polars-df 0.23.0 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +127 -1
  3. data/Cargo.lock +72 -58
  4. data/README.md +31 -27
  5. data/ext/polars/Cargo.toml +15 -6
  6. data/ext/polars/src/batched_csv.rs +35 -39
  7. data/ext/polars/src/c_api/allocator.rs +7 -0
  8. data/ext/polars/src/c_api/mod.rs +1 -0
  9. data/ext/polars/src/catalog/unity.rs +123 -101
  10. data/ext/polars/src/conversion/any_value.rs +13 -17
  11. data/ext/polars/src/conversion/chunked_array.rs +5 -5
  12. data/ext/polars/src/conversion/datetime.rs +3 -2
  13. data/ext/polars/src/conversion/mod.rs +50 -45
  14. data/ext/polars/src/dataframe/export.rs +13 -13
  15. data/ext/polars/src/dataframe/general.rs +223 -223
  16. data/ext/polars/src/dataframe/io.rs +27 -141
  17. data/ext/polars/src/dataframe/mod.rs +13 -5
  18. data/ext/polars/src/dataframe/serde.rs +1 -1
  19. data/ext/polars/src/error.rs +44 -7
  20. data/ext/polars/src/exceptions.rs +45 -12
  21. data/ext/polars/src/expr/array.rs +12 -0
  22. data/ext/polars/src/expr/datatype.rs +2 -2
  23. data/ext/polars/src/expr/datetime.rs +4 -5
  24. data/ext/polars/src/expr/general.rs +49 -13
  25. data/ext/polars/src/expr/list.rs +4 -0
  26. data/ext/polars/src/expr/meta.rs +8 -3
  27. data/ext/polars/src/expr/mod.rs +22 -6
  28. data/ext/polars/src/expr/name.rs +19 -8
  29. data/ext/polars/src/expr/rolling.rs +50 -1
  30. data/ext/polars/src/expr/string.rs +0 -1
  31. data/ext/polars/src/expr/struct.rs +7 -2
  32. data/ext/polars/src/file.rs +136 -103
  33. data/ext/polars/src/functions/aggregation.rs +9 -8
  34. data/ext/polars/src/functions/io.rs +81 -10
  35. data/ext/polars/src/functions/lazy.rs +95 -21
  36. data/ext/polars/src/functions/mod.rs +2 -0
  37. data/ext/polars/src/functions/range.rs +19 -3
  38. data/ext/polars/src/functions/strings.rs +6 -0
  39. data/ext/polars/src/functions/utils.rs +6 -0
  40. data/ext/polars/src/interop/arrow/mod.rs +50 -1
  41. data/ext/polars/src/interop/arrow/{to_ruby.rs → to_rb.rs} +30 -0
  42. data/ext/polars/src/interop/arrow/to_rust.rs +43 -0
  43. data/ext/polars/src/interop/numo/to_numo_df.rs +1 -1
  44. data/ext/polars/src/interop/numo/to_numo_series.rs +1 -1
  45. data/ext/polars/src/lazyframe/exitable.rs +39 -0
  46. data/ext/polars/src/lazyframe/general.rs +340 -236
  47. data/ext/polars/src/lazyframe/mod.rs +46 -10
  48. data/ext/polars/src/lazyframe/optflags.rs +5 -4
  49. data/ext/polars/src/lazyframe/serde.rs +11 -3
  50. data/ext/polars/src/lazyframe/sink.rs +10 -5
  51. data/ext/polars/src/lazygroupby.rs +6 -7
  52. data/ext/polars/src/lib.rs +141 -76
  53. data/ext/polars/src/map/dataframe.rs +12 -12
  54. data/ext/polars/src/map/lazy.rs +7 -5
  55. data/ext/polars/src/map/mod.rs +15 -8
  56. data/ext/polars/src/map/series.rs +3 -3
  57. data/ext/polars/src/on_startup.rs +16 -8
  58. data/ext/polars/src/prelude.rs +1 -0
  59. data/ext/polars/src/rb_modules.rs +19 -49
  60. data/ext/polars/src/series/aggregation.rs +79 -140
  61. data/ext/polars/src/series/arithmetic.rs +16 -22
  62. data/ext/polars/src/series/comparison.rs +101 -222
  63. data/ext/polars/src/series/construction.rs +17 -18
  64. data/ext/polars/src/series/export.rs +1 -1
  65. data/ext/polars/src/series/general.rs +254 -289
  66. data/ext/polars/src/series/import.rs +17 -0
  67. data/ext/polars/src/series/map.rs +178 -160
  68. data/ext/polars/src/series/mod.rs +28 -12
  69. data/ext/polars/src/series/scatter.rs +12 -9
  70. data/ext/polars/src/sql.rs +16 -9
  71. data/ext/polars/src/testing/frame.rs +31 -0
  72. data/ext/polars/src/testing/mod.rs +5 -0
  73. data/ext/polars/src/testing/series.rs +31 -0
  74. data/ext/polars/src/timeout.rs +105 -0
  75. data/ext/polars/src/utils.rs +159 -1
  76. data/lib/polars/array_expr.rb +81 -12
  77. data/lib/polars/array_name_space.rb +74 -7
  78. data/lib/polars/batched_csv_reader.rb +21 -21
  79. data/lib/polars/binary_name_space.rb +1 -1
  80. data/lib/polars/cat_expr.rb +7 -7
  81. data/lib/polars/config.rb +1 -1
  82. data/lib/polars/convert.rb +189 -34
  83. data/lib/polars/data_frame.rb +1066 -831
  84. data/lib/polars/data_frame_plot.rb +173 -0
  85. data/lib/polars/data_type_group.rb +1 -0
  86. data/lib/polars/data_types.rb +31 -12
  87. data/lib/polars/date_time_expr.rb +51 -69
  88. data/lib/polars/date_time_name_space.rb +80 -112
  89. data/lib/polars/dynamic_group_by.rb +7 -7
  90. data/lib/polars/exceptions.rb +50 -10
  91. data/lib/polars/expr.rb +470 -517
  92. data/lib/polars/functions/aggregation/horizontal.rb +0 -1
  93. data/lib/polars/functions/aggregation/vertical.rb +2 -3
  94. data/lib/polars/functions/as_datatype.rb +290 -8
  95. data/lib/polars/functions/eager.rb +204 -10
  96. data/lib/polars/functions/escape_regex.rb +21 -0
  97. data/lib/polars/functions/lazy.rb +409 -169
  98. data/lib/polars/functions/lit.rb +17 -1
  99. data/lib/polars/functions/range/int_range.rb +74 -2
  100. data/lib/polars/functions/range/linear_space.rb +77 -0
  101. data/lib/polars/functions/range/time_range.rb +1 -1
  102. data/lib/polars/functions/repeat.rb +3 -12
  103. data/lib/polars/functions/whenthen.rb +2 -2
  104. data/lib/polars/group_by.rb +72 -20
  105. data/lib/polars/iceberg_dataset.rb +1 -6
  106. data/lib/polars/in_process_query.rb +37 -0
  107. data/lib/polars/io/cloud.rb +18 -0
  108. data/lib/polars/io/csv.rb +265 -126
  109. data/lib/polars/io/database.rb +0 -1
  110. data/lib/polars/io/delta.rb +15 -7
  111. data/lib/polars/io/ipc.rb +24 -17
  112. data/lib/polars/io/ndjson.rb +161 -24
  113. data/lib/polars/io/parquet.rb +101 -38
  114. data/lib/polars/lazy_frame.rb +849 -558
  115. data/lib/polars/lazy_group_by.rb +327 -2
  116. data/lib/polars/list_expr.rb +94 -16
  117. data/lib/polars/list_name_space.rb +88 -24
  118. data/lib/polars/meta_expr.rb +42 -1
  119. data/lib/polars/name_expr.rb +41 -4
  120. data/lib/polars/query_opt_flags.rb +198 -2
  121. data/lib/polars/rolling_group_by.rb +3 -3
  122. data/lib/polars/schema.rb +21 -3
  123. data/lib/polars/selector.rb +37 -2
  124. data/lib/polars/selectors.rb +45 -9
  125. data/lib/polars/series.rb +1156 -728
  126. data/lib/polars/series_plot.rb +72 -0
  127. data/lib/polars/slice.rb +1 -1
  128. data/lib/polars/sql_context.rb +11 -4
  129. data/lib/polars/string_expr.rb +59 -68
  130. data/lib/polars/string_name_space.rb +51 -87
  131. data/lib/polars/struct_expr.rb +36 -18
  132. data/lib/polars/testing.rb +24 -273
  133. data/lib/polars/utils/constants.rb +2 -0
  134. data/lib/polars/utils/construction/data_frame.rb +410 -0
  135. data/lib/polars/utils/construction/series.rb +364 -0
  136. data/lib/polars/utils/construction/utils.rb +9 -0
  137. data/lib/polars/utils/deprecation.rb +11 -0
  138. data/lib/polars/utils/serde.rb +8 -3
  139. data/lib/polars/utils/unstable.rb +19 -0
  140. data/lib/polars/utils/various.rb +59 -0
  141. data/lib/polars/utils.rb +46 -47
  142. data/lib/polars/version.rb +1 -1
  143. data/lib/polars.rb +47 -1
  144. metadata +25 -6
  145. data/ext/polars/src/allocator.rs +0 -13
  146. data/lib/polars/plot.rb +0 -109
@@ -1,8 +1,6 @@
1
1
  module Polars
2
2
  # Two-dimensional data structure representing data as a table with rows and columns.
3
3
  class DataFrame
4
- include Plot
5
-
6
4
  # @private
7
5
  attr_accessor :_df
8
6
 
@@ -43,24 +41,24 @@ module Polars
43
41
  # @param infer_schema_length [Integer]
44
42
  # The maximum number of rows to scan for schema inference. If set to `nil`, the
45
43
  # full data may be scanned *(this can be slow)*. This parameter only applies if
46
- # the input data is a sequence or generator of rows; other input is read as-is.
44
+ # the input data is an array or generator of rows; other input is read as-is.
47
45
  # @param nan_to_null [Boolean]
48
46
  # If the data comes from one or more Numo arrays, can optionally convert input
49
47
  # data NaN values to null instead. This is a no-op for all other input data.
50
- def initialize(data = nil, schema: nil, schema_overrides: nil, strict: true, orient: nil, infer_schema_length: 100, nan_to_null: false)
48
+ def initialize(data = nil, schema: nil, schema_overrides: nil, strict: true, orient: nil, infer_schema_length: N_INFER_DEFAULT, nan_to_null: false)
51
49
  if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
52
50
  raise ArgumentError, "Use read_database instead"
53
51
  end
54
52
 
55
53
  if data.nil?
56
- self._df = self.class.hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
54
+ self._df = Utils.hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
57
55
  elsif data.is_a?(Hash)
58
56
  data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
59
- self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict, nan_to_null: nan_to_null)
57
+ self._df = Utils.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict, nan_to_null: nan_to_null)
60
58
  elsif data.is_a?(::Array)
61
- self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict, orient: orient, infer_schema_length: infer_schema_length)
59
+ self._df = Utils.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict, orient: orient, infer_schema_length: infer_schema_length)
62
60
  elsif data.is_a?(Series)
63
- self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict)
61
+ self._df = Utils.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict)
64
62
  elsif data.respond_to?(:arrow_c_stream)
65
63
  # This uses the fact that RbSeries.from_arrow_c_stream will create a
66
64
  # struct-typed Series. Then we unpack that to a DataFrame.
@@ -116,6 +114,45 @@ module Polars
116
114
  df
117
115
  end
118
116
 
117
+ # Plot data.
118
+ #
119
+ # @return [Object]
120
+ def plot(x = nil, y = nil, type: nil, group: nil, stacked: nil)
121
+ plot = DataFramePlot.new(self)
122
+ return plot if x.nil? && y.nil?
123
+
124
+ raise ArgumentError, "Must specify columns" if x.nil? || y.nil?
125
+ type ||= begin
126
+ if self[x].dtype.numeric? && self[y].dtype.numeric?
127
+ "scatter"
128
+ elsif self[x].dtype == String && self[y].dtype.numeric?
129
+ "column"
130
+ elsif (self[x].dtype == Date || self[x].dtype == Datetime) && self[y].dtype.numeric?
131
+ "line"
132
+ else
133
+ raise "Cannot determine type. Use the type option."
134
+ end
135
+ end
136
+
137
+ case type
138
+ when "line"
139
+ plot.line(x, y, color: group)
140
+ when "area"
141
+ plot.area(x, y, color: group)
142
+ when "pie"
143
+ raise ArgumentError, "Cannot use group option with pie chart" unless group.nil?
144
+ plot.pie(x, y)
145
+ when "column"
146
+ plot.column(x, y, color: group, stacked: stacked)
147
+ when "bar"
148
+ plot.bar(x, y, color: group, stacked: stacked)
149
+ when "scatter"
150
+ plot.scatter(x, y, color: group)
151
+ else
152
+ raise ArgumentError, "Invalid type: #{type}"
153
+ end
154
+ end
155
+
119
156
  # Get the shape of the DataFrame.
120
157
  #
121
158
  # @return [Array]
@@ -244,9 +281,9 @@ module Polars
244
281
  # }
245
282
  # )
246
283
  # df.schema
247
- # # => {"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::String}
284
+ # # => Polars::Schema({"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::String})
248
285
  def schema
249
- columns.zip(dtypes).to_h
286
+ Schema.new(columns.zip(dtypes).to_h)
250
287
  end
251
288
 
252
289
  # Equal.
@@ -383,142 +420,243 @@ module Polars
383
420
  # Returns subset of the DataFrame.
384
421
  #
385
422
  # @return [Object]
386
- def [](*args)
387
- if args.size == 2
388
- row_selection, col_selection = args
389
-
390
- # df[.., unknown]
391
- if row_selection.is_a?(Range)
392
-
393
- # multiple slices
394
- # df[.., ..]
395
- if col_selection.is_a?(Range)
396
- raise Todo
397
- end
398
- end
399
-
400
- # df[2, ..] (select row as df)
401
- if row_selection.is_a?(Integer)
402
- if col_selection.is_a?(::Array)
403
- df = self[0.., col_selection]
404
- return df.slice(row_selection, 1)
405
- end
406
- # df[2, "a"]
407
- if col_selection.is_a?(::String) || col_selection.is_a?(Symbol)
408
- return self[col_selection][row_selection]
409
- end
410
- end
411
-
412
- # column selection can be "a" and ["a", "b"]
413
- if col_selection.is_a?(::String) || col_selection.is_a?(Symbol)
414
- col_selection = [col_selection]
415
- end
416
-
417
- # df[.., 1]
418
- if col_selection.is_a?(Integer)
419
- series = to_series(col_selection)
420
- return series[row_selection]
421
- end
422
-
423
- if col_selection.is_a?(::Array)
424
- # df[.., [1, 2]]
425
- if Utils.is_int_sequence(col_selection)
426
- series_list = col_selection.map { |i| to_series(i) }
427
- df = self.class.new(series_list)
428
- return df[row_selection]
429
- end
430
- end
431
-
432
- df = self[col_selection]
433
- return df[row_selection]
434
- elsif args.size == 1
435
- item = args[0]
436
-
437
- # select single column
438
- # df["foo"]
439
- if item.is_a?(::String) || item.is_a?(Symbol)
440
- return Utils.wrap_s(_df.get_column(item.to_s))
441
- end
442
-
443
- # df[idx]
444
- if item.is_a?(Integer)
445
- return slice(_pos_idx(item, 0), 1)
446
- end
447
-
448
- # df[..]
449
- if item.is_a?(Range)
450
- return Slice.new(self).apply(item)
451
- end
452
-
453
- if item.is_a?(::Array) && item.all? { |v| Utils.strlike?(v) }
454
- # select multiple columns
455
- # df[["foo", "bar"]]
456
- return _from_rbdf(_df.select(item.map(&:to_s)))
457
- end
458
-
459
- if Utils.is_int_sequence(item)
460
- item = Series.new("", item)
461
- end
462
-
463
- if item.is_a?(Series)
464
- dtype = item.dtype
465
- if dtype == String
466
- return _from_rbdf(_df.select(item))
467
- elsif dtype == UInt32
468
- return _from_rbdf(_df.take_with_series(item._s))
469
- elsif [UInt8, UInt16, UInt64, Int8, Int16, Int32, Int64].include?(dtype)
470
- return _from_rbdf(
471
- _df.take_with_series(_pos_idxs(item, 0)._s)
472
- )
473
- end
474
- end
475
- end
476
-
477
- # Ruby-specific
478
- if item.is_a?(Expr) || item.is_a?(Series)
479
- return filter(item)
480
- end
481
-
482
- raise ArgumentError, "Cannot get item of type: #{item.class.name}"
423
+ #
424
+ # @example
425
+ # df = Polars::DataFrame.new(
426
+ # {"a" => [1, 2, 3], "d" => [4, 5, 6], "c" => [1, 3, 2], "b" => [7, 8, 9]}
427
+ # )
428
+ # df[0]
429
+ # # =>
430
+ # # shape: (1, 4)
431
+ # # ┌─────┬─────┬─────┬─────┐
432
+ # # │ a ┆ d ┆ c ┆ b │
433
+ # # │ --- ┆ --- ┆ --- ┆ --- │
434
+ # # │ i64 ┆ i64 ┆ i64 ┆ i64 │
435
+ # # ╞═════╪═════╪═════╪═════╡
436
+ # # │ 1 ┆ 4 ┆ 1 ┆ 7 │
437
+ # # └─────┴─────┴─────┴─────┘
438
+ #
439
+ # @example
440
+ # df[0, "a"]
441
+ # # => 1
442
+ #
443
+ # @example
444
+ # df["a"]
445
+ # # =>
446
+ # # shape: (3,)
447
+ # # Series: 'a' [i64]
448
+ # # [
449
+ # # 1
450
+ # # 2
451
+ # # 3
452
+ # # ]
453
+ #
454
+ # @example
455
+ # df[0..1]
456
+ # # =>
457
+ # # shape: (2, 4)
458
+ # # ┌─────┬─────┬─────┬─────┐
459
+ # # │ a ┆ d ┆ c ┆ b │
460
+ # # │ --- ┆ --- ┆ --- ┆ --- │
461
+ # # i64 ┆ i64 ┆ i64 ┆ i64 │
462
+ # # ╞═════╪═════╪═════╪═════╡
463
+ # # 1 ┆ 4 ┆ 1 ┆ 7 │
464
+ # # 2 ┆ 5 ┆ 3 ┆ 8 │
465
+ # # └─────┴─────┴─────┴─────┘
466
+ #
467
+ # @example
468
+ # df[0..1, "a"]
469
+ # # =>
470
+ # # shape: (2,)
471
+ # # Series: 'a' [i64]
472
+ # # [
473
+ # # 1
474
+ # # 2
475
+ # # ]
476
+ #
477
+ # @example
478
+ # df[0..1, 0]
479
+ # # =>
480
+ # # shape: (2,)
481
+ # # Series: 'a' [i64]
482
+ # # [
483
+ # # 1
484
+ # # 2
485
+ # # ]
486
+ #
487
+ # @example
488
+ # df[[0, 1], [0, 1, 2]]
489
+ # # =>
490
+ # # shape: (2, 3)
491
+ # # ┌─────┬─────┬─────┐
492
+ # # a ┆ d ┆ c │
493
+ # # │ --- ┆ --- ┆ --- │
494
+ # # │ i64 ┆ i64 ┆ i64 │
495
+ # # ╞═════╪═════╪═════╡
496
+ # # │ 1 ┆ 4 ┆ 1 │
497
+ # # 2 ┆ 5 ┆ 3 │
498
+ # # └─────┴─────┴─────┘
499
+ #
500
+ # @example
501
+ # df[0..1, ["a", "c"]]
502
+ # # =>
503
+ # # shape: (2, 2)
504
+ # # ┌─────┬─────┐
505
+ # # │ a ┆ c │
506
+ # # --- ---
507
+ # # │ i64 ┆ i64 │
508
+ # # ╞═════╪═════╡
509
+ # # │ 1 ┆ 1 │
510
+ # # │ 2 ┆ 3 │
511
+ # # └─────┴─────┘
512
+ #
513
+ # @example
514
+ # df[0.., 0..1]
515
+ # # =>
516
+ # # shape: (3, 2)
517
+ # # ┌─────┬─────┐
518
+ # # │ a ┆ d │
519
+ # # --- ---
520
+ # # │ i64 ┆ i64 │
521
+ # # ╞═════╪═════╡
522
+ # # │ 1 ┆ 4 │
523
+ # # │ 2 ┆ 5 │
524
+ # # │ 3 ┆ 6 │
525
+ # # └─────┴─────┘
526
+ #
527
+ # @example
528
+ # df[0.., "a".."c"]
529
+ # # =>
530
+ # # shape: (3, 3)
531
+ # # ┌─────┬─────┬─────┐
532
+ # # │ a ┆ d ┆ c │
533
+ # # │ --- ┆ --- ┆ --- │
534
+ # # │ i64 ┆ i64 ┆ i64 │
535
+ # # ╞═════╪═════╪═════╡
536
+ # # │ 1 ┆ 4 ┆ 1 │
537
+ # # │ 2 ┆ 5 ┆ 3 │
538
+ # # │ 3 ┆ 6 ┆ 2 │
539
+ # # └─────┴─────┴─────┘
540
+ def [](*key)
541
+ get_df_item_by_key(self, key)
483
542
  end
484
543
 
485
544
  # Set item.
486
545
  #
487
546
  # @return [Object]
547
+ #
548
+ # @example `df[["a", "b"]] = value`:
549
+ # df = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => [4, 5, 6]})
550
+ # df[["a", "b"]] = [[10, 40], [20, 50], [30, 60]]
551
+ # df
552
+ # # =>
553
+ # # shape: (3, 2)
554
+ # # ┌─────┬─────┐
555
+ # # │ a ┆ b │
556
+ # # │ --- ┆ --- │
557
+ # # │ i64 ┆ i64 │
558
+ # # ╞═════╪═════╡
559
+ # # │ 10 ┆ 40 │
560
+ # # │ 20 ┆ 50 │
561
+ # # │ 30 ┆ 60 │
562
+ # # └─────┴─────┘
563
+ #
564
+ # @example `df[row_idx, "a"] = value`:
565
+ # df[1, "a"] = 100
566
+ # df
567
+ # # =>
568
+ # # shape: (3, 2)
569
+ # # ┌─────┬─────┐
570
+ # # │ a ┆ b │
571
+ # # │ --- ┆ --- │
572
+ # # │ i64 ┆ i64 │
573
+ # # ╞═════╪═════╡
574
+ # # │ 10 ┆ 40 │
575
+ # # │ 100 ┆ 50 │
576
+ # # │ 30 ┆ 60 │
577
+ # # └─────┴─────┘
578
+ #
579
+ # @example `df[row_idx, col_idx] = value`:
580
+ # df[0, 1] = 30
581
+ # df
582
+ # # =>
583
+ # # shape: (3, 2)
584
+ # # ┌─────┬─────┐
585
+ # # │ a ┆ b │
586
+ # # │ --- ┆ --- │
587
+ # # │ i64 ┆ i64 │
588
+ # # ╞═════╪═════╡
589
+ # # │ 10 ┆ 30 │
590
+ # # │ 100 ┆ 50 │
591
+ # # │ 30 ┆ 60 │
592
+ # # └─────┴─────┘
488
593
  def []=(*key, value)
489
- if key.length == 1
490
- key = key.first
491
- elsif key.length != 2
594
+ if key.empty? || key.length > 2
492
595
  raise ArgumentError, "wrong number of arguments (given #{key.length + 1}, expected 2..3)"
493
596
  end
494
597
 
495
- if Utils.strlike?(key)
598
+ if key.length == 1 && Utils.strlike?(key[0])
599
+ key = key[0]
600
+
496
601
  if value.is_a?(::Array) || (defined?(Numo::NArray) && value.is_a?(Numo::NArray))
497
602
  value = Series.new(value)
498
603
  elsif !value.is_a?(Series)
499
604
  value = Polars.lit(value)
500
605
  end
501
- self._df = with_column(value.alias(key.to_s))._df
502
- elsif key.is_a?(::Array)
606
+ self._df = with_columns(value.alias(key.to_s))._df
607
+
608
+ # df[["C", "D"]]
609
+ elsif key.length == 1 && key[0].is_a?(::Array)
610
+ key = key[0]
611
+
612
+ if !value.is_a?(::Array) || !value.all? { |v| v.is_a?(::Array) }
613
+ msg = "can only set multiple columns with 2D matrix"
614
+ raise ArgumentError, msg
615
+ end
616
+ if value.any? { |v| v.size != key.length }
617
+ msg = "matrix columns should be equal to list used to determine column names"
618
+ raise ArgumentError, msg
619
+ end
620
+
621
+ columns = []
622
+ key.each_with_index do |name, i|
623
+ columns << Series.new(name, value.map { |v| v[i] })
624
+ end
625
+ self._df = with_columns(columns)._df
626
+
627
+ # df[a, b]
628
+ else
503
629
  row_selection, col_selection = key
504
630
 
631
+ if (row_selection.is_a?(Series) && row_selection.dtype == Boolean) || Utils.is_bool_sequence(row_selection)
632
+ msg = (
633
+ "not allowed to set DataFrame by boolean mask in the row position" +
634
+ "\n\nConsider using `DataFrame.with_columns`."
635
+ )
636
+ raise TypeError, msg
637
+ end
638
+
639
+ # get series column selection
505
640
  if Utils.strlike?(col_selection)
506
641
  s = self[col_selection]
507
642
  elsif col_selection.is_a?(Integer)
508
- raise Todo
643
+ s = self[0.., col_selection]
509
644
  else
510
- raise ArgumentError, "column selection not understood: #{col_selection}"
645
+ msg = "unexpected column selection #{col_selection.inspect}"
646
+ raise TypeError, msg
511
647
  end
512
648
 
649
+ # dispatch to []= of Series to do modification
513
650
  s[row_selection] = value
514
651
 
652
+ # now find the location to place series
653
+ # df[idx]
515
654
  if col_selection.is_a?(Integer)
516
655
  replace_column(col_selection, s)
656
+ # df["foo"]
517
657
  elsif Utils.strlike?(col_selection)
518
- replace(col_selection, s)
658
+ _replace(col_selection.to_s, s)
519
659
  end
520
- else
521
- raise Todo
522
660
  end
523
661
  end
524
662
 
@@ -566,22 +704,55 @@ module Polars
566
704
  Schema.new(columns.zip(dtypes), check_dtypes: false)
567
705
  end
568
706
 
569
- # Return the dataframe as a scalar.
707
+ # Return the DataFrame as a scalar, or return the element at the given row/column.
570
708
  #
571
- # Equivalent to `df[0,0]`, with a check that the shape is (1,1).
709
+ # @param row [Integer]
710
+ # Optional row index.
711
+ # @param column [Integer, String]
712
+ # Optional column index or name.
572
713
  #
573
714
  # @return [Object]
574
715
  #
716
+ # @note
717
+ # If row/col not provided, this is equivalent to `df[0,0]`, with a check that
718
+ # the shape is (1,1). With row/col, this is equivalent to `df[row,col]`.
719
+ #
575
720
  # @example
576
721
  # df = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => [4, 5, 6]})
577
- # result = df.select((Polars.col("a") * Polars.col("b")).sum)
578
- # result.item
722
+ # df.select((Polars.col("a") * Polars.col("b")).sum).item
579
723
  # # => 32
580
- def item
581
- if shape != [1, 1]
582
- raise ArgumentError, "Can only call .item if the dataframe is of shape (1,1), dataframe is of shape #{shape}"
724
+ #
725
+ # @example
726
+ # df.item(1, 1)
727
+ # # => 5
728
+ #
729
+ # @example
730
+ # df.item(2, "b")
731
+ # # => 6
732
+ def item(row = nil, column = nil)
733
+ if row.nil? && column.nil?
734
+ if shape != [1, 1]
735
+ msg = (
736
+ "can only call `.item()` if the dataframe is of shape (1, 1)," +
737
+ " or if explicit row/col values are provided;" +
738
+ " frame has shape #{shape.inspect}"
739
+ )
740
+ raise ArgumentError, msg
741
+ end
742
+ return _df.to_series(0).get_index(0)
743
+
744
+ elsif row.nil? || column.nil?
745
+ msg = "cannot call `.item()` with only one of `row` or `column`"
746
+ raise ArgumentError, msg
583
747
  end
584
- self[0, 0]
748
+
749
+ s =
750
+ if column.is_a?(Integer)
751
+ _df.to_series(column)
752
+ else
753
+ _df.get_column(column)
754
+ end
755
+ s.get_index_signed(row)
585
756
  end
586
757
 
587
758
  # no to_arrow
@@ -661,7 +832,7 @@ module Polars
661
832
  if index < 0
662
833
  index = columns.length + index
663
834
  end
664
- Utils.wrap_s(_df.select_at_idx(index))
835
+ Utils.wrap_s(_df.to_series(index))
665
836
  end
666
837
 
667
838
  # Serialize this DataFrame to a file or string.
@@ -758,25 +929,26 @@ module Polars
758
929
  # df.write_ndjson
759
930
  # # => "{\"foo\":1,\"bar\":6}\n{\"foo\":2,\"bar\":7}\n{\"foo\":3,\"bar\":8}\n"
760
931
  def write_ndjson(file = nil)
761
- if Utils.pathlike?(file)
762
- file = Utils.normalize_filepath(file)
932
+ should_return_buffer = false
933
+ target = nil
934
+ if file.nil?
935
+ target = StringIO.new
936
+ target.set_encoding(Encoding::BINARY)
937
+ should_return_buffer = true
938
+ elsif Utils.pathlike?(file)
939
+ target = Utils.normalize_filepath(file)
940
+ else
941
+ target = file
763
942
  end
764
- to_string_io = !file.nil? && file.is_a?(StringIO)
765
- if file.nil? || to_string_io
766
- buf = StringIO.new
767
- buf.set_encoding(Encoding::BINARY)
768
- _df.write_ndjson(buf)
769
- json_bytes = buf.string
770
943
 
771
- json_str = json_bytes.force_encoding(Encoding::UTF_8)
772
- if to_string_io
773
- file.write(json_str)
774
- else
775
- return json_str
776
- end
777
- else
778
- _df.write_ndjson(file)
944
+ lazy.sink_ndjson(
945
+ target
946
+ )
947
+
948
+ if should_return_buffer
949
+ return target.string.force_encoding(Encoding::UTF_8)
779
950
  end
951
+
780
952
  nil
781
953
  end
782
954
 
@@ -787,9 +959,9 @@ module Polars
787
959
  # (default), the output is returned as a string instead.
788
960
  # @param include_header [Boolean]
789
961
  # Whether to include header in the CSV output.
790
- # @param sep [String]
962
+ # @param separator [String]
791
963
  # Separate CSV fields with this symbol.
792
- # @param quote [String]
964
+ # @param quote_char [String]
793
965
  # Byte to use as quoting character.
794
966
  # @param batch_size [Integer]
795
967
  # Number of rows that will be processed per thread.
@@ -808,8 +980,8 @@ module Polars
808
980
  # [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
809
981
  # Rust crate.
810
982
  # @param float_precision [Integer, nil]
811
- # Number of decimal places to write, applied to both `:f32` and
812
- # `:f64` datatypes.
983
+ # Number of decimal places to write, applied to both `Float32` and
984
+ # `Float64` datatypes.
813
985
  # @param null_value [String, nil]
814
986
  # A string representing null values (defaulting to the empty string).
815
987
  #
@@ -826,38 +998,52 @@ module Polars
826
998
  # df.write_csv("file.csv")
827
999
  def write_csv(
828
1000
  file = nil,
1001
+ include_bom: false,
829
1002
  include_header: true,
830
- sep: ",",
831
- quote: '"',
1003
+ separator: ",",
1004
+ line_terminator: "\n",
1005
+ quote_char: '"',
832
1006
  batch_size: 1024,
833
1007
  datetime_format: nil,
834
1008
  date_format: nil,
835
1009
  time_format: nil,
1010
+ float_scientific: nil,
836
1011
  float_precision: nil,
837
- null_value: nil
1012
+ decimal_comma: false,
1013
+ null_value: nil,
1014
+ quote_style: nil,
1015
+ storage_options: nil,
1016
+ credential_provider: "auto",
1017
+ retries: 2
838
1018
  )
839
- if sep.length > 1
840
- raise ArgumentError, "only single byte separator is allowed"
841
- elsif quote.length > 1
842
- raise ArgumentError, "only single byte quote char is allowed"
843
- elsif null_value == ""
1019
+ Utils._check_arg_is_1byte("separator", separator, false)
1020
+ Utils._check_arg_is_1byte("quote_char", quote_char, true)
1021
+ if null_value == ""
844
1022
  null_value = nil
845
1023
  end
846
1024
 
847
1025
  if file.nil?
848
1026
  buffer = StringIO.new
849
1027
  buffer.set_encoding(Encoding::BINARY)
850
- _df.write_csv(
1028
+ lazy.sink_csv(
851
1029
  buffer,
852
- include_header,
853
- sep.ord,
854
- quote.ord,
855
- batch_size,
856
- datetime_format,
857
- date_format,
858
- time_format,
859
- float_precision,
860
- null_value
1030
+ include_bom: include_bom,
1031
+ include_header: include_header,
1032
+ separator: separator,
1033
+ line_terminator: line_terminator,
1034
+ quote_char: quote_char,
1035
+ batch_size: batch_size,
1036
+ datetime_format: datetime_format,
1037
+ date_format: date_format,
1038
+ time_format: time_format,
1039
+ float_scientific: float_scientific,
1040
+ float_precision: float_precision,
1041
+ decimal_comma: decimal_comma,
1042
+ null_value: null_value,
1043
+ quote_style: quote_style,
1044
+ storage_options: storage_options,
1045
+ credential_provider: credential_provider,
1046
+ retries: retries
861
1047
  )
862
1048
  return buffer.string.force_encoding(Encoding::UTF_8)
863
1049
  end
@@ -866,17 +1052,25 @@ module Polars
866
1052
  file = Utils.normalize_filepath(file)
867
1053
  end
868
1054
 
869
- _df.write_csv(
1055
+ lazy.sink_csv(
870
1056
  file,
871
- include_header,
872
- sep.ord,
873
- quote.ord,
874
- batch_size,
875
- datetime_format,
876
- date_format,
877
- time_format,
878
- float_precision,
879
- null_value,
1057
+ include_bom: include_bom,
1058
+ include_header: include_header,
1059
+ separator: separator,
1060
+ line_terminator: line_terminator,
1061
+ quote_char: quote_char,
1062
+ batch_size: batch_size,
1063
+ datetime_format: datetime_format,
1064
+ date_format: date_format,
1065
+ time_format: time_format,
1066
+ float_scientific: float_scientific,
1067
+ float_precision: float_precision,
1068
+ decimal_comma: decimal_comma,
1069
+ null_value: null_value,
1070
+ quote_style: quote_style,
1071
+ storage_options: storage_options,
1072
+ credential_provider: credential_provider,
1073
+ retries: retries
880
1074
  )
881
1075
  nil
882
1076
  end
@@ -934,6 +1128,10 @@ module Polars
934
1128
  #
935
1129
  # If `storage_options` is not provided, Polars will try to infer the
936
1130
  # information from environment variables.
1131
+ # @param credential_provider [Object]
1132
+ # Provide a function that can be called to provide cloud storage
1133
+ # credentials. The function is expected to return a hash of
1134
+ # credential keys along with an optional credential expiry time.
937
1135
  # @param retries [Integer]
938
1136
  # Number of retries if accessing a cloud instance fails.
939
1137
  #
@@ -943,33 +1141,27 @@ module Polars
943
1141
  compression: "uncompressed",
944
1142
  compat_level: nil,
945
1143
  storage_options: nil,
1144
+ credential_provider: "auto",
946
1145
  retries: 2
947
1146
  )
948
1147
  return_bytes = file.nil?
949
- if return_bytes
950
- file = StringIO.new
951
- file.set_encoding(Encoding::BINARY)
952
- end
953
- if Utils.pathlike?(file)
954
- file = Utils.normalize_filepath(file)
955
- end
956
-
957
- if compat_level.nil?
958
- compat_level = true
959
- end
960
-
961
- if compression.nil?
962
- compression = "uncompressed"
963
- end
964
-
965
- if storage_options&.any?
966
- storage_options = storage_options.to_a
1148
+ target = nil
1149
+ if file.nil?
1150
+ target = StringIO.new
1151
+ target.set_encoding(Encoding::BINARY)
967
1152
  else
968
- storage_options = nil
1153
+ target = file
969
1154
  end
970
1155
 
971
- _df.write_ipc(file, compression, compat_level, storage_options, retries)
972
- return_bytes ? file.string : nil
1156
+ lazy.sink_ipc(
1157
+ target,
1158
+ compression: compression,
1159
+ compat_level: compat_level,
1160
+ storage_options: storage_options,
1161
+ credential_provider: credential_provider,
1162
+ retries: retries
1163
+ )
1164
+ return_bytes ? target.string : nil
973
1165
  end
974
1166
 
975
1167
  # Write to Arrow IPC record batch stream.
@@ -1049,9 +1241,16 @@ module Polars
1049
1241
  file,
1050
1242
  compression: "zstd",
1051
1243
  compression_level: nil,
1052
- statistics: false,
1244
+ statistics: true,
1053
1245
  row_group_size: nil,
1054
- data_page_size: nil
1246
+ data_page_size: nil,
1247
+ partition_by: nil,
1248
+ partition_chunk_size_bytes: 4_294_967_296,
1249
+ storage_options: nil,
1250
+ credential_provider: "auto",
1251
+ retries: 2,
1252
+ metadata: nil,
1253
+ mkdir: false
1055
1254
  )
1056
1255
  if compression.nil?
1057
1256
  compression = "uncompressed"
@@ -1060,26 +1259,23 @@ module Polars
1060
1259
  file = Utils.normalize_filepath(file)
1061
1260
  end
1062
1261
 
1063
- if statistics == true
1064
- statistics = {
1065
- min: true,
1066
- max: true,
1067
- distinct_count: false,
1068
- null_count: true
1069
- }
1070
- elsif statistics == false
1071
- statistics = {}
1072
- elsif statistics == "full"
1073
- statistics = {
1074
- min: true,
1075
- max: true,
1076
- distinct_count: true,
1077
- null_count: true
1078
- }
1262
+ target = file
1263
+ if !partition_by.nil?
1264
+ raise Todo
1079
1265
  end
1080
1266
 
1081
- _df.write_parquet(
1082
- file, compression, compression_level, statistics, row_group_size, data_page_size
1267
+ lazy.sink_parquet(
1268
+ target,
1269
+ compression: compression,
1270
+ compression_level: compression_level,
1271
+ statistics: statistics,
1272
+ row_group_size: row_group_size,
1273
+ data_page_size: data_page_size,
1274
+ storage_options: storage_options,
1275
+ credential_provider: credential_provider,
1276
+ retries: retries,
1277
+ metadata: metadata,
1278
+ mkdir: mkdir
1083
1279
  )
1084
1280
  end
1085
1281
 
@@ -1332,7 +1528,7 @@ module Polars
1332
1528
  # "y" => 1_000_000.times.map { |v| v / 1000.0 },
1333
1529
  # "z" => 1_000_000.times.map(&:to_s)
1334
1530
  # },
1335
- # schema: {"x" => :u32, "y" => :f64, "z" => :str}
1531
+ # schema: {"x" => Polars::UInt32, "y" => Polars::Float64, "z" => Polars::String}
1336
1532
  # )
1337
1533
  # df.estimated_size
1338
1534
  # # => 25888898
@@ -1464,14 +1660,14 @@ module Polars
1464
1660
  # # │ 3 ┆ 8 ┆ c │
1465
1661
  # # └───────┴─────┴─────┘
1466
1662
  def rename(mapping, strict: true)
1467
- lazy.rename(mapping, strict: strict).collect(no_optimization: true)
1663
+ lazy.rename(mapping, strict: strict).collect(optimizations: QueryOptFlags._eager)
1468
1664
  end
1469
1665
 
1470
1666
  # Insert a Series at a certain column index. This operation is in place.
1471
1667
  #
1472
1668
  # @param index [Integer]
1473
1669
  # Column to insert the new `Series` column.
1474
- # @param series [Series]
1670
+ # @param column [Series]
1475
1671
  # `Series` to insert.
1476
1672
  #
1477
1673
  # @return [DataFrame]
@@ -1514,19 +1710,22 @@ module Polars
1514
1710
  # # │ 3 ┆ 10.0 ┆ false ┆ 20.5 │
1515
1711
  # # │ 4 ┆ 13.0 ┆ true ┆ 0.0 │
1516
1712
  # # └─────┴──────┴───────┴──────┘
1517
- def insert_column(index, series)
1713
+ def insert_column(index, column)
1518
1714
  if index < 0
1519
- index = columns.length + index
1715
+ index = width + index
1520
1716
  end
1521
- _df.insert_column(index, series._s)
1717
+ _df.insert_column(index, column._s)
1522
1718
  self
1523
1719
  end
1524
- alias_method :insert_at_idx, :insert_column
1525
1720
 
1526
1721
  # Filter the rows in the DataFrame based on a predicate expression.
1527
1722
  #
1528
- # @param predicate [Expr]
1529
- # Expression that evaluates to a boolean Series.
1723
+ # @param predicates [Array]
1724
+ # Expression(s) that evaluate to a boolean Series.
1725
+ # @param constraints [Hash]
1726
+ # Column filters; use `name = value` to filter columns by the supplied value.
1727
+ # Each constraint will behave the same as `Polars.col(name).eq(value)`, and
1728
+ # be implicitly joined with the other filter conditions using `&`.
1530
1729
  #
1531
1730
  # @return [DataFrame]
1532
1731
  #
@@ -1561,15 +1760,15 @@ module Polars
1561
1760
  # # ╞═════╪═════╪═════╡
1562
1761
  # # │ 1 ┆ 6 ┆ a │
1563
1762
  # # └─────┴─────┴─────┘
1564
- def filter(predicate)
1565
- lazy.filter(predicate).collect
1763
+ def filter(*predicates, **constraints)
1764
+ lazy.filter(*predicates, **constraints).collect(optimizations: QueryOptFlags._eager)
1566
1765
  end
1567
1766
 
1568
1767
  # Remove rows, dropping those that match the given predicate expression(s).
1569
1768
  #
1570
1769
  # The original order of the remaining rows is preserved.
1571
1770
  #
1572
- # Rows where the filter predicate does not evaluate to True are retained
1771
+ # Rows where the filter predicate does not evaluate to true are retained
1573
1772
  # (this includes rows where the predicate evaluates as `null`).
1574
1773
  #
1575
1774
  # @param predicates [Array]
@@ -1682,77 +1881,178 @@ module Polars
1682
1881
  )
1683
1882
  lazy
1684
1883
  .remove(*predicates, **constraints)
1685
- .collect(_eager: true)
1884
+ .collect(optimizations: QueryOptFlags._eager)
1686
1885
  end
1687
1886
 
1688
- # Summary statistics for a DataFrame.
1887
+ # Return a dense preview of the DataFrame.
1689
1888
  #
1690
- # @return [DataFrame]
1889
+ # The formatting shows one line per column so that wide dataframes display
1890
+ # cleanly. Each line shows the column name, the data type, and the first
1891
+ # few values.
1691
1892
  #
1692
- # @example
1893
+ # @param max_items_per_column [Integer]
1894
+ # Maximum number of items to show per column.
1895
+ # @param max_colname_length [Integer]
1896
+ # Maximum length of the displayed column names; values that exceed
1897
+ # this value are truncated with a trailing ellipsis.
1898
+ # @param return_type [nil, 'self', 'frame', 'string']
1899
+ # Modify the return format:
1900
+ #
1901
+ # - `nil` (default): Print the glimpse output to stdout, returning `nil`.
1902
+ # - `"self"`: Print the glimpse output to stdout, returning the *original* frame.
1903
+ # - `"frame"`: Return the glimpse output as a new DataFrame.
1904
+ # - `"string"`: Return the glimpse output as a string.
1905
+ #
1906
+ # @return [Object]
1907
+ #
1908
+ # @example Return the glimpse output as a DataFrame:
1693
1909
  # df = Polars::DataFrame.new(
1694
1910
  # {
1695
1911
  # "a" => [1.0, 2.8, 3.0],
1696
1912
  # "b" => [4, 5, nil],
1697
1913
  # "c" => [true, false, true],
1698
1914
  # "d" => [nil, "b", "c"],
1699
- # "e" => ["usd", "eur", nil]
1915
+ # "e" => ["usd", "eur", nil],
1916
+ # "f" => [Date.new(2020, 1, 1), Date.new(2021, 1, 2), Date.new(2022, 1, 1)]
1700
1917
  # }
1701
1918
  # )
1702
- # df.describe
1919
+ # df.glimpse(return_type: "frame")
1703
1920
  # # =>
1704
- # # shape: (7, 6)
1705
- # # ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┐
1706
- # # │ describe ┆ a b ┆ c d ┆ e
1707
- # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ ---
1708
- # # │ str f64 ┆ f64 ┆ f64 ┆ str ┆ str
1709
- # # ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╡
1710
- # # │ count3.0 3.0 3.0 ┆ 3 ┆ 3
1711
- # # │ null_count ┆ 0.01.0 0.0 ┆ 1 ┆ 1
1712
- # # │ mean ┆ 2.266667 ┆ 4.50.666667 null null
1713
- # # │ std 1.101514 0.707107 ┆ 0.57735 ┆ null null
1714
- # # │ min ┆ 1.04.0 0.0 ┆ b ┆ eur
1715
- # # │ max ┆ 3.05.0 1.0 ┆ c ┆ usd
1716
- # # │ median ┆ 2.8 ┆ 4.5 ┆ 1.0 ┆ null ┆ null │
1717
- # # └────────────┴──────────┴──────────┴──────────┴──────┴──────┘
1718
- def describe
1719
- describe_cast = lambda do |stat|
1720
- columns = []
1721
- self.columns.each_with_index do |s, i|
1722
- if self[s].is_numeric || self[s].is_boolean
1723
- columns << stat[0.., i].cast(:f64)
1724
- else
1725
- # for dates, strings, etc, we cast to string so that all
1726
- # statistics can be shown
1727
- columns << stat[0.., i].cast(:str)
1728
- end
1921
+ # # shape: (6, 3)
1922
+ # # ┌────────┬───────┬─────────────────────────────────┐
1923
+ # # │ columndtypevalues
1924
+ # # │ --- ┆ --- ┆ ---
1925
+ # # │ str ┆ str list[str]
1926
+ # # ╞════════╪═══════╪═════════════════════════════════╡
1927
+ # # │ af64 ["1.0", "2.8", "3.0"]
1928
+ # # │ bi64 ["4", "5", null]
1929
+ # # │ cbool ["true", "false", "true"]
1930
+ # # │ d str [null, ""b"", ""c""]
1931
+ # # │ estr [""usd"", ""eur"", null]
1932
+ # # │ fdate ["2020-01-01", "2021-01-02", "…
1933
+ # # └────────┴───────┴─────────────────────────────────┘
1934
+ def glimpse(
1935
+ max_items_per_column: 10,
1936
+ max_colname_length: 50,
1937
+ return_type: nil
1938
+ )
1939
+ if return_type.nil?
1940
+ return_frame = false
1941
+ else
1942
+ return_frame = return_type == "frame"
1943
+ if !return_frame && !["self", "string"].include?(return_type)
1944
+ msg = "invalid `return_type`; found #{return_type.inspect}, expected one of 'string', 'frame', 'self', or nil"
1945
+ raise ArgumentError, msg
1729
1946
  end
1730
- self.class.new(columns)
1731
1947
  end
1732
1948
 
1733
- summary = _from_rbdf(
1734
- Polars.concat(
1735
- [
1736
- describe_cast.(
1737
- self.class.new(columns.to_h { |c| [c, [height]] })
1738
- ),
1739
- describe_cast.(null_count),
1740
- describe_cast.(mean),
1741
- describe_cast.(std),
1742
- describe_cast.(min),
1743
- describe_cast.(max),
1744
- describe_cast.(median)
1745
- ]
1746
- )._df
1747
- )
1748
- summary.insert_column(
1749
- 0,
1750
- Polars::Series.new(
1751
- "describe",
1752
- ["count", "null_count", "mean", "std", "min", "max", "median"],
1949
+ # always print at most this number of values (mainly ensures that
1950
+ # we do not cast long arrays to strings, which would be slow)
1951
+ max_n_values = [max_items_per_column, height].min
1952
+ schema = self.schema
1953
+
1954
+ _column_to_row_output = lambda do |col_name, dtype|
1955
+ fn = schema[col_name] == String ? :inspect : :to_s
1956
+ values = self[0...max_n_values, col_name].to_a
1957
+ if col_name.length > max_colname_length
1958
+ col_name = col_name[0...(max_colname_length - 1)] + "…"
1959
+ end
1960
+ dtype_str = Plr.dtype_str_repr(dtype)
1961
+ if !return_frame
1962
+ dtype_str = "<#{dtype_str}>"
1963
+ end
1964
+ [col_name, dtype_str, values.map { |v| !v.nil? ? v.send(fn) : nil }]
1965
+ end
1966
+
1967
+ data = self.schema.map { |s, dtype| _column_to_row_output.(s, dtype) }
1968
+
1969
+ # output one row per column
1970
+ if return_frame
1971
+ DataFrame.new(
1972
+ data,
1973
+ orient: "row",
1974
+ schema: {"column" => String, "dtype" => String, "values" => List.new(String)}
1753
1975
  )
1976
+ else
1977
+ raise Todo
1978
+ end
1979
+ end
1980
+
1981
+ # Summary statistics for a DataFrame.
1982
+ #
1983
+ # @param percentiles [Array]
1984
+ # One or more percentiles to include in the summary statistics.
1985
+ # All values must be in the range `[0, 1]`.
1986
+ # @param interpolation ['nearest', 'higher', 'lower', 'midpoint', 'linear', 'equiprobable']
1987
+ # Interpolation method used when calculating percentiles.
1988
+ #
1989
+ # @return [DataFrame]
1990
+ #
1991
+ # @example Show default frame statistics:
1992
+ # df = Polars::DataFrame.new(
1993
+ # {
1994
+ # "float" => [1.0, 2.8, 3.0],
1995
+ # "int" => [40, 50, nil],
1996
+ # "bool" => [true, false, true],
1997
+ # "str" => ["zz", "xx", "yy"],
1998
+ # "date" => [Date.new(2020, 1, 1), Date.new(2021, 7, 5), Date.new(2022, 12, 31)]
1999
+ # }
2000
+ # )
2001
+ # df.describe
2002
+ # # =>
2003
+ # # shape: (9, 6)
2004
+ # # ┌────────────┬──────────┬──────────┬──────────┬──────┬─────────────────────────┐
2005
+ # # │ statistic ┆ float ┆ int ┆ bool ┆ str ┆ date │
2006
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
2007
+ # # │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str │
2008
+ # # ╞════════════╪══════════╪══════════╪══════════╪══════╪═════════════════════════╡
2009
+ # # │ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 3 ┆ 3 │
2010
+ # # │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 0 ┆ 0 │
2011
+ # # │ mean ┆ 2.266667 ┆ 45.0 ┆ 0.666667 ┆ null ┆ 2021-07-02 16:00:00 UTC │
2012
+ # # │ std ┆ 1.101514 ┆ 7.071068 ┆ null ┆ null ┆ null │
2013
+ # # │ min ┆ 1.0 ┆ 40.0 ┆ 0.0 ┆ xx ┆ 2020-01-01 │
2014
+ # # │ 25% ┆ 2.8 ┆ 40.0 ┆ null ┆ null ┆ 2021-07-05 │
2015
+ # # │ 50% ┆ 2.8 ┆ 50.0 ┆ null ┆ null ┆ 2021-07-05 │
2016
+ # # │ 75% ┆ 3.0 ┆ 50.0 ┆ null ┆ null ┆ 2022-12-31 │
2017
+ # # │ max ┆ 3.0 ┆ 50.0 ┆ 1.0 ┆ zz ┆ 2022-12-31 │
2018
+ # # └────────────┴──────────┴──────────┴──────────┴──────┴─────────────────────────┘
2019
+ #
2020
+ # @example Customize which percentiles are displayed, applying linear interpolation:
2021
+ # df.describe(
2022
+ # percentiles: [0.1, 0.3, 0.5, 0.7, 0.9],
2023
+ # interpolation: "linear"
2024
+ # )
2025
+ # # =>
2026
+ # # shape: (11, 6)
2027
+ # # ┌────────────┬──────────┬──────────┬──────────┬──────┬─────────────────────────┐
2028
+ # # │ statistic ┆ float ┆ int ┆ bool ┆ str ┆ date │
2029
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
2030
+ # # │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str │
2031
+ # # ╞════════════╪══════════╪══════════╪══════════╪══════╪═════════════════════════╡
2032
+ # # │ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 3 ┆ 3 │
2033
+ # # │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 0 ┆ 0 │
2034
+ # # │ mean ┆ 2.266667 ┆ 45.0 ┆ 0.666667 ┆ null ┆ 2021-07-02 16:00:00 UTC │
2035
+ # # │ std ┆ 1.101514 ┆ 7.071068 ┆ null ┆ null ┆ null │
2036
+ # # │ min ┆ 1.0 ┆ 40.0 ┆ 0.0 ┆ xx ┆ 2020-01-01 │
2037
+ # # │ … ┆ … ┆ … ┆ … ┆ … ┆ … │
2038
+ # # │ 30% ┆ 2.08 ┆ 43.0 ┆ null ┆ null ┆ 2020-11-26 │
2039
+ # # │ 50% ┆ 2.8 ┆ 45.0 ┆ null ┆ null ┆ 2021-07-05 │
2040
+ # # │ 70% ┆ 2.88 ┆ 47.0 ┆ null ┆ null ┆ 2022-02-07 │
2041
+ # # │ 90% ┆ 2.96 ┆ 49.0 ┆ null ┆ null ┆ 2022-09-13 │
2042
+ # # │ max ┆ 3.0 ┆ 50.0 ┆ 1.0 ┆ zz ┆ 2022-12-31 │
2043
+ # # └────────────┴──────────┴──────────┴──────────┴──────┴─────────────────────────┘
2044
+ def describe(
2045
+ percentiles: [0.25, 0.5, 0.75],
2046
+ interpolation: "nearest"
2047
+ )
2048
+ if columns.empty?
2049
+ msg = "cannot describe a DataFrame that has no columns"
2050
+ raise TypeError, msg
2051
+ end
2052
+
2053
+ lazy.describe(
2054
+ percentiles: percentiles, interpolation: interpolation
1754
2055
  )
1755
- summary
1756
2056
  end
1757
2057
 
1758
2058
  # Find the index of a column by name.
@@ -1771,13 +2071,12 @@ module Polars
1771
2071
  def get_column_index(name)
1772
2072
  _df.get_column_index(name)
1773
2073
  end
1774
- alias_method :find_idx_by_name, :get_column_index
1775
2074
 
1776
2075
  # Replace a column at an index location.
1777
2076
  #
1778
2077
  # @param index [Integer]
1779
2078
  # Column index.
1780
- # @param series [Series]
2079
+ # @param column [Series]
1781
2080
  # Series that will replace the column.
1782
2081
  #
1783
2082
  # @return [DataFrame]
@@ -1803,23 +2102,31 @@ module Polars
1803
2102
  # # │ 20 ┆ 7 ┆ b │
1804
2103
  # # │ 30 ┆ 8 ┆ c │
1805
2104
  # # └───────┴─────┴─────┘
1806
- def replace_column(index, series)
2105
+ def replace_column(index, column)
1807
2106
  if index < 0
1808
- index = columns.length + index
2107
+ index = width + index
1809
2108
  end
1810
- _df.replace_column(index, series._s)
2109
+ _df.replace_column(index, column._s)
1811
2110
  self
1812
2111
  end
1813
- alias_method :replace_at_idx, :replace_column
1814
2112
 
1815
- # Sort the DataFrame by column.
2113
+ # Sort the dataframe by the given columns.
1816
2114
  #
1817
- # @param by [String]
1818
- # By which column to sort.
1819
- # @param reverse [Boolean]
1820
- # Reverse/descending sort.
2115
+ # @param by [Object]
2116
+ # Column(s) to sort by. Accepts expression input, including selectors. Strings
2117
+ # are parsed as column names.
2118
+ # @param more_by [Array]
2119
+ # Additional columns to sort by, specified as positional arguments.
2120
+ # @param descending [Boolean]
2121
+ # Sort in descending order. When sorting by multiple columns, can be specified
2122
+ # per column by passing an array of booleans.
1821
2123
  # @param nulls_last [Boolean]
1822
- # Place null values last. Can only be used if sorted by a single column.
2124
+ # Place null values last; can specify a single boolean applying to all columns
2125
+ # or an array of booleans for per-column control.
2126
+ # @param multithreaded [Boolean]
2127
+ # Sort using multiple threads.
2128
+ # @param maintain_order [Boolean]
2129
+ # Whether the order should be maintained if elements are equal.
1823
2130
  #
1824
2131
  # @return [DataFrame]
1825
2132
  #
@@ -1831,7 +2138,7 @@ module Polars
1831
2138
  # "ham" => ["a", "b", "c"]
1832
2139
  # }
1833
2140
  # )
1834
- # df.sort("foo", reverse: true)
2141
+ # df.sort("foo", descending: true)
1835
2142
  # # =>
1836
2143
  # # shape: (3, 3)
1837
2144
  # # ┌─────┬─────┬─────┐
@@ -1847,7 +2154,7 @@ module Polars
1847
2154
  # @example Sort by multiple columns.
1848
2155
  # df.sort(
1849
2156
  # [Polars.col("foo"), Polars.col("bar")**2],
1850
- # reverse: [true, false]
2157
+ # descending: [true, false]
1851
2158
  # )
1852
2159
  # # =>
1853
2160
  # # shape: (3, 3)
@@ -1860,24 +2167,38 @@ module Polars
1860
2167
  # # │ 2 ┆ 7.0 ┆ b │
1861
2168
  # # │ 1 ┆ 6.0 ┆ a │
1862
2169
  # # └─────┴─────┴─────┘
1863
- def sort(by, reverse: false, nulls_last: false)
2170
+ def sort(
2171
+ by,
2172
+ *more_by,
2173
+ descending: false,
2174
+ nulls_last: false,
2175
+ multithreaded: true,
2176
+ maintain_order: false
2177
+ )
1864
2178
  lazy
1865
- .sort(by, reverse: reverse, nulls_last: nulls_last)
1866
- .collect(no_optimization: true)
2179
+ .sort(
2180
+ by,
2181
+ *more_by,
2182
+ descending: descending,
2183
+ nulls_last: nulls_last,
2184
+ multithreaded: multithreaded,
2185
+ maintain_order: maintain_order
2186
+ )
2187
+ .collect(optimizations: QueryOptFlags._eager)
1867
2188
  end
1868
2189
 
1869
2190
  # Sort the DataFrame by column in-place.
1870
2191
  #
1871
2192
  # @param by [String]
1872
2193
  # By which column to sort.
1873
- # @param reverse [Boolean]
2194
+ # @param descending [Boolean]
1874
2195
  # Reverse/descending sort.
1875
2196
  # @param nulls_last [Boolean]
1876
2197
  # Place null values last. Can only be used if sorted by a single column.
1877
2198
  #
1878
2199
  # @return [DataFrame]
1879
- def sort!(by, reverse: false, nulls_last: false)
1880
- self._df = sort(by, reverse: reverse, nulls_last: nulls_last)._df
2200
+ def sort!(by, descending: false, nulls_last: false)
2201
+ self._df = sort(by, descending: descending, nulls_last: nulls_last)._df
1881
2202
  end
1882
2203
 
1883
2204
  # Execute a SQL query against the DataFrame.
@@ -1949,7 +2270,7 @@ module Polars
1949
2270
  # # │ 3 ┆ false ┆ xx:xx ┆ 2077 ┆ 0.0 │
1950
2271
  # # └─────┴───────────┴───────┴──────┴──────┘
1951
2272
  def sql(query, table_name: "self")
1952
- ctx = SQLContext.new(eager_execution: true)
2273
+ ctx = SQLContext.new(eager: true)
1953
2274
  name = table_name || "self"
1954
2275
  ctx.register(name, self)
1955
2276
  ctx.execute(query)
@@ -1969,7 +2290,7 @@ module Polars
1969
2290
  # Accepts expression input. Strings are parsed as column names.
1970
2291
  # @param reverse [Object]
1971
2292
  # Consider the `k` smallest elements of the `by` column(s) (instead of the `k`
1972
- # largest). This can be specified per column by passing a sequence of
2293
+ # largest). This can be specified per column by passing an array of
1973
2294
  # booleans.
1974
2295
  #
1975
2296
  # @return [DataFrame]
@@ -2017,12 +2338,12 @@ module Polars
2017
2338
  lazy
2018
2339
  .top_k(k, by: by, reverse: reverse)
2019
2340
  .collect(
2020
- # optimizations=QueryOptFlags(
2021
- # projection_pushdown=False,
2022
- # predicate_pushdown=False,
2023
- # comm_subplan_elim=False,
2024
- # slice_pushdown=True
2025
- # )
2341
+ optimizations: QueryOptFlags.new(
2342
+ projection_pushdown: false,
2343
+ predicate_pushdown: false,
2344
+ comm_subplan_elim: false,
2345
+ slice_pushdown: true
2346
+ )
2026
2347
  )
2027
2348
  end
2028
2349
 
@@ -2040,7 +2361,7 @@ module Polars
2040
2361
  # Accepts expression input. Strings are parsed as column names.
2041
2362
  # @param reverse [Object]
2042
2363
  # Consider the `k` largest elements of the `by` column(s) (instead of the `k`
2043
- # smallest). This can be specified per column by passing a sequence of
2364
+ # smallest). This can be specified per column by passing an array of
2044
2365
  # booleans.
2045
2366
  #
2046
2367
  # @return [DataFrame]
@@ -2088,12 +2409,12 @@ module Polars
2088
2409
  lazy
2089
2410
  .bottom_k(k, by: by, reverse: reverse)
2090
2411
  .collect(
2091
- # optimizations=QueryOptFlags(
2092
- # projection_pushdown=False,
2093
- # predicate_pushdown=False,
2094
- # comm_subplan_elim=False,
2095
- # slice_pushdown=True,
2096
- # )
2412
+ optimizations: QueryOptFlags.new(
2413
+ projection_pushdown: false,
2414
+ predicate_pushdown: false,
2415
+ comm_subplan_elim: false,
2416
+ slice_pushdown: true
2417
+ )
2097
2418
  )
2098
2419
  end
2099
2420
 
@@ -2128,36 +2449,6 @@ module Polars
2128
2449
  def equals(other, null_equal: true)
2129
2450
  _df.equals(other._df, null_equal)
2130
2451
  end
2131
- alias_method :frame_equal, :equals
2132
-
2133
- # Replace a column by a new Series.
2134
- #
2135
- # @param column [String]
2136
- # Column to replace.
2137
- # @param new_col [Series]
2138
- # New column to insert.
2139
- #
2140
- # @return [DataFrame]
2141
- #
2142
- # @example
2143
- # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
2144
- # s = Polars::Series.new([10, 20, 30])
2145
- # df.replace("foo", s)
2146
- # # =>
2147
- # # shape: (3, 2)
2148
- # # ┌─────┬─────┐
2149
- # # │ foo ┆ bar │
2150
- # # │ --- ┆ --- │
2151
- # # │ i64 ┆ i64 │
2152
- # # ╞═════╪═════╡
2153
- # # │ 10 ┆ 4 │
2154
- # # │ 20 ┆ 5 │
2155
- # # │ 30 ┆ 6 │
2156
- # # └─────┴─────┘
2157
- def replace(column, new_col)
2158
- _df.replace(column.to_s, new_col._s)
2159
- self
2160
- end
2161
2452
 
2162
2453
  # Get a slice of this DataFrame.
2163
2454
  #
@@ -2330,7 +2621,7 @@ module Polars
2330
2621
  # # │ 80.0 ┆ 25.5 ┆ null │
2331
2622
  # # └──────┴───────┴──────┘
2332
2623
  def drop_nans(subset: nil)
2333
- lazy.drop_nans(subset: subset).collect(_eager: true)
2624
+ lazy.drop_nans(subset: subset).collect(optimizations: QueryOptFlags._eager)
2334
2625
  end
2335
2626
 
2336
2627
  # Drop all rows that contain one or more null values.
@@ -2375,12 +2666,12 @@ module Polars
2375
2666
  # # │ 3 ┆ 8 ┆ null │
2376
2667
  # # └─────┴─────┴──────┘
2377
2668
  def drop_nulls(subset: nil)
2378
- lazy.drop_nulls(subset: subset).collect(_eager: true)
2669
+ lazy.drop_nulls(subset: subset).collect(optimizations: QueryOptFlags._eager)
2379
2670
  end
2380
2671
 
2381
2672
  # Offers a structured way to apply a sequence of user-defined functions (UDFs).
2382
2673
  #
2383
- # @param func [Object]
2674
+ # @param function [Object]
2384
2675
  # Callable; will receive the frame as the first parameter,
2385
2676
  # followed by any given args/kwargs.
2386
2677
  # @param args [Object]
@@ -2397,7 +2688,7 @@ module Polars
2397
2688
  #
2398
2689
  # @example
2399
2690
  # cast_str_to_int = lambda do |data, col_name:|
2400
- # data.with_column(Polars.col(col_name).cast(:i64))
2691
+ # data.with_columns(Polars.col(col_name).cast(Polars::Int64))
2401
2692
  # end
2402
2693
  #
2403
2694
  # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]})
@@ -2414,8 +2705,8 @@ module Polars
2414
2705
  # # │ 3 ┆ 30 │
2415
2706
  # # │ 4 ┆ 40 │
2416
2707
  # # └─────┴─────┘
2417
- def pipe(func, *args, **kwargs, &block)
2418
- func.call(self, *args, **kwargs, &block)
2708
+ def pipe(function, *args, **kwargs, &block)
2709
+ function.(self, *args, **kwargs, &block)
2419
2710
  end
2420
2711
 
2421
2712
  # Add a column at index 0 that counts the rows.
@@ -2449,7 +2740,6 @@ module Polars
2449
2740
  def with_row_index(name: "index", offset: 0)
2450
2741
  _from_rbdf(_df.with_row_index(name, offset))
2451
2742
  end
2452
- alias_method :with_row_count, :with_row_index
2453
2743
 
2454
2744
  # Start a group by operation.
2455
2745
  #
@@ -2459,6 +2749,9 @@ module Polars
2459
2749
  # Make sure that the order of the groups remain consistent. This is more
2460
2750
  # expensive than a default group by. Note that this only works in expression
2461
2751
  # aggregations.
2752
+ # @param named_by [Hash]
2753
+ # Additional columns to group by, specified as keyword arguments.
2754
+ # The columns will be renamed to the keyword used.
2462
2755
  #
2463
2756
  # @return [GroupBy]
2464
2757
  #
@@ -2482,23 +2775,23 @@ module Polars
2482
2775
  # # │ b ┆ 11 │
2483
2776
  # # │ c ┆ 6 │
2484
2777
  # # └─────┴─────┘
2485
- def group_by(by, maintain_order: false)
2486
- if !Utils.bool?(maintain_order)
2487
- raise TypeError, "invalid input for group_by arg `maintain_order`: #{maintain_order}."
2778
+ def group_by(by, maintain_order: false, **named_by)
2779
+ named_by.each do |_, value|
2780
+ if !(value.is_a?(::String) || value.is_a?(Expr) || value.is_a?(Series))
2781
+ msg = "Expected Polars expression or object convertible to one, got #{value.class.name}."
2782
+ raise TypeError, msg
2783
+ end
2488
2784
  end
2489
2785
  GroupBy.new(
2490
2786
  self,
2491
2787
  by,
2788
+ **named_by,
2492
2789
  maintain_order: maintain_order
2493
2790
  )
2494
2791
  end
2495
- alias_method :groupby, :group_by
2496
- alias_method :group, :group_by
2497
2792
 
2498
2793
  # Create rolling groups based on a time column.
2499
2794
  #
2500
- # Also works for index values of type `:i32` or `:i64`.
2501
- #
2502
2795
  # Different from a `dynamic_group_by` the windows are now determined by the
2503
2796
  # individual values and are not of constant intervals. For constant intervals use
2504
2797
  # *group_by_dynamic*
@@ -2532,16 +2825,16 @@ module Polars
2532
2825
  # This column must be sorted in ascending order. If not the output will not
2533
2826
  # make sense.
2534
2827
  #
2535
- # In case of a rolling group by on indices, dtype needs to be one of
2536
- # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
2537
- # performance matters use an `:i64` column.
2828
+ # In case of a rolling operation on indices, dtype needs to be one of
2829
+ # \\\\{UInt32, UInt64, Int32, Int64}. Note that the first three get temporarily
2830
+ # cast to Int64, so if performance matters use an Int64 column.
2538
2831
  # @param period [Object]
2539
2832
  # Length of the window.
2540
2833
  # @param offset [Object]
2541
2834
  # Offset of the window. Default is -period.
2542
2835
  # @param closed ["right", "left", "both", "none"]
2543
2836
  # Define whether the temporal window interval is closed or not.
2544
- # @param by [Object]
2837
+ # @param group_by [Object]
2545
2838
  # Also group by this column/these columns.
2546
2839
  #
2547
2840
  # @return [RollingGroupBy]
@@ -2555,7 +2848,7 @@ module Polars
2555
2848
  # "2020-01-03 19:45:32",
2556
2849
  # "2020-01-08 23:16:43"
2557
2850
  # ]
2558
- # df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
2851
+ # df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_columns(
2559
2852
  # Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
2560
2853
  # )
2561
2854
  # df.rolling(index_column: "dt", period: "2d").agg(
@@ -2584,14 +2877,12 @@ module Polars
2584
2877
  period:,
2585
2878
  offset: nil,
2586
2879
  closed: "right",
2587
- by: nil
2880
+ group_by: nil
2588
2881
  )
2589
- RollingGroupBy.new(self, index_column, period, offset, closed, by)
2882
+ RollingGroupBy.new(self, index_column, period, offset, closed, group_by)
2590
2883
  end
2591
- alias_method :groupby_rolling, :rolling
2592
- alias_method :group_by_rolling, :rolling
2593
2884
 
2594
- # Group based on a time value (or index value of type `:i32`, `:i64`).
2885
+ # Group based on a time value (or index value of type Int32, Int64).
2595
2886
  #
2596
2887
  # Time windows are calculated and rows are assigned to windows. Different from a
2597
2888
  # normal group by is that a row can be member of multiple groups. The time/index
@@ -2634,8 +2925,8 @@ module Polars
2634
2925
  # make sense.
2635
2926
  #
2636
2927
  # In case of a dynamic group by on indices, dtype needs to be one of
2637
- # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
2638
- # performance matters use an `:i64` column.
2928
+ # \\\\{Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if
2929
+ # performance matters use an Int64 column.
2639
2930
  # @param every
2640
2931
  # Interval of the window.
2641
2932
  # @param period
@@ -2643,15 +2934,21 @@ module Polars
2643
2934
  # @param offset
2644
2935
  # Offset of the window if nil and period is nil it will be equal to negative
2645
2936
  # `every`.
2646
- # @param truncate
2647
- # Truncate the time value to the window lower bound.
2648
2937
  # @param include_boundaries
2649
2938
  # Add the lower and upper bound of the window to the "_lower_bound" and
2650
2939
  # "_upper_bound" columns. This will impact performance because it's harder to
2651
2940
  # parallelize
2652
2941
  # @param closed ["right", "left", "both", "none"]
2653
2942
  # Define whether the temporal window interval is closed or not.
2654
- # @param by
2943
+ # @param label ['left', 'right', 'datapoint']
2944
+ # Define which label to use for the window:
2945
+ #
2946
+ # - 'left': lower boundary of the window
2947
+ # - 'right': upper boundary of the window
2948
+ # - 'datapoint': the first value of the index column in the given window.
2949
+ # If you don't need the label to be at one of the boundaries, choose this
2950
+ # option for maximum performance
2951
+ # @param group_by
2655
2952
  # Also group by this column/these columns
2656
2953
  # @param start_by ['window', 'datapoint', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
2657
2954
  # The strategy to determine the start of the first window by.
@@ -2793,7 +3090,7 @@ module Polars
2793
3090
  # "time",
2794
3091
  # every: "1h",
2795
3092
  # closed: "both",
2796
- # by: "groups",
3093
+ # group_by: "groups",
2797
3094
  # include_boundaries: true
2798
3095
  # ).agg([Polars.col("time").count.alias("time_count")])
2799
3096
  # # =>
@@ -2843,10 +3140,10 @@ module Polars
2843
3140
  every:,
2844
3141
  period: nil,
2845
3142
  offset: nil,
2846
- truncate: true,
2847
3143
  include_boundaries: false,
2848
3144
  closed: "left",
2849
- by: nil,
3145
+ label: "left",
3146
+ group_by: nil,
2850
3147
  start_by: "window"
2851
3148
  )
2852
3149
  DynamicGroupBy.new(
@@ -2855,14 +3152,13 @@ module Polars
2855
3152
  every,
2856
3153
  period,
2857
3154
  offset,
2858
- truncate,
2859
3155
  include_boundaries,
2860
3156
  closed,
2861
- by,
3157
+ label,
3158
+ group_by,
2862
3159
  start_by
2863
3160
  )
2864
3161
  end
2865
- alias_method :groupby_dynamic, :group_by_dynamic
2866
3162
 
2867
3163
  # Upsample a DataFrame at a regular frequency.
2868
3164
  #
@@ -2871,7 +3167,7 @@ module Polars
2871
3167
  # Note that this column has to be sorted for the output to make sense.
2872
3168
  # @param every [String]
2873
3169
  # interval will start 'every' duration
2874
- # @param by [Object]
3170
+ # @param group_by [Object]
2875
3171
  # First group by these columns and then upsample for every group
2876
3172
  # @param maintain_order [Boolean]
2877
3173
  # Keep the ordering predictable. This is slower.
@@ -2910,7 +3206,7 @@ module Polars
2910
3206
  # }
2911
3207
  # ).set_sorted("time")
2912
3208
  # df.upsample(
2913
- # time_column: "time", every: "1mo", by: "groups", maintain_order: true
3209
+ # time_column: "time", every: "1mo", group_by: "groups", maintain_order: true
2914
3210
  # ).select(Polars.all.forward_fill)
2915
3211
  # # =>
2916
3212
  # # shape: (7, 3)
@@ -2930,20 +3226,20 @@ module Polars
2930
3226
  def upsample(
2931
3227
  time_column:,
2932
3228
  every:,
2933
- by: nil,
3229
+ group_by: nil,
2934
3230
  maintain_order: false
2935
3231
  )
2936
- if by.nil?
2937
- by = []
3232
+ if group_by.nil?
3233
+ group_by = []
2938
3234
  end
2939
- if by.is_a?(::String)
2940
- by = [by]
3235
+ if group_by.is_a?(::String)
3236
+ group_by = [group_by]
2941
3237
  end
2942
3238
 
2943
3239
  every = Utils.parse_as_duration_string(every)
2944
3240
 
2945
3241
  _from_rbdf(
2946
- _df.upsample(by, time_column, every, maintain_order)
3242
+ _df.upsample(group_by, time_column, every, maintain_order)
2947
3243
  )
2948
3244
  end
2949
3245
 
@@ -3096,7 +3392,7 @@ module Polars
3096
3392
  allow_exact_matches: allow_exact_matches,
3097
3393
  check_sortedness: check_sortedness
3098
3394
  )
3099
- .collect(no_optimization: true)
3395
+ .collect(optimizations: QueryOptFlags._eager)
3100
3396
  end
3101
3397
 
3102
3398
  # Join in SQL-like fashion.
@@ -3119,7 +3415,7 @@ module Polars
3119
3415
  # * *one_to_one* - “1:1”: check if join keys are unique in both left and right datasets
3120
3416
  # * *one_to_many* - “1:m”: check if join keys are unique in left dataset
3121
3417
  # * *many_to_one* - “m:1”: check if join keys are unique in right dataset
3122
- # @param join_nulls [Boolean]
3418
+ # @param nulls_equal [Boolean]
3123
3419
  # Join on null values. By default null values will never produce matches.
3124
3420
  # @param coalesce [Boolean]
3125
3421
  # Coalescing behavior (merging of join columns).
@@ -3235,7 +3531,7 @@ module Polars
3235
3531
  how: "inner",
3236
3532
  suffix: "_right",
3237
3533
  validate: "m:m",
3238
- join_nulls: false,
3534
+ nulls_equal: false,
3239
3535
  coalesce: nil,
3240
3536
  maintain_order: nil
3241
3537
  )
@@ -3248,11 +3544,11 @@ module Polars
3248
3544
  how: how,
3249
3545
  suffix: suffix,
3250
3546
  validate: validate,
3251
- join_nulls: join_nulls,
3547
+ nulls_equal: nulls_equal,
3252
3548
  coalesce: coalesce,
3253
3549
  maintain_order: maintain_order
3254
3550
  )
3255
- .collect(no_optimization: true)
3551
+ .collect(optimizations: QueryOptFlags._eager)
3256
3552
  end
3257
3553
 
3258
3554
  # Perform a join based on one or multiple (in)equality predicates.
@@ -3347,7 +3643,7 @@ module Polars
3347
3643
  *predicates,
3348
3644
  suffix: suffix
3349
3645
  )
3350
- .collect(_eager: true)
3646
+ .collect(optimizations: QueryOptFlags._eager)
3351
3647
  end
3352
3648
 
3353
3649
  # Apply a custom/user-defined function (UDF) over the rows of the DataFrame.
@@ -3410,61 +3706,14 @@ module Polars
3410
3706
  # # │ 9 │
3411
3707
  # # │ 14 │
3412
3708
  # # └─────┘
3413
- def map_rows(return_dtype: nil, inference_size: 256, &f)
3414
- out, is_df = _df.map_rows(f, return_dtype, inference_size)
3709
+ def map_rows(return_dtype: nil, inference_size: 256, &function)
3710
+ out, is_df = _df.map_rows(function, return_dtype, inference_size)
3415
3711
  if is_df
3416
3712
  _from_rbdf(out)
3417
3713
  else
3418
3714
  _from_rbdf(Utils.wrap_s(out).to_frame._df)
3419
3715
  end
3420
3716
  end
3421
- alias_method :apply, :map_rows
3422
-
3423
- # Return a new DataFrame with the column added or replaced.
3424
- #
3425
- # @param column [Object]
3426
- # Series, where the name of the Series refers to the column in the DataFrame.
3427
- #
3428
- # @return [DataFrame]
3429
- #
3430
- # @example Added
3431
- # df = Polars::DataFrame.new(
3432
- # {
3433
- # "a" => [1, 3, 5],
3434
- # "b" => [2, 4, 6]
3435
- # }
3436
- # )
3437
- # df.with_column((Polars.col("b") ** 2).alias("b_squared"))
3438
- # # =>
3439
- # # shape: (3, 3)
3440
- # # ┌─────┬─────┬───────────┐
3441
- # # │ a ┆ b ┆ b_squared │
3442
- # # │ --- ┆ --- ┆ --- │
3443
- # # │ i64 ┆ i64 ┆ i64 │
3444
- # # ╞═════╪═════╪═══════════╡
3445
- # # │ 1 ┆ 2 ┆ 4 │
3446
- # # │ 3 ┆ 4 ┆ 16 │
3447
- # # │ 5 ┆ 6 ┆ 36 │
3448
- # # └─────┴─────┴───────────┘
3449
- #
3450
- # @example Replaced
3451
- # df.with_column(Polars.col("a") ** 2)
3452
- # # =>
3453
- # # shape: (3, 2)
3454
- # # ┌─────┬─────┐
3455
- # # │ a ┆ b │
3456
- # # │ --- ┆ --- │
3457
- # # │ i64 ┆ i64 │
3458
- # # ╞═════╪═════╡
3459
- # # │ 1 ┆ 2 │
3460
- # # │ 9 ┆ 4 │
3461
- # # │ 25 ┆ 6 │
3462
- # # └─────┴─────┘
3463
- def with_column(column)
3464
- lazy
3465
- .with_column(column)
3466
- .collect(no_optimization: true, string_cache: false)
3467
- end
3468
3717
 
3469
3718
  # Return a new DataFrame grown horizontally by stacking multiple Series to it.
3470
3719
  #
@@ -3510,7 +3759,7 @@ module Polars
3510
3759
 
3511
3760
  # Grow this DataFrame vertically by stacking a DataFrame to it.
3512
3761
  #
3513
- # @param df [DataFrame]
3762
+ # @param other [DataFrame]
3514
3763
  # DataFrame to stack.
3515
3764
  # @param in_place [Boolean]
3516
3765
  # Modify in place
@@ -3545,12 +3794,12 @@ module Polars
3545
3794
  # # │ 3 ┆ 8 ┆ c │
3546
3795
  # # │ 4 ┆ 9 ┆ d │
3547
3796
  # # └─────┴─────┴─────┘
3548
- def vstack(df, in_place: false)
3797
+ def vstack(other, in_place: false)
3549
3798
  if in_place
3550
- _df.vstack_mut(df._df)
3799
+ _df.vstack_mut(other._df)
3551
3800
  self
3552
3801
  else
3553
- _from_rbdf(_df.vstack(df._df))
3802
+ _from_rbdf(_df.vstack(other._df))
3554
3803
  end
3555
3804
  end
3556
3805
 
@@ -3603,6 +3852,9 @@ module Polars
3603
3852
  #
3604
3853
  # @param columns [Object]
3605
3854
  # Column(s) to drop.
3855
+ # @param strict [Boolean]
3856
+ # Validate that all column names exist in the current schema,
3857
+ # and throw an exception if any do not.
3606
3858
  #
3607
3859
  # @return [DataFrame]
3608
3860
  #
@@ -3654,8 +3906,8 @@ module Polars
3654
3906
  # # │ 7.0 │
3655
3907
  # # │ 8.0 │
3656
3908
  # # └─────┘
3657
- def drop(*columns)
3658
- lazy.drop(*columns).collect(_eager: true)
3909
+ def drop(*columns, strict: true)
3910
+ lazy.drop(*columns, strict: strict).collect(optimizations: QueryOptFlags._eager)
3659
3911
  end
3660
3912
 
3661
3913
  # Drop in place.
@@ -3768,7 +4020,7 @@ module Polars
3768
4020
  # df.cast(Polars::String).to_h(as_series: false)
3769
4021
  # # => {"foo"=>["1", "2", "3"], "bar"=>["6.0", "7.0", "8.0"], "ham"=>["2020-01-02", "2021-03-04", "2022-05-06"]}
3770
4022
  def cast(dtypes, strict: true)
3771
- lazy.cast(dtypes, strict: strict).collect(_eager: true)
4023
+ lazy.cast(dtypes, strict: strict).collect(optimizations: QueryOptFlags._eager)
3772
4024
  end
3773
4025
 
3774
4026
  # Create an empty copy of the current DataFrame.
@@ -3818,7 +4070,6 @@ module Polars
3818
4070
  clone
3819
4071
  end
3820
4072
  end
3821
- alias_method :cleared, :clear
3822
4073
 
3823
4074
  # clone handled by initialize_copy
3824
4075
 
@@ -3880,10 +4131,13 @@ module Polars
3880
4131
  _df.get_columns.map { |s| Utils.wrap_s(s) }
3881
4132
  end
3882
4133
 
3883
- # Get a single column as Series by name.
4134
+ # Get a single column by name.
3884
4135
  #
3885
4136
  # @param name [String]
3886
4137
  # Name of the column to retrieve.
4138
+ # @param default [Object]
4139
+ # Value to return if the column does not exist; if not explicitly set and
4140
+ # the column is not present a `ColumnNotFoundError` exception is raised.
3887
4141
  #
3888
4142
  # @return [Series]
3889
4143
  #
@@ -3898,8 +4152,22 @@ module Polars
3898
4152
  # # 2
3899
4153
  # # 3
3900
4154
  # # ]
3901
- def get_column(name)
3902
- self[name]
4155
+ #
4156
+ # @example
4157
+ # df.get_column("baz", default: Polars::Series.new("baz", ["?", "?", "?"]))
4158
+ # # =>
4159
+ # # shape: (3,)
4160
+ # # Series: 'baz' [str]
4161
+ # # [
4162
+ # # "?"
4163
+ # # "?"
4164
+ # # "?"
4165
+ # # ]
4166
+ def get_column(name, default: NO_DEFAULT)
4167
+ Utils.wrap_s(_df.get_column(name.to_s))
4168
+ rescue ColumnNotFoundError
4169
+ raise if default.eql?(NO_DEFAULT)
4170
+ default
3903
4171
  end
3904
4172
 
3905
4173
  # Fill null values using the specified value or strategy.
@@ -3985,14 +4253,14 @@ module Polars
3985
4253
  _from_rbdf(
3986
4254
  lazy
3987
4255
  .fill_null(value, strategy: strategy, limit: limit, matches_supertype: matches_supertype)
3988
- .collect(no_optimization: true)
4256
+ .collect(optimizations: QueryOptFlags._eager)
3989
4257
  ._df
3990
4258
  )
3991
4259
  end
3992
4260
 
3993
4261
  # Fill floating point NaN values by an Expression evaluation.
3994
4262
  #
3995
- # @param fill_value [Object]
4263
+ # @param value [Object]
3996
4264
  # Value to fill NaN with.
3997
4265
  #
3998
4266
  # @return [DataFrame]
@@ -4021,14 +4289,16 @@ module Polars
4021
4289
  # # │ 99.0 ┆ 99.0 │
4022
4290
  # # │ 4.0 ┆ 13.0 │
4023
4291
  # # └──────┴──────┘
4024
- def fill_nan(fill_value)
4025
- lazy.fill_nan(fill_value).collect(no_optimization: true)
4292
+ def fill_nan(value)
4293
+ lazy.fill_nan(value).collect(optimizations: QueryOptFlags._eager)
4026
4294
  end
4027
4295
 
4028
4296
  # Explode `DataFrame` to long format by exploding a column with Lists.
4029
4297
  #
4030
4298
  # @param columns [Object]
4031
4299
  # Column of LargeList type.
4300
+ # @param more_columns [Array]
4301
+ # Additional names of columns to explode, specified as positional arguments.
4032
4302
  #
4033
4303
  # @return [DataFrame]
4034
4304
  #
@@ -4056,8 +4326,8 @@ module Polars
4056
4326
  # # │ c ┆ 7 │
4057
4327
  # # │ c ┆ 8 │
4058
4328
  # # └─────────┴─────────┘
4059
- def explode(columns)
4060
- lazy.explode(columns).collect(no_optimization: true)
4329
+ def explode(columns, *more_columns)
4330
+ lazy.explode(columns, *more_columns).collect(optimizations: QueryOptFlags._eager)
4061
4331
  end
4062
4332
 
4063
4333
  # Create a spreadsheet-style pivot table as a DataFrame.
@@ -4202,13 +4472,12 @@ module Polars
4202
4472
  # # │ y ┆ c ┆ 4 │
4203
4473
  # # │ z ┆ c ┆ 6 │
4204
4474
  # # └─────┴──────────┴───────┘
4205
- def unpivot(on, index: nil, variable_name: nil, value_name: nil)
4475
+ def unpivot(on = nil, index: nil, variable_name: nil, value_name: nil)
4206
4476
  on = on.nil? ? [] : Utils._expand_selectors(self, on)
4207
4477
  index = index.nil? ? [] : Utils._expand_selectors(self, index)
4208
4478
 
4209
4479
  _from_rbdf(_df.unpivot(on, index, value_name, variable_name))
4210
4480
  end
4211
- alias_method :melt, :unpivot
4212
4481
 
4213
4482
  # Unstack a long table to a wide form without doing an aggregation.
4214
4483
  #
@@ -4313,7 +4582,7 @@ module Polars
4313
4582
 
4314
4583
  if how == "horizontal"
4315
4584
  df = (
4316
- df.with_column(
4585
+ df.with_columns(
4317
4586
  (Polars.arange(0, n_cols * n_rows, eager: true) % n_cols).alias(
4318
4587
  "__sort_order"
4319
4588
  )
@@ -4336,8 +4605,10 @@ module Polars
4336
4605
 
4337
4606
  # Split into multiple DataFrames partitioned by groups.
4338
4607
  #
4339
- # @param groups [Object]
4608
+ # @param by [Object]
4340
4609
  # Groups to partition by.
4610
+ # @param more_by [Array]
4611
+ # Additional names of columns to group by, specified as positional arguments.
4341
4612
  # @param maintain_order [Boolean]
4342
4613
  # Keep predictable output order. This is slower as it requires an extra sort
4343
4614
  # operation.
@@ -4387,7 +4658,7 @@ module Polars
4387
4658
  # @example
4388
4659
  # df.partition_by("foo", maintain_order: true, as_dict: true)
4389
4660
  # # =>
4390
- # # {"A"=>shape: (2, 3)
4661
+ # # {["A"]=>shape: (2, 3)
4391
4662
  # # ┌─────┬─────┬─────┐
4392
4663
  # # │ foo ┆ N ┆ bar │
4393
4664
  # # │ --- ┆ --- ┆ --- │
@@ -4395,7 +4666,7 @@ module Polars
4395
4666
  # # ╞═════╪═════╪═════╡
4396
4667
  # # │ A ┆ 1 ┆ k │
4397
4668
  # # │ A ┆ 2 ┆ l │
4398
- # # └─────┴─────┴─────┘, "B"=>shape: (2, 3)
4669
+ # # └─────┴─────┴─────┘, ["B"]=>shape: (2, 3)
4399
4670
  # # ┌─────┬─────┬─────┐
4400
4671
  # # │ foo ┆ N ┆ bar │
4401
4672
  # # │ --- ┆ --- ┆ --- │
@@ -4403,7 +4674,7 @@ module Polars
4403
4674
  # # ╞═════╪═════╪═════╡
4404
4675
  # # │ B ┆ 2 ┆ m │
4405
4676
  # # │ B ┆ 4 ┆ m │
4406
- # # └─────┴─────┴─────┘, "C"=>shape: (1, 3)
4677
+ # # └─────┴─────┴─────┘, ["C"]=>shape: (1, 3)
4407
4678
  # # ┌─────┬─────┬─────┐
4408
4679
  # # │ foo ┆ N ┆ bar │
4409
4680
  # # │ --- ┆ --- ┆ --- │
@@ -4411,30 +4682,26 @@ module Polars
4411
4682
  # # ╞═════╪═════╪═════╡
4412
4683
  # # │ C ┆ 2 ┆ l │
4413
4684
  # # └─────┴─────┴─────┘}
4414
- def partition_by(groups, maintain_order: true, include_key: true, as_dict: false)
4415
- if groups.is_a?(::String)
4416
- groups = [groups]
4417
- elsif !groups.is_a?(::Array)
4418
- groups = Array(groups)
4419
- end
4685
+ def partition_by(by, *more_by, maintain_order: true, include_key: true, as_dict: false)
4686
+ by_parsed = Utils._expand_selectors(self, by, *more_by)
4687
+
4688
+ partitions = _df.partition_by(by_parsed, maintain_order, include_key).map { |df| _from_rbdf(df) }
4420
4689
 
4421
4690
  if as_dict
4422
- out = {}
4423
- if groups.length == 1
4424
- _df.partition_by(groups, maintain_order, include_key).each do |df|
4425
- df = _from_rbdf(df)
4426
- out[df[groups][0, 0]] = df
4427
- end
4691
+ if include_key
4692
+ names = partitions.map { |p| p.select(by_parsed).row(0) }
4428
4693
  else
4429
- _df.partition_by(groups, maintain_order, include_key).each do |df|
4430
- df = _from_rbdf(df)
4431
- out[df[groups].row(0)] = df
4694
+ if !maintain_order
4695
+ msg = "cannot use `partition_by` with `maintain_order: false, include_key: false, as_dict: true`"
4696
+ raise ArgumentError, msg
4432
4697
  end
4698
+ names = select(by_parsed).unique(maintain_order: true).rows
4433
4699
  end
4434
- out
4435
- else
4436
- _df.partition_by(groups, maintain_order, include_key).map { |df| _from_rbdf(df) }
4700
+
4701
+ return names.zip(partitions).to_h
4437
4702
  end
4703
+
4704
+ partitions
4438
4705
  end
4439
4706
 
4440
4707
  # Shift values by the given period.
@@ -4480,41 +4747,8 @@ module Polars
4480
4747
  # # │ 3 ┆ 8 ┆ c │
4481
4748
  # # │ null ┆ null ┆ null │
4482
4749
  # # └──────┴──────┴──────┘
4483
- def shift(n, fill_value: nil)
4484
- lazy.shift(n, fill_value: fill_value).collect(_eager: true)
4485
- end
4486
-
4487
- # Shift the values by a given period and fill the resulting null values.
4488
- #
4489
- # @param periods [Integer]
4490
- # Number of places to shift (may be negative).
4491
- # @param fill_value [Object]
4492
- # fill nil values with this value.
4493
- #
4494
- # @return [DataFrame]
4495
- #
4496
- # @example
4497
- # df = Polars::DataFrame.new(
4498
- # {
4499
- # "foo" => [1, 2, 3],
4500
- # "bar" => [6, 7, 8],
4501
- # "ham" => ["a", "b", "c"]
4502
- # }
4503
- # )
4504
- # df.shift_and_fill(1, 0)
4505
- # # =>
4506
- # # shape: (3, 3)
4507
- # # ┌─────┬─────┬─────┐
4508
- # # │ foo ┆ bar ┆ ham │
4509
- # # │ --- ┆ --- ┆ --- │
4510
- # # │ i64 ┆ i64 ┆ str │
4511
- # # ╞═════╪═════╪═════╡
4512
- # # │ 0 ┆ 0 ┆ 0 │
4513
- # # │ 1 ┆ 6 ┆ a │
4514
- # # │ 2 ┆ 7 ┆ b │
4515
- # # └─────┴─────┴─────┘
4516
- def shift_and_fill(periods, fill_value)
4517
- shift(periods, fill_value: fill_value)
4750
+ def shift(n = 1, fill_value: nil)
4751
+ lazy.shift(n, fill_value: fill_value).collect(optimizations: QueryOptFlags._eager)
4518
4752
  end
4519
4753
 
4520
4754
  # Get a mask of all duplicated rows in this DataFrame.
@@ -4570,6 +4804,16 @@ module Polars
4570
4804
  # Start a lazy query from this point.
4571
4805
  #
4572
4806
  # @return [LazyFrame]
4807
+ #
4808
+ # @example
4809
+ # df = Polars::DataFrame.new(
4810
+ # {
4811
+ # "a" => [nil, 2, 3, 4],
4812
+ # "b" => [0.5, nil, 2.5, 13],
4813
+ # "c" => [true, true, false, nil]
4814
+ # }
4815
+ # )
4816
+ # df.lazy
4573
4817
  def lazy
4574
4818
  wrap_ldf(_df.lazy)
4575
4819
  end
@@ -4663,7 +4907,7 @@ module Polars
4663
4907
  # # │ 10 │
4664
4908
  # # └─────────┘
4665
4909
  def select(*exprs, **named_exprs)
4666
- lazy.select(*exprs, **named_exprs).collect(_eager: true)
4910
+ lazy.select(*exprs, **named_exprs).collect(optimizations: QueryOptFlags._eager)
4667
4911
  end
4668
4912
 
4669
4913
  # Select columns from this DataFrame.
@@ -4683,7 +4927,7 @@ module Polars
4683
4927
  def select_seq(*exprs, **named_exprs)
4684
4928
  lazy
4685
4929
  .select_seq(*exprs, **named_exprs)
4686
- .collect(_eager: true)
4930
+ .collect(optimizations: QueryOptFlags._eager)
4687
4931
  end
4688
4932
 
4689
4933
  # Add columns to this DataFrame.
@@ -4795,7 +5039,7 @@ module Polars
4795
5039
  # # │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │
4796
5040
  # # └─────┴──────┴───────┴──────┴───────┘
4797
5041
  def with_columns(*exprs, **named_exprs)
4798
- lazy.with_columns(*exprs, **named_exprs).collect(_eager: true)
5042
+ lazy.with_columns(*exprs, **named_exprs).collect(optimizations: QueryOptFlags._eager)
4799
5043
  end
4800
5044
 
4801
5045
  # Add columns to this DataFrame.
@@ -4820,7 +5064,7 @@ module Polars
4820
5064
  )
4821
5065
  lazy
4822
5066
  .with_columns_seq(*exprs, **named_exprs)
4823
- .collect(_eager: true)
5067
+ .collect(optimizations: QueryOptFlags._eager)
4824
5068
  end
4825
5069
 
4826
5070
  # Get number of chunks used by the ChunkedArrays of this DataFrame.
@@ -4876,7 +5120,7 @@ module Polars
4876
5120
  # # │ 3 ┆ 8 ┆ c │
4877
5121
  # # └─────┴─────┴─────┘
4878
5122
  def max
4879
- lazy.max.collect(_eager: true)
5123
+ lazy.max.collect(optimizations: QueryOptFlags._eager)
4880
5124
  end
4881
5125
 
4882
5126
  # Get the maximum value horizontally across columns.
@@ -4926,7 +5170,7 @@ module Polars
4926
5170
  # # │ 1 ┆ 6 ┆ a │
4927
5171
  # # └─────┴─────┴─────┘
4928
5172
  def min
4929
- lazy.min.collect(_eager: true)
5173
+ lazy.min.collect(optimizations: QueryOptFlags._eager)
4930
5174
  end
4931
5175
 
4932
5176
  # Get the minimum value horizontally across columns.
@@ -4976,7 +5220,7 @@ module Polars
4976
5220
  # # │ 6 ┆ 21 ┆ null │
4977
5221
  # # └─────┴─────┴──────┘
4978
5222
  def sum
4979
- lazy.sum.collect(_eager: true)
5223
+ lazy.sum.collect(optimizations: QueryOptFlags._eager)
4980
5224
  end
4981
5225
 
4982
5226
  # Sum all values horizontally across columns.
@@ -5032,7 +5276,7 @@ module Polars
5032
5276
  # # │ 2.0 ┆ 7.0 ┆ null │
5033
5277
  # # └─────┴─────┴──────┘
5034
5278
  def mean
5035
- lazy.mean.collect(_eager: true)
5279
+ lazy.mean.collect(optimizations: QueryOptFlags._eager)
5036
5280
  end
5037
5281
 
5038
5282
  # Take the mean of all values horizontally across columns.
@@ -5103,7 +5347,7 @@ module Polars
5103
5347
  # # │ 0.816497 ┆ 0.816497 ┆ null │
5104
5348
  # # └──────────┴──────────┴──────┘
5105
5349
  def std(ddof: 1)
5106
- lazy.std(ddof: ddof).collect(_eager: true)
5350
+ lazy.std(ddof: ddof).collect(optimizations: QueryOptFlags._eager)
5107
5351
  end
5108
5352
 
5109
5353
  # Aggregate the columns of this DataFrame to their variance value.
@@ -5144,7 +5388,7 @@ module Polars
5144
5388
  # # │ 0.666667 ┆ 0.666667 ┆ null │
5145
5389
  # # └──────────┴──────────┴──────┘
5146
5390
  def var(ddof: 1)
5147
- lazy.var(ddof: ddof).collect(_eager: true)
5391
+ lazy.var(ddof: ddof).collect(optimizations: QueryOptFlags._eager)
5148
5392
  end
5149
5393
 
5150
5394
  # Aggregate the columns of this DataFrame to their median value.
@@ -5170,7 +5414,7 @@ module Polars
5170
5414
  # # │ 2.0 ┆ 7.0 ┆ null │
5171
5415
  # # └─────┴─────┴──────┘
5172
5416
  def median
5173
- lazy.median.collect(_eager: true)
5417
+ lazy.median.collect(optimizations: QueryOptFlags._eager)
5174
5418
  end
5175
5419
 
5176
5420
  # Aggregate the columns of this DataFrame to their product values.
@@ -5227,7 +5471,7 @@ module Polars
5227
5471
  # # │ 2.0 ┆ 7.0 ┆ null │
5228
5472
  # # └─────┴─────┴──────┘
5229
5473
  def quantile(quantile, interpolation: "nearest")
5230
- lazy.quantile(quantile, interpolation: interpolation).collect(_eager: true)
5474
+ lazy.quantile(quantile, interpolation: interpolation).collect(optimizations: QueryOptFlags._eager)
5231
5475
  end
5232
5476
 
5233
5477
  # Get one hot encoded dummy variables.
@@ -5294,7 +5538,7 @@ module Polars
5294
5538
  # "c" => [true, true, true, false, true, true]
5295
5539
  # }
5296
5540
  # )
5297
- # df.unique
5541
+ # df.unique(maintain_order: true)
5298
5542
  # # =>
5299
5543
  # # shape: (5, 3)
5300
5544
  # # ┌─────┬─────┬───────┐
@@ -5308,11 +5552,11 @@ module Polars
5308
5552
  # # │ 4 ┆ 3.0 ┆ true │
5309
5553
  # # │ 5 ┆ 3.0 ┆ true │
5310
5554
  # # └─────┴─────┴───────┘
5311
- def unique(maintain_order: true, subset: nil, keep: "first")
5555
+ def unique(maintain_order: false, subset: nil, keep: "any")
5312
5556
  self._from_rbdf(
5313
5557
  lazy
5314
5558
  .unique(maintain_order: maintain_order, subset: subset, keep: keep)
5315
- .collect(no_optimization: true)
5559
+ .collect(optimizations: QueryOptFlags._eager)
5316
5560
  ._df
5317
5561
  )
5318
5562
  end
@@ -5405,9 +5649,9 @@ module Polars
5405
5649
  # Sample from this DataFrame.
5406
5650
  #
5407
5651
  # @param n [Integer]
5408
- # Number of items to return. Cannot be used with `frac`. Defaults to 1 if
5409
- # `frac` is nil.
5410
- # @param frac [Float]
5652
+ # Number of items to return. Cannot be used with `fraction`. Defaults to 1 if
5653
+ # `fraction` is nil.
5654
+ # @param fraction [Float]
5411
5655
  # Fraction of items to return. Cannot be used with `n`.
5412
5656
  # @param with_replacement [Boolean]
5413
5657
  # Allow values to be sampled more than once.
@@ -5440,20 +5684,20 @@ module Polars
5440
5684
  # # └─────┴─────┴─────┘
5441
5685
  def sample(
5442
5686
  n: nil,
5443
- frac: nil,
5687
+ fraction: nil,
5444
5688
  with_replacement: false,
5445
5689
  shuffle: false,
5446
5690
  seed: nil
5447
5691
  )
5448
- if !n.nil? && !frac.nil?
5449
- raise ArgumentError, "cannot specify both `n` and `frac`"
5692
+ if !n.nil? && !fraction.nil?
5693
+ raise ArgumentError, "cannot specify both `n` and `fraction`"
5450
5694
  end
5451
5695
 
5452
- if n.nil? && !frac.nil?
5453
- frac = Series.new("frac", [frac]) unless frac.is_a?(Series)
5696
+ if n.nil? && !fraction.nil?
5697
+ fraction = Series.new("fraction", [fraction]) unless fraction.is_a?(Series)
5454
5698
 
5455
5699
  return _from_rbdf(
5456
- _df.sample_frac(frac._s, with_replacement, shuffle, seed)
5700
+ _df.sample_frac(fraction._s, with_replacement, shuffle, seed)
5457
5701
  )
5458
5702
  end
5459
5703
 
@@ -5725,7 +5969,7 @@ module Polars
5725
5969
  if include_key
5726
5970
  values = self
5727
5971
  else
5728
- data_cols = schema.keys - key
5972
+ data_cols = schema.names - key
5729
5973
  values = select(data_cols)
5730
5974
  end
5731
5975
 
@@ -5768,7 +6012,7 @@ module Polars
5768
6012
  # @example
5769
6013
  # df.iter_rows(named: true).map { |row| row["b"] }
5770
6014
  # # => [2, 4, 6]
5771
- def iter_rows(named: false, buffer_size: 500, &block)
6015
+ def iter_rows(named: false, buffer_size: 512, &block)
5772
6016
  return to_enum(:iter_rows, named: named, buffer_size: buffer_size) unless block_given?
5773
6017
 
5774
6018
  # load into the local namespace for a modest performance boost in the hot loops
@@ -5939,11 +6183,10 @@ module Polars
5939
6183
  def gather_every(n, offset = 0)
5940
6184
  select(F.col("*").gather_every(n, offset))
5941
6185
  end
5942
- alias_method :take_every, :gather_every
5943
6186
 
5944
6187
  # Hash and combine the rows in this DataFrame.
5945
6188
  #
5946
- # The hash value is of type `:u64`.
6189
+ # The hash value is of type `UInt64`.
5947
6190
  #
5948
6191
  # @param seed [Integer]
5949
6192
  # Random seed parameter. Defaults to 0.
@@ -6050,7 +6293,7 @@ module Polars
6050
6293
  # # {4,"four"}
6051
6294
  # # {5,"five"}
6052
6295
  # # ]
6053
- def to_struct(name)
6296
+ def to_struct(name = "")
6054
6297
  Utils.wrap_s(_df.to_struct(name))
6055
6298
  end
6056
6299
 
@@ -6092,7 +6335,7 @@ module Polars
6092
6335
  # # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │
6093
6336
  # # └────────┴─────┴─────┴──────┴───────────┴───────┘
6094
6337
  def unnest(columns, *more_columns, separator: nil)
6095
- lazy.unnest(columns, *more_columns, separator: separator).collect(_eager: true)
6338
+ lazy.unnest(columns, *more_columns, separator: separator).collect(optimizations: QueryOptFlags._eager)
6096
6339
  end
6097
6340
 
6098
6341
  # Requires NumPy
@@ -6138,7 +6381,7 @@ module Polars
6138
6381
  # # │ elise ┆ 44 │
6139
6382
  # # └────────┴─────┘
6140
6383
  def merge_sorted(other, key)
6141
- lazy.merge_sorted(other.lazy, key).collect(_eager: true)
6384
+ lazy.merge_sorted(other.lazy, key).collect(optimizations: QueryOptFlags._eager)
6142
6385
  end
6143
6386
 
6144
6387
  # Flag a column as sorted.
@@ -6160,7 +6403,7 @@ module Polars
6160
6403
  )
6161
6404
  lazy
6162
6405
  .set_sorted(column, descending: descending)
6163
- .collect(no_optimization: true)
6406
+ .collect(optimizations: QueryOptFlags._eager)
6164
6407
  end
6165
6408
 
6166
6409
  # Update the values in this `DataFrame` with the values in `other`.
@@ -6291,7 +6534,7 @@ module Polars
6291
6534
  include_nulls: include_nulls,
6292
6535
  maintain_order: maintain_order
6293
6536
  )
6294
- .collect(_eager: true)
6537
+ .collect(optimizations: QueryOptFlags._eager)
6295
6538
  end
6296
6539
 
6297
6540
  private
@@ -6357,282 +6600,6 @@ module Polars
6357
6600
  raise ArgumentError, "Unsupported idxs datatype."
6358
6601
  end
6359
6602
 
6360
- # @private
6361
- def self.expand_hash_scalars(data, schema_overrides: nil, strict: true, order: nil, nan_to_null: false)
6362
- updated_data = {}
6363
- unless data.empty?
6364
- dtypes = schema_overrides || {}
6365
- array_len = data.values.map { |val| Utils.arrlen(val) || 0 }.max
6366
- if array_len > 0
6367
- data.each do |name, val|
6368
- dtype = dtypes[name]
6369
- if val.is_a?(Hash) && dtype != Struct
6370
- updated_data[name] = DataFrame.new(val, strict: strict).to_struct(name)
6371
- elsif !Utils.arrlen(val).nil?
6372
- updated_data[name] = Series.new(::String.new(name), val, dtype: dtype, strict: strict)
6373
- elsif val.nil? || [Integer, Float, TrueClass, FalseClass, ::String, ::Date, ::DateTime, ::Time].any? { |cls| val.is_a?(cls) }
6374
- dtype = Polars::Float64 if val.nil? && dtype.nil?
6375
- updated_data[name] = Series.new(::String.new(name), [val], dtype: dtype, strict: strict).extend_constant(val, array_len - 1)
6376
- else
6377
- raise Todo
6378
- end
6379
- end
6380
- elsif data.values.all? { |val| Utils.arrlen(val) == 0 }
6381
- data.each do |name, val|
6382
- updated_data[name] = Series.new(name, val, dtype: dtypes[name], strict: strict)
6383
- end
6384
- elsif data.values.all? { |val| Utils.arrlen(val).nil? }
6385
- data.each do |name, val|
6386
- updated_data[name] = Series.new(name, [val], dtype: dtypes[name], strict: strict)
6387
- end
6388
- end
6389
- end
6390
- updated_data
6391
- end
6392
-
6393
- # @private
6394
- def self.hash_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true, nan_to_null: nil)
6395
- if schema.is_a?(Hash) && !data.empty?
6396
- if !data.all? { |col, _| schema[col] }
6397
- raise ArgumentError, "The given column-schema names do not match the data dictionary"
6398
- end
6399
-
6400
- data = schema.to_h { |col| [col, data[col]] }
6401
- end
6402
-
6403
- column_names, schema_overrides = _unpack_schema(
6404
- schema, lookup_names: data.keys, schema_overrides: schema_overrides
6405
- )
6406
- if column_names.empty?
6407
- column_names = data.keys
6408
- end
6409
-
6410
- if data.empty? && !schema_overrides.empty?
6411
- data_series = column_names.map { |name| Series.new(name, [], dtype: schema_overrides[name], strict: strict, nan_to_null: nan_to_null)._s }
6412
- else
6413
- data_series = expand_hash_scalars(data, schema_overrides: schema_overrides, strict: strict, nan_to_null: nan_to_null).values.map(&:_s)
6414
- end
6415
-
6416
- data_series = _handle_columns_arg(data_series, columns: column_names, from_hash: true)
6417
- RbDataFrame.new(data_series)
6418
- end
6419
-
6420
- # @private
6421
- def self.include_unknowns(schema, cols)
6422
- cols.to_h { |col| [col, schema.fetch(col, Unknown)] }
6423
- end
6424
-
6425
- # @private
6426
- def self._unpack_schema(schema, schema_overrides: nil, n_expected: nil, lookup_names: nil, include_overrides_in_columns: false)
6427
- if schema.is_a?(Hash)
6428
- schema = schema.to_a
6429
- end
6430
- column_names =
6431
- (schema || []).map.with_index do |col, i|
6432
- if col.is_a?(::String)
6433
- col || "column_#{i}"
6434
- else
6435
- col[0]
6436
- end
6437
- end
6438
- if column_names.empty? && n_expected
6439
- column_names = n_expected.times.map { |i| "column_#{i}" }
6440
- end
6441
- # TODO zip_longest
6442
- lookup = column_names.zip(lookup_names || []).to_h
6443
-
6444
- column_dtypes =
6445
- (schema || []).select { |col| !col.is_a?(::String) && col[1] }.to_h do |col|
6446
- [lookup[col[0]] || col[0], col[1]]
6447
- end
6448
-
6449
- if schema_overrides && schema_overrides.any?
6450
- column_dtypes.merge!(schema_overrides)
6451
- end
6452
-
6453
- column_dtypes.each do |col, dtype|
6454
- if !Utils.is_polars_dtype(dtype, include_unknown: true) && !dtype.nil?
6455
- column_dtypes[col] = Utils.rb_type_to_dtype(dtype)
6456
- end
6457
- end
6458
-
6459
- [column_names, column_dtypes]
6460
- end
6461
-
6462
- def self._handle_columns_arg(data, columns: nil, from_hash: false)
6463
- if columns.nil? || columns.empty?
6464
- data
6465
- else
6466
- if data.empty?
6467
- columns.map { |c| Series.new(c, nil)._s }
6468
- elsif data.length == columns.length
6469
- if from_hash
6470
- series_map = data.to_h { |s| [s.name, s] }
6471
- if columns.all? { |col| series_map.key?(col) }
6472
- return columns.map { |col| series_map[col] }
6473
- end
6474
- end
6475
-
6476
- columns.each_with_index do |c, i|
6477
- # not in-place?
6478
- data[i].rename(c)
6479
- end
6480
- data
6481
- else
6482
- raise ArgumentError, "Dimensions of columns arg must match data dimensions."
6483
- end
6484
- end
6485
- end
6486
-
6487
- def self._post_apply_columns(rbdf, columns, structs: nil, schema_overrides: nil, strict: true)
6488
- rbdf_columns = rbdf.columns
6489
- rbdf_dtypes = rbdf.dtypes
6490
- columns, dtypes = _unpack_schema(
6491
- (columns || rbdf_columns), schema_overrides: schema_overrides
6492
- )
6493
- column_subset = []
6494
- if columns != rbdf_columns
6495
- if columns.length < rbdf_columns.length && columns == rbdf_columns.first(columns.length)
6496
- column_subset = columns
6497
- else
6498
- rbdf.set_column_names(columns)
6499
- end
6500
- end
6501
-
6502
- column_casts = []
6503
- columns.each_with_index do |col, i|
6504
- if dtypes[col] == Categorical # != rbdf_dtypes[i]
6505
- column_casts << Polars.col(col).cast(Categorical, strict: strict)._rbexpr
6506
- elsif structs&.any? && structs.include?(col) && structs[col] != rbdf_dtypes[i]
6507
- column_casts << Polars.col(col).cast(structs[col], strict: strict)._rbexpr
6508
- elsif dtypes.include?(col) && dtypes[col] != rbdf_dtypes[i]
6509
- column_casts << Polars.col(col).cast(dtypes[col], strict: strict)._rbexpr
6510
- end
6511
- end
6512
-
6513
- if column_casts.any? || column_subset.any?
6514
- rbdf = rbdf.lazy
6515
- if column_casts.any?
6516
- rbdf = rbdf.with_columns(column_casts)
6517
- end
6518
- if column_subset.any?
6519
- rbdf = rbdf.select(column_subset.map { |col| Polars.col(col)._rbexpr })
6520
- end
6521
- rbdf = rbdf.collect
6522
- end
6523
-
6524
- rbdf
6525
- end
6526
-
6527
- # @private
6528
- def self.sequence_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true, orient: nil, infer_schema_length: 50)
6529
- columns = schema
6530
-
6531
- if data.length == 0
6532
- return hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides, strict: strict)
6533
- end
6534
-
6535
- if data[0].is_a?(Series)
6536
- # series_names = data.map(&:name)
6537
- # columns, dtypes = _unpack_schema(columns || series_names, n_expected: data.length)
6538
- data_series = []
6539
- data.each do |s|
6540
- data_series << s._s
6541
- end
6542
- elsif data[0].is_a?(Hash)
6543
- column_names, dtypes = _unpack_schema(columns)
6544
- schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
6545
- rbdf = RbDataFrame.from_hashes(data, schema, schema_overrides, strict, infer_schema_length)
6546
- if column_names
6547
- rbdf = _post_apply_columns(rbdf, column_names)
6548
- end
6549
- return rbdf
6550
- elsif data[0].is_a?(::Array)
6551
- first_element = data[0]
6552
- if orient.nil? && !columns.nil?
6553
- row_types = first_element.filter_map { |value| value.class }.uniq
6554
- if row_types.include?(Integer) && row_types.include?(Float)
6555
- row_types.delete(Integer)
6556
- end
6557
- orient = row_types.length == 1 ? "col" : "row"
6558
- end
6559
-
6560
- if orient == "row"
6561
- column_names, schema_overrides = _unpack_schema(
6562
- schema, schema_overrides: schema_overrides, n_expected: first_element.length
6563
- )
6564
- local_schema_override = (
6565
- schema_overrides.any? ? _include_unknowns(schema_overrides, column_names) : {}
6566
- )
6567
- if column_names.any? && first_element.length > 0 && first_element.length != column_names.length
6568
- raise ArgumentError, "the row data does not match the number of columns"
6569
- end
6570
-
6571
- unpack_nested = false
6572
- local_schema_override.each do |col, tp|
6573
- if [Categorical, Enum].include?(tp)
6574
- local_schema_override[col] = String
6575
- elsif !unpack_nested && [Unknown, Struct].include?(tp.base_type)
6576
- raise Todo
6577
- end
6578
- end
6579
-
6580
- if unpack_nested
6581
- raise Todo
6582
- else
6583
- rbdf = RbDataFrame.from_rows(
6584
- data,
6585
- infer_schema_length,
6586
- local_schema_override.any? ? local_schema_override : nil
6587
- )
6588
- end
6589
- if column_names.any? || schema_overrides.any?
6590
- rbdf = _post_apply_columns(
6591
- rbdf, column_names, schema_overrides: schema_overrides, strict: strict
6592
- )
6593
- end
6594
- return rbdf
6595
- elsif orient == "col" || orient.nil?
6596
- column_names, schema_overrides = _unpack_schema(
6597
- schema, schema_overrides: schema_overrides, n_expected: data.length
6598
- )
6599
- data_series =
6600
- data.map.with_index do |element, i|
6601
- Series.new(column_names[i], element, dtype: schema_overrides[column_names[i]], strict: strict)._s
6602
- end
6603
- return RbDataFrame.new(data_series)
6604
- else
6605
- raise ArgumentError, "orient must be one of {{'col', 'row', nil}}, got #{orient} instead."
6606
- end
6607
- end
6608
-
6609
- data_series = _handle_columns_arg(data_series, columns: columns)
6610
- RbDataFrame.new(data_series)
6611
- end
6612
-
6613
- # @private
6614
- def self._include_unknowns(schema, cols)
6615
- cols.to_h { |col| [col, schema[col] || Unknown] }
6616
- end
6617
-
6618
- # @private
6619
- def self.series_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true)
6620
- data_series = [data._s]
6621
- series_name = data_series.map(&:name)
6622
- column_names, schema_overrides = _unpack_schema(
6623
- schema || series_name, schema_overrides: schema_overrides, n_expected: 1
6624
- )
6625
- if schema_overrides.any?
6626
- new_dtype = schema_overrides.values[0]
6627
- if new_dtype != data.dtype
6628
- data_series[0] = data_series[0].cast(new_dtype, strict)
6629
- end
6630
- end
6631
-
6632
- data_series = _handle_columns_arg(data_series, columns: column_names)
6633
- RbDataFrame.new(data_series)
6634
- end
6635
-
6636
6603
  def wrap_ldf(ldf)
6637
6604
  LazyFrame._from_rbldf(ldf)
6638
6605
  end
@@ -6641,6 +6608,11 @@ module Polars
6641
6608
  self.class._from_rbdf(rb_df)
6642
6609
  end
6643
6610
 
6611
+ def _replace(column, new_column)
6612
+ self._df.replace(column, new_column._s)
6613
+ self
6614
+ end
6615
+
6644
6616
  def _comp(other, op)
6645
6617
  if other.is_a?(DataFrame)
6646
6618
  _compare_to_other_df(other, op)
@@ -6658,7 +6630,7 @@ module Polars
6658
6630
  end
6659
6631
 
6660
6632
  suffix = "__POLARS_CMP_OTHER"
6661
- other_renamed = other.select(Polars.all.suffix(suffix))
6633
+ other_renamed = other.select(Polars.all.name.suffix(suffix))
6662
6634
  combined = Polars.concat([self, other_renamed], how: "horizontal")
6663
6635
 
6664
6636
  expr = case op
@@ -6726,5 +6698,268 @@ module Polars
6726
6698
  yield
6727
6699
  end
6728
6700
  end
6701
+
6702
+ def get_series_item_by_key(s, key)
6703
+ if key.is_a?(Integer)
6704
+ return s._s.get_index_signed(key)
6705
+
6706
+ elsif key.is_a?(Range)
6707
+ return _select_elements_by_slice(s, key)
6708
+
6709
+ elsif key.is_a?(::Array)
6710
+ if key.empty?
6711
+ return s.clear
6712
+ end
6713
+
6714
+ first = key[0]
6715
+ if Utils.bool?(first)
6716
+ _raise_on_boolean_mask
6717
+ end
6718
+
6719
+ begin
6720
+ indices = Series.new("", key, dtype: Int64)
6721
+ rescue TypeError
6722
+ msg = "cannot select elements using Sequence with elements of type #{first.class.name.inspect}"
6723
+ raise TypeError, msg
6724
+ end
6725
+
6726
+ indices = _convert_series_to_indices(indices, s.len)
6727
+ return _select_elements_by_index(s, indices)
6728
+
6729
+ elsif key.is_a?(Series)
6730
+ indices = _convert_series_to_indices(key, s.len)
6731
+ return _select_elements_by_index(s, indices)
6732
+ end
6733
+
6734
+ msg = "cannot select elements using key of type #{key.class.name.inspect}: #{key.inspect}"
6735
+ raise TypeError, msg
6736
+ end
6737
+
6738
+ def _select_elements_by_slice(s, key)
6739
+ Slice.new(s).apply(key)
6740
+ end
6741
+
6742
+ def _select_elements_by_index(s, key)
6743
+ s.send(:_from_rbseries, s._s.gather_with_series(key._s))
6744
+ end
6745
+
6746
+ def get_df_item_by_key(df, key)
6747
+ if key.size == 2
6748
+ row_key, col_key = key
6749
+
6750
+ # Support df[True, False] and df["a", "b"] as these are not ambiguous
6751
+ if Utils.bool?(row_key) || Utils.strlike?(row_key)
6752
+ return _select_columns(df, key)
6753
+ end
6754
+
6755
+ selection = _select_columns(df, col_key)
6756
+
6757
+ if selection.is_empty
6758
+ return selection
6759
+ elsif selection.is_a?(Series)
6760
+ return get_series_item_by_key(selection, row_key)
6761
+ else
6762
+ return _select_rows(selection, row_key)
6763
+ end
6764
+ end
6765
+
6766
+ key = key[0] if key.size == 1
6767
+
6768
+ # Single string input, e.g. df["a"]
6769
+ if Utils.strlike?(key)
6770
+ # This case is required because empty strings are otherwise treated
6771
+ # as an empty Sequence in `_select_rows`
6772
+ return df.get_column(key)
6773
+ end
6774
+
6775
+ # Single input - df[1] - or multiple inputs - df["a", "b", "c"]
6776
+ begin
6777
+ _select_rows(df, key)
6778
+ rescue TypeError
6779
+ _select_columns(df, key)
6780
+ end
6781
+ end
6782
+
6783
+ def _select_columns(df, key)
6784
+ if key.is_a?(Integer)
6785
+ return df.to_series(key)
6786
+
6787
+ elsif Utils.strlike?(key)
6788
+ return df.get_column(key)
6789
+
6790
+ elsif key.is_a?(Range)
6791
+ start, stop = key.begin, key.end
6792
+ if start.is_a?(::String)
6793
+ start = df.get_column_index(start)
6794
+ stop = df.get_column_index(stop)
6795
+ rng = Range.new(start, stop, key.exclude_end?)
6796
+ return _select_columns_by_index(df, rng)
6797
+ else
6798
+ return _select_columns_by_index(df, key)
6799
+ end
6800
+
6801
+ elsif key.is_a?(::Array)
6802
+ if key.empty?
6803
+ return df.class.new
6804
+ end
6805
+ first = key[0]
6806
+ if Utils.bool?(first)
6807
+ return _select_columns_by_mask(df, key)
6808
+ elsif first.is_a?(Integer)
6809
+ return _select_columns_by_index(df, key)
6810
+ elsif Utils.strlike?(first)
6811
+ return _select_columns_by_name(df, key)
6812
+ else
6813
+ msg = "cannot select columns using Sequence with elements of type #{first.class.name.inspect}"
6814
+ raise TypeError, msg
6815
+ end
6816
+
6817
+ elsif key.is_a?(Series)
6818
+ if key.is_empty
6819
+ return df.class.new
6820
+ end
6821
+ dtype = key.dtype
6822
+ if dtype == String
6823
+ return _select_columns_by_name(df, key)
6824
+ elsif dtype.integer?
6825
+ return _select_columns_by_index(df, key)
6826
+ elsif dtype == Boolean
6827
+ return _select_columns_by_mask(df, key)
6828
+ else
6829
+ msg = "cannot select columns using Series of type #{dtype}"
6830
+ raise TypeError, msg
6831
+ end
6832
+ end
6833
+
6834
+ msg = (
6835
+ "cannot select columns using key of type #{key.class.name.inspect}: #{key.inspect}"
6836
+ )
6837
+ raise TypeError, msg
6838
+ end
6839
+
6840
+ def _select_columns_by_index(df, key)
6841
+ series = key.map { |i| df.to_series(i) }
6842
+ df.class.new(series)
6843
+ end
6844
+
6845
+ def _select_columns_by_name(df, key)
6846
+ df.send(:_from_rbdf, df._df.select(Array(key)))
6847
+ end
6848
+
6849
+ def _select_columns_by_mask(df, key)
6850
+ if key.length != df.width
6851
+ msg = "expected #{df.width} values when selecting columns by boolean mask, got #{key.length}"
6852
+ raise ArgumentError, msg
6853
+ end
6854
+
6855
+ indices = key.each_with_index.filter_map { |val, i| i if val }
6856
+ _select_columns_by_index(df, indices)
6857
+ end
6858
+
6859
+ def _select_rows(df, key)
6860
+ if key.is_a?(Integer)
6861
+ num_rows = df.height
6862
+ if key >= num_rows || key < -num_rows
6863
+ msg = "index #{key} is out of bounds for DataFrame of height #{num_rows}"
6864
+ raise IndexError, msg
6865
+ end
6866
+ return df.slice(key, 1)
6867
+ end
6868
+
6869
+ if key.is_a?(Range)
6870
+ return _select_rows_by_slice(df, key)
6871
+
6872
+ elsif key.is_a?(::Array)
6873
+ if key.empty?
6874
+ return df.clear
6875
+ end
6876
+ if Utils.bool?(key[0])
6877
+ _raise_on_boolean_mask
6878
+ end
6879
+ s = Series.new("", key, dtype: Int64)
6880
+ indices = _convert_series_to_indices(s, df.height)
6881
+ return _select_rows_by_index(df, indices)
6882
+
6883
+ elsif key.is_a?(Series)
6884
+ indices = _convert_series_to_indices(key, df.height)
6885
+ return _select_rows_by_index(df, indices)
6886
+
6887
+ else
6888
+ msg = "cannot select rows using key of type #{key.class.name.inspect}: #{key.inspect}"
6889
+ raise TypeError, msg
6890
+ end
6891
+ end
6892
+
6893
+ def _select_rows_by_slice(df, key)
6894
+ return Slice.new(df).apply(key)
6895
+ end
6896
+
6897
+ def _select_rows_by_index(df, key)
6898
+ df.send(:_from_rbdf, df._df.gather_with_series(key._s))
6899
+ end
6900
+
6901
+ def _convert_series_to_indices(s, size)
6902
+ idx_type = Plr.get_index_type
6903
+
6904
+ if s.dtype == idx_type
6905
+ return s
6906
+ end
6907
+
6908
+ if !s.dtype.integer?
6909
+ if s.dtype == Boolean
6910
+ _raise_on_boolean_mask
6911
+ else
6912
+ msg = "cannot treat Series of type #{s.dtype} as indices"
6913
+ raise TypeError, msg
6914
+ end
6915
+ end
6916
+
6917
+ if s.len == 0
6918
+ return Series.new(s.name, [], dtype: idx_type)
6919
+ end
6920
+
6921
+ if idx_type == UInt32
6922
+ if [Int64, UInt64].include?(s.dtype) && s.max >= Utils::U32_MAX
6923
+ msg = "index positions should be smaller than 2^32"
6924
+ raise ArgumentError, msg
6925
+ end
6926
+ if s.dtype == Int64 && s.min < -Utils::U32_MAX
6927
+ msg = "index positions should be greater than or equal to -2^32"
6928
+ raise ArgumentError, msg
6929
+ end
6930
+ end
6931
+
6932
+ if s.dtype.signed_integer?
6933
+ if s.min < 0
6934
+ if idx_type == UInt32
6935
+ idxs = [Int8, Int16].include?(s.dtype) ? s.cast(Int32) : s
6936
+ else
6937
+ idxs = [Int8, Int16, Int32].include?(s.dtype) ? s.cast(Int64) : s
6938
+ end
6939
+
6940
+ # Update negative indexes to absolute indexes.
6941
+ return (
6942
+ idxs.to_frame
6943
+ .select(
6944
+ F.when(F.col(idxs.name) < 0)
6945
+ .then(size + F.col(idxs.name))
6946
+ .otherwise(F.col(idxs.name))
6947
+ .cast(idx_type)
6948
+ )
6949
+ .to_series(0)
6950
+ )
6951
+ end
6952
+ end
6953
+
6954
+ s.cast(idx_type)
6955
+ end
6956
+
6957
+ def _raise_on_boolean_mask
6958
+ msg = (
6959
+ "selecting rows by passing a boolean mask to `[]` is not supported" +
6960
+ "\n\nHint: Use the `filter` method instead."
6961
+ )
6962
+ raise TypeError, msg
6963
+ end
6729
6964
  end
6730
6965
  end