polars-df 0.23.0 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +127 -1
  3. data/Cargo.lock +72 -58
  4. data/README.md +31 -27
  5. data/ext/polars/Cargo.toml +15 -6
  6. data/ext/polars/src/batched_csv.rs +35 -39
  7. data/ext/polars/src/c_api/allocator.rs +7 -0
  8. data/ext/polars/src/c_api/mod.rs +1 -0
  9. data/ext/polars/src/catalog/unity.rs +123 -101
  10. data/ext/polars/src/conversion/any_value.rs +13 -17
  11. data/ext/polars/src/conversion/chunked_array.rs +5 -5
  12. data/ext/polars/src/conversion/datetime.rs +3 -2
  13. data/ext/polars/src/conversion/mod.rs +50 -45
  14. data/ext/polars/src/dataframe/export.rs +13 -13
  15. data/ext/polars/src/dataframe/general.rs +223 -223
  16. data/ext/polars/src/dataframe/io.rs +27 -141
  17. data/ext/polars/src/dataframe/mod.rs +13 -5
  18. data/ext/polars/src/dataframe/serde.rs +1 -1
  19. data/ext/polars/src/error.rs +44 -7
  20. data/ext/polars/src/exceptions.rs +45 -12
  21. data/ext/polars/src/expr/array.rs +12 -0
  22. data/ext/polars/src/expr/datatype.rs +2 -2
  23. data/ext/polars/src/expr/datetime.rs +4 -5
  24. data/ext/polars/src/expr/general.rs +49 -13
  25. data/ext/polars/src/expr/list.rs +4 -0
  26. data/ext/polars/src/expr/meta.rs +8 -3
  27. data/ext/polars/src/expr/mod.rs +22 -6
  28. data/ext/polars/src/expr/name.rs +19 -8
  29. data/ext/polars/src/expr/rolling.rs +50 -1
  30. data/ext/polars/src/expr/string.rs +0 -1
  31. data/ext/polars/src/expr/struct.rs +7 -2
  32. data/ext/polars/src/file.rs +136 -103
  33. data/ext/polars/src/functions/aggregation.rs +9 -8
  34. data/ext/polars/src/functions/io.rs +81 -10
  35. data/ext/polars/src/functions/lazy.rs +95 -21
  36. data/ext/polars/src/functions/mod.rs +2 -0
  37. data/ext/polars/src/functions/range.rs +19 -3
  38. data/ext/polars/src/functions/strings.rs +6 -0
  39. data/ext/polars/src/functions/utils.rs +6 -0
  40. data/ext/polars/src/interop/arrow/mod.rs +50 -1
  41. data/ext/polars/src/interop/arrow/{to_ruby.rs → to_rb.rs} +30 -0
  42. data/ext/polars/src/interop/arrow/to_rust.rs +43 -0
  43. data/ext/polars/src/interop/numo/to_numo_df.rs +1 -1
  44. data/ext/polars/src/interop/numo/to_numo_series.rs +1 -1
  45. data/ext/polars/src/lazyframe/exitable.rs +39 -0
  46. data/ext/polars/src/lazyframe/general.rs +340 -236
  47. data/ext/polars/src/lazyframe/mod.rs +46 -10
  48. data/ext/polars/src/lazyframe/optflags.rs +5 -4
  49. data/ext/polars/src/lazyframe/serde.rs +11 -3
  50. data/ext/polars/src/lazyframe/sink.rs +10 -5
  51. data/ext/polars/src/lazygroupby.rs +6 -7
  52. data/ext/polars/src/lib.rs +141 -76
  53. data/ext/polars/src/map/dataframe.rs +12 -12
  54. data/ext/polars/src/map/lazy.rs +7 -5
  55. data/ext/polars/src/map/mod.rs +15 -8
  56. data/ext/polars/src/map/series.rs +3 -3
  57. data/ext/polars/src/on_startup.rs +16 -8
  58. data/ext/polars/src/prelude.rs +1 -0
  59. data/ext/polars/src/rb_modules.rs +19 -49
  60. data/ext/polars/src/series/aggregation.rs +79 -140
  61. data/ext/polars/src/series/arithmetic.rs +16 -22
  62. data/ext/polars/src/series/comparison.rs +101 -222
  63. data/ext/polars/src/series/construction.rs +17 -18
  64. data/ext/polars/src/series/export.rs +1 -1
  65. data/ext/polars/src/series/general.rs +254 -289
  66. data/ext/polars/src/series/import.rs +17 -0
  67. data/ext/polars/src/series/map.rs +178 -160
  68. data/ext/polars/src/series/mod.rs +28 -12
  69. data/ext/polars/src/series/scatter.rs +12 -9
  70. data/ext/polars/src/sql.rs +16 -9
  71. data/ext/polars/src/testing/frame.rs +31 -0
  72. data/ext/polars/src/testing/mod.rs +5 -0
  73. data/ext/polars/src/testing/series.rs +31 -0
  74. data/ext/polars/src/timeout.rs +105 -0
  75. data/ext/polars/src/utils.rs +159 -1
  76. data/lib/polars/array_expr.rb +81 -12
  77. data/lib/polars/array_name_space.rb +74 -7
  78. data/lib/polars/batched_csv_reader.rb +21 -21
  79. data/lib/polars/binary_name_space.rb +1 -1
  80. data/lib/polars/cat_expr.rb +7 -7
  81. data/lib/polars/config.rb +1 -1
  82. data/lib/polars/convert.rb +189 -34
  83. data/lib/polars/data_frame.rb +1066 -831
  84. data/lib/polars/data_frame_plot.rb +173 -0
  85. data/lib/polars/data_type_group.rb +1 -0
  86. data/lib/polars/data_types.rb +31 -12
  87. data/lib/polars/date_time_expr.rb +51 -69
  88. data/lib/polars/date_time_name_space.rb +80 -112
  89. data/lib/polars/dynamic_group_by.rb +7 -7
  90. data/lib/polars/exceptions.rb +50 -10
  91. data/lib/polars/expr.rb +470 -517
  92. data/lib/polars/functions/aggregation/horizontal.rb +0 -1
  93. data/lib/polars/functions/aggregation/vertical.rb +2 -3
  94. data/lib/polars/functions/as_datatype.rb +290 -8
  95. data/lib/polars/functions/eager.rb +204 -10
  96. data/lib/polars/functions/escape_regex.rb +21 -0
  97. data/lib/polars/functions/lazy.rb +409 -169
  98. data/lib/polars/functions/lit.rb +17 -1
  99. data/lib/polars/functions/range/int_range.rb +74 -2
  100. data/lib/polars/functions/range/linear_space.rb +77 -0
  101. data/lib/polars/functions/range/time_range.rb +1 -1
  102. data/lib/polars/functions/repeat.rb +3 -12
  103. data/lib/polars/functions/whenthen.rb +2 -2
  104. data/lib/polars/group_by.rb +72 -20
  105. data/lib/polars/iceberg_dataset.rb +1 -6
  106. data/lib/polars/in_process_query.rb +37 -0
  107. data/lib/polars/io/cloud.rb +18 -0
  108. data/lib/polars/io/csv.rb +265 -126
  109. data/lib/polars/io/database.rb +0 -1
  110. data/lib/polars/io/delta.rb +15 -7
  111. data/lib/polars/io/ipc.rb +24 -17
  112. data/lib/polars/io/ndjson.rb +161 -24
  113. data/lib/polars/io/parquet.rb +101 -38
  114. data/lib/polars/lazy_frame.rb +849 -558
  115. data/lib/polars/lazy_group_by.rb +327 -2
  116. data/lib/polars/list_expr.rb +94 -16
  117. data/lib/polars/list_name_space.rb +88 -24
  118. data/lib/polars/meta_expr.rb +42 -1
  119. data/lib/polars/name_expr.rb +41 -4
  120. data/lib/polars/query_opt_flags.rb +198 -2
  121. data/lib/polars/rolling_group_by.rb +3 -3
  122. data/lib/polars/schema.rb +21 -3
  123. data/lib/polars/selector.rb +37 -2
  124. data/lib/polars/selectors.rb +45 -9
  125. data/lib/polars/series.rb +1156 -728
  126. data/lib/polars/series_plot.rb +72 -0
  127. data/lib/polars/slice.rb +1 -1
  128. data/lib/polars/sql_context.rb +11 -4
  129. data/lib/polars/string_expr.rb +59 -68
  130. data/lib/polars/string_name_space.rb +51 -87
  131. data/lib/polars/struct_expr.rb +36 -18
  132. data/lib/polars/testing.rb +24 -273
  133. data/lib/polars/utils/constants.rb +2 -0
  134. data/lib/polars/utils/construction/data_frame.rb +410 -0
  135. data/lib/polars/utils/construction/series.rb +364 -0
  136. data/lib/polars/utils/construction/utils.rb +9 -0
  137. data/lib/polars/utils/deprecation.rb +11 -0
  138. data/lib/polars/utils/serde.rb +8 -3
  139. data/lib/polars/utils/unstable.rb +19 -0
  140. data/lib/polars/utils/various.rb +59 -0
  141. data/lib/polars/utils.rb +46 -47
  142. data/lib/polars/version.rb +1 -1
  143. data/lib/polars.rb +47 -1
  144. metadata +25 -6
  145. data/ext/polars/src/allocator.rs +0 -13
  146. data/lib/polars/plot.rb +0 -109
@@ -5,12 +5,13 @@ module Polars
5
5
  attr_accessor :_ldf
6
6
 
7
7
  # Create a new LazyFrame.
8
- def initialize(data = nil, schema: nil, schema_overrides: nil, orient: nil, infer_schema_length: 100, nan_to_null: false)
8
+ def initialize(data = nil, schema: nil, schema_overrides: nil, strict: true, orient: nil, infer_schema_length: N_INFER_DEFAULT, nan_to_null: false)
9
9
  self._ldf = (
10
10
  DataFrame.new(
11
11
  data,
12
12
  schema: schema,
13
13
  schema_overrides: schema_overrides,
14
+ strict: strict,
14
15
  orient: orient,
15
16
  infer_schema_length: infer_schema_length,
16
17
  nan_to_null: nan_to_null
@@ -27,25 +28,16 @@ module Polars
27
28
  ldf
28
29
  end
29
30
 
30
- # Read a logical plan from a JSON file to construct a LazyFrame.
31
- #
32
- # @param file [String]
33
- # Path to a file or a file-like object.
34
- #
35
- # @return [LazyFrame]
36
- def self.read_json(file)
37
- if Utils.pathlike?(file)
38
- file = Utils.normalize_filepath(file)
39
- end
40
-
41
- Utils.wrap_ldf(RbLazyFrame.deserialize_json(file))
42
- end
43
-
44
31
  # Read a logical plan from a file to construct a LazyFrame.
45
32
  #
46
33
  # @param source [Object]
47
34
  # Path to a file or a file-like object (by file-like object, we refer to
48
35
  # objects that have a `read` method, such as a file handler or `StringIO`).
36
+ # @param format ['binary', 'json']
37
+ # The format with which the LazyFrame was serialized. Options:
38
+ #
39
+ # - `"binary"`: Deserialize from binary format (bytes). This is the default.
40
+ # - `"json"`: Deserialize from JSON format (string).
49
41
  #
50
42
  # @return [LazyFrame]
51
43
  #
@@ -71,14 +63,20 @@ module Polars
71
63
  # # ╞═════╡
72
64
  # # │ 6 │
73
65
  # # └─────┘
74
- def self.deserialize(source)
75
- raise Todo unless RbLazyFrame.respond_to?(:deserialize_binary)
76
-
66
+ def self.deserialize(source, format: "binary")
77
67
  if Utils.pathlike?(source)
78
68
  source = Utils.normalize_filepath(source)
79
69
  end
80
70
 
81
- deserializer = RbLazyFrame.method(:deserialize_binary)
71
+ if format == "binary"
72
+ raise Todo unless RbLazyFrame.respond_to?(:deserialize_binary)
73
+ deserializer = RbLazyFrame.method(:deserialize_binary)
74
+ elsif format == "json"
75
+ deserializer = RbLazyFrame.method(:deserialize_json)
76
+ else
77
+ msg = "`format` must be one of {{'binary', 'json'}}, got #{format.inspect}"
78
+ raise ArgumentError, msg
79
+ end
82
80
 
83
81
  _from_rbldf(deserializer.(source))
84
82
  end
@@ -170,31 +168,22 @@ module Polars
170
168
  # @return [String]
171
169
  def to_s
172
170
  <<~EOS
173
- naive plan: (run LazyFrame#describe_optimized_plan to see the optimized plan)
171
+ naive plan: (run LazyFrame#explain(optimized: true) to see the optimized plan)
174
172
 
175
- #{describe_plan}
173
+ #{explain(optimized: false)}
176
174
  EOS
177
175
  end
178
176
 
179
- # Write the logical plan of this LazyFrame to a file or string in JSON format.
180
- #
181
- # @param file [String]
182
- # File path to which the result should be written.
183
- #
184
- # @return [nil]
185
- def write_json(file)
186
- if Utils.pathlike?(file)
187
- file = Utils.normalize_filepath(file)
188
- end
189
- _ldf.write_json(file)
190
- nil
191
- end
192
-
193
177
  # Serialize the logical plan of this LazyFrame to a file or string.
194
178
  #
195
179
  # @param file [Object]
196
180
  # File path to which the result should be written. If set to `nil`
197
181
  # (default), the output is returned as a string instead.
182
+ # @param format ['binary', 'json']
183
+ # The format in which to serialize. Options:
184
+ #
185
+ # - `"binary"`: Serialize to binary format (bytes). This is the default.
186
+ # - `"json"`: Serialize to JSON format (string) (deprecated).
198
187
  #
199
188
  # @return [Object]
200
189
  #
@@ -215,16 +204,25 @@ module Polars
215
204
  # # ╞═════╡
216
205
  # # │ 6 │
217
206
  # # └─────┘
218
- def serialize(file = nil)
219
- raise Todo unless _ldf.respond_to?(:serialize_binary)
207
+ def serialize(file = nil, format: "binary")
208
+ if format == "binary"
209
+ raise Todo unless _ldf.respond_to?(:serialize_binary)
210
+ serializer = _ldf.method(:serialize_binary)
211
+ elsif format == "json"
212
+ msg = "'json' serialization format of LazyFrame is deprecated"
213
+ warn msg
214
+ serializer = _ldf.method(:serialize_json)
215
+ else
216
+ msg = "`format` must be one of {{'binary', 'json'}}, got #{format.inspect}"
217
+ raise ArgumentError, msg
218
+ end
220
219
 
221
- serializer = _ldf.method(:serialize_binary)
222
220
  Utils.serialize_polars_object(serializer, file)
223
221
  end
224
222
 
225
223
  # Offers a structured way to apply a sequence of user-defined functions (UDFs).
226
224
  #
227
- # @param func [Object]
225
+ # @param function [Object]
228
226
  # Callable; will receive the frame as the first parameter,
229
227
  # followed by any given args/kwargs.
230
228
  # @param args [Object]
@@ -236,7 +234,7 @@ module Polars
236
234
  #
237
235
  # @example
238
236
  # cast_str_to_int = lambda do |data, col_name:|
239
- # data.with_column(Polars.col(col_name).cast(:i64))
237
+ # data.with_columns(Polars.col(col_name).cast(Polars::Int64))
240
238
  # end
241
239
  #
242
240
  # df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]}).lazy
@@ -253,47 +251,344 @@ module Polars
253
251
  # # │ 3 ┆ 30 │
254
252
  # # │ 4 ┆ 40 │
255
253
  # # └─────┴─────┘
256
- def pipe(func, *args, **kwargs, &block)
257
- func.call(self, *args, **kwargs, &block)
254
+ def pipe(function, *args, **kwargs, &block)
255
+ function.(self, *args, **kwargs, &block)
258
256
  end
259
257
 
260
- # Create a string representation of the unoptimized query plan.
258
+ # Creates a summary of statistics for a LazyFrame, returning a DataFrame.
261
259
  #
262
- # @return [String]
263
- def describe_plan
264
- _ldf.describe_plan
260
+ # @param percentiles [Array]
261
+ # One or more percentiles to include in the summary statistics.
262
+ # All values must be in the range `[0, 1]`.
263
+ # @param interpolation ['nearest', 'higher', 'lower', 'midpoint', 'linear', 'equiprobable']
264
+ # Interpolation method used when calculating percentiles.
265
+ #
266
+ # @return [DataFrame]
267
+ #
268
+ # @note
269
+ # The median is included by default as the 50% percentile.
270
+ #
271
+ # @note
272
+ # This method does *not* maintain the laziness of the frame, and will `collect`
273
+ # the final result. This could potentially be an expensive operation.
274
+ #
275
+ # @note
276
+ # We do not guarantee the output of `describe` to be stable. It will show
277
+ # statistics that we deem informative, and may be updated in the future.
278
+ # Using `describe` programmatically (versus interactive exploration) is
279
+ # not recommended for this reason.
280
+ #
281
+ # @example Show default frame statistics:
282
+ # lf = Polars::LazyFrame.new(
283
+ # {
284
+ # "float" => [1.0, 2.8, 3.0],
285
+ # "int" => [40, 50, nil],
286
+ # "bool" => [true, false, true],
287
+ # "str" => ["zz", "xx", "yy"],
288
+ # "date" => [Date.new(2020, 1, 1), Date.new(2021, 7, 5), Date.new(2022, 12, 31)]
289
+ # }
290
+ # )
291
+ # lf.describe
292
+ # # =>
293
+ # # shape: (9, 6)
294
+ # # ┌────────────┬──────────┬──────────┬──────────┬──────┬─────────────────────────┐
295
+ # # │ statistic ┆ float ┆ int ┆ bool ┆ str ┆ date │
296
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
297
+ # # │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str │
298
+ # # ╞════════════╪══════════╪══════════╪══════════╪══════╪═════════════════════════╡
299
+ # # │ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 3 ┆ 3 │
300
+ # # │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 0 ┆ 0 │
301
+ # # │ mean ┆ 2.266667 ┆ 45.0 ┆ 0.666667 ┆ null ┆ 2021-07-02 16:00:00 UTC │
302
+ # # │ std ┆ 1.101514 ┆ 7.071068 ┆ null ┆ null ┆ null │
303
+ # # │ min ┆ 1.0 ┆ 40.0 ┆ 0.0 ┆ xx ┆ 2020-01-01 │
304
+ # # │ 25% ┆ 2.8 ┆ 40.0 ┆ null ┆ null ┆ 2021-07-05 │
305
+ # # │ 50% ┆ 2.8 ┆ 50.0 ┆ null ┆ null ┆ 2021-07-05 │
306
+ # # │ 75% ┆ 3.0 ┆ 50.0 ┆ null ┆ null ┆ 2022-12-31 │
307
+ # # │ max ┆ 3.0 ┆ 50.0 ┆ 1.0 ┆ zz ┆ 2022-12-31 │
308
+ # # └────────────┴──────────┴──────────┴──────────┴──────┴─────────────────────────┘
309
+ #
310
+ # @example Customize which percentiles are displayed, applying linear interpolation:
311
+ # lf.describe(
312
+ # percentiles: [0.1, 0.3, 0.5, 0.7, 0.9],
313
+ # interpolation: "linear"
314
+ # )
315
+ # # =>
316
+ # # shape: (11, 6)
317
+ # # ┌────────────┬──────────┬──────────┬──────────┬──────┬─────────────────────────┐
318
+ # # │ statistic ┆ float ┆ int ┆ bool ┆ str ┆ date │
319
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
320
+ # # │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str │
321
+ # # ╞════════════╪══════════╪══════════╪══════════╪══════╪═════════════════════════╡
322
+ # # │ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 3 ┆ 3 │
323
+ # # │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 0 ┆ 0 │
324
+ # # │ mean ┆ 2.266667 ┆ 45.0 ┆ 0.666667 ┆ null ┆ 2021-07-02 16:00:00 UTC │
325
+ # # │ std ┆ 1.101514 ┆ 7.071068 ┆ null ┆ null ┆ null │
326
+ # # │ min ┆ 1.0 ┆ 40.0 ┆ 0.0 ┆ xx ┆ 2020-01-01 │
327
+ # # │ … ┆ … ┆ … ┆ … ┆ … ┆ … │
328
+ # # │ 30% ┆ 2.08 ┆ 43.0 ┆ null ┆ null ┆ 2020-11-26 │
329
+ # # │ 50% ┆ 2.8 ┆ 45.0 ┆ null ┆ null ┆ 2021-07-05 │
330
+ # # │ 70% ┆ 2.88 ┆ 47.0 ┆ null ┆ null ┆ 2022-02-07 │
331
+ # # │ 90% ┆ 2.96 ┆ 49.0 ┆ null ┆ null ┆ 2022-09-13 │
332
+ # # │ max ┆ 3.0 ┆ 50.0 ┆ 1.0 ┆ zz ┆ 2022-12-31 │
333
+ # # └────────────┴──────────┴──────────┴──────────┴──────┴─────────────────────────┘
334
+ def describe(
335
+ percentiles: [0.25, 0.5, 0.75],
336
+ interpolation: "nearest"
337
+ )
338
+ schema = collect_schema.to_h
339
+
340
+ if schema.empty?
341
+ msg = "cannot describe a LazyFrame that has no columns"
342
+ raise TypeError, msg
343
+ end
344
+
345
+ # create list of metrics
346
+ metrics = ["count", "null_count", "mean", "std", "min"]
347
+ if (quantiles = Utils.parse_percentiles(percentiles)).any?
348
+ metrics.concat(quantiles.map { |q| "%g%%" % [q * 100] })
349
+ end
350
+ metrics.append("max")
351
+
352
+ skip_minmax = lambda do |dt|
353
+ dt.nested? || [Categorical, Enum, Null, Object, Unknown].include?(dt)
354
+ end
355
+
356
+ # determine which columns will produce std/mean/percentile/etc
357
+ # statistics in a single pass over the frame schema
358
+ has_numeric_result, sort_cols = Set.new, Set.new
359
+ metric_exprs = []
360
+ null = F.lit(nil)
361
+
362
+ schema.each do |c, dtype|
363
+ is_numeric = dtype.numeric?
364
+ is_temporal = !is_numeric && dtype.temporal?
365
+
366
+ # counts
367
+ count_exprs = [
368
+ F.col(c).count.name.prefix("count:"),
369
+ F.col(c).null_count.name.prefix("null_count:")
370
+ ]
371
+ # mean
372
+ mean_expr =
373
+ if is_temporal || is_numeric || dtype == Boolean
374
+ F.col(c).mean
375
+ else
376
+ null
377
+ end
378
+
379
+ # standard deviation, min, max
380
+ expr_std = is_numeric ? F.col(c).std : null
381
+ min_expr = !skip_minmax.(dtype) ? F.col(c).min : null
382
+ max_expr = !skip_minmax.(dtype) ? F.col(c).max : null
383
+
384
+ # percentiles
385
+ pct_exprs = []
386
+ quantiles.each do |p|
387
+ if is_numeric || is_temporal
388
+ pct_expr =
389
+ if is_temporal
390
+ F.col(c).to_physical.quantile(p, interpolation: interpolation).cast(dtype)
391
+ else
392
+ F.col(c).quantile(p, interpolation: interpolation)
393
+ end
394
+ sort_cols.add(c)
395
+ else
396
+ pct_expr = null
397
+ end
398
+ pct_exprs << pct_expr.alias("#{p}:#{c}")
399
+ end
400
+
401
+ if is_numeric || dtype.nested? || [Null, Boolean].include?(dtype)
402
+ has_numeric_result.add(c)
403
+ end
404
+
405
+ # add column expressions (in end-state 'metrics' list order)
406
+ metric_exprs.concat(
407
+ [
408
+ *count_exprs,
409
+ mean_expr.alias("mean:#{c}"),
410
+ expr_std.alias("std:#{c}"),
411
+ min_expr.alias("min:#{c}"),
412
+ *pct_exprs,
413
+ max_expr.alias("max:#{c}")
414
+ ]
415
+ )
416
+ end
417
+
418
+ # calculate requested metrics in parallel, then collect the result
419
+ df_metrics = (
420
+ (
421
+ # if more than one quantile, sort the relevant columns to make them O(1)
422
+ # TODO: drop sort once we have efficient retrieval of multiple quantiles
423
+ sort_cols ? with_columns(sort_cols.map { |c| F.col(c).sort }) : self
424
+ )
425
+ .select(*metric_exprs)
426
+ .collect
427
+ )
428
+
429
+ # reshape wide result
430
+ n_metrics = metrics.length
431
+ column_metrics =
432
+ schema.length.times.map do |n|
433
+ df_metrics.row(0)[(n * n_metrics)...((n + 1) * n_metrics)]
434
+ end
435
+
436
+ summary = schema.keys.zip(column_metrics).to_h
437
+
438
+ # cast by column type (numeric/bool -> float), (other -> string)
439
+ schema.each_key do |c|
440
+ summary[c] =
441
+ summary[c].map do |v|
442
+ if v.nil? || v.is_a?(Hash)
443
+ nil
444
+ else
445
+ if has_numeric_result.include?(c)
446
+ if v == true
447
+ 1.0
448
+ elsif v == false
449
+ 0.0
450
+ else
451
+ v.to_f
452
+ end
453
+ else
454
+ "#{v}"
455
+ end
456
+ end
457
+ end
458
+ end
459
+
460
+ # return results as a DataFrame
461
+ df_summary = Polars.from_hash(summary)
462
+ df_summary.insert_column(0, Polars::Series.new("statistic", metrics))
463
+ df_summary
265
464
  end
266
465
 
267
- # Create a string representation of the optimized query plan.
466
+ # Create a string representation of the query plan.
467
+ #
468
+ # Different optimizations can be turned on or off.
268
469
  #
269
470
  # @return [String]
270
- def describe_optimized_plan(
271
- type_coercion: true,
272
- predicate_pushdown: true,
273
- projection_pushdown: true,
274
- simplify_expression: true,
275
- slice_pushdown: true,
276
- common_subplan_elimination: true,
277
- comm_subexpr_elim: true,
278
- allow_streaming: false
471
+ #
472
+ # @example
473
+ # lf = Polars::LazyFrame.new(
474
+ # {
475
+ # "a" => ["a", "b", "a", "b", "b", "c"],
476
+ # "b" => [1, 2, 3, 4, 5, 6],
477
+ # "c" => [6, 5, 4, 3, 2, 1]
478
+ # }
479
+ # )
480
+ # lf.group_by("a", maintain_order: true).agg(Polars.all.sum).sort(
481
+ # "a"
482
+ # ).explain
483
+ def explain(
484
+ format: "plain",
485
+ optimized: true,
486
+ engine: "auto",
487
+ optimizations: DEFAULT_QUERY_OPT_FLAGS
279
488
  )
280
- ldf = _ldf.optimization_toggle(
281
- type_coercion,
282
- predicate_pushdown,
283
- projection_pushdown,
284
- simplify_expression,
285
- slice_pushdown,
286
- common_subplan_elimination,
287
- comm_subexpr_elim,
288
- allow_streaming,
289
- false
290
- )
489
+ engine = _select_engine(engine)
490
+
491
+ if engine == "streaming"
492
+ Utils.issue_unstable_warning("streaming mode is considered unstable.")
493
+ end
291
494
 
292
- ldf.describe_optimized_plan
495
+ if optimized
496
+ ldf = _ldf.with_optimizations(optimizations._rboptflags)
497
+ if format == "tree"
498
+ return ldf.describe_optimized_plan_tree
499
+ else
500
+ return ldf.describe_optimized_plan
501
+ end
502
+ end
503
+
504
+ if format == "tree"
505
+ _ldf.describe_plan_tree
506
+ else
507
+ _ldf.describe_plan
508
+ end
293
509
  end
294
510
 
295
- # def show_graph
296
- # end
511
+ # Show a plot of the query plan.
512
+ #
513
+ # Note that Graphviz must be installed to render the visualization (if not
514
+ # already present, you can download it here: https://graphviz.org/download.
515
+ #
516
+ # @param optimized [Boolean]
517
+ # Optimize the query plan.
518
+ # @param show [Boolean]
519
+ # Show the figure.
520
+ # @param output_path [String]
521
+ # Write the figure to disk.
522
+ # @param raw_output [Boolean]
523
+ # Return dot syntax. This cannot be combined with `show` and/or `output_path`.
524
+ # @param engine [String]
525
+ # Select the engine used to process the query, optional.
526
+ # At the moment, if set to `"auto"` (default), the query
527
+ # is run using the polars in-memory engine. Polars will also
528
+ # attempt to use the engine set by the `POLARS_ENGINE_AFFINITY`
529
+ # environment variable. If it cannot run the query using the
530
+ # selected engine, the query is run using the polars in-memory
531
+ # engine.
532
+ # @param plan_stage ['ir', 'physical']
533
+ # Select the stage to display. Currently only the streaming engine has a
534
+ # separate physical stage, for the other engines both IR and physical are the
535
+ # same.
536
+ # @param optimizations [Object]
537
+ # The set of the optimizations considered during query optimization.
538
+ #
539
+ # @return [Object]
540
+ #
541
+ # @example
542
+ # lf = Polars::LazyFrame.new(
543
+ # {
544
+ # "a" => ["a", "b", "a", "b", "b", "c"],
545
+ # "b" => [1, 2, 3, 4, 5, 6],
546
+ # "c" => [6, 5, 4, 3, 2, 1]
547
+ # }
548
+ # )
549
+ # lf.group_by("a", maintain_order: true).agg(Polars.all.sum).sort(
550
+ # "a"
551
+ # ).show_graph
552
+ def show_graph(
553
+ optimized: true,
554
+ show: true,
555
+ output_path: nil,
556
+ raw_output: false,
557
+ figsize: [16.0, 12.0],
558
+ engine: "auto",
559
+ plan_stage: "ir",
560
+ optimizations: DEFAULT_QUERY_OPT_FLAGS
561
+ )
562
+ engine = _select_engine(engine)
563
+
564
+ if engine == "streaming"
565
+ issue_unstable_warning("streaming mode is considered unstable.")
566
+ end
567
+
568
+ optimizations = optimizations.dup
569
+ optimizations._rboptflags.streaming = engine == "streaming"
570
+ _ldf = self._ldf.with_optimizations(optimizations._rboptflags)
571
+
572
+ if plan_stage == "ir"
573
+ dot = _ldf.to_dot(optimized)
574
+ elsif plan_stage == "physical"
575
+ if engine == "streaming"
576
+ dot = _ldf.to_dot_streaming_phys(optimized)
577
+ else
578
+ dot = _ldf.to_dot(optimized)
579
+ end
580
+ else
581
+ error_msg = "invalid plan stage '#{plan_stage}'"
582
+ raise TypeError, error_msg
583
+ end
584
+
585
+ Utils.display_dot_graph(
586
+ dot: dot,
587
+ show: show,
588
+ output_path: output_path,
589
+ raw_output: raw_output
590
+ )
591
+ end
297
592
 
298
593
  # Sort the DataFrame.
299
594
  #
@@ -307,7 +602,7 @@ module Polars
307
602
  # Column (expressions) to sort by.
308
603
  # @param more_by [Array]
309
604
  # Additional columns to sort by, specified as positional arguments.
310
- # @param reverse [Boolean]
605
+ # @param descending [Boolean]
311
606
  # Sort in descending order.
312
607
  # @param nulls_last [Boolean]
313
608
  # Place null values last. Can only be used if sorted by a single column.
@@ -328,7 +623,7 @@ module Polars
328
623
  # "ham" => ["a", "b", "c"]
329
624
  # }
330
625
  # ).lazy
331
- # df.sort("foo", reverse: true).collect
626
+ # df.sort("foo", descending: true).collect
332
627
  # # =>
333
628
  # # shape: (3, 3)
334
629
  # # ┌─────┬─────┬─────┐
@@ -340,21 +635,21 @@ module Polars
340
635
  # # │ 2 ┆ 7.0 ┆ b │
341
636
  # # │ 1 ┆ 6.0 ┆ a │
342
637
  # # └─────┴─────┴─────┘
343
- def sort(by, *more_by, reverse: false, nulls_last: false, maintain_order: false, multithreaded: true)
638
+ def sort(by, *more_by, descending: false, nulls_last: false, maintain_order: false, multithreaded: true)
344
639
  if by.is_a?(::String) && more_by.empty?
345
640
  return _from_rbldf(
346
641
  _ldf.sort(
347
- by, reverse, nulls_last, maintain_order, multithreaded
642
+ by, descending, nulls_last, maintain_order, multithreaded
348
643
  )
349
644
  )
350
645
  end
351
646
 
352
647
  by = Utils.parse_into_list_of_expressions(by, *more_by)
353
- reverse = Utils.extend_bool(reverse, by.length, "reverse", "by")
648
+ descending = Utils.extend_bool(descending, by.length, "descending", "by")
354
649
  nulls_last = Utils.extend_bool(nulls_last, by.length, "nulls_last", "by")
355
650
  _from_rbldf(
356
651
  _ldf.sort_by_exprs(
357
- by, reverse, nulls_last, maintain_order, multithreaded
652
+ by, descending, nulls_last, maintain_order, multithreaded
358
653
  )
359
654
  )
360
655
  end
@@ -440,7 +735,7 @@ module Polars
440
735
  # Accepts expression input. Strings are parsed as column names.
441
736
  # @param reverse [Object]
442
737
  # Consider the `k` smallest elements of the `by` column(s) (instead of the `k`
443
- # largest). This can be specified per column by passing a sequence of
738
+ # largest). This can be specified per column by passing an array of
444
739
  # booleans.
445
740
  #
446
741
  # @return [LazyFrame]
@@ -504,7 +799,7 @@ module Polars
504
799
  # Accepts expression input. Strings are parsed as column names.
505
800
  # @param reverse [Object]
506
801
  # Consider the `k` largest elements of the `by` column(s) (instead of the `k`
507
- # smallest). This can be specified per column by passing a sequence of
802
+ # smallest). This can be specified per column by passing an array of
508
803
  # booleans.
509
804
  #
510
805
  # @return [LazyFrame]
@@ -554,35 +849,90 @@ module Polars
554
849
  _from_rbldf(_ldf.bottom_k(k, by, reverse))
555
850
  end
556
851
 
557
- # def profile
558
- # end
852
+ # Profile a LazyFrame.
853
+ #
854
+ # This will run the query and return a tuple
855
+ # containing the materialized DataFrame and a DataFrame that
856
+ # contains profiling information of each node that is executed.
857
+ #
858
+ # The units of the timings are microseconds.
859
+ #
860
+ # @param engine [String]
861
+ # Select the engine used to process the query, optional.
862
+ # At the moment, if set to `"auto"` (default), the query
863
+ # is run using the polars in-memory engine. Polars will also
864
+ # attempt to use the engine set by the `POLARS_ENGINE_AFFINITY`
865
+ # environment variable. If it cannot run the query using the
866
+ # selected engine, the query is run using the polars in-memory
867
+ # engine.
868
+ # @param optimizations [Object]
869
+ # The optimization passes done during query optimization.
870
+ #
871
+ # @return [Array]
872
+ #
873
+ # @example
874
+ # lf = Polars::LazyFrame.new(
875
+ # {
876
+ # "a" => ["a", "b", "a", "b", "b", "c"],
877
+ # "b" => [1, 2, 3, 4, 5, 6],
878
+ # "c" => [6, 5, 4, 3, 2, 1]
879
+ # }
880
+ # )
881
+ # lf.group_by("a", maintain_order: true).agg(Polars.all.sum).sort(
882
+ # "a"
883
+ # ).profile
884
+ # # =>
885
+ # # [shape: (3, 3)
886
+ # # ┌─────┬─────┬─────┐
887
+ # # │ a ┆ b ┆ c │
888
+ # # │ --- ┆ --- ┆ --- │
889
+ # # │ str ┆ i64 ┆ i64 │
890
+ # # ╞═════╪═════╪═════╡
891
+ # # │ a ┆ 4 ┆ 10 │
892
+ # # │ b ┆ 11 ┆ 10 │
893
+ # # │ c ┆ 6 ┆ 1 │
894
+ # # └─────┴─────┴─────┘,
895
+ # # shape: (3, 3)
896
+ # # ┌──────────────┬───────┬─────┐
897
+ # # │ node ┆ start ┆ end │
898
+ # # │ --- ┆ --- ┆ --- │
899
+ # # │ str ┆ u64 ┆ u64 │
900
+ # # ╞══════════════╪═══════╪═════╡
901
+ # # │ optimization ┆ 0 ┆ 67 │
902
+ # # │ sort(a) ┆ 67 ┆ 79 │
903
+ # # └──────────────┴───────┴─────┘]
904
+ def profile(
905
+ engine: "auto",
906
+ optimizations: DEFAULT_QUERY_OPT_FLAGS
907
+ )
908
+ engine = _select_engine(engine)
909
+
910
+ ldf = _ldf.with_optimizations(optimizations._rboptflags)
911
+
912
+ df_rb, timings_rb = ldf.profile
913
+ [Utils.wrap_df(df_rb), Utils.wrap_df(timings_rb)]
914
+ end
559
915
 
560
- # Collect into a DataFrame.
561
- #
562
- # Note: use {#fetch} if you want to run your query on the first `n` rows
563
- # only. This can be a huge time saver in debugging queries.
564
- #
565
- # @param type_coercion [Boolean]
566
- # Do type coercion optimization.
567
- # @param predicate_pushdown [Boolean]
568
- # Do predicate pushdown optimization.
569
- # @param projection_pushdown [Boolean]
570
- # Do projection pushdown optimization.
571
- # @param simplify_expression [Boolean]
572
- # Run simplify expressions optimization.
573
- # @param string_cache [Boolean]
574
- # This argument is deprecated. Please set the string cache globally.
575
- # The argument will be ignored
576
- # @param no_optimization [Boolean]
577
- # Turn off (certain) optimizations.
578
- # @param slice_pushdown [Boolean]
579
- # Slice pushdown optimization.
580
- # @param common_subplan_elimination [Boolean]
581
- # Will try to cache branching subplans that occur on self-joins or unions.
582
- # @param comm_subexpr_elim [Boolean]
583
- # Common subexpressions will be cached and reused.
584
- # @param allow_streaming [Boolean]
585
- # Run parts of the query in a streaming fashion (this is in an alpha state)
916
+ # Materialize this LazyFrame into a DataFrame.
917
+ #
918
+ # By default, all query optimizations are enabled. Individual optimizations may
919
+ # be disabled by setting the corresponding parameter to `false`.
920
+ #
921
+ # @param engine
922
+ # Select the engine used to process the query, optional.
923
+ # At the moment, if set to `"auto"` (default), the query is run
924
+ # using the polars streaming engine. Polars will also
925
+ # attempt to use the engine set by the `POLARS_ENGINE_AFFINITY`
926
+ # environment variable. If it cannot run the query using the
927
+ # selected engine, the query is run using the polars streaming
928
+ # engine.
929
+ # @param background [Boolean]
930
+ # Run the query in the background and get a handle to the query.
931
+ # This handle can be used to fetch the result or cancel the query.
932
+ # @param optimizations
933
+ # The optimization passes done during query optimization.
934
+ #
935
+ # This has no effect if `lazy` is set to `true`.
586
936
  #
587
937
  # @return [DataFrame]
588
938
  #
@@ -607,42 +957,23 @@ module Polars
607
957
  # # │ c ┆ 6 ┆ 1 │
608
958
  # # └─────┴─────┴─────┘
609
959
  def collect(
610
- type_coercion: true,
611
- predicate_pushdown: true,
612
- projection_pushdown: true,
613
- simplify_expression: true,
614
- string_cache: false,
615
- no_optimization: false,
616
- slice_pushdown: true,
617
- common_subplan_elimination: true,
618
- comm_subexpr_elim: true,
619
- allow_streaming: false,
620
- _eager: false
960
+ engine: "auto",
961
+ background: false,
962
+ optimizations: DEFAULT_QUERY_OPT_FLAGS
621
963
  )
622
- if no_optimization
623
- predicate_pushdown = false
624
- projection_pushdown = false
625
- slice_pushdown = false
626
- common_subplan_elimination = false
627
- comm_subexpr_elim = false
964
+ engine = _select_engine(engine)
965
+
966
+ if engine == "streaming"
967
+ Utils.issue_unstable_warning("streaming mode is considered unstable.")
628
968
  end
629
969
 
630
- if allow_streaming
631
- common_subplan_elimination = false
970
+ ldf = _ldf.with_optimizations(optimizations._rboptflags)
971
+ if background
972
+ Utils.issue_unstable_warning("background mode is considered unstable.")
973
+ return InProcessQuery.new(ldf.collect_concurrently)
632
974
  end
633
975
 
634
- ldf = _ldf.optimization_toggle(
635
- type_coercion,
636
- predicate_pushdown,
637
- projection_pushdown,
638
- simplify_expression,
639
- slice_pushdown,
640
- common_subplan_elimination,
641
- comm_subexpr_elim,
642
- allow_streaming,
643
- _eager
644
- )
645
- Utils.wrap_df(ldf.collect)
976
+ Utils.wrap_df(ldf.collect(engine))
646
977
  end
647
978
 
648
979
  # Resolve the schema of this LazyFrame.
@@ -705,24 +1036,12 @@ module Polars
705
1036
  # If `nil` (default), the chunks of the `DataFrame` are
706
1037
  # used. Writing in smaller chunks may reduce memory pressure and improve
707
1038
  # writing speeds.
708
- # @param data_pagesize_limit [Integer]
1039
+ # @param data_page_size [Integer]
709
1040
  # Size limit of individual data pages.
710
1041
  # If not set defaults to 1024 * 1024 bytes
711
1042
  # @param maintain_order [Boolean]
712
1043
  # Maintain the order in which data is processed.
713
1044
  # Setting this to `false` will be slightly faster.
714
- # @param type_coercion [Boolean]
715
- # Do type coercion optimization.
716
- # @param predicate_pushdown [Boolean]
717
- # Do predicate pushdown optimization.
718
- # @param projection_pushdown [Boolean]
719
- # Do projection pushdown optimization.
720
- # @param simplify_expression [Boolean]
721
- # Run simplify expressions optimization.
722
- # @param no_optimization [Boolean]
723
- # Turn off (certain) optimizations.
724
- # @param slice_pushdown [Boolean]
725
- # Slice pushdown optimization.
726
1045
  # @param storage_options [String]
727
1046
  # Options that indicate how to connect to a cloud provider.
728
1047
  #
@@ -748,6 +1067,18 @@ module Polars
748
1067
  # Recursively create all the directories in the path.
749
1068
  # @param lazy [Boolean]
750
1069
  # Wait to start execution until `collect` is called.
1070
+ # @param engine
1071
+ # Select the engine used to process the query, optional.
1072
+ # At the moment, if set to `"auto"` (default), the query is run
1073
+ # using the polars streaming engine. Polars will also
1074
+ # attempt to use the engine set by the `POLARS_ENGINE_AFFINITY`
1075
+ # environment variable. If it cannot run the query using the
1076
+ # selected engine, the query is run using the polars streaming
1077
+ # engine.
1078
+ # @param optimizations
1079
+ # The optimization passes done during query optimization.
1080
+ #
1081
+ # This has no effect if `lazy` is set to `true`.
751
1082
  #
752
1083
  # @return [DataFrame]
753
1084
  #
@@ -760,28 +1091,20 @@ module Polars
760
1091
  compression_level: nil,
761
1092
  statistics: true,
762
1093
  row_group_size: nil,
763
- data_pagesize_limit: nil,
1094
+ data_page_size: nil,
764
1095
  maintain_order: true,
765
- type_coercion: true,
766
- predicate_pushdown: true,
767
- projection_pushdown: true,
768
- simplify_expression: true,
769
- no_optimization: false,
770
- slice_pushdown: true,
771
1096
  storage_options: nil,
1097
+ credential_provider: "auto",
772
1098
  retries: 2,
773
1099
  sync_on_close: nil,
1100
+ metadata: nil,
774
1101
  mkdir: false,
775
- lazy: false
1102
+ lazy: false,
1103
+ field_overwrites: nil,
1104
+ engine: "auto",
1105
+ optimizations: DEFAULT_QUERY_OPT_FLAGS
776
1106
  )
777
- lf = _set_sink_optimizations(
778
- type_coercion: type_coercion,
779
- predicate_pushdown: predicate_pushdown,
780
- projection_pushdown: projection_pushdown,
781
- simplify_expression: simplify_expression,
782
- slice_pushdown: slice_pushdown,
783
- no_optimization: no_optimization
784
- )
1107
+ engine = _select_engine(engine, path)
785
1108
 
786
1109
  if statistics == true
787
1110
  statistics = {
@@ -801,6 +1124,12 @@ module Polars
801
1124
  }
802
1125
  end
803
1126
 
1127
+ _init_credential_provider_builder = Polars.method(:_init_credential_provider_builder)
1128
+
1129
+ credential_provider_builder = _init_credential_provider_builder.(
1130
+ credential_provider, path, storage_options, "sink_parquet"
1131
+ )
1132
+
804
1133
  if storage_options&.any?
805
1134
  storage_options = storage_options.to_a
806
1135
  else
@@ -813,24 +1142,33 @@ module Polars
813
1142
  "mkdir" => mkdir
814
1143
  }
815
1144
 
816
- lf = lf.sink_parquet(
1145
+ field_overwrites_dicts = []
1146
+ if !field_overwrites.nil?
1147
+ raise Todo
1148
+ end
1149
+
1150
+ ldf_rb = _ldf.sink_parquet(
817
1151
  path,
818
1152
  compression,
819
1153
  compression_level,
820
1154
  statistics,
821
1155
  row_group_size,
822
- data_pagesize_limit,
1156
+ data_page_size,
823
1157
  storage_options,
1158
+ credential_provider_builder,
824
1159
  retries,
825
- sink_options
1160
+ sink_options,
1161
+ metadata,
1162
+ field_overwrites_dicts
826
1163
  )
827
- lf = LazyFrame._from_rbldf(lf)
828
1164
 
829
1165
  if !lazy
830
- lf.collect
1166
+ ldf_rb = ldf_rb.with_optimizations(optimizations._rboptflags)
1167
+ ldf = LazyFrame._from_rbldf(ldf_rb)
1168
+ ldf.collect(engine: engine)
831
1169
  return nil
832
1170
  end
833
- lf
1171
+ LazyFrame._from_rbldf(ldf_rb)
834
1172
  end
835
1173
 
836
1174
  # Evaluate the query in streaming mode and write to an IPC file.
@@ -860,18 +1198,6 @@ module Polars
860
1198
  # information from environment variables.
861
1199
  # @param retries [Integer]
862
1200
  # Number of retries if accessing a cloud instance fails.
863
- # @param type_coercion [Boolean]
864
- # Do type coercion optimization.
865
- # @param predicate_pushdown [Boolean]
866
- # Do predicate pushdown optimization.
867
- # @param projection_pushdown [Boolean]
868
- # Do projection pushdown optimization.
869
- # @param simplify_expression [Boolean]
870
- # Run simplify expressions optimization.
871
- # @param slice_pushdown [Boolean]
872
- # Slice pushdown optimization.
873
- # @param no_optimization [Boolean]
874
- # Turn off (certain) optimizations.
875
1201
  # @param sync_on_close ['data', 'all']
876
1202
  # Sync to disk when before closing a file.
877
1203
  #
@@ -882,6 +1208,18 @@ module Polars
882
1208
  # Recursively create all the directories in the path.
883
1209
  # @param lazy [Boolean]
884
1210
  # Wait to start execution until `collect` is called.
1211
+ # @param engine
1212
+ # Select the engine used to process the query, optional.
1213
+ # At the moment, if set to `"auto"` (default), the query is run
1214
+ # using the polars streaming engine. Polars will also
1215
+ # attempt to use the engine set by the `POLARS_ENGINE_AFFINITY`
1216
+ # environment variable. If it cannot run the query using the
1217
+ # selected engine, the query is run using the polars streaming
1218
+ # engine.
1219
+ # @param optimizations
1220
+ # The optimization passes done during query optimization.
1221
+ #
1222
+ # This has no effect if `lazy` is set to `true`.
885
1223
  #
886
1224
  # @return [DataFrame]
887
1225
  #
@@ -890,27 +1228,24 @@ module Polars
890
1228
  # lf.sink_ipc("out.arrow")
891
1229
  def sink_ipc(
892
1230
  path,
893
- compression: "zstd",
1231
+ compression: "uncompressed",
1232
+ compat_level: nil,
894
1233
  maintain_order: true,
895
1234
  storage_options: nil,
1235
+ credential_provider: "auto",
896
1236
  retries: 2,
897
- type_coercion: true,
898
- predicate_pushdown: true,
899
- projection_pushdown: true,
900
- simplify_expression: true,
901
- slice_pushdown: true,
902
- no_optimization: false,
903
1237
  sync_on_close: nil,
904
1238
  mkdir: false,
905
- lazy: false
1239
+ lazy: false,
1240
+ engine: "auto",
1241
+ optimizations: DEFAULT_QUERY_OPT_FLAGS
906
1242
  )
907
- lf = _set_sink_optimizations(
908
- type_coercion: type_coercion,
909
- predicate_pushdown: predicate_pushdown,
910
- projection_pushdown: projection_pushdown,
911
- simplify_expression: simplify_expression,
912
- slice_pushdown: slice_pushdown,
913
- no_optimization: no_optimization
1243
+ engine = _select_engine(engine, path)
1244
+
1245
+ _init_credential_provider_builder = Polars.method(:_init_credential_provider_builder)
1246
+
1247
+ credential_provider_builder = _init_credential_provider_builder.(
1248
+ credential_provider, path, storage_options, "sink_ipc"
914
1249
  )
915
1250
 
916
1251
  if storage_options&.any?
@@ -925,20 +1260,34 @@ module Polars
925
1260
  "mkdir" => mkdir
926
1261
  }
927
1262
 
928
- lf = lf.sink_ipc(
1263
+ compat_level_rb = nil
1264
+ if compat_level.nil?
1265
+ compat_level_rb = true
1266
+ else
1267
+ raise Todo
1268
+ end
1269
+
1270
+ if compression.nil?
1271
+ compression = "uncompressed"
1272
+ end
1273
+
1274
+ ldf_rb = _ldf.sink_ipc(
929
1275
  path,
930
1276
  compression,
1277
+ compat_level_rb,
931
1278
  storage_options,
1279
+ credential_provider_builder,
932
1280
  retries,
933
1281
  sink_options
934
1282
  )
935
- lf = LazyFrame._from_rbldf(lf)
936
1283
 
937
1284
  if !lazy
938
- lf.collect
1285
+ ldf_rb = ldf_rb.with_optimizations(optimizations._rboptflags)
1286
+ ldf = LazyFrame._from_rbldf(ldf_rb)
1287
+ ldf.collect(engine: engine)
939
1288
  return nil
940
1289
  end
941
- lf
1290
+ LazyFrame._from_rbldf(ldf_rb)
942
1291
  end
943
1292
 
944
1293
  # Evaluate the query in streaming mode and write to a CSV file.
@@ -1004,32 +1353,32 @@ module Polars
1004
1353
  # @param maintain_order [Boolean]
1005
1354
  # Maintain the order in which data is processed.
1006
1355
  # Setting this to `false` will be slightly faster.
1007
- # @param type_coercion [Boolean]
1008
- # Do type coercion optimization.
1009
- # @param predicate_pushdown [Boolean]
1010
- # Do predicate pushdown optimization.
1011
- # @param projection_pushdown [Boolean]
1012
- # Do projection pushdown optimization.
1013
- # @param simplify_expression [Boolean]
1014
- # Run simplify expressions optimization.
1015
- # @param slice_pushdown [Boolean]
1016
- # Slice pushdown optimization.
1017
- # @param no_optimization [Boolean]
1018
- # Turn off (certain) optimizations.
1019
1356
  # @param storage_options [Object]
1020
1357
  # Options that indicate how to connect to a cloud provider.
1021
1358
  # @param retries [Integer]
1022
1359
  # Number of retries if accessing a cloud instance fails.
1023
1360
  # @param sync_on_close ['data', 'all']
1024
- # Sync to disk when before closing a file.
1361
+ # Sync to disk when before closing a file.
1025
1362
  #
1026
- # * `nil` does not sync.
1027
- # * `data` syncs the file contents.
1028
- # * `all` syncs the file contents and metadata.
1363
+ # * `nil` does not sync.
1364
+ # * `data` syncs the file contents.
1365
+ # * `all` syncs the file contents and metadata.
1029
1366
  # @param mkdir [Boolean]
1030
- # Recursively create all the directories in the path.
1367
+ # Recursively create all the directories in the path.
1031
1368
  # @param lazy [Boolean]
1032
- # Wait to start execution until `collect` is called.
1369
+ # Wait to start execution until `collect` is called.
1370
+ # @param engine
1371
+ # Select the engine used to process the query, optional.
1372
+ # At the moment, if set to `"auto"` (default), the query is run
1373
+ # using the polars streaming engine. Polars will also
1374
+ # attempt to use the engine set by the `POLARS_ENGINE_AFFINITY`
1375
+ # environment variable. If it cannot run the query using the
1376
+ # selected engine, the query is run using the polars streaming
1377
+ # engine.
1378
+ # @param optimizations
1379
+ # The optimization passes done during query optimization.
1380
+ #
1381
+ # This has no effect if `lazy` is set to `true`.
1033
1382
  #
1034
1383
  # @return [DataFrame]
1035
1384
  #
@@ -1053,28 +1402,23 @@ module Polars
1053
1402
  null_value: nil,
1054
1403
  quote_style: nil,
1055
1404
  maintain_order: true,
1056
- type_coercion: true,
1057
- predicate_pushdown: true,
1058
- projection_pushdown: true,
1059
- simplify_expression: true,
1060
- slice_pushdown: true,
1061
- no_optimization: false,
1062
1405
  storage_options: nil,
1406
+ credential_provider: "auto",
1063
1407
  retries: 2,
1064
1408
  sync_on_close: nil,
1065
1409
  mkdir: false,
1066
- lazy: false
1410
+ lazy: false,
1411
+ engine: "auto",
1412
+ optimizations: DEFAULT_QUERY_OPT_FLAGS
1067
1413
  )
1068
1414
  Utils._check_arg_is_1byte("separator", separator, false)
1069
1415
  Utils._check_arg_is_1byte("quote_char", quote_char, false)
1416
+ engine = _select_engine(engine, path)
1070
1417
 
1071
- lf = _set_sink_optimizations(
1072
- type_coercion: type_coercion,
1073
- predicate_pushdown: predicate_pushdown,
1074
- projection_pushdown: projection_pushdown,
1075
- simplify_expression: simplify_expression,
1076
- slice_pushdown: slice_pushdown,
1077
- no_optimization: no_optimization
1418
+ _init_credential_provider_builder = Polars.method(:_init_credential_provider_builder)
1419
+
1420
+ credential_provider_builder = _init_credential_provider_builder.(
1421
+ credential_provider, path, storage_options, "sink_csv"
1078
1422
  )
1079
1423
 
1080
1424
  if storage_options&.any?
@@ -1089,7 +1433,7 @@ module Polars
1089
1433
  "mkdir" => mkdir
1090
1434
  }
1091
1435
 
1092
- lf = lf.sink_csv(
1436
+ ldf_rb = _ldf.sink_csv(
1093
1437
  path,
1094
1438
  include_bom,
1095
1439
  include_header,
@@ -1106,16 +1450,18 @@ module Polars
1106
1450
  null_value,
1107
1451
  quote_style,
1108
1452
  storage_options,
1453
+ credential_provider_builder,
1109
1454
  retries,
1110
1455
  sink_options
1111
1456
  )
1112
- lf = LazyFrame._from_rbldf(lf)
1113
1457
 
1114
1458
  if !lazy
1115
- lf.collect
1459
+ ldf_rb = ldf_rb.with_optimizations(optimizations._rboptflags)
1460
+ ldf = LazyFrame._from_rbldf(ldf_rb)
1461
+ ldf.collect(engine: engine)
1116
1462
  return nil
1117
1463
  end
1118
- lf
1464
+ LazyFrame._from_rbldf(ldf_rb)
1119
1465
  end
1120
1466
 
1121
1467
  # Evaluate the query in streaming mode and write to an NDJSON file.
@@ -1127,18 +1473,6 @@ module Polars
1127
1473
  # @param maintain_order [Boolean]
1128
1474
  # Maintain the order in which data is processed.
1129
1475
  # Setting this to `false` will be slightly faster.
1130
- # @param type_coercion [Boolean]
1131
- # Do type coercion optimization.
1132
- # @param predicate_pushdown [Boolean]
1133
- # Do predicate pushdown optimization.
1134
- # @param projection_pushdown [Boolean]
1135
- # Do projection pushdown optimization.
1136
- # @param simplify_expression [Boolean]
1137
- # Run simplify expressions optimization.
1138
- # @param slice_pushdown [Boolean]
1139
- # Slice pushdown optimization.
1140
- # @param no_optimization [Boolean]
1141
- # Turn off (certain) optimizations.
1142
1476
  # @param storage_options [String]
1143
1477
  # Options that indicate how to connect to a cloud provider.
1144
1478
  #
@@ -1164,6 +1498,18 @@ module Polars
1164
1498
  # Recursively create all the directories in the path.
1165
1499
  # @param lazy [Boolean]
1166
1500
  # Wait to start execution until `collect` is called.
1501
+ # @param engine
1502
+ # Select the engine used to process the query, optional.
1503
+ # At the moment, if set to `"auto"` (default), the query is run
1504
+ # using the polars streaming engine. Polars will also
1505
+ # attempt to use the engine set by the `POLARS_ENGINE_AFFINITY`
1506
+ # environment variable. If it cannot run the query using the
1507
+ # selected engine, the query is run using the polars streaming
1508
+ # engine.
1509
+ # @param optimizations
1510
+ # The optimization passes done during query optimization.
1511
+ #
1512
+ # This has no effect if `lazy` is set to `true`.
1167
1513
  #
1168
1514
  # @return [DataFrame]
1169
1515
  #
@@ -1173,25 +1519,21 @@ module Polars
1173
1519
  def sink_ndjson(
1174
1520
  path,
1175
1521
  maintain_order: true,
1176
- type_coercion: true,
1177
- predicate_pushdown: true,
1178
- projection_pushdown: true,
1179
- simplify_expression: true,
1180
- slice_pushdown: true,
1181
- no_optimization: false,
1182
1522
  storage_options: nil,
1523
+ credential_provider: "auto",
1183
1524
  retries: 2,
1184
1525
  sync_on_close: nil,
1185
1526
  mkdir: false,
1186
- lazy: false
1527
+ lazy: false,
1528
+ engine: "auto",
1529
+ optimizations: DEFAULT_QUERY_OPT_FLAGS
1187
1530
  )
1188
- lf = _set_sink_optimizations(
1189
- type_coercion: type_coercion,
1190
- predicate_pushdown: predicate_pushdown,
1191
- projection_pushdown: projection_pushdown,
1192
- simplify_expression: simplify_expression,
1193
- slice_pushdown: slice_pushdown,
1194
- no_optimization: no_optimization
1531
+ engine = _select_engine(engine, path)
1532
+
1533
+ _init_credential_provider_builder = Polars.method(:_init_credential_provider_builder)
1534
+
1535
+ credential_provider_builder = _init_credential_provider_builder.(
1536
+ credential_provider, path, storage_options, "sink_ndjson"
1195
1537
  )
1196
1538
 
1197
1539
  if storage_options&.any?
@@ -1206,80 +1548,21 @@ module Polars
1206
1548
  "mkdir" => mkdir
1207
1549
  }
1208
1550
 
1209
- lf = lf.sink_json(path, storage_options, retries, sink_options)
1210
- lf = LazyFrame._from_rbldf(lf)
1551
+ ldf_rb = _ldf.sink_json(
1552
+ path,
1553
+ storage_options,
1554
+ credential_provider_builder,
1555
+ retries,
1556
+ sink_options
1557
+ )
1211
1558
 
1212
1559
  if !lazy
1213
- lf.collect
1560
+ ldf_rb = ldf_rb.with_optimizations(optimizations._rboptflags)
1561
+ ldf = LazyFrame._from_rbldf(ldf_rb)
1562
+ ldf.collect(engine: engine)
1214
1563
  return nil
1215
1564
  end
1216
- lf
1217
- end
1218
-
1219
- # @private
1220
- def _set_sink_optimizations(
1221
- type_coercion: true,
1222
- predicate_pushdown: true,
1223
- projection_pushdown: true,
1224
- simplify_expression: true,
1225
- slice_pushdown: true,
1226
- no_optimization: false
1227
- )
1228
- if no_optimization
1229
- predicate_pushdown = false
1230
- projection_pushdown = false
1231
- slice_pushdown = false
1232
- end
1233
-
1234
- _ldf.optimization_toggle(
1235
- type_coercion,
1236
- predicate_pushdown,
1237
- projection_pushdown,
1238
- simplify_expression,
1239
- slice_pushdown,
1240
- false,
1241
- false,
1242
- true,
1243
- false
1244
- )
1245
- end
1246
-
1247
- # Collect a small number of rows for debugging purposes.
1248
- #
1249
- # Fetch is like a {#collect} operation, but it overwrites the number of rows
1250
- # read by every scan operation. This is a utility that helps debug a query on a
1251
- # smaller number of rows.
1252
- #
1253
- # Note that the fetch does not guarantee the final number of rows in the
1254
- # DataFrame. Filter, join operations and a lower number of rows available in the
1255
- # scanned file influence the final number of rows.
1256
- #
1257
- # @param n_rows [Integer]
1258
- # Collect n_rows from the data sources.
1259
- #
1260
- # @return [DataFrame]
1261
- #
1262
- # @example
1263
- # df = Polars::DataFrame.new(
1264
- # {
1265
- # "a" => ["a", "b", "a", "b", "b", "c"],
1266
- # "b" => [1, 2, 3, 4, 5, 6],
1267
- # "c" => [6, 5, 4, 3, 2, 1]
1268
- # }
1269
- # ).lazy
1270
- # df.group_by("a", maintain_order: true).agg(Polars.all.sum).fetch(2)
1271
- # # =>
1272
- # # shape: (2, 3)
1273
- # # ┌─────┬─────┬─────┐
1274
- # # │ a ┆ b ┆ c │
1275
- # # │ --- ┆ --- ┆ --- │
1276
- # # │ str ┆ i64 ┆ i64 │
1277
- # # ╞═════╪═════╪═════╡
1278
- # # │ a ┆ 4 ┆ 10 │
1279
- # # │ b ┆ 11 ┆ 10 │
1280
- # # └─────┴─────┴─────┘
1281
- def fetch(n_rows = 500, **kwargs)
1282
- head(n_rows).collect(**kwargs)
1565
+ LazyFrame._from_rbldf(ldf_rb)
1283
1566
  end
1284
1567
 
1285
1568
  # Return lazy representation, i.e. itself.
@@ -1388,7 +1671,7 @@ module Polars
1388
1671
  # "c" => [true, true, false, nil],
1389
1672
  # }
1390
1673
  # ).lazy
1391
- # lf.clear.fetch
1674
+ # lf.clear.collect
1392
1675
  # # =>
1393
1676
  # # shape: (0, 3)
1394
1677
  # # ┌─────┬─────┬──────┐
@@ -1399,7 +1682,7 @@ module Polars
1399
1682
  # # └─────┴─────┴──────┘
1400
1683
  #
1401
1684
  # @example
1402
- # lf.clear(2).fetch
1685
+ # lf.clear(2).collect
1403
1686
  # # =>
1404
1687
  # # shape: (2, 3)
1405
1688
  # # ┌──────┬──────┬──────┐
@@ -1413,24 +1696,82 @@ module Polars
1413
1696
  def clear(n = 0)
1414
1697
  DataFrame.new(schema: schema).clear(n).lazy
1415
1698
  end
1416
- alias_method :cleared, :clear
1417
1699
 
1418
1700
  # Filter the rows in the DataFrame based on a predicate expression.
1419
1701
  #
1420
- # @param predicate [Object]
1421
- # Expression that evaluates to a boolean Series.
1702
+ # @param predicates [Array]
1703
+ # Expression(s) that evaluate to a boolean Series.
1704
+ # @param constraints [Hash]
1705
+ # Column filters; use `name = value` to filter columns using the supplied
1706
+ # value. Each constraint behaves the same as `Polars.col(name).eq(value)`,
1707
+ # and is implicitly joined with the other filter conditions using `&`.
1422
1708
  #
1423
1709
  # @return [LazyFrame]
1424
1710
  #
1425
1711
  # @example Filter on one condition:
1426
- # lf = Polars::DataFrame.new(
1712
+ # lf = Polars::LazyFrame.new(
1427
1713
  # {
1428
- # "foo" => [1, 2, 3],
1429
- # "bar" => [6, 7, 8],
1430
- # "ham" => ["a", "b", "c"]
1714
+ # "foo" => [1, 2, 3, nil, 4, nil, 0],
1715
+ # "bar" => [6, 7, 8, nil, nil, 9, 0],
1716
+ # "ham" => ["a", "b", "c", nil, "d", "e", "f"]
1431
1717
  # }
1432
- # ).lazy
1433
- # lf.filter(Polars.col("foo") < 3).collect
1718
+ # )
1719
+ # lf.filter(Polars.col("foo") > 1).collect
1720
+ # # =>
1721
+ # # shape: (3, 3)
1722
+ # # ┌─────┬──────┬─────┐
1723
+ # # │ foo ┆ bar ┆ ham │
1724
+ # # │ --- ┆ --- ┆ --- │
1725
+ # # │ i64 ┆ i64 ┆ str │
1726
+ # # ╞═════╪══════╪═════╡
1727
+ # # │ 2 ┆ 7 ┆ b │
1728
+ # # │ 3 ┆ 8 ┆ c │
1729
+ # # │ 4 ┆ null ┆ d │
1730
+ # # └─────┴──────┴─────┘
1731
+ #
1732
+ # @example Filter on multiple conditions:
1733
+ # lf.filter((Polars.col("foo") < 3) & (Polars.col("ham") == "a")).collect
1734
+ # # =>
1735
+ # # shape: (1, 3)
1736
+ # # ┌─────┬─────┬─────┐
1737
+ # # │ foo ┆ bar ┆ ham │
1738
+ # # │ --- ┆ --- ┆ --- │
1739
+ # # │ i64 ┆ i64 ┆ str │
1740
+ # # ╞═════╪═════╪═════╡
1741
+ # # │ 1 ┆ 6 ┆ a │
1742
+ # # └─────┴─────┴─────┘
1743
+ #
1744
+ # @example Provide multiple filters using `*args` syntax:
1745
+ # lf.filter(
1746
+ # Polars.col("foo") == 1,
1747
+ # Polars.col("ham") == "a"
1748
+ # ).collect
1749
+ # # =>
1750
+ # # shape: (1, 3)
1751
+ # # ┌─────┬─────┬─────┐
1752
+ # # │ foo ┆ bar ┆ ham │
1753
+ # # │ --- ┆ --- ┆ --- │
1754
+ # # │ i64 ┆ i64 ┆ str │
1755
+ # # ╞═════╪═════╪═════╡
1756
+ # # │ 1 ┆ 6 ┆ a │
1757
+ # # └─────┴─────┴─────┘
1758
+ #
1759
+ # @example Provide multiple filters using `**kwargs` syntax:
1760
+ # lf.filter(foo: 1, ham: "a").collect
1761
+ # # =>
1762
+ # # shape: (1, 3)
1763
+ # # ┌─────┬─────┬─────┐
1764
+ # # │ foo ┆ bar ┆ ham │
1765
+ # # │ --- ┆ --- ┆ --- │
1766
+ # # │ i64 ┆ i64 ┆ str │
1767
+ # # ╞═════╪═════╪═════╡
1768
+ # # │ 1 ┆ 6 ┆ a │
1769
+ # # └─────┴─────┴─────┘
1770
+ #
1771
+ # @example Filter on an OR condition:
1772
+ # lf.filter(
1773
+ # (Polars.col("foo") == 1) | (Polars.col("ham") == "c")
1774
+ # ).collect
1434
1775
  # # =>
1435
1776
  # # shape: (2, 3)
1436
1777
  # # ┌─────┬─────┬─────┐
@@ -1439,11 +1780,13 @@ module Polars
1439
1780
  # # │ i64 ┆ i64 ┆ str │
1440
1781
  # # ╞═════╪═════╪═════╡
1441
1782
  # # │ 1 ┆ 6 ┆ a │
1442
- # # │ 27b
1783
+ # # │ 38c
1443
1784
  # # └─────┴─────┴─────┘
1444
1785
  #
1445
- # @example Filter on multiple conditions:
1446
- # lf.filter((Polars.col("foo") < 3) & (Polars.col("ham") == "a")).collect
1786
+ # @example Filter by comparing two columns against each other
1787
+ # lf.filter(
1788
+ # Polars.col("foo") == Polars.col("bar")
1789
+ # ).collect
1447
1790
  # # =>
1448
1791
  # # shape: (1, 3)
1449
1792
  # # ┌─────┬─────┬─────┐
@@ -1451,13 +1794,39 @@ module Polars
1451
1794
  # # │ --- ┆ --- ┆ --- │
1452
1795
  # # │ i64 ┆ i64 ┆ str │
1453
1796
  # # ╞═════╪═════╪═════╡
1797
+ # # │ 0 ┆ 0 ┆ f │
1798
+ # # └─────┴─────┴─────┘
1799
+ #
1800
+ # @example
1801
+ # lf.filter(
1802
+ # Polars.col("foo") != Polars.col("bar")
1803
+ # ).collect
1804
+ # # =>
1805
+ # # shape: (3, 3)
1806
+ # # ┌─────┬─────┬─────┐
1807
+ # # │ foo ┆ bar ┆ ham │
1808
+ # # │ --- ┆ --- ┆ --- │
1809
+ # # │ i64 ┆ i64 ┆ str │
1810
+ # # ╞═════╪═════╪═════╡
1454
1811
  # # │ 1 ┆ 6 ┆ a │
1812
+ # # │ 2 ┆ 7 ┆ b │
1813
+ # # │ 3 ┆ 8 ┆ c │
1455
1814
  # # └─────┴─────┴─────┘
1456
- def filter(predicate)
1457
- _from_rbldf(
1458
- _ldf.filter(
1459
- Utils.parse_into_expression(predicate, str_as_lit: false)
1460
- )
1815
+ def filter(*predicates, **constraints)
1816
+ if constraints.empty?
1817
+ # early-exit conditions (exclude/include all rows)
1818
+ if predicates.empty? || (predicates.length == 1 && predicates[0].is_a?(TrueClass))
1819
+ return dup
1820
+ end
1821
+ if predicates.length == 1 && predicates[0].is_a?(FalseClass)
1822
+ return clear
1823
+ end
1824
+ end
1825
+
1826
+ _filter(
1827
+ predicates: predicates,
1828
+ constraints: constraints,
1829
+ invert: false
1461
1830
  )
1462
1831
  end
1463
1832
 
@@ -1752,13 +2121,9 @@ module Polars
1752
2121
  lgb = _ldf.group_by(exprs, maintain_order)
1753
2122
  LazyGroupBy.new(lgb)
1754
2123
  end
1755
- alias_method :groupby, :group_by
1756
- alias_method :group, :group_by
1757
2124
 
1758
2125
  # Create rolling groups based on a time column.
1759
2126
  #
1760
- # Also works for index values of type `:i32` or `:i64`.
1761
- #
1762
2127
  # Different from a `dynamic_group_by` the windows are now determined by the
1763
2128
  # individual values and are not of constant intervals. For constant intervals
1764
2129
  # use *group_by_dynamic*.
@@ -1793,15 +2158,15 @@ module Polars
1793
2158
  # make sense.
1794
2159
  #
1795
2160
  # In case of a rolling group by on indices, dtype needs to be one of
1796
- # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
1797
- # performance matters use an `:i64` column.
2161
+ # \\\\{UInt32, UInt64, Int32, Int64}. Note that the first three get temporarily
2162
+ # cast to Int64, so if performance matters use an Int64 column.
1798
2163
  # @param period [Object]
1799
2164
  # Length of the window.
1800
2165
  # @param offset [Object]
1801
2166
  # Offset of the window. Default is -period.
1802
2167
  # @param closed ["right", "left", "both", "none"]
1803
2168
  # Define whether the temporal window interval is closed or not.
1804
- # @param by [Object]
2169
+ # @param group_by [Object]
1805
2170
  # Also group by this column/these columns.
1806
2171
  #
1807
2172
  # @return [LazyFrame]
@@ -1815,7 +2180,7 @@ module Polars
1815
2180
  # "2020-01-03 19:45:32",
1816
2181
  # "2020-01-08 23:16:43"
1817
2182
  # ]
1818
- # df = Polars::LazyFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
2183
+ # df = Polars::LazyFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_columns(
1819
2184
  # Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
1820
2185
  # )
1821
2186
  # df.rolling(index_column: "dt", period: "2d").agg(
@@ -1844,7 +2209,7 @@ module Polars
1844
2209
  period:,
1845
2210
  offset: nil,
1846
2211
  closed: "right",
1847
- by: nil
2212
+ group_by: nil
1848
2213
  )
1849
2214
  index_column = Utils.parse_into_expression(index_column)
1850
2215
  if offset.nil?
@@ -1852,7 +2217,7 @@ module Polars
1852
2217
  end
1853
2218
 
1854
2219
  rbexprs_by = (
1855
- !by.nil? ? Utils.parse_into_list_of_expressions(by) : []
2220
+ !group_by.nil? ? Utils.parse_into_list_of_expressions(group_by) : []
1856
2221
  )
1857
2222
  period = Utils.parse_as_duration_string(period)
1858
2223
  offset = Utils.parse_as_duration_string(offset)
@@ -1860,10 +2225,8 @@ module Polars
1860
2225
  lgb = _ldf.rolling(index_column, period, offset, closed, rbexprs_by)
1861
2226
  LazyGroupBy.new(lgb)
1862
2227
  end
1863
- alias_method :group_by_rolling, :rolling
1864
- alias_method :groupby_rolling, :rolling
1865
2228
 
1866
- # Group based on a time value (or index value of type `:i32`, `:i64`).
2229
+ # Group based on a time value (or index value of type Int32, Int64).
1867
2230
  #
1868
2231
  # Time windows are calculated and rows are assigned to windows. Different from a
1869
2232
  # normal group by is that a row can be member of multiple groups. The time/index
@@ -1906,8 +2269,8 @@ module Polars
1906
2269
  # make sense.
1907
2270
  #
1908
2271
  # In case of a dynamic group by on indices, dtype needs to be one of
1909
- # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
1910
- # performance matters use an `:i64` column.
2272
+ # \\\\{Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if
2273
+ # performance matters use an Int64 column.
1911
2274
  # @param every [Object]
1912
2275
  # Interval of the window.
1913
2276
  # @param period [Object]
@@ -1915,8 +2278,6 @@ module Polars
1915
2278
  # @param offset [Object]
1916
2279
  # Offset of the window if nil and period is nil it will be equal to negative
1917
2280
  # `every`.
1918
- # @param truncate [Boolean]
1919
- # Truncate the time value to the window lower bound.
1920
2281
  # @param include_boundaries [Boolean]
1921
2282
  # Add the lower and upper bound of the window to the "_lower_bound" and
1922
2283
  # "_upper_bound" columns. This will impact performance because it's harder to
@@ -1931,7 +2292,7 @@ module Polars
1931
2292
  # - 'datapoint': the first value of the index column in the given window.
1932
2293
  # If you don't need the label to be at one of the boundaries, choose this
1933
2294
  # option for maximum performance
1934
- # @param by [Object]
2295
+ # @param group_by [Object]
1935
2296
  # Also group by this column/these columns
1936
2297
  # @param start_by ['window', 'datapoint', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
1937
2298
  # The strategy to determine the start of the first window by.
@@ -2073,7 +2434,7 @@ module Polars
2073
2434
  # "time",
2074
2435
  # every: "1h",
2075
2436
  # closed: "both",
2076
- # by: "groups",
2437
+ # group_by: "groups",
2077
2438
  # include_boundaries: true
2078
2439
  # ).agg([Polars.col("time").count.alias("time_count")])
2079
2440
  # # =>
@@ -2123,17 +2484,12 @@ module Polars
2123
2484
  every:,
2124
2485
  period: nil,
2125
2486
  offset: nil,
2126
- truncate: nil,
2127
2487
  include_boundaries: false,
2128
2488
  closed: "left",
2129
2489
  label: "left",
2130
- by: nil,
2490
+ group_by: nil,
2131
2491
  start_by: "window"
2132
2492
  )
2133
- if !truncate.nil?
2134
- label = truncate ? "left" : "datapoint"
2135
- end
2136
-
2137
2493
  index_column = Utils.parse_into_expression(index_column, str_as_lit: false)
2138
2494
  if offset.nil?
2139
2495
  offset = period.nil? ? "-#{every}" : "0ns"
@@ -2147,7 +2503,7 @@ module Polars
2147
2503
  offset = Utils.parse_as_duration_string(offset)
2148
2504
  every = Utils.parse_as_duration_string(every)
2149
2505
 
2150
- rbexprs_by = by.nil? ? [] : Utils.parse_into_list_of_expressions(by)
2506
+ rbexprs_by = group_by.nil? ? [] : Utils.parse_into_list_of_expressions(group_by)
2151
2507
  lgb = _ldf.group_by_dynamic(
2152
2508
  index_column,
2153
2509
  every,
@@ -2161,7 +2517,6 @@ module Polars
2161
2517
  )
2162
2518
  LazyGroupBy.new(lgb)
2163
2519
  end
2164
- alias_method :groupby_dynamic, :group_by_dynamic
2165
2520
 
2166
2521
  # Perform an asof join.
2167
2522
  #
@@ -2521,7 +2876,7 @@ module Polars
2521
2876
  # * *one_to_one* - “1:1”: check if join keys are unique in both left and right datasets
2522
2877
  # * *one_to_many* - “1:m”: check if join keys are unique in left dataset
2523
2878
  # * *many_to_one* - “m:1”: check if join keys are unique in right dataset
2524
- # @param join_nulls [Boolean]
2879
+ # @param nulls_equal [Boolean]
2525
2880
  # Join on null values. By default null values will never produce matches.
2526
2881
  # @param allow_parallel [Boolean]
2527
2882
  # Allow the physical plan to optionally evaluate the computation of both
@@ -2643,7 +2998,7 @@ module Polars
2643
2998
  how: "inner",
2644
2999
  suffix: "_right",
2645
3000
  validate: "m:m",
2646
- join_nulls: false,
3001
+ nulls_equal: false,
2647
3002
  allow_parallel: true,
2648
3003
  force_parallel: false,
2649
3004
  coalesce: nil,
@@ -2666,7 +3021,7 @@ module Polars
2666
3021
  [],
2667
3022
  [],
2668
3023
  allow_parallel,
2669
- join_nulls,
3024
+ nulls_equal,
2670
3025
  force_parallel,
2671
3026
  how,
2672
3027
  suffix,
@@ -2695,7 +3050,7 @@ module Polars
2695
3050
  rbexprs_right,
2696
3051
  allow_parallel,
2697
3052
  force_parallel,
2698
- join_nulls,
3053
+ nulls_equal,
2699
3054
  how,
2700
3055
  suffix,
2701
3056
  validate,
@@ -2875,87 +3230,6 @@ module Polars
2875
3230
  _from_rbldf(_ldf.with_columns_seq(rbexprs))
2876
3231
  end
2877
3232
 
2878
- # Add an external context to the computation graph.
2879
- #
2880
- # This allows expressions to also access columns from DataFrames
2881
- # that are not part of this one.
2882
- #
2883
- # @param other [Object]
2884
- # Lazy DataFrame to join with.
2885
- #
2886
- # @return [LazyFrame]
2887
- #
2888
- # @example
2889
- # df_a = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => ["a", "c", nil]}).lazy
2890
- # df_other = Polars::DataFrame.new({"c" => ["foo", "ham"]})
2891
- # (
2892
- # df_a.with_context(df_other.lazy).select(
2893
- # [Polars.col("b") + Polars.col("c").first]
2894
- # )
2895
- # ).collect
2896
- # # =>
2897
- # # shape: (3, 1)
2898
- # # ┌──────┐
2899
- # # │ b │
2900
- # # │ --- │
2901
- # # │ str │
2902
- # # ╞══════╡
2903
- # # │ afoo │
2904
- # # │ cfoo │
2905
- # # │ null │
2906
- # # └──────┘
2907
- def with_context(other)
2908
- if !other.is_a?(::Array)
2909
- other = [other]
2910
- end
2911
-
2912
- _from_rbldf(_ldf.with_context(other.map(&:_ldf)))
2913
- end
2914
-
2915
- # Add or overwrite column in a DataFrame.
2916
- #
2917
- # @param column [Object]
2918
- # Expression that evaluates to column or a Series to use.
2919
- #
2920
- # @return [LazyFrame]
2921
- #
2922
- # @example
2923
- # df = Polars::DataFrame.new(
2924
- # {
2925
- # "a" => [1, 3, 5],
2926
- # "b" => [2, 4, 6]
2927
- # }
2928
- # ).lazy
2929
- # df.with_column((Polars.col("b") ** 2).alias("b_squared")).collect
2930
- # # =>
2931
- # # shape: (3, 3)
2932
- # # ┌─────┬─────┬───────────┐
2933
- # # │ a ┆ b ┆ b_squared │
2934
- # # │ --- ┆ --- ┆ --- │
2935
- # # │ i64 ┆ i64 ┆ i64 │
2936
- # # ╞═════╪═════╪═══════════╡
2937
- # # │ 1 ┆ 2 ┆ 4 │
2938
- # # │ 3 ┆ 4 ┆ 16 │
2939
- # # │ 5 ┆ 6 ┆ 36 │
2940
- # # └─────┴─────┴───────────┘
2941
- #
2942
- # @example
2943
- # df.with_column(Polars.col("a") ** 2).collect
2944
- # # =>
2945
- # # shape: (3, 2)
2946
- # # ┌─────┬─────┐
2947
- # # │ a ┆ b │
2948
- # # │ --- ┆ --- │
2949
- # # │ i64 ┆ i64 │
2950
- # # ╞═════╪═════╡
2951
- # # │ 1 ┆ 2 │
2952
- # # │ 9 ┆ 4 │
2953
- # # │ 25 ┆ 6 │
2954
- # # └─────┴─────┘
2955
- def with_column(column)
2956
- with_columns([column])
2957
- end
2958
-
2959
3233
  # Remove one or multiple columns from a DataFrame.
2960
3234
  #
2961
3235
  # @param columns [Object]
@@ -3060,20 +3334,6 @@ module Polars
3060
3334
  # # │ 2 ┆ 7 ┆ b │
3061
3335
  # # │ 3 ┆ 8 ┆ c │
3062
3336
  # # └───────┴─────┴─────┘
3063
- #
3064
- # @example
3065
- # lf.rename(->(column_name) { "c" + column_name[1..] }).collect
3066
- # # =>
3067
- # # shape: (3, 3)
3068
- # # ┌─────┬─────┬─────┐
3069
- # # │ coo ┆ car ┆ cam │
3070
- # # │ --- ┆ --- ┆ --- │
3071
- # # │ i64 ┆ i64 ┆ str │
3072
- # # ╞═════╪═════╪═════╡
3073
- # # │ 1 ┆ 6 ┆ a │
3074
- # # │ 2 ┆ 7 ┆ b │
3075
- # # │ 3 ┆ 8 ┆ c │
3076
- # # └─────┴─────┴─────┘
3077
3337
  def rename(mapping, strict: true)
3078
3338
  if mapping.respond_to?(:call)
3079
3339
  select(F.all.name.map(&mapping))
@@ -3153,7 +3413,7 @@ module Polars
3153
3413
  # # │ 5 ┆ 6 │
3154
3414
  # # │ null ┆ null │
3155
3415
  # # └──────┴──────┘
3156
- def shift(n, fill_value: nil)
3416
+ def shift(n = 1, fill_value: nil)
3157
3417
  if !fill_value.nil?
3158
3418
  fill_value = Utils.parse_into_expression(fill_value, str_as_lit: true)
3159
3419
  end
@@ -3161,52 +3421,6 @@ module Polars
3161
3421
  _from_rbldf(_ldf.shift(n, fill_value))
3162
3422
  end
3163
3423
 
3164
- # Shift the values by a given period and fill the resulting null values.
3165
- #
3166
- # @param periods [Integer]
3167
- # Number of places to shift (may be negative).
3168
- # @param fill_value [Object]
3169
- # Fill `nil` values with the result of this expression.
3170
- #
3171
- # @return [LazyFrame]
3172
- #
3173
- # @example
3174
- # df = Polars::DataFrame.new(
3175
- # {
3176
- # "a" => [1, 3, 5],
3177
- # "b" => [2, 4, 6]
3178
- # }
3179
- # ).lazy
3180
- # df.shift_and_fill(1, 0).collect
3181
- # # =>
3182
- # # shape: (3, 2)
3183
- # # ┌─────┬─────┐
3184
- # # │ a ┆ b │
3185
- # # │ --- ┆ --- │
3186
- # # │ i64 ┆ i64 │
3187
- # # ╞═════╪═════╡
3188
- # # │ 0 ┆ 0 │
3189
- # # │ 1 ┆ 2 │
3190
- # # │ 3 ┆ 4 │
3191
- # # └─────┴─────┘
3192
- #
3193
- # @example
3194
- # df.shift_and_fill(-1, 0).collect
3195
- # # =>
3196
- # # shape: (3, 2)
3197
- # # ┌─────┬─────┐
3198
- # # │ a ┆ b │
3199
- # # │ --- ┆ --- │
3200
- # # │ i64 ┆ i64 │
3201
- # # ╞═════╪═════╡
3202
- # # │ 3 ┆ 4 │
3203
- # # │ 5 ┆ 6 │
3204
- # # │ 0 ┆ 0 │
3205
- # # └─────┴─────┘
3206
- def shift_and_fill(periods, fill_value)
3207
- shift(periods, fill_value: fill_value)
3208
- end
3209
-
3210
3424
  # Get a slice of this DataFrame.
3211
3425
  #
3212
3426
  # @param offset [Integer]
@@ -3252,11 +3466,6 @@ module Polars
3252
3466
  #
3253
3467
  # @return [LazyFrame]
3254
3468
  #
3255
- # @note
3256
- # Consider using the {#fetch} operation if you only want to test your
3257
- # query. The {#fetch} operation will load the first `n` rows at the scan
3258
- # level, whereas the {#head}/{#limit} are applied at the end.
3259
- #
3260
3469
  # @example
3261
3470
  # lf = Polars::LazyFrame.new(
3262
3471
  # {
@@ -3302,11 +3511,6 @@ module Polars
3302
3511
  #
3303
3512
  # @return [LazyFrame]
3304
3513
  #
3305
- # @note
3306
- # Consider using the {#fetch} operation if you only want to test your
3307
- # query. The {#fetch} operation will load the first `n` rows at the scan
3308
- # level, whereas the {#head}/{#limit} are applied at the end.
3309
- #
3310
3514
  # @example
3311
3515
  # lf = Polars::LazyFrame.new(
3312
3516
  # {
@@ -3475,10 +3679,14 @@ module Polars
3475
3679
  def with_row_index(name: "index", offset: 0)
3476
3680
  _from_rbldf(_ldf.with_row_index(name, offset))
3477
3681
  end
3478
- alias_method :with_row_count, :with_row_index
3479
3682
 
3480
3683
  # Take every nth row in the LazyFrame and return as a new LazyFrame.
3481
3684
  #
3685
+ # @param n [Integer]
3686
+ # Gather every *n*-th row.
3687
+ # @param offset [Integer]
3688
+ # Starting index.
3689
+ #
3482
3690
  # @return [LazyFrame]
3483
3691
  #
3484
3692
  # @example
@@ -3494,10 +3702,9 @@ module Polars
3494
3702
  # # │ 1 ┆ 5 │
3495
3703
  # # │ 3 ┆ 7 │
3496
3704
  # # └─────┴─────┘
3497
- def gather_every(n)
3498
- select(F.col("*").gather_every(n))
3705
+ def gather_every(n, offset: 0)
3706
+ select(F.col("*").gather_every(n, offset))
3499
3707
  end
3500
- alias_method :take_every, :gather_every
3501
3708
 
3502
3709
  # Fill null values using the specified value or strategy.
3503
3710
  #
@@ -3568,13 +3775,53 @@ module Polars
3568
3775
  # # │ 0 ┆ 0.0 │
3569
3776
  # # │ 4 ┆ 13.0 │
3570
3777
  # # └─────┴──────┘
3571
- def fill_null(value = nil, strategy: nil, limit: nil, matches_supertype: nil)
3778
+ def fill_null(value = nil, strategy: nil, limit: nil, matches_supertype: true)
3779
+ if !value.nil?
3780
+ if value.is_a?(Expr)
3781
+ dtypes = nil
3782
+ elsif value.is_a?(TrueClass) || value.is_a?(FalseClass)
3783
+ dtypes = [Boolean]
3784
+ elsif matches_supertype && (value.is_a?(Integer) || value.is_a?(Float))
3785
+ dtypes = [
3786
+ Int8,
3787
+ Int16,
3788
+ Int32,
3789
+ Int64,
3790
+ Int128,
3791
+ UInt8,
3792
+ UInt16,
3793
+ UInt32,
3794
+ UInt64,
3795
+ Float32,
3796
+ Float64,
3797
+ Decimal.new
3798
+ ]
3799
+ elsif value.is_a?(Integer)
3800
+ dtypes = [Int64]
3801
+ elsif value.is_a?(Float)
3802
+ dtypes = [Float64]
3803
+ elsif value.is_a?(::Date)
3804
+ dtypes = [Date]
3805
+ elsif value.is_a?(::String)
3806
+ dtypes = [String, Categorical]
3807
+ else
3808
+ # fallback; anything not explicitly handled above
3809
+ dtypes = nil
3810
+ end
3811
+
3812
+ if dtypes
3813
+ return with_columns(
3814
+ F.col(dtypes).fill_null(value, strategy: strategy, limit: limit)
3815
+ )
3816
+ end
3817
+ end
3818
+
3572
3819
  select(Polars.all.fill_null(value, strategy: strategy, limit: limit))
3573
3820
  end
3574
3821
 
3575
3822
  # Fill floating point NaN values.
3576
3823
  #
3577
- # @param fill_value [Object]
3824
+ # @param value [Object]
3578
3825
  # Value to fill the NaN values with.
3579
3826
  #
3580
3827
  # @return [LazyFrame]
@@ -3603,11 +3850,11 @@ module Polars
3603
3850
  # # │ 99.0 ┆ 99.0 │
3604
3851
  # # │ 4.0 ┆ 13.0 │
3605
3852
  # # └──────┴──────┘
3606
- def fill_nan(fill_value)
3607
- if !fill_value.is_a?(Expr)
3608
- fill_value = F.lit(fill_value)
3853
+ def fill_nan(value)
3854
+ if !value.is_a?(Expr)
3855
+ value = F.lit(value)
3609
3856
  end
3610
- _from_rbldf(_ldf.fill_nan(fill_value._rbexpr))
3857
+ _from_rbldf(_ldf.fill_nan(value._rbexpr))
3611
3858
  end
3612
3859
 
3613
3860
  # Aggregate the columns in the DataFrame to their standard deviation value.
@@ -3922,7 +4169,7 @@ module Polars
3922
4169
  # # │ 3 ┆ a ┆ b │
3923
4170
  # # │ 1 ┆ a ┆ b │
3924
4171
  # # └─────┴─────┴─────┘
3925
- def unique(maintain_order: true, subset: nil, keep: "first")
4172
+ def unique(maintain_order: false, subset: nil, keep: "any")
3926
4173
  selector_subset = nil
3927
4174
  if !subset.nil?
3928
4175
  selector_subset = Utils.parse_list_into_selector(subset)._rbselector
@@ -4078,7 +4325,7 @@ module Polars
4078
4325
  # # │ z ┆ c ┆ 6 │
4079
4326
  # # └─────┴──────────┴───────┘
4080
4327
  def unpivot(
4081
- on,
4328
+ on = nil,
4082
4329
  index: nil,
4083
4330
  variable_name: nil,
4084
4331
  value_name: nil,
@@ -4100,7 +4347,6 @@ module Polars
4100
4347
  )
4101
4348
  )
4102
4349
  end
4103
- alias_method :melt, :unpivot
4104
4350
 
4105
4351
  # def map
4106
4352
  # end
@@ -4166,7 +4412,7 @@ module Polars
4166
4412
  # ["before", Polars.struct(Polars.col("^t_.$")).alias("t_struct"), "after"]
4167
4413
  # )
4168
4414
  # )
4169
- # df.fetch
4415
+ # df.collect
4170
4416
  # # =>
4171
4417
  # # shape: (2, 3)
4172
4418
  # # ┌────────┬─────────────────────┬───────┐
@@ -4179,7 +4425,7 @@ module Polars
4179
4425
  # # └────────┴─────────────────────┴───────┘
4180
4426
  #
4181
4427
  # @example
4182
- # df.unnest("t_struct").fetch
4428
+ # df.unnest("t_struct").collect
4183
4429
  # # =>
4184
4430
  # # shape: (2, 6)
4185
4431
  # # ┌────────┬─────┬─────┬──────┬───────────┬───────┐
@@ -4248,19 +4494,41 @@ module Polars
4248
4494
  #
4249
4495
  # @param column [Object]
4250
4496
  # Column that is sorted.
4497
+ # @param more_columns [Array]
4498
+ # Columns that are sorted over after `column`.
4251
4499
  # @param descending [Boolean]
4252
4500
  # Whether the column is sorted in descending order.
4501
+ # @param nulls_last [Boolean]
4502
+ # Whether the nulls are at the end.
4253
4503
  #
4254
4504
  # @return [LazyFrame]
4255
4505
  def set_sorted(
4256
4506
  column,
4257
- descending: false
4507
+ *more_columns,
4508
+ descending: false,
4509
+ nulls_last: false
4258
4510
  )
4259
4511
  if !Utils.strlike?(column)
4260
4512
  msg = "expected a 'str' for argument 'column' in 'set_sorted'"
4261
4513
  raise TypeError, msg
4262
4514
  end
4263
- with_columns(F.col(column).set_sorted(descending: descending))
4515
+
4516
+ if Utils.bool?(descending)
4517
+ ds = [descending]
4518
+ else
4519
+ ds = descending
4520
+ end
4521
+ if Utils.bool?(nulls_last)
4522
+ nl = [nulls_last]
4523
+ else
4524
+ nl = nulls_last
4525
+ end
4526
+
4527
+ _from_rbldf(
4528
+ _ldf.hint_sorted(
4529
+ [column] + more_columns, ds, nl
4530
+ )
4531
+ )
4264
4532
  end
4265
4533
 
4266
4534
  # Update the values in this `LazyFrame` with the values in `other`.
@@ -4297,7 +4565,7 @@ module Polars
4297
4565
  # @note
4298
4566
  # This is syntactic sugar for a left/inner join that preserves the order
4299
4567
  # of the left `DataFrame` by default, with an optional coalesce when
4300
- # `include_nulls: False`.
4568
+ # `include_nulls: false`.
4301
4569
  #
4302
4570
  # @example Update `df` values with the non-null values in `new_df`, by row index:
4303
4571
  # lf = Polars::LazyFrame.new(
@@ -4451,7 +4719,7 @@ module Polars
4451
4719
  # only use non-idx right columns present in left frame
4452
4720
  right_other = Set.new(right_schema.to_h.keys).intersection(left_schema.to_h.keys) - Set.new(right_on)
4453
4721
 
4454
- # When include_nulls is True, we need to distinguish records after the join that
4722
+ # When include_nulls is true, we need to distinguish records after the join that
4455
4723
  # were originally null in the right frame, as opposed to records that were null
4456
4724
  # because the key was missing from the right frame.
4457
4725
  # Add a validity column to track whether row was matched or not.
@@ -4574,11 +4842,29 @@ module Polars
4574
4842
  end
4575
4843
 
4576
4844
  # if multiple predicates, combine as 'horizontal' expression
4577
- combined_predicate = all_predicates ? (all_predicates.length > 1 ? F.all_horizontal(*all_predicates) : all_predicates[0]) : nil
4845
+ combined_predicate =
4846
+ if all_predicates.any?
4847
+ if all_predicates.length > 1
4848
+ F.all_horizontal(*all_predicates)
4849
+ else
4850
+ all_predicates[0]
4851
+ end
4852
+ else
4853
+ nil
4854
+ end
4578
4855
 
4579
4856
  # apply reduced boolean mask first, if applicable, then predicates
4580
4857
  if boolean_masks.any?
4581
- raise Todo
4858
+ if boolean_masks.length > 1
4859
+ raise Todo
4860
+ end
4861
+ mask_expr = F.lit(boolean_masks[0])
4862
+ combined_predicate =
4863
+ if combined_predicate.nil?
4864
+ mask_expr
4865
+ else
4866
+ mask_expr & combined_predicate
4867
+ end
4582
4868
  end
4583
4869
 
4584
4870
  if combined_predicate.nil?
@@ -4588,5 +4874,10 @@ module Polars
4588
4874
  filter_method = invert ? _ldf.method(:remove) : _ldf.method(:filter)
4589
4875
  _from_rbldf(filter_method.(combined_predicate._rbexpr))
4590
4876
  end
4877
+
4878
+ def _select_engine(engine, path = nil)
4879
+ engine = Plr.get_engine_affinity if engine == "auto"
4880
+ engine == "auto" && !path.is_a?(::String) && !path.nil? ? "in-memory" : engine
4881
+ end
4591
4882
  end
4592
4883
  end