polars-df 0.23.0 → 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +127 -1
- data/Cargo.lock +72 -58
- data/README.md +31 -27
- data/ext/polars/Cargo.toml +15 -6
- data/ext/polars/src/batched_csv.rs +35 -39
- data/ext/polars/src/c_api/allocator.rs +7 -0
- data/ext/polars/src/c_api/mod.rs +1 -0
- data/ext/polars/src/catalog/unity.rs +123 -101
- data/ext/polars/src/conversion/any_value.rs +13 -17
- data/ext/polars/src/conversion/chunked_array.rs +5 -5
- data/ext/polars/src/conversion/datetime.rs +3 -2
- data/ext/polars/src/conversion/mod.rs +50 -45
- data/ext/polars/src/dataframe/export.rs +13 -13
- data/ext/polars/src/dataframe/general.rs +223 -223
- data/ext/polars/src/dataframe/io.rs +27 -141
- data/ext/polars/src/dataframe/mod.rs +13 -5
- data/ext/polars/src/dataframe/serde.rs +1 -1
- data/ext/polars/src/error.rs +44 -7
- data/ext/polars/src/exceptions.rs +45 -12
- data/ext/polars/src/expr/array.rs +12 -0
- data/ext/polars/src/expr/datatype.rs +2 -2
- data/ext/polars/src/expr/datetime.rs +4 -5
- data/ext/polars/src/expr/general.rs +49 -13
- data/ext/polars/src/expr/list.rs +4 -0
- data/ext/polars/src/expr/meta.rs +8 -3
- data/ext/polars/src/expr/mod.rs +22 -6
- data/ext/polars/src/expr/name.rs +19 -8
- data/ext/polars/src/expr/rolling.rs +50 -1
- data/ext/polars/src/expr/string.rs +0 -1
- data/ext/polars/src/expr/struct.rs +7 -2
- data/ext/polars/src/file.rs +136 -103
- data/ext/polars/src/functions/aggregation.rs +9 -8
- data/ext/polars/src/functions/io.rs +81 -10
- data/ext/polars/src/functions/lazy.rs +95 -21
- data/ext/polars/src/functions/mod.rs +2 -0
- data/ext/polars/src/functions/range.rs +19 -3
- data/ext/polars/src/functions/strings.rs +6 -0
- data/ext/polars/src/functions/utils.rs +6 -0
- data/ext/polars/src/interop/arrow/mod.rs +50 -1
- data/ext/polars/src/interop/arrow/{to_ruby.rs → to_rb.rs} +30 -0
- data/ext/polars/src/interop/arrow/to_rust.rs +43 -0
- data/ext/polars/src/interop/numo/to_numo_df.rs +1 -1
- data/ext/polars/src/interop/numo/to_numo_series.rs +1 -1
- data/ext/polars/src/lazyframe/exitable.rs +39 -0
- data/ext/polars/src/lazyframe/general.rs +340 -236
- data/ext/polars/src/lazyframe/mod.rs +46 -10
- data/ext/polars/src/lazyframe/optflags.rs +5 -4
- data/ext/polars/src/lazyframe/serde.rs +11 -3
- data/ext/polars/src/lazyframe/sink.rs +10 -5
- data/ext/polars/src/lazygroupby.rs +6 -7
- data/ext/polars/src/lib.rs +141 -76
- data/ext/polars/src/map/dataframe.rs +12 -12
- data/ext/polars/src/map/lazy.rs +7 -5
- data/ext/polars/src/map/mod.rs +15 -8
- data/ext/polars/src/map/series.rs +3 -3
- data/ext/polars/src/on_startup.rs +16 -8
- data/ext/polars/src/prelude.rs +1 -0
- data/ext/polars/src/rb_modules.rs +19 -49
- data/ext/polars/src/series/aggregation.rs +79 -140
- data/ext/polars/src/series/arithmetic.rs +16 -22
- data/ext/polars/src/series/comparison.rs +101 -222
- data/ext/polars/src/series/construction.rs +17 -18
- data/ext/polars/src/series/export.rs +1 -1
- data/ext/polars/src/series/general.rs +254 -289
- data/ext/polars/src/series/import.rs +17 -0
- data/ext/polars/src/series/map.rs +178 -160
- data/ext/polars/src/series/mod.rs +28 -12
- data/ext/polars/src/series/scatter.rs +12 -9
- data/ext/polars/src/sql.rs +16 -9
- data/ext/polars/src/testing/frame.rs +31 -0
- data/ext/polars/src/testing/mod.rs +5 -0
- data/ext/polars/src/testing/series.rs +31 -0
- data/ext/polars/src/timeout.rs +105 -0
- data/ext/polars/src/utils.rs +159 -1
- data/lib/polars/array_expr.rb +81 -12
- data/lib/polars/array_name_space.rb +74 -7
- data/lib/polars/batched_csv_reader.rb +21 -21
- data/lib/polars/binary_name_space.rb +1 -1
- data/lib/polars/cat_expr.rb +7 -7
- data/lib/polars/config.rb +1 -1
- data/lib/polars/convert.rb +189 -34
- data/lib/polars/data_frame.rb +1066 -831
- data/lib/polars/data_frame_plot.rb +173 -0
- data/lib/polars/data_type_group.rb +1 -0
- data/lib/polars/data_types.rb +31 -12
- data/lib/polars/date_time_expr.rb +51 -69
- data/lib/polars/date_time_name_space.rb +80 -112
- data/lib/polars/dynamic_group_by.rb +7 -7
- data/lib/polars/exceptions.rb +50 -10
- data/lib/polars/expr.rb +470 -517
- data/lib/polars/functions/aggregation/horizontal.rb +0 -1
- data/lib/polars/functions/aggregation/vertical.rb +2 -3
- data/lib/polars/functions/as_datatype.rb +290 -8
- data/lib/polars/functions/eager.rb +204 -10
- data/lib/polars/functions/escape_regex.rb +21 -0
- data/lib/polars/functions/lazy.rb +409 -169
- data/lib/polars/functions/lit.rb +17 -1
- data/lib/polars/functions/range/int_range.rb +74 -2
- data/lib/polars/functions/range/linear_space.rb +77 -0
- data/lib/polars/functions/range/time_range.rb +1 -1
- data/lib/polars/functions/repeat.rb +3 -12
- data/lib/polars/functions/whenthen.rb +2 -2
- data/lib/polars/group_by.rb +72 -20
- data/lib/polars/iceberg_dataset.rb +1 -6
- data/lib/polars/in_process_query.rb +37 -0
- data/lib/polars/io/cloud.rb +18 -0
- data/lib/polars/io/csv.rb +265 -126
- data/lib/polars/io/database.rb +0 -1
- data/lib/polars/io/delta.rb +15 -7
- data/lib/polars/io/ipc.rb +24 -17
- data/lib/polars/io/ndjson.rb +161 -24
- data/lib/polars/io/parquet.rb +101 -38
- data/lib/polars/lazy_frame.rb +849 -558
- data/lib/polars/lazy_group_by.rb +327 -2
- data/lib/polars/list_expr.rb +94 -16
- data/lib/polars/list_name_space.rb +88 -24
- data/lib/polars/meta_expr.rb +42 -1
- data/lib/polars/name_expr.rb +41 -4
- data/lib/polars/query_opt_flags.rb +198 -2
- data/lib/polars/rolling_group_by.rb +3 -3
- data/lib/polars/schema.rb +21 -3
- data/lib/polars/selector.rb +37 -2
- data/lib/polars/selectors.rb +45 -9
- data/lib/polars/series.rb +1156 -728
- data/lib/polars/series_plot.rb +72 -0
- data/lib/polars/slice.rb +1 -1
- data/lib/polars/sql_context.rb +11 -4
- data/lib/polars/string_expr.rb +59 -68
- data/lib/polars/string_name_space.rb +51 -87
- data/lib/polars/struct_expr.rb +36 -18
- data/lib/polars/testing.rb +24 -273
- data/lib/polars/utils/constants.rb +2 -0
- data/lib/polars/utils/construction/data_frame.rb +410 -0
- data/lib/polars/utils/construction/series.rb +364 -0
- data/lib/polars/utils/construction/utils.rb +9 -0
- data/lib/polars/utils/deprecation.rb +11 -0
- data/lib/polars/utils/serde.rb +8 -3
- data/lib/polars/utils/unstable.rb +19 -0
- data/lib/polars/utils/various.rb +59 -0
- data/lib/polars/utils.rb +46 -47
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +47 -1
- metadata +25 -6
- data/ext/polars/src/allocator.rs +0 -13
- data/lib/polars/plot.rb +0 -109
data/lib/polars/data_frame.rb
CHANGED
|
@@ -1,8 +1,6 @@
|
|
|
1
1
|
module Polars
|
|
2
2
|
# Two-dimensional data structure representing data as a table with rows and columns.
|
|
3
3
|
class DataFrame
|
|
4
|
-
include Plot
|
|
5
|
-
|
|
6
4
|
# @private
|
|
7
5
|
attr_accessor :_df
|
|
8
6
|
|
|
@@ -43,24 +41,24 @@ module Polars
|
|
|
43
41
|
# @param infer_schema_length [Integer]
|
|
44
42
|
# The maximum number of rows to scan for schema inference. If set to `nil`, the
|
|
45
43
|
# full data may be scanned *(this can be slow)*. This parameter only applies if
|
|
46
|
-
# the input data is
|
|
44
|
+
# the input data is an array or generator of rows; other input is read as-is.
|
|
47
45
|
# @param nan_to_null [Boolean]
|
|
48
46
|
# If the data comes from one or more Numo arrays, can optionally convert input
|
|
49
47
|
# data NaN values to null instead. This is a no-op for all other input data.
|
|
50
|
-
def initialize(data = nil, schema: nil, schema_overrides: nil, strict: true, orient: nil, infer_schema_length:
|
|
48
|
+
def initialize(data = nil, schema: nil, schema_overrides: nil, strict: true, orient: nil, infer_schema_length: N_INFER_DEFAULT, nan_to_null: false)
|
|
51
49
|
if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
|
|
52
50
|
raise ArgumentError, "Use read_database instead"
|
|
53
51
|
end
|
|
54
52
|
|
|
55
53
|
if data.nil?
|
|
56
|
-
self._df =
|
|
54
|
+
self._df = Utils.hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
|
|
57
55
|
elsif data.is_a?(Hash)
|
|
58
56
|
data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
|
|
59
|
-
self._df =
|
|
57
|
+
self._df = Utils.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict, nan_to_null: nan_to_null)
|
|
60
58
|
elsif data.is_a?(::Array)
|
|
61
|
-
self._df =
|
|
59
|
+
self._df = Utils.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict, orient: orient, infer_schema_length: infer_schema_length)
|
|
62
60
|
elsif data.is_a?(Series)
|
|
63
|
-
self._df =
|
|
61
|
+
self._df = Utils.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict)
|
|
64
62
|
elsif data.respond_to?(:arrow_c_stream)
|
|
65
63
|
# This uses the fact that RbSeries.from_arrow_c_stream will create a
|
|
66
64
|
# struct-typed Series. Then we unpack that to a DataFrame.
|
|
@@ -116,6 +114,45 @@ module Polars
|
|
|
116
114
|
df
|
|
117
115
|
end
|
|
118
116
|
|
|
117
|
+
# Plot data.
|
|
118
|
+
#
|
|
119
|
+
# @return [Object]
|
|
120
|
+
def plot(x = nil, y = nil, type: nil, group: nil, stacked: nil)
|
|
121
|
+
plot = DataFramePlot.new(self)
|
|
122
|
+
return plot if x.nil? && y.nil?
|
|
123
|
+
|
|
124
|
+
raise ArgumentError, "Must specify columns" if x.nil? || y.nil?
|
|
125
|
+
type ||= begin
|
|
126
|
+
if self[x].dtype.numeric? && self[y].dtype.numeric?
|
|
127
|
+
"scatter"
|
|
128
|
+
elsif self[x].dtype == String && self[y].dtype.numeric?
|
|
129
|
+
"column"
|
|
130
|
+
elsif (self[x].dtype == Date || self[x].dtype == Datetime) && self[y].dtype.numeric?
|
|
131
|
+
"line"
|
|
132
|
+
else
|
|
133
|
+
raise "Cannot determine type. Use the type option."
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
case type
|
|
138
|
+
when "line"
|
|
139
|
+
plot.line(x, y, color: group)
|
|
140
|
+
when "area"
|
|
141
|
+
plot.area(x, y, color: group)
|
|
142
|
+
when "pie"
|
|
143
|
+
raise ArgumentError, "Cannot use group option with pie chart" unless group.nil?
|
|
144
|
+
plot.pie(x, y)
|
|
145
|
+
when "column"
|
|
146
|
+
plot.column(x, y, color: group, stacked: stacked)
|
|
147
|
+
when "bar"
|
|
148
|
+
plot.bar(x, y, color: group, stacked: stacked)
|
|
149
|
+
when "scatter"
|
|
150
|
+
plot.scatter(x, y, color: group)
|
|
151
|
+
else
|
|
152
|
+
raise ArgumentError, "Invalid type: #{type}"
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
|
|
119
156
|
# Get the shape of the DataFrame.
|
|
120
157
|
#
|
|
121
158
|
# @return [Array]
|
|
@@ -244,9 +281,9 @@ module Polars
|
|
|
244
281
|
# }
|
|
245
282
|
# )
|
|
246
283
|
# df.schema
|
|
247
|
-
# # => {"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::String}
|
|
284
|
+
# # => Polars::Schema({"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::String})
|
|
248
285
|
def schema
|
|
249
|
-
columns.zip(dtypes).to_h
|
|
286
|
+
Schema.new(columns.zip(dtypes).to_h)
|
|
250
287
|
end
|
|
251
288
|
|
|
252
289
|
# Equal.
|
|
@@ -383,142 +420,243 @@ module Polars
|
|
|
383
420
|
# Returns subset of the DataFrame.
|
|
384
421
|
#
|
|
385
422
|
# @return [Object]
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
423
|
+
#
|
|
424
|
+
# @example
|
|
425
|
+
# df = Polars::DataFrame.new(
|
|
426
|
+
# {"a" => [1, 2, 3], "d" => [4, 5, 6], "c" => [1, 3, 2], "b" => [7, 8, 9]}
|
|
427
|
+
# )
|
|
428
|
+
# df[0]
|
|
429
|
+
# # =>
|
|
430
|
+
# # shape: (1, 4)
|
|
431
|
+
# # ┌─────┬─────┬─────┬─────┐
|
|
432
|
+
# # │ a ┆ d ┆ c ┆ b │
|
|
433
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
|
434
|
+
# # │ i64 ┆ i64 ┆ i64 ┆ i64 │
|
|
435
|
+
# # ╞═════╪═════╪═════╪═════╡
|
|
436
|
+
# # │ 1 ┆ 4 ┆ 1 ┆ 7 │
|
|
437
|
+
# # └─────┴─────┴─────┴─────┘
|
|
438
|
+
#
|
|
439
|
+
# @example
|
|
440
|
+
# df[0, "a"]
|
|
441
|
+
# # => 1
|
|
442
|
+
#
|
|
443
|
+
# @example
|
|
444
|
+
# df["a"]
|
|
445
|
+
# # =>
|
|
446
|
+
# # shape: (3,)
|
|
447
|
+
# # Series: 'a' [i64]
|
|
448
|
+
# # [
|
|
449
|
+
# # 1
|
|
450
|
+
# # 2
|
|
451
|
+
# # 3
|
|
452
|
+
# # ]
|
|
453
|
+
#
|
|
454
|
+
# @example
|
|
455
|
+
# df[0..1]
|
|
456
|
+
# # =>
|
|
457
|
+
# # shape: (2, 4)
|
|
458
|
+
# # ┌─────┬─────┬─────┬─────┐
|
|
459
|
+
# # │ a ┆ d ┆ c ┆ b │
|
|
460
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
|
461
|
+
# # │ i64 ┆ i64 ┆ i64 ┆ i64 │
|
|
462
|
+
# # ╞═════╪═════╪═════╪═════╡
|
|
463
|
+
# # │ 1 ┆ 4 ┆ 1 ┆ 7 │
|
|
464
|
+
# # │ 2 ┆ 5 ┆ 3 ┆ 8 │
|
|
465
|
+
# # └─────┴─────┴─────┴─────┘
|
|
466
|
+
#
|
|
467
|
+
# @example
|
|
468
|
+
# df[0..1, "a"]
|
|
469
|
+
# # =>
|
|
470
|
+
# # shape: (2,)
|
|
471
|
+
# # Series: 'a' [i64]
|
|
472
|
+
# # [
|
|
473
|
+
# # 1
|
|
474
|
+
# # 2
|
|
475
|
+
# # ]
|
|
476
|
+
#
|
|
477
|
+
# @example
|
|
478
|
+
# df[0..1, 0]
|
|
479
|
+
# # =>
|
|
480
|
+
# # shape: (2,)
|
|
481
|
+
# # Series: 'a' [i64]
|
|
482
|
+
# # [
|
|
483
|
+
# # 1
|
|
484
|
+
# # 2
|
|
485
|
+
# # ]
|
|
486
|
+
#
|
|
487
|
+
# @example
|
|
488
|
+
# df[[0, 1], [0, 1, 2]]
|
|
489
|
+
# # =>
|
|
490
|
+
# # shape: (2, 3)
|
|
491
|
+
# # ┌─────┬─────┬─────┐
|
|
492
|
+
# # │ a ┆ d ┆ c │
|
|
493
|
+
# # │ --- ┆ --- ┆ --- │
|
|
494
|
+
# # │ i64 ┆ i64 ┆ i64 │
|
|
495
|
+
# # ╞═════╪═════╪═════╡
|
|
496
|
+
# # │ 1 ┆ 4 ┆ 1 │
|
|
497
|
+
# # │ 2 ┆ 5 ┆ 3 │
|
|
498
|
+
# # └─────┴─────┴─────┘
|
|
499
|
+
#
|
|
500
|
+
# @example
|
|
501
|
+
# df[0..1, ["a", "c"]]
|
|
502
|
+
# # =>
|
|
503
|
+
# # shape: (2, 2)
|
|
504
|
+
# # ┌─────┬─────┐
|
|
505
|
+
# # │ a ┆ c │
|
|
506
|
+
# # │ --- ┆ --- │
|
|
507
|
+
# # │ i64 ┆ i64 │
|
|
508
|
+
# # ╞═════╪═════╡
|
|
509
|
+
# # │ 1 ┆ 1 │
|
|
510
|
+
# # │ 2 ┆ 3 │
|
|
511
|
+
# # └─────┴─────┘
|
|
512
|
+
#
|
|
513
|
+
# @example
|
|
514
|
+
# df[0.., 0..1]
|
|
515
|
+
# # =>
|
|
516
|
+
# # shape: (3, 2)
|
|
517
|
+
# # ┌─────┬─────┐
|
|
518
|
+
# # │ a ┆ d │
|
|
519
|
+
# # │ --- ┆ --- │
|
|
520
|
+
# # │ i64 ┆ i64 │
|
|
521
|
+
# # ╞═════╪═════╡
|
|
522
|
+
# # │ 1 ┆ 4 │
|
|
523
|
+
# # │ 2 ┆ 5 │
|
|
524
|
+
# # │ 3 ┆ 6 │
|
|
525
|
+
# # └─────┴─────┘
|
|
526
|
+
#
|
|
527
|
+
# @example
|
|
528
|
+
# df[0.., "a".."c"]
|
|
529
|
+
# # =>
|
|
530
|
+
# # shape: (3, 3)
|
|
531
|
+
# # ┌─────┬─────┬─────┐
|
|
532
|
+
# # │ a ┆ d ┆ c │
|
|
533
|
+
# # │ --- ┆ --- ┆ --- │
|
|
534
|
+
# # │ i64 ┆ i64 ┆ i64 │
|
|
535
|
+
# # ╞═════╪═════╪═════╡
|
|
536
|
+
# # │ 1 ┆ 4 ┆ 1 │
|
|
537
|
+
# # │ 2 ┆ 5 ┆ 3 │
|
|
538
|
+
# # │ 3 ┆ 6 ┆ 2 │
|
|
539
|
+
# # └─────┴─────┴─────┘
|
|
540
|
+
def [](*key)
|
|
541
|
+
get_df_item_by_key(self, key)
|
|
483
542
|
end
|
|
484
543
|
|
|
485
544
|
# Set item.
|
|
486
545
|
#
|
|
487
546
|
# @return [Object]
|
|
547
|
+
#
|
|
548
|
+
# @example `df[["a", "b"]] = value`:
|
|
549
|
+
# df = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => [4, 5, 6]})
|
|
550
|
+
# df[["a", "b"]] = [[10, 40], [20, 50], [30, 60]]
|
|
551
|
+
# df
|
|
552
|
+
# # =>
|
|
553
|
+
# # shape: (3, 2)
|
|
554
|
+
# # ┌─────┬─────┐
|
|
555
|
+
# # │ a ┆ b │
|
|
556
|
+
# # │ --- ┆ --- │
|
|
557
|
+
# # │ i64 ┆ i64 │
|
|
558
|
+
# # ╞═════╪═════╡
|
|
559
|
+
# # │ 10 ┆ 40 │
|
|
560
|
+
# # │ 20 ┆ 50 │
|
|
561
|
+
# # │ 30 ┆ 60 │
|
|
562
|
+
# # └─────┴─────┘
|
|
563
|
+
#
|
|
564
|
+
# @example `df[row_idx, "a"] = value`:
|
|
565
|
+
# df[1, "a"] = 100
|
|
566
|
+
# df
|
|
567
|
+
# # =>
|
|
568
|
+
# # shape: (3, 2)
|
|
569
|
+
# # ┌─────┬─────┐
|
|
570
|
+
# # │ a ┆ b │
|
|
571
|
+
# # │ --- ┆ --- │
|
|
572
|
+
# # │ i64 ┆ i64 │
|
|
573
|
+
# # ╞═════╪═════╡
|
|
574
|
+
# # │ 10 ┆ 40 │
|
|
575
|
+
# # │ 100 ┆ 50 │
|
|
576
|
+
# # │ 30 ┆ 60 │
|
|
577
|
+
# # └─────┴─────┘
|
|
578
|
+
#
|
|
579
|
+
# @example `df[row_idx, col_idx] = value`:
|
|
580
|
+
# df[0, 1] = 30
|
|
581
|
+
# df
|
|
582
|
+
# # =>
|
|
583
|
+
# # shape: (3, 2)
|
|
584
|
+
# # ┌─────┬─────┐
|
|
585
|
+
# # │ a ┆ b │
|
|
586
|
+
# # │ --- ┆ --- │
|
|
587
|
+
# # │ i64 ┆ i64 │
|
|
588
|
+
# # ╞═════╪═════╡
|
|
589
|
+
# # │ 10 ┆ 30 │
|
|
590
|
+
# # │ 100 ┆ 50 │
|
|
591
|
+
# # │ 30 ┆ 60 │
|
|
592
|
+
# # └─────┴─────┘
|
|
488
593
|
def []=(*key, value)
|
|
489
|
-
if key.length
|
|
490
|
-
key = key.first
|
|
491
|
-
elsif key.length != 2
|
|
594
|
+
if key.empty? || key.length > 2
|
|
492
595
|
raise ArgumentError, "wrong number of arguments (given #{key.length + 1}, expected 2..3)"
|
|
493
596
|
end
|
|
494
597
|
|
|
495
|
-
if Utils.strlike?(key)
|
|
598
|
+
if key.length == 1 && Utils.strlike?(key[0])
|
|
599
|
+
key = key[0]
|
|
600
|
+
|
|
496
601
|
if value.is_a?(::Array) || (defined?(Numo::NArray) && value.is_a?(Numo::NArray))
|
|
497
602
|
value = Series.new(value)
|
|
498
603
|
elsif !value.is_a?(Series)
|
|
499
604
|
value = Polars.lit(value)
|
|
500
605
|
end
|
|
501
|
-
self._df =
|
|
502
|
-
|
|
606
|
+
self._df = with_columns(value.alias(key.to_s))._df
|
|
607
|
+
|
|
608
|
+
# df[["C", "D"]]
|
|
609
|
+
elsif key.length == 1 && key[0].is_a?(::Array)
|
|
610
|
+
key = key[0]
|
|
611
|
+
|
|
612
|
+
if !value.is_a?(::Array) || !value.all? { |v| v.is_a?(::Array) }
|
|
613
|
+
msg = "can only set multiple columns with 2D matrix"
|
|
614
|
+
raise ArgumentError, msg
|
|
615
|
+
end
|
|
616
|
+
if value.any? { |v| v.size != key.length }
|
|
617
|
+
msg = "matrix columns should be equal to list used to determine column names"
|
|
618
|
+
raise ArgumentError, msg
|
|
619
|
+
end
|
|
620
|
+
|
|
621
|
+
columns = []
|
|
622
|
+
key.each_with_index do |name, i|
|
|
623
|
+
columns << Series.new(name, value.map { |v| v[i] })
|
|
624
|
+
end
|
|
625
|
+
self._df = with_columns(columns)._df
|
|
626
|
+
|
|
627
|
+
# df[a, b]
|
|
628
|
+
else
|
|
503
629
|
row_selection, col_selection = key
|
|
504
630
|
|
|
631
|
+
if (row_selection.is_a?(Series) && row_selection.dtype == Boolean) || Utils.is_bool_sequence(row_selection)
|
|
632
|
+
msg = (
|
|
633
|
+
"not allowed to set DataFrame by boolean mask in the row position" +
|
|
634
|
+
"\n\nConsider using `DataFrame.with_columns`."
|
|
635
|
+
)
|
|
636
|
+
raise TypeError, msg
|
|
637
|
+
end
|
|
638
|
+
|
|
639
|
+
# get series column selection
|
|
505
640
|
if Utils.strlike?(col_selection)
|
|
506
641
|
s = self[col_selection]
|
|
507
642
|
elsif col_selection.is_a?(Integer)
|
|
508
|
-
|
|
643
|
+
s = self[0.., col_selection]
|
|
509
644
|
else
|
|
510
|
-
|
|
645
|
+
msg = "unexpected column selection #{col_selection.inspect}"
|
|
646
|
+
raise TypeError, msg
|
|
511
647
|
end
|
|
512
648
|
|
|
649
|
+
# dispatch to []= of Series to do modification
|
|
513
650
|
s[row_selection] = value
|
|
514
651
|
|
|
652
|
+
# now find the location to place series
|
|
653
|
+
# df[idx]
|
|
515
654
|
if col_selection.is_a?(Integer)
|
|
516
655
|
replace_column(col_selection, s)
|
|
656
|
+
# df["foo"]
|
|
517
657
|
elsif Utils.strlike?(col_selection)
|
|
518
|
-
|
|
658
|
+
_replace(col_selection.to_s, s)
|
|
519
659
|
end
|
|
520
|
-
else
|
|
521
|
-
raise Todo
|
|
522
660
|
end
|
|
523
661
|
end
|
|
524
662
|
|
|
@@ -566,22 +704,55 @@ module Polars
|
|
|
566
704
|
Schema.new(columns.zip(dtypes), check_dtypes: false)
|
|
567
705
|
end
|
|
568
706
|
|
|
569
|
-
# Return the
|
|
707
|
+
# Return the DataFrame as a scalar, or return the element at the given row/column.
|
|
570
708
|
#
|
|
571
|
-
#
|
|
709
|
+
# @param row [Integer]
|
|
710
|
+
# Optional row index.
|
|
711
|
+
# @param column [Integer, String]
|
|
712
|
+
# Optional column index or name.
|
|
572
713
|
#
|
|
573
714
|
# @return [Object]
|
|
574
715
|
#
|
|
716
|
+
# @note
|
|
717
|
+
# If row/col not provided, this is equivalent to `df[0,0]`, with a check that
|
|
718
|
+
# the shape is (1,1). With row/col, this is equivalent to `df[row,col]`.
|
|
719
|
+
#
|
|
575
720
|
# @example
|
|
576
721
|
# df = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => [4, 5, 6]})
|
|
577
|
-
#
|
|
578
|
-
# result.item
|
|
722
|
+
# df.select((Polars.col("a") * Polars.col("b")).sum).item
|
|
579
723
|
# # => 32
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
724
|
+
#
|
|
725
|
+
# @example
|
|
726
|
+
# df.item(1, 1)
|
|
727
|
+
# # => 5
|
|
728
|
+
#
|
|
729
|
+
# @example
|
|
730
|
+
# df.item(2, "b")
|
|
731
|
+
# # => 6
|
|
732
|
+
def item(row = nil, column = nil)
|
|
733
|
+
if row.nil? && column.nil?
|
|
734
|
+
if shape != [1, 1]
|
|
735
|
+
msg = (
|
|
736
|
+
"can only call `.item()` if the dataframe is of shape (1, 1)," +
|
|
737
|
+
" or if explicit row/col values are provided;" +
|
|
738
|
+
" frame has shape #{shape.inspect}"
|
|
739
|
+
)
|
|
740
|
+
raise ArgumentError, msg
|
|
741
|
+
end
|
|
742
|
+
return _df.to_series(0).get_index(0)
|
|
743
|
+
|
|
744
|
+
elsif row.nil? || column.nil?
|
|
745
|
+
msg = "cannot call `.item()` with only one of `row` or `column`"
|
|
746
|
+
raise ArgumentError, msg
|
|
583
747
|
end
|
|
584
|
-
|
|
748
|
+
|
|
749
|
+
s =
|
|
750
|
+
if column.is_a?(Integer)
|
|
751
|
+
_df.to_series(column)
|
|
752
|
+
else
|
|
753
|
+
_df.get_column(column)
|
|
754
|
+
end
|
|
755
|
+
s.get_index_signed(row)
|
|
585
756
|
end
|
|
586
757
|
|
|
587
758
|
# no to_arrow
|
|
@@ -661,7 +832,7 @@ module Polars
|
|
|
661
832
|
if index < 0
|
|
662
833
|
index = columns.length + index
|
|
663
834
|
end
|
|
664
|
-
Utils.wrap_s(_df.
|
|
835
|
+
Utils.wrap_s(_df.to_series(index))
|
|
665
836
|
end
|
|
666
837
|
|
|
667
838
|
# Serialize this DataFrame to a file or string.
|
|
@@ -758,25 +929,26 @@ module Polars
|
|
|
758
929
|
# df.write_ndjson
|
|
759
930
|
# # => "{\"foo\":1,\"bar\":6}\n{\"foo\":2,\"bar\":7}\n{\"foo\":3,\"bar\":8}\n"
|
|
760
931
|
def write_ndjson(file = nil)
|
|
761
|
-
|
|
762
|
-
|
|
932
|
+
should_return_buffer = false
|
|
933
|
+
target = nil
|
|
934
|
+
if file.nil?
|
|
935
|
+
target = StringIO.new
|
|
936
|
+
target.set_encoding(Encoding::BINARY)
|
|
937
|
+
should_return_buffer = true
|
|
938
|
+
elsif Utils.pathlike?(file)
|
|
939
|
+
target = Utils.normalize_filepath(file)
|
|
940
|
+
else
|
|
941
|
+
target = file
|
|
763
942
|
end
|
|
764
|
-
to_string_io = !file.nil? && file.is_a?(StringIO)
|
|
765
|
-
if file.nil? || to_string_io
|
|
766
|
-
buf = StringIO.new
|
|
767
|
-
buf.set_encoding(Encoding::BINARY)
|
|
768
|
-
_df.write_ndjson(buf)
|
|
769
|
-
json_bytes = buf.string
|
|
770
943
|
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
else
|
|
778
|
-
_df.write_ndjson(file)
|
|
944
|
+
lazy.sink_ndjson(
|
|
945
|
+
target
|
|
946
|
+
)
|
|
947
|
+
|
|
948
|
+
if should_return_buffer
|
|
949
|
+
return target.string.force_encoding(Encoding::UTF_8)
|
|
779
950
|
end
|
|
951
|
+
|
|
780
952
|
nil
|
|
781
953
|
end
|
|
782
954
|
|
|
@@ -787,9 +959,9 @@ module Polars
|
|
|
787
959
|
# (default), the output is returned as a string instead.
|
|
788
960
|
# @param include_header [Boolean]
|
|
789
961
|
# Whether to include header in the CSV output.
|
|
790
|
-
# @param
|
|
962
|
+
# @param separator [String]
|
|
791
963
|
# Separate CSV fields with this symbol.
|
|
792
|
-
# @param
|
|
964
|
+
# @param quote_char [String]
|
|
793
965
|
# Byte to use as quoting character.
|
|
794
966
|
# @param batch_size [Integer]
|
|
795
967
|
# Number of rows that will be processed per thread.
|
|
@@ -808,8 +980,8 @@ module Polars
|
|
|
808
980
|
# [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
|
|
809
981
|
# Rust crate.
|
|
810
982
|
# @param float_precision [Integer, nil]
|
|
811
|
-
# Number of decimal places to write, applied to both
|
|
812
|
-
#
|
|
983
|
+
# Number of decimal places to write, applied to both `Float32` and
|
|
984
|
+
# `Float64` datatypes.
|
|
813
985
|
# @param null_value [String, nil]
|
|
814
986
|
# A string representing null values (defaulting to the empty string).
|
|
815
987
|
#
|
|
@@ -826,38 +998,52 @@ module Polars
|
|
|
826
998
|
# df.write_csv("file.csv")
|
|
827
999
|
def write_csv(
|
|
828
1000
|
file = nil,
|
|
1001
|
+
include_bom: false,
|
|
829
1002
|
include_header: true,
|
|
830
|
-
|
|
831
|
-
|
|
1003
|
+
separator: ",",
|
|
1004
|
+
line_terminator: "\n",
|
|
1005
|
+
quote_char: '"',
|
|
832
1006
|
batch_size: 1024,
|
|
833
1007
|
datetime_format: nil,
|
|
834
1008
|
date_format: nil,
|
|
835
1009
|
time_format: nil,
|
|
1010
|
+
float_scientific: nil,
|
|
836
1011
|
float_precision: nil,
|
|
837
|
-
|
|
1012
|
+
decimal_comma: false,
|
|
1013
|
+
null_value: nil,
|
|
1014
|
+
quote_style: nil,
|
|
1015
|
+
storage_options: nil,
|
|
1016
|
+
credential_provider: "auto",
|
|
1017
|
+
retries: 2
|
|
838
1018
|
)
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
raise ArgumentError, "only single byte quote char is allowed"
|
|
843
|
-
elsif null_value == ""
|
|
1019
|
+
Utils._check_arg_is_1byte("separator", separator, false)
|
|
1020
|
+
Utils._check_arg_is_1byte("quote_char", quote_char, true)
|
|
1021
|
+
if null_value == ""
|
|
844
1022
|
null_value = nil
|
|
845
1023
|
end
|
|
846
1024
|
|
|
847
1025
|
if file.nil?
|
|
848
1026
|
buffer = StringIO.new
|
|
849
1027
|
buffer.set_encoding(Encoding::BINARY)
|
|
850
|
-
|
|
1028
|
+
lazy.sink_csv(
|
|
851
1029
|
buffer,
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
1030
|
+
include_bom: include_bom,
|
|
1031
|
+
include_header: include_header,
|
|
1032
|
+
separator: separator,
|
|
1033
|
+
line_terminator: line_terminator,
|
|
1034
|
+
quote_char: quote_char,
|
|
1035
|
+
batch_size: batch_size,
|
|
1036
|
+
datetime_format: datetime_format,
|
|
1037
|
+
date_format: date_format,
|
|
1038
|
+
time_format: time_format,
|
|
1039
|
+
float_scientific: float_scientific,
|
|
1040
|
+
float_precision: float_precision,
|
|
1041
|
+
decimal_comma: decimal_comma,
|
|
1042
|
+
null_value: null_value,
|
|
1043
|
+
quote_style: quote_style,
|
|
1044
|
+
storage_options: storage_options,
|
|
1045
|
+
credential_provider: credential_provider,
|
|
1046
|
+
retries: retries
|
|
861
1047
|
)
|
|
862
1048
|
return buffer.string.force_encoding(Encoding::UTF_8)
|
|
863
1049
|
end
|
|
@@ -866,17 +1052,25 @@ module Polars
|
|
|
866
1052
|
file = Utils.normalize_filepath(file)
|
|
867
1053
|
end
|
|
868
1054
|
|
|
869
|
-
|
|
1055
|
+
lazy.sink_csv(
|
|
870
1056
|
file,
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
1057
|
+
include_bom: include_bom,
|
|
1058
|
+
include_header: include_header,
|
|
1059
|
+
separator: separator,
|
|
1060
|
+
line_terminator: line_terminator,
|
|
1061
|
+
quote_char: quote_char,
|
|
1062
|
+
batch_size: batch_size,
|
|
1063
|
+
datetime_format: datetime_format,
|
|
1064
|
+
date_format: date_format,
|
|
1065
|
+
time_format: time_format,
|
|
1066
|
+
float_scientific: float_scientific,
|
|
1067
|
+
float_precision: float_precision,
|
|
1068
|
+
decimal_comma: decimal_comma,
|
|
1069
|
+
null_value: null_value,
|
|
1070
|
+
quote_style: quote_style,
|
|
1071
|
+
storage_options: storage_options,
|
|
1072
|
+
credential_provider: credential_provider,
|
|
1073
|
+
retries: retries
|
|
880
1074
|
)
|
|
881
1075
|
nil
|
|
882
1076
|
end
|
|
@@ -934,6 +1128,10 @@ module Polars
|
|
|
934
1128
|
#
|
|
935
1129
|
# If `storage_options` is not provided, Polars will try to infer the
|
|
936
1130
|
# information from environment variables.
|
|
1131
|
+
# @param credential_provider [Object]
|
|
1132
|
+
# Provide a function that can be called to provide cloud storage
|
|
1133
|
+
# credentials. The function is expected to return a hash of
|
|
1134
|
+
# credential keys along with an optional credential expiry time.
|
|
937
1135
|
# @param retries [Integer]
|
|
938
1136
|
# Number of retries if accessing a cloud instance fails.
|
|
939
1137
|
#
|
|
@@ -943,33 +1141,27 @@ module Polars
|
|
|
943
1141
|
compression: "uncompressed",
|
|
944
1142
|
compat_level: nil,
|
|
945
1143
|
storage_options: nil,
|
|
1144
|
+
credential_provider: "auto",
|
|
946
1145
|
retries: 2
|
|
947
1146
|
)
|
|
948
1147
|
return_bytes = file.nil?
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
if Utils.pathlike?(file)
|
|
954
|
-
file = Utils.normalize_filepath(file)
|
|
955
|
-
end
|
|
956
|
-
|
|
957
|
-
if compat_level.nil?
|
|
958
|
-
compat_level = true
|
|
959
|
-
end
|
|
960
|
-
|
|
961
|
-
if compression.nil?
|
|
962
|
-
compression = "uncompressed"
|
|
963
|
-
end
|
|
964
|
-
|
|
965
|
-
if storage_options&.any?
|
|
966
|
-
storage_options = storage_options.to_a
|
|
1148
|
+
target = nil
|
|
1149
|
+
if file.nil?
|
|
1150
|
+
target = StringIO.new
|
|
1151
|
+
target.set_encoding(Encoding::BINARY)
|
|
967
1152
|
else
|
|
968
|
-
|
|
1153
|
+
target = file
|
|
969
1154
|
end
|
|
970
1155
|
|
|
971
|
-
|
|
972
|
-
|
|
1156
|
+
lazy.sink_ipc(
|
|
1157
|
+
target,
|
|
1158
|
+
compression: compression,
|
|
1159
|
+
compat_level: compat_level,
|
|
1160
|
+
storage_options: storage_options,
|
|
1161
|
+
credential_provider: credential_provider,
|
|
1162
|
+
retries: retries
|
|
1163
|
+
)
|
|
1164
|
+
return_bytes ? target.string : nil
|
|
973
1165
|
end
|
|
974
1166
|
|
|
975
1167
|
# Write to Arrow IPC record batch stream.
|
|
@@ -1049,9 +1241,16 @@ module Polars
|
|
|
1049
1241
|
file,
|
|
1050
1242
|
compression: "zstd",
|
|
1051
1243
|
compression_level: nil,
|
|
1052
|
-
statistics:
|
|
1244
|
+
statistics: true,
|
|
1053
1245
|
row_group_size: nil,
|
|
1054
|
-
data_page_size: nil
|
|
1246
|
+
data_page_size: nil,
|
|
1247
|
+
partition_by: nil,
|
|
1248
|
+
partition_chunk_size_bytes: 4_294_967_296,
|
|
1249
|
+
storage_options: nil,
|
|
1250
|
+
credential_provider: "auto",
|
|
1251
|
+
retries: 2,
|
|
1252
|
+
metadata: nil,
|
|
1253
|
+
mkdir: false
|
|
1055
1254
|
)
|
|
1056
1255
|
if compression.nil?
|
|
1057
1256
|
compression = "uncompressed"
|
|
@@ -1060,26 +1259,23 @@ module Polars
|
|
|
1060
1259
|
file = Utils.normalize_filepath(file)
|
|
1061
1260
|
end
|
|
1062
1261
|
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
max: true,
|
|
1067
|
-
distinct_count: false,
|
|
1068
|
-
null_count: true
|
|
1069
|
-
}
|
|
1070
|
-
elsif statistics == false
|
|
1071
|
-
statistics = {}
|
|
1072
|
-
elsif statistics == "full"
|
|
1073
|
-
statistics = {
|
|
1074
|
-
min: true,
|
|
1075
|
-
max: true,
|
|
1076
|
-
distinct_count: true,
|
|
1077
|
-
null_count: true
|
|
1078
|
-
}
|
|
1262
|
+
target = file
|
|
1263
|
+
if !partition_by.nil?
|
|
1264
|
+
raise Todo
|
|
1079
1265
|
end
|
|
1080
1266
|
|
|
1081
|
-
|
|
1082
|
-
|
|
1267
|
+
lazy.sink_parquet(
|
|
1268
|
+
target,
|
|
1269
|
+
compression: compression,
|
|
1270
|
+
compression_level: compression_level,
|
|
1271
|
+
statistics: statistics,
|
|
1272
|
+
row_group_size: row_group_size,
|
|
1273
|
+
data_page_size: data_page_size,
|
|
1274
|
+
storage_options: storage_options,
|
|
1275
|
+
credential_provider: credential_provider,
|
|
1276
|
+
retries: retries,
|
|
1277
|
+
metadata: metadata,
|
|
1278
|
+
mkdir: mkdir
|
|
1083
1279
|
)
|
|
1084
1280
|
end
|
|
1085
1281
|
|
|
@@ -1332,7 +1528,7 @@ module Polars
|
|
|
1332
1528
|
# "y" => 1_000_000.times.map { |v| v / 1000.0 },
|
|
1333
1529
|
# "z" => 1_000_000.times.map(&:to_s)
|
|
1334
1530
|
# },
|
|
1335
|
-
# schema: {"x" =>
|
|
1531
|
+
# schema: {"x" => Polars::UInt32, "y" => Polars::Float64, "z" => Polars::String}
|
|
1336
1532
|
# )
|
|
1337
1533
|
# df.estimated_size
|
|
1338
1534
|
# # => 25888898
|
|
@@ -1464,14 +1660,14 @@ module Polars
|
|
|
1464
1660
|
# # │ 3 ┆ 8 ┆ c │
|
|
1465
1661
|
# # └───────┴─────┴─────┘
|
|
1466
1662
|
def rename(mapping, strict: true)
|
|
1467
|
-
lazy.rename(mapping, strict: strict).collect(
|
|
1663
|
+
lazy.rename(mapping, strict: strict).collect(optimizations: QueryOptFlags._eager)
|
|
1468
1664
|
end
|
|
1469
1665
|
|
|
1470
1666
|
# Insert a Series at a certain column index. This operation is in place.
|
|
1471
1667
|
#
|
|
1472
1668
|
# @param index [Integer]
|
|
1473
1669
|
# Column to insert the new `Series` column.
|
|
1474
|
-
# @param
|
|
1670
|
+
# @param column [Series]
|
|
1475
1671
|
# `Series` to insert.
|
|
1476
1672
|
#
|
|
1477
1673
|
# @return [DataFrame]
|
|
@@ -1514,19 +1710,22 @@ module Polars
|
|
|
1514
1710
|
# # │ 3 ┆ 10.0 ┆ false ┆ 20.5 │
|
|
1515
1711
|
# # │ 4 ┆ 13.0 ┆ true ┆ 0.0 │
|
|
1516
1712
|
# # └─────┴──────┴───────┴──────┘
|
|
1517
|
-
def insert_column(index,
|
|
1713
|
+
def insert_column(index, column)
|
|
1518
1714
|
if index < 0
|
|
1519
|
-
index =
|
|
1715
|
+
index = width + index
|
|
1520
1716
|
end
|
|
1521
|
-
_df.insert_column(index,
|
|
1717
|
+
_df.insert_column(index, column._s)
|
|
1522
1718
|
self
|
|
1523
1719
|
end
|
|
1524
|
-
alias_method :insert_at_idx, :insert_column
|
|
1525
1720
|
|
|
1526
1721
|
# Filter the rows in the DataFrame based on a predicate expression.
|
|
1527
1722
|
#
|
|
1528
|
-
# @param
|
|
1529
|
-
# Expression that
|
|
1723
|
+
# @param predicates [Array]
|
|
1724
|
+
# Expression(s) that evaluate to a boolean Series.
|
|
1725
|
+
# @param constraints [Hash]
|
|
1726
|
+
# Column filters; use `name = value` to filter columns by the supplied value.
|
|
1727
|
+
# Each constraint will behave the same as `Polars.col(name).eq(value)`, and
|
|
1728
|
+
# be implicitly joined with the other filter conditions using `&`.
|
|
1530
1729
|
#
|
|
1531
1730
|
# @return [DataFrame]
|
|
1532
1731
|
#
|
|
@@ -1561,15 +1760,15 @@ module Polars
|
|
|
1561
1760
|
# # ╞═════╪═════╪═════╡
|
|
1562
1761
|
# # │ 1 ┆ 6 ┆ a │
|
|
1563
1762
|
# # └─────┴─────┴─────┘
|
|
1564
|
-
def filter(
|
|
1565
|
-
lazy.filter(
|
|
1763
|
+
def filter(*predicates, **constraints)
|
|
1764
|
+
lazy.filter(*predicates, **constraints).collect(optimizations: QueryOptFlags._eager)
|
|
1566
1765
|
end
|
|
1567
1766
|
|
|
1568
1767
|
# Remove rows, dropping those that match the given predicate expression(s).
|
|
1569
1768
|
#
|
|
1570
1769
|
# The original order of the remaining rows is preserved.
|
|
1571
1770
|
#
|
|
1572
|
-
# Rows where the filter predicate does not evaluate to
|
|
1771
|
+
# Rows where the filter predicate does not evaluate to true are retained
|
|
1573
1772
|
# (this includes rows where the predicate evaluates as `null`).
|
|
1574
1773
|
#
|
|
1575
1774
|
# @param predicates [Array]
|
|
@@ -1682,77 +1881,178 @@ module Polars
|
|
|
1682
1881
|
)
|
|
1683
1882
|
lazy
|
|
1684
1883
|
.remove(*predicates, **constraints)
|
|
1685
|
-
.collect(
|
|
1884
|
+
.collect(optimizations: QueryOptFlags._eager)
|
|
1686
1885
|
end
|
|
1687
1886
|
|
|
1688
|
-
#
|
|
1887
|
+
# Return a dense preview of the DataFrame.
|
|
1689
1888
|
#
|
|
1690
|
-
#
|
|
1889
|
+
# The formatting shows one line per column so that wide dataframes display
|
|
1890
|
+
# cleanly. Each line shows the column name, the data type, and the first
|
|
1891
|
+
# few values.
|
|
1691
1892
|
#
|
|
1692
|
-
# @
|
|
1893
|
+
# @param max_items_per_column [Integer]
|
|
1894
|
+
# Maximum number of items to show per column.
|
|
1895
|
+
# @param max_colname_length [Integer]
|
|
1896
|
+
# Maximum length of the displayed column names; values that exceed
|
|
1897
|
+
# this value are truncated with a trailing ellipsis.
|
|
1898
|
+
# @param return_type [nil, 'self', 'frame', 'string']
|
|
1899
|
+
# Modify the return format:
|
|
1900
|
+
#
|
|
1901
|
+
# - `nil` (default): Print the glimpse output to stdout, returning `nil`.
|
|
1902
|
+
# - `"self"`: Print the glimpse output to stdout, returning the *original* frame.
|
|
1903
|
+
# - `"frame"`: Return the glimpse output as a new DataFrame.
|
|
1904
|
+
# - `"string"`: Return the glimpse output as a string.
|
|
1905
|
+
#
|
|
1906
|
+
# @return [Object]
|
|
1907
|
+
#
|
|
1908
|
+
# @example Return the glimpse output as a DataFrame:
|
|
1693
1909
|
# df = Polars::DataFrame.new(
|
|
1694
1910
|
# {
|
|
1695
1911
|
# "a" => [1.0, 2.8, 3.0],
|
|
1696
1912
|
# "b" => [4, 5, nil],
|
|
1697
1913
|
# "c" => [true, false, true],
|
|
1698
1914
|
# "d" => [nil, "b", "c"],
|
|
1699
|
-
# "e" => ["usd", "eur", nil]
|
|
1915
|
+
# "e" => ["usd", "eur", nil],
|
|
1916
|
+
# "f" => [Date.new(2020, 1, 1), Date.new(2021, 1, 2), Date.new(2022, 1, 1)]
|
|
1700
1917
|
# }
|
|
1701
1918
|
# )
|
|
1702
|
-
# df.
|
|
1919
|
+
# df.glimpse(return_type: "frame")
|
|
1703
1920
|
# # =>
|
|
1704
|
-
# # shape: (
|
|
1705
|
-
# #
|
|
1706
|
-
# # │
|
|
1707
|
-
# # │ ---
|
|
1708
|
-
# # │ str
|
|
1709
|
-
# #
|
|
1710
|
-
# # │
|
|
1711
|
-
# # │
|
|
1712
|
-
# # │
|
|
1713
|
-
# # │
|
|
1714
|
-
# # │
|
|
1715
|
-
# # │
|
|
1716
|
-
# #
|
|
1717
|
-
|
|
1718
|
-
|
|
1719
|
-
|
|
1720
|
-
|
|
1721
|
-
|
|
1722
|
-
|
|
1723
|
-
|
|
1724
|
-
|
|
1725
|
-
|
|
1726
|
-
|
|
1727
|
-
|
|
1728
|
-
|
|
1921
|
+
# # shape: (6, 3)
|
|
1922
|
+
# # ┌────────┬───────┬─────────────────────────────────┐
|
|
1923
|
+
# # │ column ┆ dtype ┆ values │
|
|
1924
|
+
# # │ --- ┆ --- ┆ --- │
|
|
1925
|
+
# # │ str ┆ str ┆ list[str] │
|
|
1926
|
+
# # ╞════════╪═══════╪═════════════════════════════════╡
|
|
1927
|
+
# # │ a ┆ f64 ┆ ["1.0", "2.8", "3.0"] │
|
|
1928
|
+
# # │ b ┆ i64 ┆ ["4", "5", null] │
|
|
1929
|
+
# # │ c ┆ bool ┆ ["true", "false", "true"] │
|
|
1930
|
+
# # │ d ┆ str ┆ [null, ""b"", ""c""] │
|
|
1931
|
+
# # │ e ┆ str ┆ [""usd"", ""eur"", null] │
|
|
1932
|
+
# # │ f ┆ date ┆ ["2020-01-01", "2021-01-02", "… │
|
|
1933
|
+
# # └────────┴───────┴─────────────────────────────────┘
|
|
1934
|
+
def glimpse(
|
|
1935
|
+
max_items_per_column: 10,
|
|
1936
|
+
max_colname_length: 50,
|
|
1937
|
+
return_type: nil
|
|
1938
|
+
)
|
|
1939
|
+
if return_type.nil?
|
|
1940
|
+
return_frame = false
|
|
1941
|
+
else
|
|
1942
|
+
return_frame = return_type == "frame"
|
|
1943
|
+
if !return_frame && !["self", "string"].include?(return_type)
|
|
1944
|
+
msg = "invalid `return_type`; found #{return_type.inspect}, expected one of 'string', 'frame', 'self', or nil"
|
|
1945
|
+
raise ArgumentError, msg
|
|
1729
1946
|
end
|
|
1730
|
-
self.class.new(columns)
|
|
1731
1947
|
end
|
|
1732
1948
|
|
|
1733
|
-
|
|
1734
|
-
|
|
1735
|
-
|
|
1736
|
-
|
|
1737
|
-
|
|
1738
|
-
|
|
1739
|
-
|
|
1740
|
-
|
|
1741
|
-
|
|
1742
|
-
|
|
1743
|
-
|
|
1744
|
-
|
|
1745
|
-
|
|
1746
|
-
|
|
1747
|
-
|
|
1748
|
-
|
|
1749
|
-
|
|
1750
|
-
|
|
1751
|
-
|
|
1752
|
-
|
|
1949
|
+
# always print at most this number of values (mainly ensures that
|
|
1950
|
+
# we do not cast long arrays to strings, which would be slow)
|
|
1951
|
+
max_n_values = [max_items_per_column, height].min
|
|
1952
|
+
schema = self.schema
|
|
1953
|
+
|
|
1954
|
+
_column_to_row_output = lambda do |col_name, dtype|
|
|
1955
|
+
fn = schema[col_name] == String ? :inspect : :to_s
|
|
1956
|
+
values = self[0...max_n_values, col_name].to_a
|
|
1957
|
+
if col_name.length > max_colname_length
|
|
1958
|
+
col_name = col_name[0...(max_colname_length - 1)] + "…"
|
|
1959
|
+
end
|
|
1960
|
+
dtype_str = Plr.dtype_str_repr(dtype)
|
|
1961
|
+
if !return_frame
|
|
1962
|
+
dtype_str = "<#{dtype_str}>"
|
|
1963
|
+
end
|
|
1964
|
+
[col_name, dtype_str, values.map { |v| !v.nil? ? v.send(fn) : nil }]
|
|
1965
|
+
end
|
|
1966
|
+
|
|
1967
|
+
data = self.schema.map { |s, dtype| _column_to_row_output.(s, dtype) }
|
|
1968
|
+
|
|
1969
|
+
# output one row per column
|
|
1970
|
+
if return_frame
|
|
1971
|
+
DataFrame.new(
|
|
1972
|
+
data,
|
|
1973
|
+
orient: "row",
|
|
1974
|
+
schema: {"column" => String, "dtype" => String, "values" => List.new(String)}
|
|
1753
1975
|
)
|
|
1976
|
+
else
|
|
1977
|
+
raise Todo
|
|
1978
|
+
end
|
|
1979
|
+
end
|
|
1980
|
+
|
|
1981
|
+
# Summary statistics for a DataFrame.
|
|
1982
|
+
#
|
|
1983
|
+
# @param percentiles [Array]
|
|
1984
|
+
# One or more percentiles to include in the summary statistics.
|
|
1985
|
+
# All values must be in the range `[0, 1]`.
|
|
1986
|
+
# @param interpolation ['nearest', 'higher', 'lower', 'midpoint', 'linear', 'equiprobable']
|
|
1987
|
+
# Interpolation method used when calculating percentiles.
|
|
1988
|
+
#
|
|
1989
|
+
# @return [DataFrame]
|
|
1990
|
+
#
|
|
1991
|
+
# @example Show default frame statistics:
|
|
1992
|
+
# df = Polars::DataFrame.new(
|
|
1993
|
+
# {
|
|
1994
|
+
# "float" => [1.0, 2.8, 3.0],
|
|
1995
|
+
# "int" => [40, 50, nil],
|
|
1996
|
+
# "bool" => [true, false, true],
|
|
1997
|
+
# "str" => ["zz", "xx", "yy"],
|
|
1998
|
+
# "date" => [Date.new(2020, 1, 1), Date.new(2021, 7, 5), Date.new(2022, 12, 31)]
|
|
1999
|
+
# }
|
|
2000
|
+
# )
|
|
2001
|
+
# df.describe
|
|
2002
|
+
# # =>
|
|
2003
|
+
# # shape: (9, 6)
|
|
2004
|
+
# # ┌────────────┬──────────┬──────────┬──────────┬──────┬─────────────────────────┐
|
|
2005
|
+
# # │ statistic ┆ float ┆ int ┆ bool ┆ str ┆ date │
|
|
2006
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
|
2007
|
+
# # │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str │
|
|
2008
|
+
# # ╞════════════╪══════════╪══════════╪══════════╪══════╪═════════════════════════╡
|
|
2009
|
+
# # │ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 3 ┆ 3 │
|
|
2010
|
+
# # │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 0 ┆ 0 │
|
|
2011
|
+
# # │ mean ┆ 2.266667 ┆ 45.0 ┆ 0.666667 ┆ null ┆ 2021-07-02 16:00:00 UTC │
|
|
2012
|
+
# # │ std ┆ 1.101514 ┆ 7.071068 ┆ null ┆ null ┆ null │
|
|
2013
|
+
# # │ min ┆ 1.0 ┆ 40.0 ┆ 0.0 ┆ xx ┆ 2020-01-01 │
|
|
2014
|
+
# # │ 25% ┆ 2.8 ┆ 40.0 ┆ null ┆ null ┆ 2021-07-05 │
|
|
2015
|
+
# # │ 50% ┆ 2.8 ┆ 50.0 ┆ null ┆ null ┆ 2021-07-05 │
|
|
2016
|
+
# # │ 75% ┆ 3.0 ┆ 50.0 ┆ null ┆ null ┆ 2022-12-31 │
|
|
2017
|
+
# # │ max ┆ 3.0 ┆ 50.0 ┆ 1.0 ┆ zz ┆ 2022-12-31 │
|
|
2018
|
+
# # └────────────┴──────────┴──────────┴──────────┴──────┴─────────────────────────┘
|
|
2019
|
+
#
|
|
2020
|
+
# @example Customize which percentiles are displayed, applying linear interpolation:
|
|
2021
|
+
# df.describe(
|
|
2022
|
+
# percentiles: [0.1, 0.3, 0.5, 0.7, 0.9],
|
|
2023
|
+
# interpolation: "linear"
|
|
2024
|
+
# )
|
|
2025
|
+
# # =>
|
|
2026
|
+
# # shape: (11, 6)
|
|
2027
|
+
# # ┌────────────┬──────────┬──────────┬──────────┬──────┬─────────────────────────┐
|
|
2028
|
+
# # │ statistic ┆ float ┆ int ┆ bool ┆ str ┆ date │
|
|
2029
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
|
2030
|
+
# # │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str │
|
|
2031
|
+
# # ╞════════════╪══════════╪══════════╪══════════╪══════╪═════════════════════════╡
|
|
2032
|
+
# # │ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 3 ┆ 3 │
|
|
2033
|
+
# # │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 0 ┆ 0 │
|
|
2034
|
+
# # │ mean ┆ 2.266667 ┆ 45.0 ┆ 0.666667 ┆ null ┆ 2021-07-02 16:00:00 UTC │
|
|
2035
|
+
# # │ std ┆ 1.101514 ┆ 7.071068 ┆ null ┆ null ┆ null │
|
|
2036
|
+
# # │ min ┆ 1.0 ┆ 40.0 ┆ 0.0 ┆ xx ┆ 2020-01-01 │
|
|
2037
|
+
# # │ … ┆ … ┆ … ┆ … ┆ … ┆ … │
|
|
2038
|
+
# # │ 30% ┆ 2.08 ┆ 43.0 ┆ null ┆ null ┆ 2020-11-26 │
|
|
2039
|
+
# # │ 50% ┆ 2.8 ┆ 45.0 ┆ null ┆ null ┆ 2021-07-05 │
|
|
2040
|
+
# # │ 70% ┆ 2.88 ┆ 47.0 ┆ null ┆ null ┆ 2022-02-07 │
|
|
2041
|
+
# # │ 90% ┆ 2.96 ┆ 49.0 ┆ null ┆ null ┆ 2022-09-13 │
|
|
2042
|
+
# # │ max ┆ 3.0 ┆ 50.0 ┆ 1.0 ┆ zz ┆ 2022-12-31 │
|
|
2043
|
+
# # └────────────┴──────────┴──────────┴──────────┴──────┴─────────────────────────┘
|
|
2044
|
+
def describe(
|
|
2045
|
+
percentiles: [0.25, 0.5, 0.75],
|
|
2046
|
+
interpolation: "nearest"
|
|
2047
|
+
)
|
|
2048
|
+
if columns.empty?
|
|
2049
|
+
msg = "cannot describe a DataFrame that has no columns"
|
|
2050
|
+
raise TypeError, msg
|
|
2051
|
+
end
|
|
2052
|
+
|
|
2053
|
+
lazy.describe(
|
|
2054
|
+
percentiles: percentiles, interpolation: interpolation
|
|
1754
2055
|
)
|
|
1755
|
-
summary
|
|
1756
2056
|
end
|
|
1757
2057
|
|
|
1758
2058
|
# Find the index of a column by name.
|
|
@@ -1771,13 +2071,12 @@ module Polars
|
|
|
1771
2071
|
def get_column_index(name)
|
|
1772
2072
|
_df.get_column_index(name)
|
|
1773
2073
|
end
|
|
1774
|
-
alias_method :find_idx_by_name, :get_column_index
|
|
1775
2074
|
|
|
1776
2075
|
# Replace a column at an index location.
|
|
1777
2076
|
#
|
|
1778
2077
|
# @param index [Integer]
|
|
1779
2078
|
# Column index.
|
|
1780
|
-
# @param
|
|
2079
|
+
# @param column [Series]
|
|
1781
2080
|
# Series that will replace the column.
|
|
1782
2081
|
#
|
|
1783
2082
|
# @return [DataFrame]
|
|
@@ -1803,23 +2102,31 @@ module Polars
|
|
|
1803
2102
|
# # │ 20 ┆ 7 ┆ b │
|
|
1804
2103
|
# # │ 30 ┆ 8 ┆ c │
|
|
1805
2104
|
# # └───────┴─────┴─────┘
|
|
1806
|
-
def replace_column(index,
|
|
2105
|
+
def replace_column(index, column)
|
|
1807
2106
|
if index < 0
|
|
1808
|
-
index =
|
|
2107
|
+
index = width + index
|
|
1809
2108
|
end
|
|
1810
|
-
_df.replace_column(index,
|
|
2109
|
+
_df.replace_column(index, column._s)
|
|
1811
2110
|
self
|
|
1812
2111
|
end
|
|
1813
|
-
alias_method :replace_at_idx, :replace_column
|
|
1814
2112
|
|
|
1815
|
-
# Sort the
|
|
2113
|
+
# Sort the dataframe by the given columns.
|
|
1816
2114
|
#
|
|
1817
|
-
# @param by [
|
|
1818
|
-
#
|
|
1819
|
-
#
|
|
1820
|
-
#
|
|
2115
|
+
# @param by [Object]
|
|
2116
|
+
# Column(s) to sort by. Accepts expression input, including selectors. Strings
|
|
2117
|
+
# are parsed as column names.
|
|
2118
|
+
# @param more_by [Array]
|
|
2119
|
+
# Additional columns to sort by, specified as positional arguments.
|
|
2120
|
+
# @param descending [Boolean]
|
|
2121
|
+
# Sort in descending order. When sorting by multiple columns, can be specified
|
|
2122
|
+
# per column by passing an array of booleans.
|
|
1821
2123
|
# @param nulls_last [Boolean]
|
|
1822
|
-
# Place null values last
|
|
2124
|
+
# Place null values last; can specify a single boolean applying to all columns
|
|
2125
|
+
# or an array of booleans for per-column control.
|
|
2126
|
+
# @param multithreaded [Boolean]
|
|
2127
|
+
# Sort using multiple threads.
|
|
2128
|
+
# @param maintain_order [Boolean]
|
|
2129
|
+
# Whether the order should be maintained if elements are equal.
|
|
1823
2130
|
#
|
|
1824
2131
|
# @return [DataFrame]
|
|
1825
2132
|
#
|
|
@@ -1831,7 +2138,7 @@ module Polars
|
|
|
1831
2138
|
# "ham" => ["a", "b", "c"]
|
|
1832
2139
|
# }
|
|
1833
2140
|
# )
|
|
1834
|
-
# df.sort("foo",
|
|
2141
|
+
# df.sort("foo", descending: true)
|
|
1835
2142
|
# # =>
|
|
1836
2143
|
# # shape: (3, 3)
|
|
1837
2144
|
# # ┌─────┬─────┬─────┐
|
|
@@ -1847,7 +2154,7 @@ module Polars
|
|
|
1847
2154
|
# @example Sort by multiple columns.
|
|
1848
2155
|
# df.sort(
|
|
1849
2156
|
# [Polars.col("foo"), Polars.col("bar")**2],
|
|
1850
|
-
#
|
|
2157
|
+
# descending: [true, false]
|
|
1851
2158
|
# )
|
|
1852
2159
|
# # =>
|
|
1853
2160
|
# # shape: (3, 3)
|
|
@@ -1860,24 +2167,38 @@ module Polars
|
|
|
1860
2167
|
# # │ 2 ┆ 7.0 ┆ b │
|
|
1861
2168
|
# # │ 1 ┆ 6.0 ┆ a │
|
|
1862
2169
|
# # └─────┴─────┴─────┘
|
|
1863
|
-
def sort(
|
|
2170
|
+
def sort(
|
|
2171
|
+
by,
|
|
2172
|
+
*more_by,
|
|
2173
|
+
descending: false,
|
|
2174
|
+
nulls_last: false,
|
|
2175
|
+
multithreaded: true,
|
|
2176
|
+
maintain_order: false
|
|
2177
|
+
)
|
|
1864
2178
|
lazy
|
|
1865
|
-
.sort(
|
|
1866
|
-
|
|
2179
|
+
.sort(
|
|
2180
|
+
by,
|
|
2181
|
+
*more_by,
|
|
2182
|
+
descending: descending,
|
|
2183
|
+
nulls_last: nulls_last,
|
|
2184
|
+
multithreaded: multithreaded,
|
|
2185
|
+
maintain_order: maintain_order
|
|
2186
|
+
)
|
|
2187
|
+
.collect(optimizations: QueryOptFlags._eager)
|
|
1867
2188
|
end
|
|
1868
2189
|
|
|
1869
2190
|
# Sort the DataFrame by column in-place.
|
|
1870
2191
|
#
|
|
1871
2192
|
# @param by [String]
|
|
1872
2193
|
# By which column to sort.
|
|
1873
|
-
# @param
|
|
2194
|
+
# @param descending [Boolean]
|
|
1874
2195
|
# Reverse/descending sort.
|
|
1875
2196
|
# @param nulls_last [Boolean]
|
|
1876
2197
|
# Place null values last. Can only be used if sorted by a single column.
|
|
1877
2198
|
#
|
|
1878
2199
|
# @return [DataFrame]
|
|
1879
|
-
def sort!(by,
|
|
1880
|
-
self._df = sort(by,
|
|
2200
|
+
def sort!(by, descending: false, nulls_last: false)
|
|
2201
|
+
self._df = sort(by, descending: descending, nulls_last: nulls_last)._df
|
|
1881
2202
|
end
|
|
1882
2203
|
|
|
1883
2204
|
# Execute a SQL query against the DataFrame.
|
|
@@ -1949,7 +2270,7 @@ module Polars
|
|
|
1949
2270
|
# # │ 3 ┆ false ┆ xx:xx ┆ 2077 ┆ 0.0 │
|
|
1950
2271
|
# # └─────┴───────────┴───────┴──────┴──────┘
|
|
1951
2272
|
def sql(query, table_name: "self")
|
|
1952
|
-
ctx = SQLContext.new(
|
|
2273
|
+
ctx = SQLContext.new(eager: true)
|
|
1953
2274
|
name = table_name || "self"
|
|
1954
2275
|
ctx.register(name, self)
|
|
1955
2276
|
ctx.execute(query)
|
|
@@ -1969,7 +2290,7 @@ module Polars
|
|
|
1969
2290
|
# Accepts expression input. Strings are parsed as column names.
|
|
1970
2291
|
# @param reverse [Object]
|
|
1971
2292
|
# Consider the `k` smallest elements of the `by` column(s) (instead of the `k`
|
|
1972
|
-
# largest). This can be specified per column by passing
|
|
2293
|
+
# largest). This can be specified per column by passing an array of
|
|
1973
2294
|
# booleans.
|
|
1974
2295
|
#
|
|
1975
2296
|
# @return [DataFrame]
|
|
@@ -2017,12 +2338,12 @@ module Polars
|
|
|
2017
2338
|
lazy
|
|
2018
2339
|
.top_k(k, by: by, reverse: reverse)
|
|
2019
2340
|
.collect(
|
|
2020
|
-
|
|
2021
|
-
|
|
2022
|
-
|
|
2023
|
-
|
|
2024
|
-
|
|
2025
|
-
|
|
2341
|
+
optimizations: QueryOptFlags.new(
|
|
2342
|
+
projection_pushdown: false,
|
|
2343
|
+
predicate_pushdown: false,
|
|
2344
|
+
comm_subplan_elim: false,
|
|
2345
|
+
slice_pushdown: true
|
|
2346
|
+
)
|
|
2026
2347
|
)
|
|
2027
2348
|
end
|
|
2028
2349
|
|
|
@@ -2040,7 +2361,7 @@ module Polars
|
|
|
2040
2361
|
# Accepts expression input. Strings are parsed as column names.
|
|
2041
2362
|
# @param reverse [Object]
|
|
2042
2363
|
# Consider the `k` largest elements of the `by` column(s) (instead of the `k`
|
|
2043
|
-
# smallest). This can be specified per column by passing
|
|
2364
|
+
# smallest). This can be specified per column by passing an array of
|
|
2044
2365
|
# booleans.
|
|
2045
2366
|
#
|
|
2046
2367
|
# @return [DataFrame]
|
|
@@ -2088,12 +2409,12 @@ module Polars
|
|
|
2088
2409
|
lazy
|
|
2089
2410
|
.bottom_k(k, by: by, reverse: reverse)
|
|
2090
2411
|
.collect(
|
|
2091
|
-
|
|
2092
|
-
|
|
2093
|
-
|
|
2094
|
-
|
|
2095
|
-
|
|
2096
|
-
|
|
2412
|
+
optimizations: QueryOptFlags.new(
|
|
2413
|
+
projection_pushdown: false,
|
|
2414
|
+
predicate_pushdown: false,
|
|
2415
|
+
comm_subplan_elim: false,
|
|
2416
|
+
slice_pushdown: true
|
|
2417
|
+
)
|
|
2097
2418
|
)
|
|
2098
2419
|
end
|
|
2099
2420
|
|
|
@@ -2128,36 +2449,6 @@ module Polars
|
|
|
2128
2449
|
def equals(other, null_equal: true)
|
|
2129
2450
|
_df.equals(other._df, null_equal)
|
|
2130
2451
|
end
|
|
2131
|
-
alias_method :frame_equal, :equals
|
|
2132
|
-
|
|
2133
|
-
# Replace a column by a new Series.
|
|
2134
|
-
#
|
|
2135
|
-
# @param column [String]
|
|
2136
|
-
# Column to replace.
|
|
2137
|
-
# @param new_col [Series]
|
|
2138
|
-
# New column to insert.
|
|
2139
|
-
#
|
|
2140
|
-
# @return [DataFrame]
|
|
2141
|
-
#
|
|
2142
|
-
# @example
|
|
2143
|
-
# df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
|
|
2144
|
-
# s = Polars::Series.new([10, 20, 30])
|
|
2145
|
-
# df.replace("foo", s)
|
|
2146
|
-
# # =>
|
|
2147
|
-
# # shape: (3, 2)
|
|
2148
|
-
# # ┌─────┬─────┐
|
|
2149
|
-
# # │ foo ┆ bar │
|
|
2150
|
-
# # │ --- ┆ --- │
|
|
2151
|
-
# # │ i64 ┆ i64 │
|
|
2152
|
-
# # ╞═════╪═════╡
|
|
2153
|
-
# # │ 10 ┆ 4 │
|
|
2154
|
-
# # │ 20 ┆ 5 │
|
|
2155
|
-
# # │ 30 ┆ 6 │
|
|
2156
|
-
# # └─────┴─────┘
|
|
2157
|
-
def replace(column, new_col)
|
|
2158
|
-
_df.replace(column.to_s, new_col._s)
|
|
2159
|
-
self
|
|
2160
|
-
end
|
|
2161
2452
|
|
|
2162
2453
|
# Get a slice of this DataFrame.
|
|
2163
2454
|
#
|
|
@@ -2330,7 +2621,7 @@ module Polars
|
|
|
2330
2621
|
# # │ 80.0 ┆ 25.5 ┆ null │
|
|
2331
2622
|
# # └──────┴───────┴──────┘
|
|
2332
2623
|
def drop_nans(subset: nil)
|
|
2333
|
-
lazy.drop_nans(subset: subset).collect(
|
|
2624
|
+
lazy.drop_nans(subset: subset).collect(optimizations: QueryOptFlags._eager)
|
|
2334
2625
|
end
|
|
2335
2626
|
|
|
2336
2627
|
# Drop all rows that contain one or more null values.
|
|
@@ -2375,12 +2666,12 @@ module Polars
|
|
|
2375
2666
|
# # │ 3 ┆ 8 ┆ null │
|
|
2376
2667
|
# # └─────┴─────┴──────┘
|
|
2377
2668
|
def drop_nulls(subset: nil)
|
|
2378
|
-
lazy.drop_nulls(subset: subset).collect(
|
|
2669
|
+
lazy.drop_nulls(subset: subset).collect(optimizations: QueryOptFlags._eager)
|
|
2379
2670
|
end
|
|
2380
2671
|
|
|
2381
2672
|
# Offers a structured way to apply a sequence of user-defined functions (UDFs).
|
|
2382
2673
|
#
|
|
2383
|
-
# @param
|
|
2674
|
+
# @param function [Object]
|
|
2384
2675
|
# Callable; will receive the frame as the first parameter,
|
|
2385
2676
|
# followed by any given args/kwargs.
|
|
2386
2677
|
# @param args [Object]
|
|
@@ -2397,7 +2688,7 @@ module Polars
|
|
|
2397
2688
|
#
|
|
2398
2689
|
# @example
|
|
2399
2690
|
# cast_str_to_int = lambda do |data, col_name:|
|
|
2400
|
-
# data.
|
|
2691
|
+
# data.with_columns(Polars.col(col_name).cast(Polars::Int64))
|
|
2401
2692
|
# end
|
|
2402
2693
|
#
|
|
2403
2694
|
# df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]})
|
|
@@ -2414,8 +2705,8 @@ module Polars
|
|
|
2414
2705
|
# # │ 3 ┆ 30 │
|
|
2415
2706
|
# # │ 4 ┆ 40 │
|
|
2416
2707
|
# # └─────┴─────┘
|
|
2417
|
-
def pipe(
|
|
2418
|
-
|
|
2708
|
+
def pipe(function, *args, **kwargs, &block)
|
|
2709
|
+
function.(self, *args, **kwargs, &block)
|
|
2419
2710
|
end
|
|
2420
2711
|
|
|
2421
2712
|
# Add a column at index 0 that counts the rows.
|
|
@@ -2449,7 +2740,6 @@ module Polars
|
|
|
2449
2740
|
def with_row_index(name: "index", offset: 0)
|
|
2450
2741
|
_from_rbdf(_df.with_row_index(name, offset))
|
|
2451
2742
|
end
|
|
2452
|
-
alias_method :with_row_count, :with_row_index
|
|
2453
2743
|
|
|
2454
2744
|
# Start a group by operation.
|
|
2455
2745
|
#
|
|
@@ -2459,6 +2749,9 @@ module Polars
|
|
|
2459
2749
|
# Make sure that the order of the groups remain consistent. This is more
|
|
2460
2750
|
# expensive than a default group by. Note that this only works in expression
|
|
2461
2751
|
# aggregations.
|
|
2752
|
+
# @param named_by [Hash]
|
|
2753
|
+
# Additional columns to group by, specified as keyword arguments.
|
|
2754
|
+
# The columns will be renamed to the keyword used.
|
|
2462
2755
|
#
|
|
2463
2756
|
# @return [GroupBy]
|
|
2464
2757
|
#
|
|
@@ -2482,23 +2775,23 @@ module Polars
|
|
|
2482
2775
|
# # │ b ┆ 11 │
|
|
2483
2776
|
# # │ c ┆ 6 │
|
|
2484
2777
|
# # └─────┴─────┘
|
|
2485
|
-
def group_by(by, maintain_order: false)
|
|
2486
|
-
|
|
2487
|
-
|
|
2778
|
+
def group_by(by, maintain_order: false, **named_by)
|
|
2779
|
+
named_by.each do |_, value|
|
|
2780
|
+
if !(value.is_a?(::String) || value.is_a?(Expr) || value.is_a?(Series))
|
|
2781
|
+
msg = "Expected Polars expression or object convertible to one, got #{value.class.name}."
|
|
2782
|
+
raise TypeError, msg
|
|
2783
|
+
end
|
|
2488
2784
|
end
|
|
2489
2785
|
GroupBy.new(
|
|
2490
2786
|
self,
|
|
2491
2787
|
by,
|
|
2788
|
+
**named_by,
|
|
2492
2789
|
maintain_order: maintain_order
|
|
2493
2790
|
)
|
|
2494
2791
|
end
|
|
2495
|
-
alias_method :groupby, :group_by
|
|
2496
|
-
alias_method :group, :group_by
|
|
2497
2792
|
|
|
2498
2793
|
# Create rolling groups based on a time column.
|
|
2499
2794
|
#
|
|
2500
|
-
# Also works for index values of type `:i32` or `:i64`.
|
|
2501
|
-
#
|
|
2502
2795
|
# Different from a `dynamic_group_by` the windows are now determined by the
|
|
2503
2796
|
# individual values and are not of constant intervals. For constant intervals use
|
|
2504
2797
|
# *group_by_dynamic*
|
|
@@ -2532,16 +2825,16 @@ module Polars
|
|
|
2532
2825
|
# This column must be sorted in ascending order. If not the output will not
|
|
2533
2826
|
# make sense.
|
|
2534
2827
|
#
|
|
2535
|
-
# In case of a rolling
|
|
2536
|
-
#
|
|
2537
|
-
# performance matters use an
|
|
2828
|
+
# In case of a rolling operation on indices, dtype needs to be one of
|
|
2829
|
+
# \\\\{UInt32, UInt64, Int32, Int64}. Note that the first three get temporarily
|
|
2830
|
+
# cast to Int64, so if performance matters use an Int64 column.
|
|
2538
2831
|
# @param period [Object]
|
|
2539
2832
|
# Length of the window.
|
|
2540
2833
|
# @param offset [Object]
|
|
2541
2834
|
# Offset of the window. Default is -period.
|
|
2542
2835
|
# @param closed ["right", "left", "both", "none"]
|
|
2543
2836
|
# Define whether the temporal window interval is closed or not.
|
|
2544
|
-
# @param
|
|
2837
|
+
# @param group_by [Object]
|
|
2545
2838
|
# Also group by this column/these columns.
|
|
2546
2839
|
#
|
|
2547
2840
|
# @return [RollingGroupBy]
|
|
@@ -2555,7 +2848,7 @@ module Polars
|
|
|
2555
2848
|
# "2020-01-03 19:45:32",
|
|
2556
2849
|
# "2020-01-08 23:16:43"
|
|
2557
2850
|
# ]
|
|
2558
|
-
# df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).
|
|
2851
|
+
# df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_columns(
|
|
2559
2852
|
# Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
|
|
2560
2853
|
# )
|
|
2561
2854
|
# df.rolling(index_column: "dt", period: "2d").agg(
|
|
@@ -2584,14 +2877,12 @@ module Polars
|
|
|
2584
2877
|
period:,
|
|
2585
2878
|
offset: nil,
|
|
2586
2879
|
closed: "right",
|
|
2587
|
-
|
|
2880
|
+
group_by: nil
|
|
2588
2881
|
)
|
|
2589
|
-
RollingGroupBy.new(self, index_column, period, offset, closed,
|
|
2882
|
+
RollingGroupBy.new(self, index_column, period, offset, closed, group_by)
|
|
2590
2883
|
end
|
|
2591
|
-
alias_method :groupby_rolling, :rolling
|
|
2592
|
-
alias_method :group_by_rolling, :rolling
|
|
2593
2884
|
|
|
2594
|
-
# Group based on a time value (or index value of type
|
|
2885
|
+
# Group based on a time value (or index value of type Int32, Int64).
|
|
2595
2886
|
#
|
|
2596
2887
|
# Time windows are calculated and rows are assigned to windows. Different from a
|
|
2597
2888
|
# normal group by is that a row can be member of multiple groups. The time/index
|
|
@@ -2634,8 +2925,8 @@ module Polars
|
|
|
2634
2925
|
# make sense.
|
|
2635
2926
|
#
|
|
2636
2927
|
# In case of a dynamic group by on indices, dtype needs to be one of
|
|
2637
|
-
#
|
|
2638
|
-
# performance matters use an
|
|
2928
|
+
# \\\\{Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if
|
|
2929
|
+
# performance matters use an Int64 column.
|
|
2639
2930
|
# @param every
|
|
2640
2931
|
# Interval of the window.
|
|
2641
2932
|
# @param period
|
|
@@ -2643,15 +2934,21 @@ module Polars
|
|
|
2643
2934
|
# @param offset
|
|
2644
2935
|
# Offset of the window if nil and period is nil it will be equal to negative
|
|
2645
2936
|
# `every`.
|
|
2646
|
-
# @param truncate
|
|
2647
|
-
# Truncate the time value to the window lower bound.
|
|
2648
2937
|
# @param include_boundaries
|
|
2649
2938
|
# Add the lower and upper bound of the window to the "_lower_bound" and
|
|
2650
2939
|
# "_upper_bound" columns. This will impact performance because it's harder to
|
|
2651
2940
|
# parallelize
|
|
2652
2941
|
# @param closed ["right", "left", "both", "none"]
|
|
2653
2942
|
# Define whether the temporal window interval is closed or not.
|
|
2654
|
-
# @param
|
|
2943
|
+
# @param label ['left', 'right', 'datapoint']
|
|
2944
|
+
# Define which label to use for the window:
|
|
2945
|
+
#
|
|
2946
|
+
# - 'left': lower boundary of the window
|
|
2947
|
+
# - 'right': upper boundary of the window
|
|
2948
|
+
# - 'datapoint': the first value of the index column in the given window.
|
|
2949
|
+
# If you don't need the label to be at one of the boundaries, choose this
|
|
2950
|
+
# option for maximum performance
|
|
2951
|
+
# @param group_by
|
|
2655
2952
|
# Also group by this column/these columns
|
|
2656
2953
|
# @param start_by ['window', 'datapoint', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
|
|
2657
2954
|
# The strategy to determine the start of the first window by.
|
|
@@ -2793,7 +3090,7 @@ module Polars
|
|
|
2793
3090
|
# "time",
|
|
2794
3091
|
# every: "1h",
|
|
2795
3092
|
# closed: "both",
|
|
2796
|
-
#
|
|
3093
|
+
# group_by: "groups",
|
|
2797
3094
|
# include_boundaries: true
|
|
2798
3095
|
# ).agg([Polars.col("time").count.alias("time_count")])
|
|
2799
3096
|
# # =>
|
|
@@ -2843,10 +3140,10 @@ module Polars
|
|
|
2843
3140
|
every:,
|
|
2844
3141
|
period: nil,
|
|
2845
3142
|
offset: nil,
|
|
2846
|
-
truncate: true,
|
|
2847
3143
|
include_boundaries: false,
|
|
2848
3144
|
closed: "left",
|
|
2849
|
-
|
|
3145
|
+
label: "left",
|
|
3146
|
+
group_by: nil,
|
|
2850
3147
|
start_by: "window"
|
|
2851
3148
|
)
|
|
2852
3149
|
DynamicGroupBy.new(
|
|
@@ -2855,14 +3152,13 @@ module Polars
|
|
|
2855
3152
|
every,
|
|
2856
3153
|
period,
|
|
2857
3154
|
offset,
|
|
2858
|
-
truncate,
|
|
2859
3155
|
include_boundaries,
|
|
2860
3156
|
closed,
|
|
2861
|
-
|
|
3157
|
+
label,
|
|
3158
|
+
group_by,
|
|
2862
3159
|
start_by
|
|
2863
3160
|
)
|
|
2864
3161
|
end
|
|
2865
|
-
alias_method :groupby_dynamic, :group_by_dynamic
|
|
2866
3162
|
|
|
2867
3163
|
# Upsample a DataFrame at a regular frequency.
|
|
2868
3164
|
#
|
|
@@ -2871,7 +3167,7 @@ module Polars
|
|
|
2871
3167
|
# Note that this column has to be sorted for the output to make sense.
|
|
2872
3168
|
# @param every [String]
|
|
2873
3169
|
# interval will start 'every' duration
|
|
2874
|
-
# @param
|
|
3170
|
+
# @param group_by [Object]
|
|
2875
3171
|
# First group by these columns and then upsample for every group
|
|
2876
3172
|
# @param maintain_order [Boolean]
|
|
2877
3173
|
# Keep the ordering predictable. This is slower.
|
|
@@ -2910,7 +3206,7 @@ module Polars
|
|
|
2910
3206
|
# }
|
|
2911
3207
|
# ).set_sorted("time")
|
|
2912
3208
|
# df.upsample(
|
|
2913
|
-
# time_column: "time", every: "1mo",
|
|
3209
|
+
# time_column: "time", every: "1mo", group_by: "groups", maintain_order: true
|
|
2914
3210
|
# ).select(Polars.all.forward_fill)
|
|
2915
3211
|
# # =>
|
|
2916
3212
|
# # shape: (7, 3)
|
|
@@ -2930,20 +3226,20 @@ module Polars
|
|
|
2930
3226
|
def upsample(
|
|
2931
3227
|
time_column:,
|
|
2932
3228
|
every:,
|
|
2933
|
-
|
|
3229
|
+
group_by: nil,
|
|
2934
3230
|
maintain_order: false
|
|
2935
3231
|
)
|
|
2936
|
-
if
|
|
2937
|
-
|
|
3232
|
+
if group_by.nil?
|
|
3233
|
+
group_by = []
|
|
2938
3234
|
end
|
|
2939
|
-
if
|
|
2940
|
-
|
|
3235
|
+
if group_by.is_a?(::String)
|
|
3236
|
+
group_by = [group_by]
|
|
2941
3237
|
end
|
|
2942
3238
|
|
|
2943
3239
|
every = Utils.parse_as_duration_string(every)
|
|
2944
3240
|
|
|
2945
3241
|
_from_rbdf(
|
|
2946
|
-
_df.upsample(
|
|
3242
|
+
_df.upsample(group_by, time_column, every, maintain_order)
|
|
2947
3243
|
)
|
|
2948
3244
|
end
|
|
2949
3245
|
|
|
@@ -3096,7 +3392,7 @@ module Polars
|
|
|
3096
3392
|
allow_exact_matches: allow_exact_matches,
|
|
3097
3393
|
check_sortedness: check_sortedness
|
|
3098
3394
|
)
|
|
3099
|
-
.collect(
|
|
3395
|
+
.collect(optimizations: QueryOptFlags._eager)
|
|
3100
3396
|
end
|
|
3101
3397
|
|
|
3102
3398
|
# Join in SQL-like fashion.
|
|
@@ -3119,7 +3415,7 @@ module Polars
|
|
|
3119
3415
|
# * *one_to_one* - “1:1”: check if join keys are unique in both left and right datasets
|
|
3120
3416
|
# * *one_to_many* - “1:m”: check if join keys are unique in left dataset
|
|
3121
3417
|
# * *many_to_one* - “m:1”: check if join keys are unique in right dataset
|
|
3122
|
-
# @param
|
|
3418
|
+
# @param nulls_equal [Boolean]
|
|
3123
3419
|
# Join on null values. By default null values will never produce matches.
|
|
3124
3420
|
# @param coalesce [Boolean]
|
|
3125
3421
|
# Coalescing behavior (merging of join columns).
|
|
@@ -3235,7 +3531,7 @@ module Polars
|
|
|
3235
3531
|
how: "inner",
|
|
3236
3532
|
suffix: "_right",
|
|
3237
3533
|
validate: "m:m",
|
|
3238
|
-
|
|
3534
|
+
nulls_equal: false,
|
|
3239
3535
|
coalesce: nil,
|
|
3240
3536
|
maintain_order: nil
|
|
3241
3537
|
)
|
|
@@ -3248,11 +3544,11 @@ module Polars
|
|
|
3248
3544
|
how: how,
|
|
3249
3545
|
suffix: suffix,
|
|
3250
3546
|
validate: validate,
|
|
3251
|
-
|
|
3547
|
+
nulls_equal: nulls_equal,
|
|
3252
3548
|
coalesce: coalesce,
|
|
3253
3549
|
maintain_order: maintain_order
|
|
3254
3550
|
)
|
|
3255
|
-
.collect(
|
|
3551
|
+
.collect(optimizations: QueryOptFlags._eager)
|
|
3256
3552
|
end
|
|
3257
3553
|
|
|
3258
3554
|
# Perform a join based on one or multiple (in)equality predicates.
|
|
@@ -3347,7 +3643,7 @@ module Polars
|
|
|
3347
3643
|
*predicates,
|
|
3348
3644
|
suffix: suffix
|
|
3349
3645
|
)
|
|
3350
|
-
.collect(
|
|
3646
|
+
.collect(optimizations: QueryOptFlags._eager)
|
|
3351
3647
|
end
|
|
3352
3648
|
|
|
3353
3649
|
# Apply a custom/user-defined function (UDF) over the rows of the DataFrame.
|
|
@@ -3410,61 +3706,14 @@ module Polars
|
|
|
3410
3706
|
# # │ 9 │
|
|
3411
3707
|
# # │ 14 │
|
|
3412
3708
|
# # └─────┘
|
|
3413
|
-
def map_rows(return_dtype: nil, inference_size: 256, &
|
|
3414
|
-
out, is_df = _df.map_rows(
|
|
3709
|
+
def map_rows(return_dtype: nil, inference_size: 256, &function)
|
|
3710
|
+
out, is_df = _df.map_rows(function, return_dtype, inference_size)
|
|
3415
3711
|
if is_df
|
|
3416
3712
|
_from_rbdf(out)
|
|
3417
3713
|
else
|
|
3418
3714
|
_from_rbdf(Utils.wrap_s(out).to_frame._df)
|
|
3419
3715
|
end
|
|
3420
3716
|
end
|
|
3421
|
-
alias_method :apply, :map_rows
|
|
3422
|
-
|
|
3423
|
-
# Return a new DataFrame with the column added or replaced.
|
|
3424
|
-
#
|
|
3425
|
-
# @param column [Object]
|
|
3426
|
-
# Series, where the name of the Series refers to the column in the DataFrame.
|
|
3427
|
-
#
|
|
3428
|
-
# @return [DataFrame]
|
|
3429
|
-
#
|
|
3430
|
-
# @example Added
|
|
3431
|
-
# df = Polars::DataFrame.new(
|
|
3432
|
-
# {
|
|
3433
|
-
# "a" => [1, 3, 5],
|
|
3434
|
-
# "b" => [2, 4, 6]
|
|
3435
|
-
# }
|
|
3436
|
-
# )
|
|
3437
|
-
# df.with_column((Polars.col("b") ** 2).alias("b_squared"))
|
|
3438
|
-
# # =>
|
|
3439
|
-
# # shape: (3, 3)
|
|
3440
|
-
# # ┌─────┬─────┬───────────┐
|
|
3441
|
-
# # │ a ┆ b ┆ b_squared │
|
|
3442
|
-
# # │ --- ┆ --- ┆ --- │
|
|
3443
|
-
# # │ i64 ┆ i64 ┆ i64 │
|
|
3444
|
-
# # ╞═════╪═════╪═══════════╡
|
|
3445
|
-
# # │ 1 ┆ 2 ┆ 4 │
|
|
3446
|
-
# # │ 3 ┆ 4 ┆ 16 │
|
|
3447
|
-
# # │ 5 ┆ 6 ┆ 36 │
|
|
3448
|
-
# # └─────┴─────┴───────────┘
|
|
3449
|
-
#
|
|
3450
|
-
# @example Replaced
|
|
3451
|
-
# df.with_column(Polars.col("a") ** 2)
|
|
3452
|
-
# # =>
|
|
3453
|
-
# # shape: (3, 2)
|
|
3454
|
-
# # ┌─────┬─────┐
|
|
3455
|
-
# # │ a ┆ b │
|
|
3456
|
-
# # │ --- ┆ --- │
|
|
3457
|
-
# # │ i64 ┆ i64 │
|
|
3458
|
-
# # ╞═════╪═════╡
|
|
3459
|
-
# # │ 1 ┆ 2 │
|
|
3460
|
-
# # │ 9 ┆ 4 │
|
|
3461
|
-
# # │ 25 ┆ 6 │
|
|
3462
|
-
# # └─────┴─────┘
|
|
3463
|
-
def with_column(column)
|
|
3464
|
-
lazy
|
|
3465
|
-
.with_column(column)
|
|
3466
|
-
.collect(no_optimization: true, string_cache: false)
|
|
3467
|
-
end
|
|
3468
3717
|
|
|
3469
3718
|
# Return a new DataFrame grown horizontally by stacking multiple Series to it.
|
|
3470
3719
|
#
|
|
@@ -3510,7 +3759,7 @@ module Polars
|
|
|
3510
3759
|
|
|
3511
3760
|
# Grow this DataFrame vertically by stacking a DataFrame to it.
|
|
3512
3761
|
#
|
|
3513
|
-
# @param
|
|
3762
|
+
# @param other [DataFrame]
|
|
3514
3763
|
# DataFrame to stack.
|
|
3515
3764
|
# @param in_place [Boolean]
|
|
3516
3765
|
# Modify in place
|
|
@@ -3545,12 +3794,12 @@ module Polars
|
|
|
3545
3794
|
# # │ 3 ┆ 8 ┆ c │
|
|
3546
3795
|
# # │ 4 ┆ 9 ┆ d │
|
|
3547
3796
|
# # └─────┴─────┴─────┘
|
|
3548
|
-
def vstack(
|
|
3797
|
+
def vstack(other, in_place: false)
|
|
3549
3798
|
if in_place
|
|
3550
|
-
_df.vstack_mut(
|
|
3799
|
+
_df.vstack_mut(other._df)
|
|
3551
3800
|
self
|
|
3552
3801
|
else
|
|
3553
|
-
_from_rbdf(_df.vstack(
|
|
3802
|
+
_from_rbdf(_df.vstack(other._df))
|
|
3554
3803
|
end
|
|
3555
3804
|
end
|
|
3556
3805
|
|
|
@@ -3603,6 +3852,9 @@ module Polars
|
|
|
3603
3852
|
#
|
|
3604
3853
|
# @param columns [Object]
|
|
3605
3854
|
# Column(s) to drop.
|
|
3855
|
+
# @param strict [Boolean]
|
|
3856
|
+
# Validate that all column names exist in the current schema,
|
|
3857
|
+
# and throw an exception if any do not.
|
|
3606
3858
|
#
|
|
3607
3859
|
# @return [DataFrame]
|
|
3608
3860
|
#
|
|
@@ -3654,8 +3906,8 @@ module Polars
|
|
|
3654
3906
|
# # │ 7.0 │
|
|
3655
3907
|
# # │ 8.0 │
|
|
3656
3908
|
# # └─────┘
|
|
3657
|
-
def drop(*columns)
|
|
3658
|
-
lazy.drop(*columns).collect(
|
|
3909
|
+
def drop(*columns, strict: true)
|
|
3910
|
+
lazy.drop(*columns, strict: strict).collect(optimizations: QueryOptFlags._eager)
|
|
3659
3911
|
end
|
|
3660
3912
|
|
|
3661
3913
|
# Drop in place.
|
|
@@ -3768,7 +4020,7 @@ module Polars
|
|
|
3768
4020
|
# df.cast(Polars::String).to_h(as_series: false)
|
|
3769
4021
|
# # => {"foo"=>["1", "2", "3"], "bar"=>["6.0", "7.0", "8.0"], "ham"=>["2020-01-02", "2021-03-04", "2022-05-06"]}
|
|
3770
4022
|
def cast(dtypes, strict: true)
|
|
3771
|
-
lazy.cast(dtypes, strict: strict).collect(
|
|
4023
|
+
lazy.cast(dtypes, strict: strict).collect(optimizations: QueryOptFlags._eager)
|
|
3772
4024
|
end
|
|
3773
4025
|
|
|
3774
4026
|
# Create an empty copy of the current DataFrame.
|
|
@@ -3818,7 +4070,6 @@ module Polars
|
|
|
3818
4070
|
clone
|
|
3819
4071
|
end
|
|
3820
4072
|
end
|
|
3821
|
-
alias_method :cleared, :clear
|
|
3822
4073
|
|
|
3823
4074
|
# clone handled by initialize_copy
|
|
3824
4075
|
|
|
@@ -3880,10 +4131,13 @@ module Polars
|
|
|
3880
4131
|
_df.get_columns.map { |s| Utils.wrap_s(s) }
|
|
3881
4132
|
end
|
|
3882
4133
|
|
|
3883
|
-
# Get a single column
|
|
4134
|
+
# Get a single column by name.
|
|
3884
4135
|
#
|
|
3885
4136
|
# @param name [String]
|
|
3886
4137
|
# Name of the column to retrieve.
|
|
4138
|
+
# @param default [Object]
|
|
4139
|
+
# Value to return if the column does not exist; if not explicitly set and
|
|
4140
|
+
# the column is not present a `ColumnNotFoundError` exception is raised.
|
|
3887
4141
|
#
|
|
3888
4142
|
# @return [Series]
|
|
3889
4143
|
#
|
|
@@ -3898,8 +4152,22 @@ module Polars
|
|
|
3898
4152
|
# # 2
|
|
3899
4153
|
# # 3
|
|
3900
4154
|
# # ]
|
|
3901
|
-
|
|
3902
|
-
|
|
4155
|
+
#
|
|
4156
|
+
# @example
|
|
4157
|
+
# df.get_column("baz", default: Polars::Series.new("baz", ["?", "?", "?"]))
|
|
4158
|
+
# # =>
|
|
4159
|
+
# # shape: (3,)
|
|
4160
|
+
# # Series: 'baz' [str]
|
|
4161
|
+
# # [
|
|
4162
|
+
# # "?"
|
|
4163
|
+
# # "?"
|
|
4164
|
+
# # "?"
|
|
4165
|
+
# # ]
|
|
4166
|
+
def get_column(name, default: NO_DEFAULT)
|
|
4167
|
+
Utils.wrap_s(_df.get_column(name.to_s))
|
|
4168
|
+
rescue ColumnNotFoundError
|
|
4169
|
+
raise if default.eql?(NO_DEFAULT)
|
|
4170
|
+
default
|
|
3903
4171
|
end
|
|
3904
4172
|
|
|
3905
4173
|
# Fill null values using the specified value or strategy.
|
|
@@ -3985,14 +4253,14 @@ module Polars
|
|
|
3985
4253
|
_from_rbdf(
|
|
3986
4254
|
lazy
|
|
3987
4255
|
.fill_null(value, strategy: strategy, limit: limit, matches_supertype: matches_supertype)
|
|
3988
|
-
.collect(
|
|
4256
|
+
.collect(optimizations: QueryOptFlags._eager)
|
|
3989
4257
|
._df
|
|
3990
4258
|
)
|
|
3991
4259
|
end
|
|
3992
4260
|
|
|
3993
4261
|
# Fill floating point NaN values by an Expression evaluation.
|
|
3994
4262
|
#
|
|
3995
|
-
# @param
|
|
4263
|
+
# @param value [Object]
|
|
3996
4264
|
# Value to fill NaN with.
|
|
3997
4265
|
#
|
|
3998
4266
|
# @return [DataFrame]
|
|
@@ -4021,14 +4289,16 @@ module Polars
|
|
|
4021
4289
|
# # │ 99.0 ┆ 99.0 │
|
|
4022
4290
|
# # │ 4.0 ┆ 13.0 │
|
|
4023
4291
|
# # └──────┴──────┘
|
|
4024
|
-
def fill_nan(
|
|
4025
|
-
lazy.fill_nan(
|
|
4292
|
+
def fill_nan(value)
|
|
4293
|
+
lazy.fill_nan(value).collect(optimizations: QueryOptFlags._eager)
|
|
4026
4294
|
end
|
|
4027
4295
|
|
|
4028
4296
|
# Explode `DataFrame` to long format by exploding a column with Lists.
|
|
4029
4297
|
#
|
|
4030
4298
|
# @param columns [Object]
|
|
4031
4299
|
# Column of LargeList type.
|
|
4300
|
+
# @param more_columns [Array]
|
|
4301
|
+
# Additional names of columns to explode, specified as positional arguments.
|
|
4032
4302
|
#
|
|
4033
4303
|
# @return [DataFrame]
|
|
4034
4304
|
#
|
|
@@ -4056,8 +4326,8 @@ module Polars
|
|
|
4056
4326
|
# # │ c ┆ 7 │
|
|
4057
4327
|
# # │ c ┆ 8 │
|
|
4058
4328
|
# # └─────────┴─────────┘
|
|
4059
|
-
def explode(columns)
|
|
4060
|
-
lazy.explode(columns).collect(
|
|
4329
|
+
def explode(columns, *more_columns)
|
|
4330
|
+
lazy.explode(columns, *more_columns).collect(optimizations: QueryOptFlags._eager)
|
|
4061
4331
|
end
|
|
4062
4332
|
|
|
4063
4333
|
# Create a spreadsheet-style pivot table as a DataFrame.
|
|
@@ -4202,13 +4472,12 @@ module Polars
|
|
|
4202
4472
|
# # │ y ┆ c ┆ 4 │
|
|
4203
4473
|
# # │ z ┆ c ┆ 6 │
|
|
4204
4474
|
# # └─────┴──────────┴───────┘
|
|
4205
|
-
def unpivot(on, index: nil, variable_name: nil, value_name: nil)
|
|
4475
|
+
def unpivot(on = nil, index: nil, variable_name: nil, value_name: nil)
|
|
4206
4476
|
on = on.nil? ? [] : Utils._expand_selectors(self, on)
|
|
4207
4477
|
index = index.nil? ? [] : Utils._expand_selectors(self, index)
|
|
4208
4478
|
|
|
4209
4479
|
_from_rbdf(_df.unpivot(on, index, value_name, variable_name))
|
|
4210
4480
|
end
|
|
4211
|
-
alias_method :melt, :unpivot
|
|
4212
4481
|
|
|
4213
4482
|
# Unstack a long table to a wide form without doing an aggregation.
|
|
4214
4483
|
#
|
|
@@ -4313,7 +4582,7 @@ module Polars
|
|
|
4313
4582
|
|
|
4314
4583
|
if how == "horizontal"
|
|
4315
4584
|
df = (
|
|
4316
|
-
df.
|
|
4585
|
+
df.with_columns(
|
|
4317
4586
|
(Polars.arange(0, n_cols * n_rows, eager: true) % n_cols).alias(
|
|
4318
4587
|
"__sort_order"
|
|
4319
4588
|
)
|
|
@@ -4336,8 +4605,10 @@ module Polars
|
|
|
4336
4605
|
|
|
4337
4606
|
# Split into multiple DataFrames partitioned by groups.
|
|
4338
4607
|
#
|
|
4339
|
-
# @param
|
|
4608
|
+
# @param by [Object]
|
|
4340
4609
|
# Groups to partition by.
|
|
4610
|
+
# @param more_by [Array]
|
|
4611
|
+
# Additional names of columns to group by, specified as positional arguments.
|
|
4341
4612
|
# @param maintain_order [Boolean]
|
|
4342
4613
|
# Keep predictable output order. This is slower as it requires an extra sort
|
|
4343
4614
|
# operation.
|
|
@@ -4387,7 +4658,7 @@ module Polars
|
|
|
4387
4658
|
# @example
|
|
4388
4659
|
# df.partition_by("foo", maintain_order: true, as_dict: true)
|
|
4389
4660
|
# # =>
|
|
4390
|
-
# # {"A"=>shape: (2, 3)
|
|
4661
|
+
# # {["A"]=>shape: (2, 3)
|
|
4391
4662
|
# # ┌─────┬─────┬─────┐
|
|
4392
4663
|
# # │ foo ┆ N ┆ bar │
|
|
4393
4664
|
# # │ --- ┆ --- ┆ --- │
|
|
@@ -4395,7 +4666,7 @@ module Polars
|
|
|
4395
4666
|
# # ╞═════╪═════╪═════╡
|
|
4396
4667
|
# # │ A ┆ 1 ┆ k │
|
|
4397
4668
|
# # │ A ┆ 2 ┆ l │
|
|
4398
|
-
# # └─────┴─────┴─────┘, "B"=>shape: (2, 3)
|
|
4669
|
+
# # └─────┴─────┴─────┘, ["B"]=>shape: (2, 3)
|
|
4399
4670
|
# # ┌─────┬─────┬─────┐
|
|
4400
4671
|
# # │ foo ┆ N ┆ bar │
|
|
4401
4672
|
# # │ --- ┆ --- ┆ --- │
|
|
@@ -4403,7 +4674,7 @@ module Polars
|
|
|
4403
4674
|
# # ╞═════╪═════╪═════╡
|
|
4404
4675
|
# # │ B ┆ 2 ┆ m │
|
|
4405
4676
|
# # │ B ┆ 4 ┆ m │
|
|
4406
|
-
# # └─────┴─────┴─────┘, "C"=>shape: (1, 3)
|
|
4677
|
+
# # └─────┴─────┴─────┘, ["C"]=>shape: (1, 3)
|
|
4407
4678
|
# # ┌─────┬─────┬─────┐
|
|
4408
4679
|
# # │ foo ┆ N ┆ bar │
|
|
4409
4680
|
# # │ --- ┆ --- ┆ --- │
|
|
@@ -4411,30 +4682,26 @@ module Polars
|
|
|
4411
4682
|
# # ╞═════╪═════╪═════╡
|
|
4412
4683
|
# # │ C ┆ 2 ┆ l │
|
|
4413
4684
|
# # └─────┴─────┴─────┘}
|
|
4414
|
-
def partition_by(
|
|
4415
|
-
|
|
4416
|
-
|
|
4417
|
-
|
|
4418
|
-
groups = Array(groups)
|
|
4419
|
-
end
|
|
4685
|
+
def partition_by(by, *more_by, maintain_order: true, include_key: true, as_dict: false)
|
|
4686
|
+
by_parsed = Utils._expand_selectors(self, by, *more_by)
|
|
4687
|
+
|
|
4688
|
+
partitions = _df.partition_by(by_parsed, maintain_order, include_key).map { |df| _from_rbdf(df) }
|
|
4420
4689
|
|
|
4421
4690
|
if as_dict
|
|
4422
|
-
|
|
4423
|
-
|
|
4424
|
-
_df.partition_by(groups, maintain_order, include_key).each do |df|
|
|
4425
|
-
df = _from_rbdf(df)
|
|
4426
|
-
out[df[groups][0, 0]] = df
|
|
4427
|
-
end
|
|
4691
|
+
if include_key
|
|
4692
|
+
names = partitions.map { |p| p.select(by_parsed).row(0) }
|
|
4428
4693
|
else
|
|
4429
|
-
|
|
4430
|
-
|
|
4431
|
-
|
|
4694
|
+
if !maintain_order
|
|
4695
|
+
msg = "cannot use `partition_by` with `maintain_order: false, include_key: false, as_dict: true`"
|
|
4696
|
+
raise ArgumentError, msg
|
|
4432
4697
|
end
|
|
4698
|
+
names = select(by_parsed).unique(maintain_order: true).rows
|
|
4433
4699
|
end
|
|
4434
|
-
|
|
4435
|
-
|
|
4436
|
-
_df.partition_by(groups, maintain_order, include_key).map { |df| _from_rbdf(df) }
|
|
4700
|
+
|
|
4701
|
+
return names.zip(partitions).to_h
|
|
4437
4702
|
end
|
|
4703
|
+
|
|
4704
|
+
partitions
|
|
4438
4705
|
end
|
|
4439
4706
|
|
|
4440
4707
|
# Shift values by the given period.
|
|
@@ -4480,41 +4747,8 @@ module Polars
|
|
|
4480
4747
|
# # │ 3 ┆ 8 ┆ c │
|
|
4481
4748
|
# # │ null ┆ null ┆ null │
|
|
4482
4749
|
# # └──────┴──────┴──────┘
|
|
4483
|
-
def shift(n, fill_value: nil)
|
|
4484
|
-
lazy.shift(n, fill_value: fill_value).collect(
|
|
4485
|
-
end
|
|
4486
|
-
|
|
4487
|
-
# Shift the values by a given period and fill the resulting null values.
|
|
4488
|
-
#
|
|
4489
|
-
# @param periods [Integer]
|
|
4490
|
-
# Number of places to shift (may be negative).
|
|
4491
|
-
# @param fill_value [Object]
|
|
4492
|
-
# fill nil values with this value.
|
|
4493
|
-
#
|
|
4494
|
-
# @return [DataFrame]
|
|
4495
|
-
#
|
|
4496
|
-
# @example
|
|
4497
|
-
# df = Polars::DataFrame.new(
|
|
4498
|
-
# {
|
|
4499
|
-
# "foo" => [1, 2, 3],
|
|
4500
|
-
# "bar" => [6, 7, 8],
|
|
4501
|
-
# "ham" => ["a", "b", "c"]
|
|
4502
|
-
# }
|
|
4503
|
-
# )
|
|
4504
|
-
# df.shift_and_fill(1, 0)
|
|
4505
|
-
# # =>
|
|
4506
|
-
# # shape: (3, 3)
|
|
4507
|
-
# # ┌─────┬─────┬─────┐
|
|
4508
|
-
# # │ foo ┆ bar ┆ ham │
|
|
4509
|
-
# # │ --- ┆ --- ┆ --- │
|
|
4510
|
-
# # │ i64 ┆ i64 ┆ str │
|
|
4511
|
-
# # ╞═════╪═════╪═════╡
|
|
4512
|
-
# # │ 0 ┆ 0 ┆ 0 │
|
|
4513
|
-
# # │ 1 ┆ 6 ┆ a │
|
|
4514
|
-
# # │ 2 ┆ 7 ┆ b │
|
|
4515
|
-
# # └─────┴─────┴─────┘
|
|
4516
|
-
def shift_and_fill(periods, fill_value)
|
|
4517
|
-
shift(periods, fill_value: fill_value)
|
|
4750
|
+
def shift(n = 1, fill_value: nil)
|
|
4751
|
+
lazy.shift(n, fill_value: fill_value).collect(optimizations: QueryOptFlags._eager)
|
|
4518
4752
|
end
|
|
4519
4753
|
|
|
4520
4754
|
# Get a mask of all duplicated rows in this DataFrame.
|
|
@@ -4570,6 +4804,16 @@ module Polars
|
|
|
4570
4804
|
# Start a lazy query from this point.
|
|
4571
4805
|
#
|
|
4572
4806
|
# @return [LazyFrame]
|
|
4807
|
+
#
|
|
4808
|
+
# @example
|
|
4809
|
+
# df = Polars::DataFrame.new(
|
|
4810
|
+
# {
|
|
4811
|
+
# "a" => [nil, 2, 3, 4],
|
|
4812
|
+
# "b" => [0.5, nil, 2.5, 13],
|
|
4813
|
+
# "c" => [true, true, false, nil]
|
|
4814
|
+
# }
|
|
4815
|
+
# )
|
|
4816
|
+
# df.lazy
|
|
4573
4817
|
def lazy
|
|
4574
4818
|
wrap_ldf(_df.lazy)
|
|
4575
4819
|
end
|
|
@@ -4663,7 +4907,7 @@ module Polars
|
|
|
4663
4907
|
# # │ 10 │
|
|
4664
4908
|
# # └─────────┘
|
|
4665
4909
|
def select(*exprs, **named_exprs)
|
|
4666
|
-
lazy.select(*exprs, **named_exprs).collect(
|
|
4910
|
+
lazy.select(*exprs, **named_exprs).collect(optimizations: QueryOptFlags._eager)
|
|
4667
4911
|
end
|
|
4668
4912
|
|
|
4669
4913
|
# Select columns from this DataFrame.
|
|
@@ -4683,7 +4927,7 @@ module Polars
|
|
|
4683
4927
|
def select_seq(*exprs, **named_exprs)
|
|
4684
4928
|
lazy
|
|
4685
4929
|
.select_seq(*exprs, **named_exprs)
|
|
4686
|
-
.collect(
|
|
4930
|
+
.collect(optimizations: QueryOptFlags._eager)
|
|
4687
4931
|
end
|
|
4688
4932
|
|
|
4689
4933
|
# Add columns to this DataFrame.
|
|
@@ -4795,7 +5039,7 @@ module Polars
|
|
|
4795
5039
|
# # │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │
|
|
4796
5040
|
# # └─────┴──────┴───────┴──────┴───────┘
|
|
4797
5041
|
def with_columns(*exprs, **named_exprs)
|
|
4798
|
-
lazy.with_columns(*exprs, **named_exprs).collect(
|
|
5042
|
+
lazy.with_columns(*exprs, **named_exprs).collect(optimizations: QueryOptFlags._eager)
|
|
4799
5043
|
end
|
|
4800
5044
|
|
|
4801
5045
|
# Add columns to this DataFrame.
|
|
@@ -4820,7 +5064,7 @@ module Polars
|
|
|
4820
5064
|
)
|
|
4821
5065
|
lazy
|
|
4822
5066
|
.with_columns_seq(*exprs, **named_exprs)
|
|
4823
|
-
.collect(
|
|
5067
|
+
.collect(optimizations: QueryOptFlags._eager)
|
|
4824
5068
|
end
|
|
4825
5069
|
|
|
4826
5070
|
# Get number of chunks used by the ChunkedArrays of this DataFrame.
|
|
@@ -4876,7 +5120,7 @@ module Polars
|
|
|
4876
5120
|
# # │ 3 ┆ 8 ┆ c │
|
|
4877
5121
|
# # └─────┴─────┴─────┘
|
|
4878
5122
|
def max
|
|
4879
|
-
lazy.max.collect(
|
|
5123
|
+
lazy.max.collect(optimizations: QueryOptFlags._eager)
|
|
4880
5124
|
end
|
|
4881
5125
|
|
|
4882
5126
|
# Get the maximum value horizontally across columns.
|
|
@@ -4926,7 +5170,7 @@ module Polars
|
|
|
4926
5170
|
# # │ 1 ┆ 6 ┆ a │
|
|
4927
5171
|
# # └─────┴─────┴─────┘
|
|
4928
5172
|
def min
|
|
4929
|
-
lazy.min.collect(
|
|
5173
|
+
lazy.min.collect(optimizations: QueryOptFlags._eager)
|
|
4930
5174
|
end
|
|
4931
5175
|
|
|
4932
5176
|
# Get the minimum value horizontally across columns.
|
|
@@ -4976,7 +5220,7 @@ module Polars
|
|
|
4976
5220
|
# # │ 6 ┆ 21 ┆ null │
|
|
4977
5221
|
# # └─────┴─────┴──────┘
|
|
4978
5222
|
def sum
|
|
4979
|
-
lazy.sum.collect(
|
|
5223
|
+
lazy.sum.collect(optimizations: QueryOptFlags._eager)
|
|
4980
5224
|
end
|
|
4981
5225
|
|
|
4982
5226
|
# Sum all values horizontally across columns.
|
|
@@ -5032,7 +5276,7 @@ module Polars
|
|
|
5032
5276
|
# # │ 2.0 ┆ 7.0 ┆ null │
|
|
5033
5277
|
# # └─────┴─────┴──────┘
|
|
5034
5278
|
def mean
|
|
5035
|
-
lazy.mean.collect(
|
|
5279
|
+
lazy.mean.collect(optimizations: QueryOptFlags._eager)
|
|
5036
5280
|
end
|
|
5037
5281
|
|
|
5038
5282
|
# Take the mean of all values horizontally across columns.
|
|
@@ -5103,7 +5347,7 @@ module Polars
|
|
|
5103
5347
|
# # │ 0.816497 ┆ 0.816497 ┆ null │
|
|
5104
5348
|
# # └──────────┴──────────┴──────┘
|
|
5105
5349
|
def std(ddof: 1)
|
|
5106
|
-
lazy.std(ddof: ddof).collect(
|
|
5350
|
+
lazy.std(ddof: ddof).collect(optimizations: QueryOptFlags._eager)
|
|
5107
5351
|
end
|
|
5108
5352
|
|
|
5109
5353
|
# Aggregate the columns of this DataFrame to their variance value.
|
|
@@ -5144,7 +5388,7 @@ module Polars
|
|
|
5144
5388
|
# # │ 0.666667 ┆ 0.666667 ┆ null │
|
|
5145
5389
|
# # └──────────┴──────────┴──────┘
|
|
5146
5390
|
def var(ddof: 1)
|
|
5147
|
-
lazy.var(ddof: ddof).collect(
|
|
5391
|
+
lazy.var(ddof: ddof).collect(optimizations: QueryOptFlags._eager)
|
|
5148
5392
|
end
|
|
5149
5393
|
|
|
5150
5394
|
# Aggregate the columns of this DataFrame to their median value.
|
|
@@ -5170,7 +5414,7 @@ module Polars
|
|
|
5170
5414
|
# # │ 2.0 ┆ 7.0 ┆ null │
|
|
5171
5415
|
# # └─────┴─────┴──────┘
|
|
5172
5416
|
def median
|
|
5173
|
-
lazy.median.collect(
|
|
5417
|
+
lazy.median.collect(optimizations: QueryOptFlags._eager)
|
|
5174
5418
|
end
|
|
5175
5419
|
|
|
5176
5420
|
# Aggregate the columns of this DataFrame to their product values.
|
|
@@ -5227,7 +5471,7 @@ module Polars
|
|
|
5227
5471
|
# # │ 2.0 ┆ 7.0 ┆ null │
|
|
5228
5472
|
# # └─────┴─────┴──────┘
|
|
5229
5473
|
def quantile(quantile, interpolation: "nearest")
|
|
5230
|
-
lazy.quantile(quantile, interpolation: interpolation).collect(
|
|
5474
|
+
lazy.quantile(quantile, interpolation: interpolation).collect(optimizations: QueryOptFlags._eager)
|
|
5231
5475
|
end
|
|
5232
5476
|
|
|
5233
5477
|
# Get one hot encoded dummy variables.
|
|
@@ -5294,7 +5538,7 @@ module Polars
|
|
|
5294
5538
|
# "c" => [true, true, true, false, true, true]
|
|
5295
5539
|
# }
|
|
5296
5540
|
# )
|
|
5297
|
-
# df.unique
|
|
5541
|
+
# df.unique(maintain_order: true)
|
|
5298
5542
|
# # =>
|
|
5299
5543
|
# # shape: (5, 3)
|
|
5300
5544
|
# # ┌─────┬─────┬───────┐
|
|
@@ -5308,11 +5552,11 @@ module Polars
|
|
|
5308
5552
|
# # │ 4 ┆ 3.0 ┆ true │
|
|
5309
5553
|
# # │ 5 ┆ 3.0 ┆ true │
|
|
5310
5554
|
# # └─────┴─────┴───────┘
|
|
5311
|
-
def unique(maintain_order:
|
|
5555
|
+
def unique(maintain_order: false, subset: nil, keep: "any")
|
|
5312
5556
|
self._from_rbdf(
|
|
5313
5557
|
lazy
|
|
5314
5558
|
.unique(maintain_order: maintain_order, subset: subset, keep: keep)
|
|
5315
|
-
.collect(
|
|
5559
|
+
.collect(optimizations: QueryOptFlags._eager)
|
|
5316
5560
|
._df
|
|
5317
5561
|
)
|
|
5318
5562
|
end
|
|
@@ -5405,9 +5649,9 @@ module Polars
|
|
|
5405
5649
|
# Sample from this DataFrame.
|
|
5406
5650
|
#
|
|
5407
5651
|
# @param n [Integer]
|
|
5408
|
-
# Number of items to return. Cannot be used with `
|
|
5409
|
-
# `
|
|
5410
|
-
# @param
|
|
5652
|
+
# Number of items to return. Cannot be used with `fraction`. Defaults to 1 if
|
|
5653
|
+
# `fraction` is nil.
|
|
5654
|
+
# @param fraction [Float]
|
|
5411
5655
|
# Fraction of items to return. Cannot be used with `n`.
|
|
5412
5656
|
# @param with_replacement [Boolean]
|
|
5413
5657
|
# Allow values to be sampled more than once.
|
|
@@ -5440,20 +5684,20 @@ module Polars
|
|
|
5440
5684
|
# # └─────┴─────┴─────┘
|
|
5441
5685
|
def sample(
|
|
5442
5686
|
n: nil,
|
|
5443
|
-
|
|
5687
|
+
fraction: nil,
|
|
5444
5688
|
with_replacement: false,
|
|
5445
5689
|
shuffle: false,
|
|
5446
5690
|
seed: nil
|
|
5447
5691
|
)
|
|
5448
|
-
if !n.nil? && !
|
|
5449
|
-
raise ArgumentError, "cannot specify both `n` and `
|
|
5692
|
+
if !n.nil? && !fraction.nil?
|
|
5693
|
+
raise ArgumentError, "cannot specify both `n` and `fraction`"
|
|
5450
5694
|
end
|
|
5451
5695
|
|
|
5452
|
-
if n.nil? && !
|
|
5453
|
-
|
|
5696
|
+
if n.nil? && !fraction.nil?
|
|
5697
|
+
fraction = Series.new("fraction", [fraction]) unless fraction.is_a?(Series)
|
|
5454
5698
|
|
|
5455
5699
|
return _from_rbdf(
|
|
5456
|
-
_df.sample_frac(
|
|
5700
|
+
_df.sample_frac(fraction._s, with_replacement, shuffle, seed)
|
|
5457
5701
|
)
|
|
5458
5702
|
end
|
|
5459
5703
|
|
|
@@ -5725,7 +5969,7 @@ module Polars
|
|
|
5725
5969
|
if include_key
|
|
5726
5970
|
values = self
|
|
5727
5971
|
else
|
|
5728
|
-
data_cols = schema.
|
|
5972
|
+
data_cols = schema.names - key
|
|
5729
5973
|
values = select(data_cols)
|
|
5730
5974
|
end
|
|
5731
5975
|
|
|
@@ -5768,7 +6012,7 @@ module Polars
|
|
|
5768
6012
|
# @example
|
|
5769
6013
|
# df.iter_rows(named: true).map { |row| row["b"] }
|
|
5770
6014
|
# # => [2, 4, 6]
|
|
5771
|
-
def iter_rows(named: false, buffer_size:
|
|
6015
|
+
def iter_rows(named: false, buffer_size: 512, &block)
|
|
5772
6016
|
return to_enum(:iter_rows, named: named, buffer_size: buffer_size) unless block_given?
|
|
5773
6017
|
|
|
5774
6018
|
# load into the local namespace for a modest performance boost in the hot loops
|
|
@@ -5939,11 +6183,10 @@ module Polars
|
|
|
5939
6183
|
def gather_every(n, offset = 0)
|
|
5940
6184
|
select(F.col("*").gather_every(n, offset))
|
|
5941
6185
|
end
|
|
5942
|
-
alias_method :take_every, :gather_every
|
|
5943
6186
|
|
|
5944
6187
|
# Hash and combine the rows in this DataFrame.
|
|
5945
6188
|
#
|
|
5946
|
-
# The hash value is of type
|
|
6189
|
+
# The hash value is of type `UInt64`.
|
|
5947
6190
|
#
|
|
5948
6191
|
# @param seed [Integer]
|
|
5949
6192
|
# Random seed parameter. Defaults to 0.
|
|
@@ -6050,7 +6293,7 @@ module Polars
|
|
|
6050
6293
|
# # {4,"four"}
|
|
6051
6294
|
# # {5,"five"}
|
|
6052
6295
|
# # ]
|
|
6053
|
-
def to_struct(name)
|
|
6296
|
+
def to_struct(name = "")
|
|
6054
6297
|
Utils.wrap_s(_df.to_struct(name))
|
|
6055
6298
|
end
|
|
6056
6299
|
|
|
@@ -6092,7 +6335,7 @@ module Polars
|
|
|
6092
6335
|
# # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │
|
|
6093
6336
|
# # └────────┴─────┴─────┴──────┴───────────┴───────┘
|
|
6094
6337
|
def unnest(columns, *more_columns, separator: nil)
|
|
6095
|
-
lazy.unnest(columns, *more_columns, separator: separator).collect(
|
|
6338
|
+
lazy.unnest(columns, *more_columns, separator: separator).collect(optimizations: QueryOptFlags._eager)
|
|
6096
6339
|
end
|
|
6097
6340
|
|
|
6098
6341
|
# Requires NumPy
|
|
@@ -6138,7 +6381,7 @@ module Polars
|
|
|
6138
6381
|
# # │ elise ┆ 44 │
|
|
6139
6382
|
# # └────────┴─────┘
|
|
6140
6383
|
def merge_sorted(other, key)
|
|
6141
|
-
lazy.merge_sorted(other.lazy, key).collect(
|
|
6384
|
+
lazy.merge_sorted(other.lazy, key).collect(optimizations: QueryOptFlags._eager)
|
|
6142
6385
|
end
|
|
6143
6386
|
|
|
6144
6387
|
# Flag a column as sorted.
|
|
@@ -6160,7 +6403,7 @@ module Polars
|
|
|
6160
6403
|
)
|
|
6161
6404
|
lazy
|
|
6162
6405
|
.set_sorted(column, descending: descending)
|
|
6163
|
-
.collect(
|
|
6406
|
+
.collect(optimizations: QueryOptFlags._eager)
|
|
6164
6407
|
end
|
|
6165
6408
|
|
|
6166
6409
|
# Update the values in this `DataFrame` with the values in `other`.
|
|
@@ -6291,7 +6534,7 @@ module Polars
|
|
|
6291
6534
|
include_nulls: include_nulls,
|
|
6292
6535
|
maintain_order: maintain_order
|
|
6293
6536
|
)
|
|
6294
|
-
.collect(
|
|
6537
|
+
.collect(optimizations: QueryOptFlags._eager)
|
|
6295
6538
|
end
|
|
6296
6539
|
|
|
6297
6540
|
private
|
|
@@ -6357,282 +6600,6 @@ module Polars
|
|
|
6357
6600
|
raise ArgumentError, "Unsupported idxs datatype."
|
|
6358
6601
|
end
|
|
6359
6602
|
|
|
6360
|
-
# @private
|
|
6361
|
-
def self.expand_hash_scalars(data, schema_overrides: nil, strict: true, order: nil, nan_to_null: false)
|
|
6362
|
-
updated_data = {}
|
|
6363
|
-
unless data.empty?
|
|
6364
|
-
dtypes = schema_overrides || {}
|
|
6365
|
-
array_len = data.values.map { |val| Utils.arrlen(val) || 0 }.max
|
|
6366
|
-
if array_len > 0
|
|
6367
|
-
data.each do |name, val|
|
|
6368
|
-
dtype = dtypes[name]
|
|
6369
|
-
if val.is_a?(Hash) && dtype != Struct
|
|
6370
|
-
updated_data[name] = DataFrame.new(val, strict: strict).to_struct(name)
|
|
6371
|
-
elsif !Utils.arrlen(val).nil?
|
|
6372
|
-
updated_data[name] = Series.new(::String.new(name), val, dtype: dtype, strict: strict)
|
|
6373
|
-
elsif val.nil? || [Integer, Float, TrueClass, FalseClass, ::String, ::Date, ::DateTime, ::Time].any? { |cls| val.is_a?(cls) }
|
|
6374
|
-
dtype = Polars::Float64 if val.nil? && dtype.nil?
|
|
6375
|
-
updated_data[name] = Series.new(::String.new(name), [val], dtype: dtype, strict: strict).extend_constant(val, array_len - 1)
|
|
6376
|
-
else
|
|
6377
|
-
raise Todo
|
|
6378
|
-
end
|
|
6379
|
-
end
|
|
6380
|
-
elsif data.values.all? { |val| Utils.arrlen(val) == 0 }
|
|
6381
|
-
data.each do |name, val|
|
|
6382
|
-
updated_data[name] = Series.new(name, val, dtype: dtypes[name], strict: strict)
|
|
6383
|
-
end
|
|
6384
|
-
elsif data.values.all? { |val| Utils.arrlen(val).nil? }
|
|
6385
|
-
data.each do |name, val|
|
|
6386
|
-
updated_data[name] = Series.new(name, [val], dtype: dtypes[name], strict: strict)
|
|
6387
|
-
end
|
|
6388
|
-
end
|
|
6389
|
-
end
|
|
6390
|
-
updated_data
|
|
6391
|
-
end
|
|
6392
|
-
|
|
6393
|
-
# @private
|
|
6394
|
-
def self.hash_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true, nan_to_null: nil)
|
|
6395
|
-
if schema.is_a?(Hash) && !data.empty?
|
|
6396
|
-
if !data.all? { |col, _| schema[col] }
|
|
6397
|
-
raise ArgumentError, "The given column-schema names do not match the data dictionary"
|
|
6398
|
-
end
|
|
6399
|
-
|
|
6400
|
-
data = schema.to_h { |col| [col, data[col]] }
|
|
6401
|
-
end
|
|
6402
|
-
|
|
6403
|
-
column_names, schema_overrides = _unpack_schema(
|
|
6404
|
-
schema, lookup_names: data.keys, schema_overrides: schema_overrides
|
|
6405
|
-
)
|
|
6406
|
-
if column_names.empty?
|
|
6407
|
-
column_names = data.keys
|
|
6408
|
-
end
|
|
6409
|
-
|
|
6410
|
-
if data.empty? && !schema_overrides.empty?
|
|
6411
|
-
data_series = column_names.map { |name| Series.new(name, [], dtype: schema_overrides[name], strict: strict, nan_to_null: nan_to_null)._s }
|
|
6412
|
-
else
|
|
6413
|
-
data_series = expand_hash_scalars(data, schema_overrides: schema_overrides, strict: strict, nan_to_null: nan_to_null).values.map(&:_s)
|
|
6414
|
-
end
|
|
6415
|
-
|
|
6416
|
-
data_series = _handle_columns_arg(data_series, columns: column_names, from_hash: true)
|
|
6417
|
-
RbDataFrame.new(data_series)
|
|
6418
|
-
end
|
|
6419
|
-
|
|
6420
|
-
# @private
|
|
6421
|
-
def self.include_unknowns(schema, cols)
|
|
6422
|
-
cols.to_h { |col| [col, schema.fetch(col, Unknown)] }
|
|
6423
|
-
end
|
|
6424
|
-
|
|
6425
|
-
# @private
|
|
6426
|
-
def self._unpack_schema(schema, schema_overrides: nil, n_expected: nil, lookup_names: nil, include_overrides_in_columns: false)
|
|
6427
|
-
if schema.is_a?(Hash)
|
|
6428
|
-
schema = schema.to_a
|
|
6429
|
-
end
|
|
6430
|
-
column_names =
|
|
6431
|
-
(schema || []).map.with_index do |col, i|
|
|
6432
|
-
if col.is_a?(::String)
|
|
6433
|
-
col || "column_#{i}"
|
|
6434
|
-
else
|
|
6435
|
-
col[0]
|
|
6436
|
-
end
|
|
6437
|
-
end
|
|
6438
|
-
if column_names.empty? && n_expected
|
|
6439
|
-
column_names = n_expected.times.map { |i| "column_#{i}" }
|
|
6440
|
-
end
|
|
6441
|
-
# TODO zip_longest
|
|
6442
|
-
lookup = column_names.zip(lookup_names || []).to_h
|
|
6443
|
-
|
|
6444
|
-
column_dtypes =
|
|
6445
|
-
(schema || []).select { |col| !col.is_a?(::String) && col[1] }.to_h do |col|
|
|
6446
|
-
[lookup[col[0]] || col[0], col[1]]
|
|
6447
|
-
end
|
|
6448
|
-
|
|
6449
|
-
if schema_overrides && schema_overrides.any?
|
|
6450
|
-
column_dtypes.merge!(schema_overrides)
|
|
6451
|
-
end
|
|
6452
|
-
|
|
6453
|
-
column_dtypes.each do |col, dtype|
|
|
6454
|
-
if !Utils.is_polars_dtype(dtype, include_unknown: true) && !dtype.nil?
|
|
6455
|
-
column_dtypes[col] = Utils.rb_type_to_dtype(dtype)
|
|
6456
|
-
end
|
|
6457
|
-
end
|
|
6458
|
-
|
|
6459
|
-
[column_names, column_dtypes]
|
|
6460
|
-
end
|
|
6461
|
-
|
|
6462
|
-
def self._handle_columns_arg(data, columns: nil, from_hash: false)
|
|
6463
|
-
if columns.nil? || columns.empty?
|
|
6464
|
-
data
|
|
6465
|
-
else
|
|
6466
|
-
if data.empty?
|
|
6467
|
-
columns.map { |c| Series.new(c, nil)._s }
|
|
6468
|
-
elsif data.length == columns.length
|
|
6469
|
-
if from_hash
|
|
6470
|
-
series_map = data.to_h { |s| [s.name, s] }
|
|
6471
|
-
if columns.all? { |col| series_map.key?(col) }
|
|
6472
|
-
return columns.map { |col| series_map[col] }
|
|
6473
|
-
end
|
|
6474
|
-
end
|
|
6475
|
-
|
|
6476
|
-
columns.each_with_index do |c, i|
|
|
6477
|
-
# not in-place?
|
|
6478
|
-
data[i].rename(c)
|
|
6479
|
-
end
|
|
6480
|
-
data
|
|
6481
|
-
else
|
|
6482
|
-
raise ArgumentError, "Dimensions of columns arg must match data dimensions."
|
|
6483
|
-
end
|
|
6484
|
-
end
|
|
6485
|
-
end
|
|
6486
|
-
|
|
6487
|
-
def self._post_apply_columns(rbdf, columns, structs: nil, schema_overrides: nil, strict: true)
|
|
6488
|
-
rbdf_columns = rbdf.columns
|
|
6489
|
-
rbdf_dtypes = rbdf.dtypes
|
|
6490
|
-
columns, dtypes = _unpack_schema(
|
|
6491
|
-
(columns || rbdf_columns), schema_overrides: schema_overrides
|
|
6492
|
-
)
|
|
6493
|
-
column_subset = []
|
|
6494
|
-
if columns != rbdf_columns
|
|
6495
|
-
if columns.length < rbdf_columns.length && columns == rbdf_columns.first(columns.length)
|
|
6496
|
-
column_subset = columns
|
|
6497
|
-
else
|
|
6498
|
-
rbdf.set_column_names(columns)
|
|
6499
|
-
end
|
|
6500
|
-
end
|
|
6501
|
-
|
|
6502
|
-
column_casts = []
|
|
6503
|
-
columns.each_with_index do |col, i|
|
|
6504
|
-
if dtypes[col] == Categorical # != rbdf_dtypes[i]
|
|
6505
|
-
column_casts << Polars.col(col).cast(Categorical, strict: strict)._rbexpr
|
|
6506
|
-
elsif structs&.any? && structs.include?(col) && structs[col] != rbdf_dtypes[i]
|
|
6507
|
-
column_casts << Polars.col(col).cast(structs[col], strict: strict)._rbexpr
|
|
6508
|
-
elsif dtypes.include?(col) && dtypes[col] != rbdf_dtypes[i]
|
|
6509
|
-
column_casts << Polars.col(col).cast(dtypes[col], strict: strict)._rbexpr
|
|
6510
|
-
end
|
|
6511
|
-
end
|
|
6512
|
-
|
|
6513
|
-
if column_casts.any? || column_subset.any?
|
|
6514
|
-
rbdf = rbdf.lazy
|
|
6515
|
-
if column_casts.any?
|
|
6516
|
-
rbdf = rbdf.with_columns(column_casts)
|
|
6517
|
-
end
|
|
6518
|
-
if column_subset.any?
|
|
6519
|
-
rbdf = rbdf.select(column_subset.map { |col| Polars.col(col)._rbexpr })
|
|
6520
|
-
end
|
|
6521
|
-
rbdf = rbdf.collect
|
|
6522
|
-
end
|
|
6523
|
-
|
|
6524
|
-
rbdf
|
|
6525
|
-
end
|
|
6526
|
-
|
|
6527
|
-
# @private
|
|
6528
|
-
def self.sequence_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true, orient: nil, infer_schema_length: 50)
|
|
6529
|
-
columns = schema
|
|
6530
|
-
|
|
6531
|
-
if data.length == 0
|
|
6532
|
-
return hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides, strict: strict)
|
|
6533
|
-
end
|
|
6534
|
-
|
|
6535
|
-
if data[0].is_a?(Series)
|
|
6536
|
-
# series_names = data.map(&:name)
|
|
6537
|
-
# columns, dtypes = _unpack_schema(columns || series_names, n_expected: data.length)
|
|
6538
|
-
data_series = []
|
|
6539
|
-
data.each do |s|
|
|
6540
|
-
data_series << s._s
|
|
6541
|
-
end
|
|
6542
|
-
elsif data[0].is_a?(Hash)
|
|
6543
|
-
column_names, dtypes = _unpack_schema(columns)
|
|
6544
|
-
schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
|
|
6545
|
-
rbdf = RbDataFrame.from_hashes(data, schema, schema_overrides, strict, infer_schema_length)
|
|
6546
|
-
if column_names
|
|
6547
|
-
rbdf = _post_apply_columns(rbdf, column_names)
|
|
6548
|
-
end
|
|
6549
|
-
return rbdf
|
|
6550
|
-
elsif data[0].is_a?(::Array)
|
|
6551
|
-
first_element = data[0]
|
|
6552
|
-
if orient.nil? && !columns.nil?
|
|
6553
|
-
row_types = first_element.filter_map { |value| value.class }.uniq
|
|
6554
|
-
if row_types.include?(Integer) && row_types.include?(Float)
|
|
6555
|
-
row_types.delete(Integer)
|
|
6556
|
-
end
|
|
6557
|
-
orient = row_types.length == 1 ? "col" : "row"
|
|
6558
|
-
end
|
|
6559
|
-
|
|
6560
|
-
if orient == "row"
|
|
6561
|
-
column_names, schema_overrides = _unpack_schema(
|
|
6562
|
-
schema, schema_overrides: schema_overrides, n_expected: first_element.length
|
|
6563
|
-
)
|
|
6564
|
-
local_schema_override = (
|
|
6565
|
-
schema_overrides.any? ? _include_unknowns(schema_overrides, column_names) : {}
|
|
6566
|
-
)
|
|
6567
|
-
if column_names.any? && first_element.length > 0 && first_element.length != column_names.length
|
|
6568
|
-
raise ArgumentError, "the row data does not match the number of columns"
|
|
6569
|
-
end
|
|
6570
|
-
|
|
6571
|
-
unpack_nested = false
|
|
6572
|
-
local_schema_override.each do |col, tp|
|
|
6573
|
-
if [Categorical, Enum].include?(tp)
|
|
6574
|
-
local_schema_override[col] = String
|
|
6575
|
-
elsif !unpack_nested && [Unknown, Struct].include?(tp.base_type)
|
|
6576
|
-
raise Todo
|
|
6577
|
-
end
|
|
6578
|
-
end
|
|
6579
|
-
|
|
6580
|
-
if unpack_nested
|
|
6581
|
-
raise Todo
|
|
6582
|
-
else
|
|
6583
|
-
rbdf = RbDataFrame.from_rows(
|
|
6584
|
-
data,
|
|
6585
|
-
infer_schema_length,
|
|
6586
|
-
local_schema_override.any? ? local_schema_override : nil
|
|
6587
|
-
)
|
|
6588
|
-
end
|
|
6589
|
-
if column_names.any? || schema_overrides.any?
|
|
6590
|
-
rbdf = _post_apply_columns(
|
|
6591
|
-
rbdf, column_names, schema_overrides: schema_overrides, strict: strict
|
|
6592
|
-
)
|
|
6593
|
-
end
|
|
6594
|
-
return rbdf
|
|
6595
|
-
elsif orient == "col" || orient.nil?
|
|
6596
|
-
column_names, schema_overrides = _unpack_schema(
|
|
6597
|
-
schema, schema_overrides: schema_overrides, n_expected: data.length
|
|
6598
|
-
)
|
|
6599
|
-
data_series =
|
|
6600
|
-
data.map.with_index do |element, i|
|
|
6601
|
-
Series.new(column_names[i], element, dtype: schema_overrides[column_names[i]], strict: strict)._s
|
|
6602
|
-
end
|
|
6603
|
-
return RbDataFrame.new(data_series)
|
|
6604
|
-
else
|
|
6605
|
-
raise ArgumentError, "orient must be one of {{'col', 'row', nil}}, got #{orient} instead."
|
|
6606
|
-
end
|
|
6607
|
-
end
|
|
6608
|
-
|
|
6609
|
-
data_series = _handle_columns_arg(data_series, columns: columns)
|
|
6610
|
-
RbDataFrame.new(data_series)
|
|
6611
|
-
end
|
|
6612
|
-
|
|
6613
|
-
# @private
|
|
6614
|
-
def self._include_unknowns(schema, cols)
|
|
6615
|
-
cols.to_h { |col| [col, schema[col] || Unknown] }
|
|
6616
|
-
end
|
|
6617
|
-
|
|
6618
|
-
# @private
|
|
6619
|
-
def self.series_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true)
|
|
6620
|
-
data_series = [data._s]
|
|
6621
|
-
series_name = data_series.map(&:name)
|
|
6622
|
-
column_names, schema_overrides = _unpack_schema(
|
|
6623
|
-
schema || series_name, schema_overrides: schema_overrides, n_expected: 1
|
|
6624
|
-
)
|
|
6625
|
-
if schema_overrides.any?
|
|
6626
|
-
new_dtype = schema_overrides.values[0]
|
|
6627
|
-
if new_dtype != data.dtype
|
|
6628
|
-
data_series[0] = data_series[0].cast(new_dtype, strict)
|
|
6629
|
-
end
|
|
6630
|
-
end
|
|
6631
|
-
|
|
6632
|
-
data_series = _handle_columns_arg(data_series, columns: column_names)
|
|
6633
|
-
RbDataFrame.new(data_series)
|
|
6634
|
-
end
|
|
6635
|
-
|
|
6636
6603
|
def wrap_ldf(ldf)
|
|
6637
6604
|
LazyFrame._from_rbldf(ldf)
|
|
6638
6605
|
end
|
|
@@ -6641,6 +6608,11 @@ module Polars
|
|
|
6641
6608
|
self.class._from_rbdf(rb_df)
|
|
6642
6609
|
end
|
|
6643
6610
|
|
|
6611
|
+
def _replace(column, new_column)
|
|
6612
|
+
self._df.replace(column, new_column._s)
|
|
6613
|
+
self
|
|
6614
|
+
end
|
|
6615
|
+
|
|
6644
6616
|
def _comp(other, op)
|
|
6645
6617
|
if other.is_a?(DataFrame)
|
|
6646
6618
|
_compare_to_other_df(other, op)
|
|
@@ -6658,7 +6630,7 @@ module Polars
|
|
|
6658
6630
|
end
|
|
6659
6631
|
|
|
6660
6632
|
suffix = "__POLARS_CMP_OTHER"
|
|
6661
|
-
other_renamed = other.select(Polars.all.suffix(suffix))
|
|
6633
|
+
other_renamed = other.select(Polars.all.name.suffix(suffix))
|
|
6662
6634
|
combined = Polars.concat([self, other_renamed], how: "horizontal")
|
|
6663
6635
|
|
|
6664
6636
|
expr = case op
|
|
@@ -6726,5 +6698,268 @@ module Polars
|
|
|
6726
6698
|
yield
|
|
6727
6699
|
end
|
|
6728
6700
|
end
|
|
6701
|
+
|
|
6702
|
+
def get_series_item_by_key(s, key)
|
|
6703
|
+
if key.is_a?(Integer)
|
|
6704
|
+
return s._s.get_index_signed(key)
|
|
6705
|
+
|
|
6706
|
+
elsif key.is_a?(Range)
|
|
6707
|
+
return _select_elements_by_slice(s, key)
|
|
6708
|
+
|
|
6709
|
+
elsif key.is_a?(::Array)
|
|
6710
|
+
if key.empty?
|
|
6711
|
+
return s.clear
|
|
6712
|
+
end
|
|
6713
|
+
|
|
6714
|
+
first = key[0]
|
|
6715
|
+
if Utils.bool?(first)
|
|
6716
|
+
_raise_on_boolean_mask
|
|
6717
|
+
end
|
|
6718
|
+
|
|
6719
|
+
begin
|
|
6720
|
+
indices = Series.new("", key, dtype: Int64)
|
|
6721
|
+
rescue TypeError
|
|
6722
|
+
msg = "cannot select elements using Sequence with elements of type #{first.class.name.inspect}"
|
|
6723
|
+
raise TypeError, msg
|
|
6724
|
+
end
|
|
6725
|
+
|
|
6726
|
+
indices = _convert_series_to_indices(indices, s.len)
|
|
6727
|
+
return _select_elements_by_index(s, indices)
|
|
6728
|
+
|
|
6729
|
+
elsif key.is_a?(Series)
|
|
6730
|
+
indices = _convert_series_to_indices(key, s.len)
|
|
6731
|
+
return _select_elements_by_index(s, indices)
|
|
6732
|
+
end
|
|
6733
|
+
|
|
6734
|
+
msg = "cannot select elements using key of type #{key.class.name.inspect}: #{key.inspect}"
|
|
6735
|
+
raise TypeError, msg
|
|
6736
|
+
end
|
|
6737
|
+
|
|
6738
|
+
def _select_elements_by_slice(s, key)
|
|
6739
|
+
Slice.new(s).apply(key)
|
|
6740
|
+
end
|
|
6741
|
+
|
|
6742
|
+
def _select_elements_by_index(s, key)
|
|
6743
|
+
s.send(:_from_rbseries, s._s.gather_with_series(key._s))
|
|
6744
|
+
end
|
|
6745
|
+
|
|
6746
|
+
def get_df_item_by_key(df, key)
|
|
6747
|
+
if key.size == 2
|
|
6748
|
+
row_key, col_key = key
|
|
6749
|
+
|
|
6750
|
+
# Support df[True, False] and df["a", "b"] as these are not ambiguous
|
|
6751
|
+
if Utils.bool?(row_key) || Utils.strlike?(row_key)
|
|
6752
|
+
return _select_columns(df, key)
|
|
6753
|
+
end
|
|
6754
|
+
|
|
6755
|
+
selection = _select_columns(df, col_key)
|
|
6756
|
+
|
|
6757
|
+
if selection.is_empty
|
|
6758
|
+
return selection
|
|
6759
|
+
elsif selection.is_a?(Series)
|
|
6760
|
+
return get_series_item_by_key(selection, row_key)
|
|
6761
|
+
else
|
|
6762
|
+
return _select_rows(selection, row_key)
|
|
6763
|
+
end
|
|
6764
|
+
end
|
|
6765
|
+
|
|
6766
|
+
key = key[0] if key.size == 1
|
|
6767
|
+
|
|
6768
|
+
# Single string input, e.g. df["a"]
|
|
6769
|
+
if Utils.strlike?(key)
|
|
6770
|
+
# This case is required because empty strings are otherwise treated
|
|
6771
|
+
# as an empty Sequence in `_select_rows`
|
|
6772
|
+
return df.get_column(key)
|
|
6773
|
+
end
|
|
6774
|
+
|
|
6775
|
+
# Single input - df[1] - or multiple inputs - df["a", "b", "c"]
|
|
6776
|
+
begin
|
|
6777
|
+
_select_rows(df, key)
|
|
6778
|
+
rescue TypeError
|
|
6779
|
+
_select_columns(df, key)
|
|
6780
|
+
end
|
|
6781
|
+
end
|
|
6782
|
+
|
|
6783
|
+
def _select_columns(df, key)
|
|
6784
|
+
if key.is_a?(Integer)
|
|
6785
|
+
return df.to_series(key)
|
|
6786
|
+
|
|
6787
|
+
elsif Utils.strlike?(key)
|
|
6788
|
+
return df.get_column(key)
|
|
6789
|
+
|
|
6790
|
+
elsif key.is_a?(Range)
|
|
6791
|
+
start, stop = key.begin, key.end
|
|
6792
|
+
if start.is_a?(::String)
|
|
6793
|
+
start = df.get_column_index(start)
|
|
6794
|
+
stop = df.get_column_index(stop)
|
|
6795
|
+
rng = Range.new(start, stop, key.exclude_end?)
|
|
6796
|
+
return _select_columns_by_index(df, rng)
|
|
6797
|
+
else
|
|
6798
|
+
return _select_columns_by_index(df, key)
|
|
6799
|
+
end
|
|
6800
|
+
|
|
6801
|
+
elsif key.is_a?(::Array)
|
|
6802
|
+
if key.empty?
|
|
6803
|
+
return df.class.new
|
|
6804
|
+
end
|
|
6805
|
+
first = key[0]
|
|
6806
|
+
if Utils.bool?(first)
|
|
6807
|
+
return _select_columns_by_mask(df, key)
|
|
6808
|
+
elsif first.is_a?(Integer)
|
|
6809
|
+
return _select_columns_by_index(df, key)
|
|
6810
|
+
elsif Utils.strlike?(first)
|
|
6811
|
+
return _select_columns_by_name(df, key)
|
|
6812
|
+
else
|
|
6813
|
+
msg = "cannot select columns using Sequence with elements of type #{first.class.name.inspect}"
|
|
6814
|
+
raise TypeError, msg
|
|
6815
|
+
end
|
|
6816
|
+
|
|
6817
|
+
elsif key.is_a?(Series)
|
|
6818
|
+
if key.is_empty
|
|
6819
|
+
return df.class.new
|
|
6820
|
+
end
|
|
6821
|
+
dtype = key.dtype
|
|
6822
|
+
if dtype == String
|
|
6823
|
+
return _select_columns_by_name(df, key)
|
|
6824
|
+
elsif dtype.integer?
|
|
6825
|
+
return _select_columns_by_index(df, key)
|
|
6826
|
+
elsif dtype == Boolean
|
|
6827
|
+
return _select_columns_by_mask(df, key)
|
|
6828
|
+
else
|
|
6829
|
+
msg = "cannot select columns using Series of type #{dtype}"
|
|
6830
|
+
raise TypeError, msg
|
|
6831
|
+
end
|
|
6832
|
+
end
|
|
6833
|
+
|
|
6834
|
+
msg = (
|
|
6835
|
+
"cannot select columns using key of type #{key.class.name.inspect}: #{key.inspect}"
|
|
6836
|
+
)
|
|
6837
|
+
raise TypeError, msg
|
|
6838
|
+
end
|
|
6839
|
+
|
|
6840
|
+
def _select_columns_by_index(df, key)
|
|
6841
|
+
series = key.map { |i| df.to_series(i) }
|
|
6842
|
+
df.class.new(series)
|
|
6843
|
+
end
|
|
6844
|
+
|
|
6845
|
+
def _select_columns_by_name(df, key)
|
|
6846
|
+
df.send(:_from_rbdf, df._df.select(Array(key)))
|
|
6847
|
+
end
|
|
6848
|
+
|
|
6849
|
+
def _select_columns_by_mask(df, key)
|
|
6850
|
+
if key.length != df.width
|
|
6851
|
+
msg = "expected #{df.width} values when selecting columns by boolean mask, got #{key.length}"
|
|
6852
|
+
raise ArgumentError, msg
|
|
6853
|
+
end
|
|
6854
|
+
|
|
6855
|
+
indices = key.each_with_index.filter_map { |val, i| i if val }
|
|
6856
|
+
_select_columns_by_index(df, indices)
|
|
6857
|
+
end
|
|
6858
|
+
|
|
6859
|
+
def _select_rows(df, key)
|
|
6860
|
+
if key.is_a?(Integer)
|
|
6861
|
+
num_rows = df.height
|
|
6862
|
+
if key >= num_rows || key < -num_rows
|
|
6863
|
+
msg = "index #{key} is out of bounds for DataFrame of height #{num_rows}"
|
|
6864
|
+
raise IndexError, msg
|
|
6865
|
+
end
|
|
6866
|
+
return df.slice(key, 1)
|
|
6867
|
+
end
|
|
6868
|
+
|
|
6869
|
+
if key.is_a?(Range)
|
|
6870
|
+
return _select_rows_by_slice(df, key)
|
|
6871
|
+
|
|
6872
|
+
elsif key.is_a?(::Array)
|
|
6873
|
+
if key.empty?
|
|
6874
|
+
return df.clear
|
|
6875
|
+
end
|
|
6876
|
+
if Utils.bool?(key[0])
|
|
6877
|
+
_raise_on_boolean_mask
|
|
6878
|
+
end
|
|
6879
|
+
s = Series.new("", key, dtype: Int64)
|
|
6880
|
+
indices = _convert_series_to_indices(s, df.height)
|
|
6881
|
+
return _select_rows_by_index(df, indices)
|
|
6882
|
+
|
|
6883
|
+
elsif key.is_a?(Series)
|
|
6884
|
+
indices = _convert_series_to_indices(key, df.height)
|
|
6885
|
+
return _select_rows_by_index(df, indices)
|
|
6886
|
+
|
|
6887
|
+
else
|
|
6888
|
+
msg = "cannot select rows using key of type #{key.class.name.inspect}: #{key.inspect}"
|
|
6889
|
+
raise TypeError, msg
|
|
6890
|
+
end
|
|
6891
|
+
end
|
|
6892
|
+
|
|
6893
|
+
def _select_rows_by_slice(df, key)
|
|
6894
|
+
return Slice.new(df).apply(key)
|
|
6895
|
+
end
|
|
6896
|
+
|
|
6897
|
+
def _select_rows_by_index(df, key)
|
|
6898
|
+
df.send(:_from_rbdf, df._df.gather_with_series(key._s))
|
|
6899
|
+
end
|
|
6900
|
+
|
|
6901
|
+
def _convert_series_to_indices(s, size)
|
|
6902
|
+
idx_type = Plr.get_index_type
|
|
6903
|
+
|
|
6904
|
+
if s.dtype == idx_type
|
|
6905
|
+
return s
|
|
6906
|
+
end
|
|
6907
|
+
|
|
6908
|
+
if !s.dtype.integer?
|
|
6909
|
+
if s.dtype == Boolean
|
|
6910
|
+
_raise_on_boolean_mask
|
|
6911
|
+
else
|
|
6912
|
+
msg = "cannot treat Series of type #{s.dtype} as indices"
|
|
6913
|
+
raise TypeError, msg
|
|
6914
|
+
end
|
|
6915
|
+
end
|
|
6916
|
+
|
|
6917
|
+
if s.len == 0
|
|
6918
|
+
return Series.new(s.name, [], dtype: idx_type)
|
|
6919
|
+
end
|
|
6920
|
+
|
|
6921
|
+
if idx_type == UInt32
|
|
6922
|
+
if [Int64, UInt64].include?(s.dtype) && s.max >= Utils::U32_MAX
|
|
6923
|
+
msg = "index positions should be smaller than 2^32"
|
|
6924
|
+
raise ArgumentError, msg
|
|
6925
|
+
end
|
|
6926
|
+
if s.dtype == Int64 && s.min < -Utils::U32_MAX
|
|
6927
|
+
msg = "index positions should be greater than or equal to -2^32"
|
|
6928
|
+
raise ArgumentError, msg
|
|
6929
|
+
end
|
|
6930
|
+
end
|
|
6931
|
+
|
|
6932
|
+
if s.dtype.signed_integer?
|
|
6933
|
+
if s.min < 0
|
|
6934
|
+
if idx_type == UInt32
|
|
6935
|
+
idxs = [Int8, Int16].include?(s.dtype) ? s.cast(Int32) : s
|
|
6936
|
+
else
|
|
6937
|
+
idxs = [Int8, Int16, Int32].include?(s.dtype) ? s.cast(Int64) : s
|
|
6938
|
+
end
|
|
6939
|
+
|
|
6940
|
+
# Update negative indexes to absolute indexes.
|
|
6941
|
+
return (
|
|
6942
|
+
idxs.to_frame
|
|
6943
|
+
.select(
|
|
6944
|
+
F.when(F.col(idxs.name) < 0)
|
|
6945
|
+
.then(size + F.col(idxs.name))
|
|
6946
|
+
.otherwise(F.col(idxs.name))
|
|
6947
|
+
.cast(idx_type)
|
|
6948
|
+
)
|
|
6949
|
+
.to_series(0)
|
|
6950
|
+
)
|
|
6951
|
+
end
|
|
6952
|
+
end
|
|
6953
|
+
|
|
6954
|
+
s.cast(idx_type)
|
|
6955
|
+
end
|
|
6956
|
+
|
|
6957
|
+
def _raise_on_boolean_mask
|
|
6958
|
+
msg = (
|
|
6959
|
+
"selecting rows by passing a boolean mask to `[]` is not supported" +
|
|
6960
|
+
"\n\nHint: Use the `filter` method instead."
|
|
6961
|
+
)
|
|
6962
|
+
raise TypeError, msg
|
|
6963
|
+
end
|
|
6729
6964
|
end
|
|
6730
6965
|
end
|