polars-df 0.21.0-aarch64-linux → 0.22.0-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +27 -0
- data/Cargo.lock +55 -48
- data/Cargo.toml +3 -0
- data/LICENSE-THIRD-PARTY.txt +23 -49
- data/README.md +12 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/3.3/polars.so +0 -0
- data/lib/polars/3.4/polars.so +0 -0
- data/lib/polars/array_expr.rb +382 -3
- data/lib/polars/array_name_space.rb +281 -0
- data/lib/polars/binary_expr.rb +67 -0
- data/lib/polars/binary_name_space.rb +43 -0
- data/lib/polars/cat_expr.rb +224 -0
- data/lib/polars/cat_name_space.rb +138 -0
- data/lib/polars/config.rb +2 -2
- data/lib/polars/convert.rb +6 -6
- data/lib/polars/data_frame.rb +794 -27
- data/lib/polars/data_type_expr.rb +52 -0
- data/lib/polars/data_types.rb +26 -5
- data/lib/polars/date_time_expr.rb +252 -1
- data/lib/polars/date_time_name_space.rb +299 -0
- data/lib/polars/expr.rb +1248 -206
- data/lib/polars/functions/business.rb +95 -0
- data/lib/polars/functions/datatype.rb +21 -0
- data/lib/polars/functions/lazy.rb +14 -1
- data/lib/polars/io/csv.rb +1 -1
- data/lib/polars/io/iceberg.rb +27 -0
- data/lib/polars/io/json.rb +4 -4
- data/lib/polars/io/ndjson.rb +4 -4
- data/lib/polars/io/parquet.rb +32 -7
- data/lib/polars/io/scan_options.rb +4 -1
- data/lib/polars/lazy_frame.rb +1028 -28
- data/lib/polars/list_expr.rb +217 -17
- data/lib/polars/list_name_space.rb +231 -22
- data/lib/polars/meta_expr.rb +89 -0
- data/lib/polars/name_expr.rb +36 -0
- data/lib/polars/query_opt_flags.rb +50 -0
- data/lib/polars/scan_cast_options.rb +20 -1
- data/lib/polars/schema.rb +79 -3
- data/lib/polars/selector.rb +72 -0
- data/lib/polars/selectors.rb +3 -3
- data/lib/polars/series.rb +1053 -54
- data/lib/polars/string_expr.rb +436 -32
- data/lib/polars/string_name_space.rb +736 -50
- data/lib/polars/struct_expr.rb +103 -0
- data/lib/polars/struct_name_space.rb +19 -1
- data/lib/polars/utils/serde.rb +17 -0
- data/lib/polars/utils/various.rb +22 -1
- data/lib/polars/utils.rb +5 -1
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +6 -0
- metadata +8 -2
data/lib/polars/lazy_frame.rb
CHANGED
@@ -27,9 +27,6 @@ module Polars
|
|
27
27
|
ldf
|
28
28
|
end
|
29
29
|
|
30
|
-
# def self.from_json
|
31
|
-
# end
|
32
|
-
|
33
30
|
# Read a logical plan from a JSON file to construct a LazyFrame.
|
34
31
|
#
|
35
32
|
# @param file [String]
|
@@ -41,7 +38,49 @@ module Polars
|
|
41
38
|
file = Utils.normalize_filepath(file)
|
42
39
|
end
|
43
40
|
|
44
|
-
Utils.wrap_ldf(RbLazyFrame.
|
41
|
+
Utils.wrap_ldf(RbLazyFrame.deserialize_json(file))
|
42
|
+
end
|
43
|
+
|
44
|
+
# Read a logical plan from a file to construct a LazyFrame.
|
45
|
+
#
|
46
|
+
# @param source [Object]
|
47
|
+
# Path to a file or a file-like object (by file-like object, we refer to
|
48
|
+
# objects that have a `read` method, such as a file handler or `StringIO`).
|
49
|
+
#
|
50
|
+
# @return [LazyFrame]
|
51
|
+
#
|
52
|
+
# @note
|
53
|
+
# This function uses marshaling if the logical plan contains Ruby UDFs,
|
54
|
+
# and as such inherits the security implications. Deserializing can execute
|
55
|
+
# arbitrary code, so it should only be attempted on trusted data.
|
56
|
+
#
|
57
|
+
# @note
|
58
|
+
# Serialization is not stable across Polars versions: a LazyFrame serialized
|
59
|
+
# in one Polars version may not be deserializable in another Polars version.
|
60
|
+
#
|
61
|
+
# @example
|
62
|
+
# lf = Polars::LazyFrame.new({"a" => [1, 2, 3]}).sum
|
63
|
+
# bytes = lf.serialize
|
64
|
+
# Polars::LazyFrame.deserialize(StringIO.new(bytes)).collect
|
65
|
+
# # =>
|
66
|
+
# # shape: (1, 1)
|
67
|
+
# # ┌─────┐
|
68
|
+
# # │ a │
|
69
|
+
# # │ --- │
|
70
|
+
# # │ i64 │
|
71
|
+
# # ╞═════╡
|
72
|
+
# # │ 6 │
|
73
|
+
# # └─────┘
|
74
|
+
def self.deserialize(source)
|
75
|
+
raise Todo unless RbLazyFrame.respond_to?(:deserialize_binary)
|
76
|
+
|
77
|
+
if Utils.pathlike?(source)
|
78
|
+
source = Utils.normalize_filepath(source)
|
79
|
+
end
|
80
|
+
|
81
|
+
deserializer = RbLazyFrame.method(:deserialize_binary)
|
82
|
+
|
83
|
+
_from_rbldf(deserializer.(source))
|
45
84
|
end
|
46
85
|
|
47
86
|
# Get or set column names.
|
@@ -151,6 +190,38 @@ module Polars
|
|
151
190
|
nil
|
152
191
|
end
|
153
192
|
|
193
|
+
# Serialize the logical plan of this LazyFrame to a file or string.
|
194
|
+
#
|
195
|
+
# @param file [Object]
|
196
|
+
# File path to which the result should be written. If set to `nil`
|
197
|
+
# (default), the output is returned as a string instead.
|
198
|
+
#
|
199
|
+
# @return [Object]
|
200
|
+
#
|
201
|
+
# @note
|
202
|
+
# Serialization is not stable across Polars versions: a LazyFrame serialized
|
203
|
+
# in one Polars version may not be deserializable in another Polars version.
|
204
|
+
#
|
205
|
+
# @example Serialize the logical plan into a binary representation.
|
206
|
+
# lf = Polars::LazyFrame.new({"a" => [1, 2, 3]}).sum
|
207
|
+
# bytes = lf.serialize
|
208
|
+
# Polars::LazyFrame.deserialize(StringIO.new(bytes)).collect
|
209
|
+
# # =>
|
210
|
+
# # shape: (1, 1)
|
211
|
+
# # ┌─────┐
|
212
|
+
# # │ a │
|
213
|
+
# # │ --- │
|
214
|
+
# # │ i64 │
|
215
|
+
# # ╞═════╡
|
216
|
+
# # │ 6 │
|
217
|
+
# # └─────┘
|
218
|
+
def serialize(file = nil)
|
219
|
+
raise Todo unless _ldf.respond_to?(:serialize_binary)
|
220
|
+
|
221
|
+
serializer = _ldf.method(:serialize_binary)
|
222
|
+
Utils.serialize_polars_object(serializer, file)
|
223
|
+
end
|
224
|
+
|
154
225
|
# Offers a structured way to apply a sequence of user-defined functions (UDFs).
|
155
226
|
#
|
156
227
|
# @param func [Object]
|
@@ -288,6 +359,201 @@ module Polars
|
|
288
359
|
)
|
289
360
|
end
|
290
361
|
|
362
|
+
# Execute a SQL query against the LazyFrame.
|
363
|
+
#
|
364
|
+
# @note
|
365
|
+
# This functionality is considered **unstable**, although it is close to
|
366
|
+
# being considered stable. It may be changed at any point without it being
|
367
|
+
# considered a breaking change.
|
368
|
+
#
|
369
|
+
# @param query [String]
|
370
|
+
# SQL query to execute.
|
371
|
+
# @param table_name [String]
|
372
|
+
# Optionally provide an explicit name for the table that represents the
|
373
|
+
# calling frame (defaults to "self").
|
374
|
+
#
|
375
|
+
# @return [Expr]
|
376
|
+
#
|
377
|
+
# @note
|
378
|
+
# * The calling frame is automatically registered as a table in the SQL context
|
379
|
+
# under the name "self". If you want access to the DataFrames and LazyFrames
|
380
|
+
# found in the current globals, use the top-level `Polars.sql`.
|
381
|
+
# * More control over registration and execution behaviour is available by
|
382
|
+
# using the `SQLContext` object.
|
383
|
+
#
|
384
|
+
# @example Query the LazyFrame using SQL:
|
385
|
+
# lf1 = Polars::LazyFrame.new({"a" => [1, 2, 3], "b" => [6, 7, 8], "c" => ["z", "y", "x"]})
|
386
|
+
# lf2 = Polars::LazyFrame.new({"a" => [3, 2, 1], "d" => [125, -654, 888]})
|
387
|
+
# lf1.sql("SELECT c, b FROM self WHERE a > 1").collect
|
388
|
+
# # =>
|
389
|
+
# # shape: (2, 2)
|
390
|
+
# # ┌─────┬─────┐
|
391
|
+
# # │ c ┆ b │
|
392
|
+
# # │ --- ┆ --- │
|
393
|
+
# # │ str ┆ i64 │
|
394
|
+
# # ╞═════╪═════╡
|
395
|
+
# # │ y ┆ 7 │
|
396
|
+
# # │ x ┆ 8 │
|
397
|
+
# # └─────┴─────┘
|
398
|
+
#
|
399
|
+
# @example Apply SQL transforms (aliasing "self" to "frame") then filter natively (you can freely mix SQL and native operations):
|
400
|
+
# lf1.sql(
|
401
|
+
# "
|
402
|
+
# SELECT
|
403
|
+
# a,
|
404
|
+
# (a % 2 == 0) AS a_is_even,
|
405
|
+
# (b::float4 / 2) AS \"b/2\",
|
406
|
+
# CONCAT_WS(':', c, c, c) AS c_c_c
|
407
|
+
# FROM frame
|
408
|
+
# ORDER BY a
|
409
|
+
# ",
|
410
|
+
# table_name: "frame",
|
411
|
+
# ).filter(~Polars.col("c_c_c").str.starts_with("x")).collect
|
412
|
+
# # =>
|
413
|
+
# # shape: (2, 4)
|
414
|
+
# # ┌─────┬───────────┬─────┬───────┐
|
415
|
+
# # │ a ┆ a_is_even ┆ b/2 ┆ c_c_c │
|
416
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
417
|
+
# # │ i64 ┆ bool ┆ f32 ┆ str │
|
418
|
+
# # ╞═════╪═══════════╪═════╪═══════╡
|
419
|
+
# # │ 1 ┆ false ┆ 3.0 ┆ z:z:z │
|
420
|
+
# # │ 2 ┆ true ┆ 3.5 ┆ y:y:y │
|
421
|
+
# # └─────┴───────────┴─────┴───────┘
|
422
|
+
def sql(query, table_name: "self")
|
423
|
+
ctx = Polars::SQLContext.new
|
424
|
+
name = table_name || "self"
|
425
|
+
ctx.register(name, self)
|
426
|
+
ctx.execute(query)
|
427
|
+
end
|
428
|
+
|
429
|
+
# Return the `k` largest rows.
|
430
|
+
#
|
431
|
+
# Non-null elements are always preferred over null elements, regardless of
|
432
|
+
# the value of `reverse`. The output is not guaranteed to be in any
|
433
|
+
# particular order, call :func:`sort` after this function if you wish the
|
434
|
+
# output to be sorted.
|
435
|
+
#
|
436
|
+
# @param k [Integer]
|
437
|
+
# Number of rows to return.
|
438
|
+
# @param by [Object]
|
439
|
+
# Column(s) used to determine the top rows.
|
440
|
+
# Accepts expression input. Strings are parsed as column names.
|
441
|
+
# @param reverse [Object]
|
442
|
+
# Consider the `k` smallest elements of the `by` column(s) (instead of the `k`
|
443
|
+
# largest). This can be specified per column by passing a sequence of
|
444
|
+
# booleans.
|
445
|
+
#
|
446
|
+
# @return [LazyFrame]
|
447
|
+
#
|
448
|
+
# @example Get the rows which contain the 4 largest values in column b.
|
449
|
+
# lf = Polars::LazyFrame.new(
|
450
|
+
# {
|
451
|
+
# "a" => ["a", "b", "a", "b", "b", "c"],
|
452
|
+
# "b" => [2, 1, 1, 3, 2, 1]
|
453
|
+
# }
|
454
|
+
# )
|
455
|
+
# lf.top_k(4, by: "b").collect
|
456
|
+
# # =>
|
457
|
+
# # shape: (4, 2)
|
458
|
+
# # ┌─────┬─────┐
|
459
|
+
# # │ a ┆ b │
|
460
|
+
# # │ --- ┆ --- │
|
461
|
+
# # │ str ┆ i64 │
|
462
|
+
# # ╞═════╪═════╡
|
463
|
+
# # │ b ┆ 3 │
|
464
|
+
# # │ a ┆ 2 │
|
465
|
+
# # │ b ┆ 2 │
|
466
|
+
# # │ b ┆ 1 │
|
467
|
+
# # └─────┴─────┘
|
468
|
+
#
|
469
|
+
# @example Get the rows which contain the 4 largest values when sorting on column b and a.
|
470
|
+
# lf.top_k(4, by: ["b", "a"]).collect
|
471
|
+
# # =>
|
472
|
+
# # shape: (4, 2)
|
473
|
+
# # ┌─────┬─────┐
|
474
|
+
# # │ a ┆ b │
|
475
|
+
# # │ --- ┆ --- │
|
476
|
+
# # │ str ┆ i64 │
|
477
|
+
# # ╞═════╪═════╡
|
478
|
+
# # │ b ┆ 3 │
|
479
|
+
# # │ b ┆ 2 │
|
480
|
+
# # │ a ┆ 2 │
|
481
|
+
# # │ c ┆ 1 │
|
482
|
+
# # └─────┴─────┘
|
483
|
+
def top_k(
|
484
|
+
k,
|
485
|
+
by:,
|
486
|
+
reverse: false
|
487
|
+
)
|
488
|
+
by = Utils.parse_into_list_of_expressions(by)
|
489
|
+
reverse = Utils.extend_bool(reverse, by.length, "reverse", "by")
|
490
|
+
_from_rbldf(_ldf.top_k(k, by, reverse))
|
491
|
+
end
|
492
|
+
|
493
|
+
# Return the `k` smallest rows.
|
494
|
+
#
|
495
|
+
# Non-null elements are always preferred over null elements, regardless of
|
496
|
+
# the value of `reverse`. The output is not guaranteed to be in any
|
497
|
+
# particular order, call :func:`sort` after this function if you wish the
|
498
|
+
# output to be sorted.
|
499
|
+
#
|
500
|
+
# @param k [Integer]
|
501
|
+
# Number of rows to return.
|
502
|
+
# @param by [Object]
|
503
|
+
# Column(s) used to determine the bottom rows.
|
504
|
+
# Accepts expression input. Strings are parsed as column names.
|
505
|
+
# @param reverse [Object]
|
506
|
+
# Consider the `k` largest elements of the `by` column(s) (instead of the `k`
|
507
|
+
# smallest). This can be specified per column by passing a sequence of
|
508
|
+
# booleans.
|
509
|
+
#
|
510
|
+
# @return [LazyFrame]
|
511
|
+
#
|
512
|
+
# @example Get the rows which contain the 4 smallest values in column b.
|
513
|
+
# lf = Polars::LazyFrame.new(
|
514
|
+
# {
|
515
|
+
# "a" => ["a", "b", "a", "b", "b", "c"],
|
516
|
+
# "b" => [2, 1, 1, 3, 2, 1]
|
517
|
+
# }
|
518
|
+
# )
|
519
|
+
# lf.bottom_k(4, by: "b").collect
|
520
|
+
# # =>
|
521
|
+
# # shape: (4, 2)
|
522
|
+
# # ┌─────┬─────┐
|
523
|
+
# # │ a ┆ b │
|
524
|
+
# # │ --- ┆ --- │
|
525
|
+
# # │ str ┆ i64 │
|
526
|
+
# # ╞═════╪═════╡
|
527
|
+
# # │ b ┆ 1 │
|
528
|
+
# # │ a ┆ 1 │
|
529
|
+
# # │ c ┆ 1 │
|
530
|
+
# # │ a ┆ 2 │
|
531
|
+
# # └─────┴─────┘
|
532
|
+
#
|
533
|
+
# @example Get the rows which contain the 4 smallest values when sorting on column a and b.
|
534
|
+
# lf.bottom_k(4, by: ["a", "b"]).collect
|
535
|
+
# # =>
|
536
|
+
# # shape: (4, 2)
|
537
|
+
# # ┌─────┬─────┐
|
538
|
+
# # │ a ┆ b │
|
539
|
+
# # │ --- ┆ --- │
|
540
|
+
# # │ str ┆ i64 │
|
541
|
+
# # ╞═════╪═════╡
|
542
|
+
# # │ a ┆ 1 │
|
543
|
+
# # │ a ┆ 2 │
|
544
|
+
# # │ b ┆ 1 │
|
545
|
+
# # │ b ┆ 2 │
|
546
|
+
# # └─────┴─────┘
|
547
|
+
def bottom_k(
|
548
|
+
k,
|
549
|
+
by:,
|
550
|
+
reverse: false
|
551
|
+
)
|
552
|
+
by = Utils.parse_into_list_of_expressions(by)
|
553
|
+
reverse = Utils.extend_bool(reverse, by.length, "reverse", "by")
|
554
|
+
_from_rbldf(_ldf.bottom_k(k, by, reverse))
|
555
|
+
end
|
556
|
+
|
291
557
|
# def profile
|
292
558
|
# end
|
293
559
|
|
@@ -379,6 +645,41 @@ module Polars
|
|
379
645
|
Utils.wrap_df(ldf.collect)
|
380
646
|
end
|
381
647
|
|
648
|
+
# Resolve the schema of this LazyFrame.
|
649
|
+
#
|
650
|
+
# @return [Schema]
|
651
|
+
#
|
652
|
+
# @example Determine the schema.
|
653
|
+
# lf = Polars::LazyFrame.new(
|
654
|
+
# {
|
655
|
+
# "foo" => [1, 2, 3],
|
656
|
+
# "bar" => [6.0, 7.0, 8.0],
|
657
|
+
# "ham" => ["a", "b", "c"]
|
658
|
+
# }
|
659
|
+
# )
|
660
|
+
# lf.collect_schema
|
661
|
+
# # => Polars::Schema({"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::String})
|
662
|
+
#
|
663
|
+
# @example Access various properties of the schema.
|
664
|
+
# schema = lf.collect_schema
|
665
|
+
# schema["bar"]
|
666
|
+
# # => Polars::Float64
|
667
|
+
#
|
668
|
+
# @example
|
669
|
+
# schema.names
|
670
|
+
# # => ["foo", "bar", "ham"]
|
671
|
+
#
|
672
|
+
# @example
|
673
|
+
# schema.dtypes
|
674
|
+
# # => [Polars::Int64, Polars::Float64, Polars::String]
|
675
|
+
#
|
676
|
+
# @example
|
677
|
+
# schema.length
|
678
|
+
# # => 3
|
679
|
+
def collect_schema
|
680
|
+
Schema.new(_ldf.collect_schema, check_dtypes: false)
|
681
|
+
end
|
682
|
+
|
382
683
|
# Persists a LazyFrame at the provided path.
|
383
684
|
#
|
384
685
|
# This allows streaming results that are larger than RAM to be written to disk.
|
@@ -544,6 +845,21 @@ module Polars
|
|
544
845
|
# @param maintain_order [Boolean]
|
545
846
|
# Maintain the order in which data is processed.
|
546
847
|
# Setting this to `false` will be slightly faster.
|
848
|
+
# @param storage_options [String]
|
849
|
+
# Options that indicate how to connect to a cloud provider.
|
850
|
+
#
|
851
|
+
# The cloud providers currently supported are AWS, GCP, and Azure.
|
852
|
+
# See supported keys here:
|
853
|
+
#
|
854
|
+
# * [aws](https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html)
|
855
|
+
# * [gcp](https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html)
|
856
|
+
# * [azure](https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html)
|
857
|
+
# * Hugging Face (`hf://`): Accepts an API key under the `token` parameter: `{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.
|
858
|
+
#
|
859
|
+
# If `storage_options` is not provided, Polars will try to infer the
|
860
|
+
# information from environment variables.
|
861
|
+
# @param retries [Integer]
|
862
|
+
# Number of retries if accessing a cloud instance fails.
|
547
863
|
# @param type_coercion [Boolean]
|
548
864
|
# Do type coercion optimization.
|
549
865
|
# @param predicate_pushdown [Boolean]
|
@@ -576,6 +892,8 @@ module Polars
|
|
576
892
|
path,
|
577
893
|
compression: "zstd",
|
578
894
|
maintain_order: true,
|
895
|
+
storage_options: nil,
|
896
|
+
retries: 2,
|
579
897
|
type_coercion: true,
|
580
898
|
predicate_pushdown: true,
|
581
899
|
projection_pushdown: true,
|
@@ -586,10 +904,6 @@ module Polars
|
|
586
904
|
mkdir: false,
|
587
905
|
lazy: false
|
588
906
|
)
|
589
|
-
# TODO support storage options in Rust
|
590
|
-
storage_options = nil
|
591
|
-
retries = 2
|
592
|
-
|
593
907
|
lf = _set_sink_optimizations(
|
594
908
|
type_coercion: type_coercion,
|
595
909
|
predicate_pushdown: predicate_pushdown,
|
@@ -1147,6 +1461,140 @@ module Polars
|
|
1147
1461
|
)
|
1148
1462
|
end
|
1149
1463
|
|
1464
|
+
# Remove rows, dropping those that match the given predicate expression(s).
|
1465
|
+
#
|
1466
|
+
# The original order of the remaining rows is preserved.
|
1467
|
+
#
|
1468
|
+
# Rows where the filter predicate does not evaluate to true are retained
|
1469
|
+
# (this includes rows where the predicate evaluates as `null`).
|
1470
|
+
#
|
1471
|
+
# @param predicates [Array]
|
1472
|
+
# Expression that evaluates to a boolean Series.
|
1473
|
+
# @param constraints [Hash]
|
1474
|
+
# Column filters; use `name = value` to filter columns using the supplied
|
1475
|
+
# value. Each constraint behaves the same as `Polars.col(name).eq(value)`,
|
1476
|
+
# and is implicitly joined with the other filter conditions using `&`.
|
1477
|
+
#
|
1478
|
+
# @return [LazyFrame]
|
1479
|
+
#
|
1480
|
+
# @example Remove rows matching a condition:
|
1481
|
+
# lf = Polars::LazyFrame.new(
|
1482
|
+
# {
|
1483
|
+
# "foo" => [2, 3, nil, 4, 0],
|
1484
|
+
# "bar" => [5, 6, nil, nil, 0],
|
1485
|
+
# "ham" => ["a", "b", nil, "c", "d"]
|
1486
|
+
# }
|
1487
|
+
# )
|
1488
|
+
# lf.remove(
|
1489
|
+
# Polars.col("bar") >= 5
|
1490
|
+
# ).collect
|
1491
|
+
# # =>
|
1492
|
+
# # shape: (3, 3)
|
1493
|
+
# # ┌──────┬──────┬──────┐
|
1494
|
+
# # │ foo ┆ bar ┆ ham │
|
1495
|
+
# # │ --- ┆ --- ┆ --- │
|
1496
|
+
# # │ i64 ┆ i64 ┆ str │
|
1497
|
+
# # ╞══════╪══════╪══════╡
|
1498
|
+
# # │ null ┆ null ┆ null │
|
1499
|
+
# # │ 4 ┆ null ┆ c │
|
1500
|
+
# # │ 0 ┆ 0 ┆ d │
|
1501
|
+
# # └──────┴──────┴──────┘
|
1502
|
+
#
|
1503
|
+
# @example Discard rows based on multiple conditions, combined with and/or operators:
|
1504
|
+
# lf.remove(
|
1505
|
+
# (Polars.col("foo") >= 0) & (Polars.col("bar") >= 0)
|
1506
|
+
# ).collect
|
1507
|
+
# # =>
|
1508
|
+
# # shape: (2, 3)
|
1509
|
+
# # ┌──────┬──────┬──────┐
|
1510
|
+
# # │ foo ┆ bar ┆ ham │
|
1511
|
+
# # │ --- ┆ --- ┆ --- │
|
1512
|
+
# # │ i64 ┆ i64 ┆ str │
|
1513
|
+
# # ╞══════╪══════╪══════╡
|
1514
|
+
# # │ null ┆ null ┆ null │
|
1515
|
+
# # │ 4 ┆ null ┆ c │
|
1516
|
+
# # └──────┴──────┴──────┘
|
1517
|
+
#
|
1518
|
+
# @example
|
1519
|
+
# lf.remove(
|
1520
|
+
# (Polars.col("foo") >= 0) | (Polars.col("bar") >= 0)
|
1521
|
+
# ).collect
|
1522
|
+
# # =>
|
1523
|
+
# # shape: (1, 3)
|
1524
|
+
# # ┌──────┬──────┬──────┐
|
1525
|
+
# # │ foo ┆ bar ┆ ham │
|
1526
|
+
# # │ --- ┆ --- ┆ --- │
|
1527
|
+
# # │ i64 ┆ i64 ┆ str │
|
1528
|
+
# # ╞══════╪══════╪══════╡
|
1529
|
+
# # │ null ┆ null ┆ null │
|
1530
|
+
# # └──────┴──────┴──────┘
|
1531
|
+
#
|
1532
|
+
# @example Provide multiple constraints using `*args` syntax:
|
1533
|
+
# lf.remove(
|
1534
|
+
# Polars.col("ham").is_not_null,
|
1535
|
+
# Polars.col("bar") >= 0
|
1536
|
+
# ).collect
|
1537
|
+
# # =>
|
1538
|
+
# # shape: (2, 3)
|
1539
|
+
# # ┌──────┬──────┬──────┐
|
1540
|
+
# # │ foo ┆ bar ┆ ham │
|
1541
|
+
# # │ --- ┆ --- ┆ --- │
|
1542
|
+
# # │ i64 ┆ i64 ┆ str │
|
1543
|
+
# # ╞══════╪══════╪══════╡
|
1544
|
+
# # │ null ┆ null ┆ null │
|
1545
|
+
# # │ 4 ┆ null ┆ c │
|
1546
|
+
# # └──────┴──────┴──────┘
|
1547
|
+
#
|
1548
|
+
# @example Provide constraints(s) using `**kwargs` syntax:
|
1549
|
+
# lf.remove(foo: 0, bar: 0).collect
|
1550
|
+
# # =>
|
1551
|
+
# # shape: (4, 3)
|
1552
|
+
# # ┌──────┬──────┬──────┐
|
1553
|
+
# # │ foo ┆ bar ┆ ham │
|
1554
|
+
# # │ --- ┆ --- ┆ --- │
|
1555
|
+
# # │ i64 ┆ i64 ┆ str │
|
1556
|
+
# # ╞══════╪══════╪══════╡
|
1557
|
+
# # │ 2 ┆ 5 ┆ a │
|
1558
|
+
# # │ 3 ┆ 6 ┆ b │
|
1559
|
+
# # │ null ┆ null ┆ null │
|
1560
|
+
# # │ 4 ┆ null ┆ c │
|
1561
|
+
# # └──────┴──────┴──────┘
|
1562
|
+
#
|
1563
|
+
# @example Remove rows by comparing two columns against each other; in this case, we remove rows where the two columns are not equal (using `ne_missing` to ensure that null values compare equal):
|
1564
|
+
# lf.remove(
|
1565
|
+
# Polars.col("foo").ne_missing(Polars.col("bar"))
|
1566
|
+
# ).collect
|
1567
|
+
# # =>
|
1568
|
+
# # shape: (2, 3)
|
1569
|
+
# # ┌──────┬──────┬──────┐
|
1570
|
+
# # │ foo ┆ bar ┆ ham │
|
1571
|
+
# # │ --- ┆ --- ┆ --- │
|
1572
|
+
# # │ i64 ┆ i64 ┆ str │
|
1573
|
+
# # ╞══════╪══════╪══════╡
|
1574
|
+
# # │ null ┆ null ┆ null │
|
1575
|
+
# # │ 0 ┆ 0 ┆ d │
|
1576
|
+
# # └──────┴──────┴──────┘
|
1577
|
+
def remove(
|
1578
|
+
*predicates,
|
1579
|
+
**constraints
|
1580
|
+
)
|
1581
|
+
if constraints.empty?
|
1582
|
+
# early-exit conditions (exclude/include all rows)
|
1583
|
+
if predicates.empty? || (predicates.length == 1 && predicates[0].is_a?(TrueClass))
|
1584
|
+
return clear
|
1585
|
+
end
|
1586
|
+
if predicates.length == 1 && predicates[0].is_a?(FalseClass)
|
1587
|
+
return dup
|
1588
|
+
end
|
1589
|
+
end
|
1590
|
+
|
1591
|
+
_filter(
|
1592
|
+
predicates: predicates,
|
1593
|
+
constraints: constraints,
|
1594
|
+
invert: true
|
1595
|
+
)
|
1596
|
+
end
|
1597
|
+
|
1150
1598
|
# Select columns from this DataFrame.
|
1151
1599
|
#
|
1152
1600
|
# @param exprs [Array]
|
@@ -1244,6 +1692,29 @@ module Polars
|
|
1244
1692
|
_from_rbldf(_ldf.select(rbexprs))
|
1245
1693
|
end
|
1246
1694
|
|
1695
|
+
# Select columns from this LazyFrame.
|
1696
|
+
#
|
1697
|
+
# This will run all expression sequentially instead of in parallel.
|
1698
|
+
# Use this when the work per expression is cheap.
|
1699
|
+
#
|
1700
|
+
# @param exprs [Array]
|
1701
|
+
# Column(s) to select, specified as positional arguments.
|
1702
|
+
# Accepts expression input. Strings are parsed as column names,
|
1703
|
+
# other non-expression inputs are parsed as literals.
|
1704
|
+
# @param named_exprs [Hash]
|
1705
|
+
# Additional columns to select, specified as keyword arguments.
|
1706
|
+
# The columns will be renamed to the keyword used.
|
1707
|
+
#
|
1708
|
+
# @return [LazyFrame]
|
1709
|
+
def select_seq(*exprs, **named_exprs)
|
1710
|
+
structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", 0).to_i != 0
|
1711
|
+
|
1712
|
+
rbexprs = Utils.parse_into_list_of_expressions(
|
1713
|
+
*exprs, **named_exprs, __structify: structify
|
1714
|
+
)
|
1715
|
+
_from_rbldf(_ldf.select_seq(rbexprs))
|
1716
|
+
end
|
1717
|
+
|
1247
1718
|
# Start a group by operation.
|
1248
1719
|
#
|
1249
1720
|
# @param by [Array]
|
@@ -1440,9 +1911,9 @@ module Polars
|
|
1440
1911
|
# @param every [Object]
|
1441
1912
|
# Interval of the window.
|
1442
1913
|
# @param period [Object]
|
1443
|
-
# Length of the window, if
|
1914
|
+
# Length of the window, if nil it is equal to 'every'.
|
1444
1915
|
# @param offset [Object]
|
1445
|
-
# Offset of the window if
|
1916
|
+
# Offset of the window if nil and period is nil it will be equal to negative
|
1446
1917
|
# `every`.
|
1447
1918
|
# @param truncate [Boolean]
|
1448
1919
|
# Truncate the time value to the window lower bound.
|
@@ -1714,7 +2185,7 @@ module Polars
|
|
1714
2185
|
# Join column of the right DataFrame.
|
1715
2186
|
# @param on [String]
|
1716
2187
|
# Join column of both DataFrames. If set, `left_on` and `right_on` should be
|
1717
|
-
#
|
2188
|
+
# nil.
|
1718
2189
|
# @param by_left [Object]
|
1719
2190
|
# Join on these columns before doing asof join.
|
1720
2191
|
# @param by_right [Object]
|
@@ -2039,7 +2510,7 @@ module Polars
|
|
2039
2510
|
# Join column of the right DataFrame.
|
2040
2511
|
# @param on Object
|
2041
2512
|
# Join column of both DataFrames. If set, `left_on` and `right_on` should be
|
2042
|
-
#
|
2513
|
+
# nil.
|
2043
2514
|
# @param how ["inner", "left", "full", "semi", "anti", "cross"]
|
2044
2515
|
# Join strategy.
|
2045
2516
|
# @param suffix [String]
|
@@ -2234,6 +2705,103 @@ module Polars
|
|
2234
2705
|
)
|
2235
2706
|
end
|
2236
2707
|
|
2708
|
+
# Perform a join based on one or multiple (in)equality predicates.
|
2709
|
+
#
|
2710
|
+
# This performs an inner join, so only rows where all predicates are true
|
2711
|
+
# are included in the result, and a row from either DataFrame may be included
|
2712
|
+
# multiple times in the result.
|
2713
|
+
#
|
2714
|
+
# @note
|
2715
|
+
# The row order of the input DataFrames is not preserved.
|
2716
|
+
#
|
2717
|
+
# @note
|
2718
|
+
# This functionality is experimental. It may be
|
2719
|
+
# changed at any point without it being considered a breaking change.
|
2720
|
+
#
|
2721
|
+
# @param other [Object]
|
2722
|
+
# DataFrame to join with.
|
2723
|
+
# @param predicates [Object]
|
2724
|
+
# (In)Equality condition to join the two tables on.
|
2725
|
+
# When a column name occurs in both tables, the proper suffix must
|
2726
|
+
# be applied in the predicate.
|
2727
|
+
# @param suffix [String]
|
2728
|
+
# Suffix to append to columns with a duplicate name.
|
2729
|
+
#
|
2730
|
+
# @return [LazyFrame]
|
2731
|
+
#
|
2732
|
+
# @example Join two lazyframes together based on two predicates which get AND-ed together.
|
2733
|
+
# east = Polars::LazyFrame.new(
|
2734
|
+
# {
|
2735
|
+
# "id" => [100, 101, 102],
|
2736
|
+
# "dur" => [120, 140, 160],
|
2737
|
+
# "rev" => [12, 14, 16],
|
2738
|
+
# "cores" => [2, 8, 4]
|
2739
|
+
# }
|
2740
|
+
# )
|
2741
|
+
# west = Polars::LazyFrame.new(
|
2742
|
+
# {
|
2743
|
+
# "t_id" => [404, 498, 676, 742],
|
2744
|
+
# "time" => [90, 130, 150, 170],
|
2745
|
+
# "cost" => [9, 13, 15, 16],
|
2746
|
+
# "cores" => [4, 2, 1, 4]
|
2747
|
+
# }
|
2748
|
+
# )
|
2749
|
+
# east.join_where(
|
2750
|
+
# west,
|
2751
|
+
# Polars.col("dur") < Polars.col("time"),
|
2752
|
+
# Polars.col("rev") < Polars.col("cost")
|
2753
|
+
# ).collect
|
2754
|
+
# # =>
|
2755
|
+
# # shape: (5, 8)
|
2756
|
+
# # ┌─────┬─────┬─────┬───────┬──────┬──────┬──────┬─────────────┐
|
2757
|
+
# # │ id ┆ dur ┆ rev ┆ cores ┆ t_id ┆ time ┆ cost ┆ cores_right │
|
2758
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
2759
|
+
# # │ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
|
2760
|
+
# # ╞═════╪═════╪═════╪═══════╪══════╪══════╪══════╪═════════════╡
|
2761
|
+
# # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 498 ┆ 130 ┆ 13 ┆ 2 │
|
2762
|
+
# # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
|
2763
|
+
# # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
|
2764
|
+
# # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
|
2765
|
+
# # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
|
2766
|
+
# # └─────┴─────┴─────┴───────┴──────┴──────┴──────┴─────────────┘
|
2767
|
+
#
|
2768
|
+
# @example To OR them together, use a single expression and the `|` operator.
|
2769
|
+
# east.join_where(
|
2770
|
+
# west,
|
2771
|
+
# (Polars.col("dur") < Polars.col("time")) | (Polars.col("rev") < Polars.col("cost"))
|
2772
|
+
# ).collect
|
2773
|
+
# # =>
|
2774
|
+
# # shape: (6, 8)
|
2775
|
+
# # ┌─────┬─────┬─────┬───────┬──────┬──────┬──────┬─────────────┐
|
2776
|
+
# # │ id ┆ dur ┆ rev ┆ cores ┆ t_id ┆ time ┆ cost ┆ cores_right │
|
2777
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
2778
|
+
# # │ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
|
2779
|
+
# # ╞═════╪═════╪═════╪═══════╪══════╪══════╪══════╪═════════════╡
|
2780
|
+
# # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 498 ┆ 130 ┆ 13 ┆ 2 │
|
2781
|
+
# # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
|
2782
|
+
# # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
|
2783
|
+
# # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
|
2784
|
+
# # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
|
2785
|
+
# # │ 102 ┆ 160 ┆ 16 ┆ 4 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
|
2786
|
+
# # └─────┴─────┴─────┴───────┴──────┴──────┴──────┴─────────────┘
|
2787
|
+
def join_where(
|
2788
|
+
other,
|
2789
|
+
*predicates,
|
2790
|
+
suffix: "_right"
|
2791
|
+
)
|
2792
|
+
Utils.require_same_type(self, other)
|
2793
|
+
|
2794
|
+
rbexprs = Utils.parse_into_list_of_expressions(*predicates)
|
2795
|
+
|
2796
|
+
_from_rbldf(
|
2797
|
+
_ldf.join_where(
|
2798
|
+
other._ldf,
|
2799
|
+
rbexprs,
|
2800
|
+
suffix
|
2801
|
+
)
|
2802
|
+
)
|
2803
|
+
end
|
2804
|
+
|
2237
2805
|
# Add or overwrite multiple columns in a DataFrame.
|
2238
2806
|
#
|
2239
2807
|
# @param exprs [Object]
|
@@ -2279,6 +2847,34 @@ module Polars
|
|
2279
2847
|
_from_rbldf(_ldf.with_columns(rbexprs))
|
2280
2848
|
end
|
2281
2849
|
|
2850
|
+
# Add columns to this LazyFrame.
|
2851
|
+
#
|
2852
|
+
# Added columns will replace existing columns with the same name.
|
2853
|
+
#
|
2854
|
+
# This will run all expression sequentially instead of in parallel.
|
2855
|
+
# Use this when the work per expression is cheap.
|
2856
|
+
#
|
2857
|
+
# @param exprs [Array]
|
2858
|
+
# Column(s) to add, specified as positional arguments.
|
2859
|
+
# Accepts expression input. Strings are parsed as column names, other
|
2860
|
+
# non-expression inputs are parsed as literals.
|
2861
|
+
# @param named_exprs [Hash]
|
2862
|
+
# Additional columns to add, specified as keyword arguments.
|
2863
|
+
# The columns will be renamed to the keyword used.
|
2864
|
+
#
|
2865
|
+
# @return [LazyFrame]
|
2866
|
+
def with_columns_seq(
|
2867
|
+
*exprs,
|
2868
|
+
**named_exprs
|
2869
|
+
)
|
2870
|
+
structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", 0).to_i != 0
|
2871
|
+
|
2872
|
+
rbexprs = Utils.parse_into_list_of_expressions(
|
2873
|
+
*exprs, **named_exprs, __structify: structify
|
2874
|
+
)
|
2875
|
+
_from_rbldf(_ldf.with_columns_seq(rbexprs))
|
2876
|
+
end
|
2877
|
+
|
2282
2878
|
# Add an external context to the computation graph.
|
2283
2879
|
#
|
2284
2880
|
# This allows expressions to also access columns from DataFrames
|
@@ -2887,7 +3483,7 @@ module Polars
|
|
2887
3483
|
#
|
2888
3484
|
# @example
|
2889
3485
|
# s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]}).lazy
|
2890
|
-
# s.
|
3486
|
+
# s.gather_every(2).collect
|
2891
3487
|
# # =>
|
2892
3488
|
# # shape: (2, 2)
|
2893
3489
|
# # ┌─────┬─────┐
|
@@ -2898,9 +3494,10 @@ module Polars
|
|
2898
3494
|
# # │ 1 ┆ 5 │
|
2899
3495
|
# # │ 3 ┆ 7 │
|
2900
3496
|
# # └─────┴─────┘
|
2901
|
-
def
|
2902
|
-
select(F.col("*").
|
3497
|
+
def gather_every(n)
|
3498
|
+
select(F.col("*").gather_every(n))
|
2903
3499
|
end
|
3500
|
+
alias_method :take_every, :gather_every
|
2904
3501
|
|
2905
3502
|
# Fill null values using the specified value or strategy.
|
2906
3503
|
#
|
@@ -3177,6 +3774,32 @@ module Polars
|
|
3177
3774
|
_from_rbldf(_ldf.median)
|
3178
3775
|
end
|
3179
3776
|
|
3777
|
+
# Aggregate the columns in the LazyFrame as the sum of their null value count.
|
3778
|
+
#
|
3779
|
+
# @return [LazyFrame]
|
3780
|
+
#
|
3781
|
+
# @example
|
3782
|
+
# lf = Polars::LazyFrame.new(
|
3783
|
+
# {
|
3784
|
+
# "foo" => [1, nil, 3],
|
3785
|
+
# "bar" => [6, 7, nil],
|
3786
|
+
# "ham" => ["a", "b", "c"]
|
3787
|
+
# }
|
3788
|
+
# )
|
3789
|
+
# lf.null_count.collect
|
3790
|
+
# # =>
|
3791
|
+
# # shape: (1, 3)
|
3792
|
+
# # ┌─────┬─────┬─────┐
|
3793
|
+
# # │ foo ┆ bar ┆ ham │
|
3794
|
+
# # │ --- ┆ --- ┆ --- │
|
3795
|
+
# # │ u32 ┆ u32 ┆ u32 │
|
3796
|
+
# # ╞═════╪═════╪═════╡
|
3797
|
+
# # │ 1 ┆ 1 ┆ 0 │
|
3798
|
+
# # └─────┴─────┴─────┘
|
3799
|
+
def null_count
|
3800
|
+
_from_rbldf(_ldf.null_count)
|
3801
|
+
end
|
3802
|
+
|
3180
3803
|
# Aggregate the columns in the DataFrame to their quantile value.
|
3181
3804
|
#
|
3182
3805
|
# @param quantile [Float]
|
@@ -3307,37 +3930,103 @@ module Polars
|
|
3307
3930
|
_from_rbldf(_ldf.unique(maintain_order, selector_subset, keep))
|
3308
3931
|
end
|
3309
3932
|
|
3310
|
-
# Drop rows
|
3933
|
+
# Drop all rows that contain one or more NaN values.
|
3934
|
+
#
|
3935
|
+
# The original order of the remaining rows is preserved.
|
3311
3936
|
#
|
3312
3937
|
# @param subset [Object]
|
3313
|
-
#
|
3938
|
+
# Column name(s) for which NaN values are considered; if set to `nil`
|
3939
|
+
# (default), use all columns (note that only floating-point columns
|
3940
|
+
# can contain NaNs).
|
3314
3941
|
#
|
3315
3942
|
# @return [LazyFrame]
|
3316
3943
|
#
|
3317
3944
|
# @example
|
3318
|
-
#
|
3945
|
+
# lf = Polars::LazyFrame.new(
|
3946
|
+
# {
|
3947
|
+
# "foo" => [-20.5, Float::NAN, 80.0],
|
3948
|
+
# "bar" => [Float::NAN, 110.0, 25.5],
|
3949
|
+
# "ham" => ["xxx", "yyy", nil]
|
3950
|
+
# }
|
3951
|
+
# )
|
3952
|
+
# lf.drop_nans.collect
|
3953
|
+
# # =>
|
3954
|
+
# # shape: (1, 3)
|
3955
|
+
# # ┌──────┬──────┬──────┐
|
3956
|
+
# # │ foo ┆ bar ┆ ham │
|
3957
|
+
# # │ --- ┆ --- ┆ --- │
|
3958
|
+
# # │ f64 ┆ f64 ┆ str │
|
3959
|
+
# # ╞══════╪══════╪══════╡
|
3960
|
+
# # │ 80.0 ┆ 25.5 ┆ null │
|
3961
|
+
# # └──────┴──────┴──────┘
|
3962
|
+
#
|
3963
|
+
# @example
|
3964
|
+
# lf.drop_nans(subset: ["bar"]).collect
|
3965
|
+
# # =>
|
3966
|
+
# # shape: (2, 3)
|
3967
|
+
# # ┌──────┬───────┬──────┐
|
3968
|
+
# # │ foo ┆ bar ┆ ham │
|
3969
|
+
# # │ --- ┆ --- ┆ --- │
|
3970
|
+
# # │ f64 ┆ f64 ┆ str │
|
3971
|
+
# # ╞══════╪═══════╪══════╡
|
3972
|
+
# # │ NaN ┆ 110.0 ┆ yyy │
|
3973
|
+
# # │ 80.0 ┆ 25.5 ┆ null │
|
3974
|
+
# # └──────┴───────┴──────┘
|
3975
|
+
def drop_nans(subset: nil)
|
3976
|
+
selector_subset = nil
|
3977
|
+
if !subset.nil?
|
3978
|
+
selector_subset = Utils.parse_list_into_selector(subset)._rbselector
|
3979
|
+
end
|
3980
|
+
_from_rbldf(_ldf.drop_nans(selector_subset))
|
3981
|
+
end
|
3982
|
+
|
3983
|
+
# Drop all rows that contain one or more null values.
|
3984
|
+
#
|
3985
|
+
# The original order of the remaining rows is preserved.
|
3986
|
+
#
|
3987
|
+
# @param subset [Object]
|
3988
|
+
# Column name(s) for which null values are considered.
|
3989
|
+
# If set to `nil` (default), use all columns.
|
3990
|
+
#
|
3991
|
+
# @return [LazyFrame]
|
3992
|
+
#
|
3993
|
+
# @example
|
3994
|
+
# lf = Polars::LazyFrame.new(
|
3319
3995
|
# {
|
3320
3996
|
# "foo" => [1, 2, 3],
|
3321
3997
|
# "bar" => [6, nil, 8],
|
3322
|
-
# "ham" => ["a", "b",
|
3998
|
+
# "ham" => ["a", "b", nil]
|
3323
3999
|
# }
|
3324
4000
|
# )
|
3325
|
-
#
|
4001
|
+
# lf.drop_nulls.collect
|
3326
4002
|
# # =>
|
3327
|
-
# # shape: (
|
4003
|
+
# # shape: (1, 3)
|
3328
4004
|
# # ┌─────┬─────┬─────┐
|
3329
4005
|
# # │ foo ┆ bar ┆ ham │
|
3330
4006
|
# # │ --- ┆ --- ┆ --- │
|
3331
4007
|
# # │ i64 ┆ i64 ┆ str │
|
3332
4008
|
# # ╞═════╪═════╪═════╡
|
3333
4009
|
# # │ 1 ┆ 6 ┆ a │
|
3334
|
-
# # │ 3 ┆ 8 ┆ c │
|
3335
4010
|
# # └─────┴─────┴─────┘
|
4011
|
+
#
|
4012
|
+
# @example
|
4013
|
+
# lf.drop_nulls(subset: Polars.cs.integer).collect
|
4014
|
+
# # =>
|
4015
|
+
# # shape: (2, 3)
|
4016
|
+
# # ┌─────┬─────┬──────┐
|
4017
|
+
# # │ foo ┆ bar ┆ ham │
|
4018
|
+
# # │ --- ┆ --- ┆ --- │
|
4019
|
+
# # │ i64 ┆ i64 ┆ str │
|
4020
|
+
# # ╞═════╪═════╪══════╡
|
4021
|
+
# # │ 1 ┆ 6 ┆ a │
|
4022
|
+
# # │ 3 ┆ 8 ┆ null │
|
4023
|
+
# # └─────┴─────┴──────┘
|
3336
4024
|
def drop_nulls(subset: nil)
|
3337
|
-
|
3338
|
-
|
4025
|
+
selector_subset = nil
|
4026
|
+
if !subset.nil?
|
4027
|
+
selector_subset = Utils.parse_list_into_selector(subset)._rbselector
|
3339
4028
|
end
|
3340
|
-
_from_rbldf(_ldf.drop_nulls(
|
4029
|
+
_from_rbldf(_ldf.drop_nulls(selector_subset))
|
3341
4030
|
end
|
3342
4031
|
|
3343
4032
|
# Unpivot a DataFrame from wide to long format.
|
@@ -3571,9 +4260,261 @@ module Polars
|
|
3571
4260
|
with_columns(F.col(column).set_sorted(descending: descending))
|
3572
4261
|
end
|
3573
4262
|
|
3574
|
-
#
|
3575
|
-
#
|
3576
|
-
#
|
4263
|
+
# Update the values in this `LazyFrame` with the values in `other`.
|
4264
|
+
#
|
4265
|
+
# @note
|
4266
|
+
# This functionality is considered **unstable**. It may be changed
|
4267
|
+
# at any point without it being considered a breaking change.
|
4268
|
+
#
|
4269
|
+
# @param other [LazyFrame]
|
4270
|
+
# LazyFrame that will be used to update the values
|
4271
|
+
# @param on [Object]
|
4272
|
+
# Column names that will be joined on. If set to `nil` (default),
|
4273
|
+
# the implicit row index of each frame is used as a join key.
|
4274
|
+
# @param how ['left', 'inner', 'full']
|
4275
|
+
# * 'left' will keep all rows from the left table; rows may be duplicated
|
4276
|
+
# if multiple rows in the right frame match the left row's key.
|
4277
|
+
# * 'inner' keeps only those rows where the key exists in both frames.
|
4278
|
+
# * 'full' will update existing rows where the key matches while also
|
4279
|
+
# adding any new rows contained in the given frame.
|
4280
|
+
# @param left_on [Object]
|
4281
|
+
# Join column(s) of the left DataFrame.
|
4282
|
+
# @param right_on [Object]
|
4283
|
+
# Join column(s) of the right DataFrame.
|
4284
|
+
# @param include_nulls [Boolean]
|
4285
|
+
# Overwrite values in the left frame with null values from the right frame.
|
4286
|
+
# If set to `false` (default), null values in the right frame are ignored.
|
4287
|
+
# @param maintain_order ['none', 'left', 'right', 'left_right', 'right_left']
|
4288
|
+
# Which order of rows from the inputs to preserve. See `LazyFrame.join`
|
4289
|
+
# for details. Unlike `join` this function preserves the left order by
|
4290
|
+
# default.
|
4291
|
+
#
|
4292
|
+
# @return [LazyFrame]
|
4293
|
+
#
|
4294
|
+
# @note
|
4295
|
+
# This is syntactic sugar for a left/inner join that preserves the order
|
4296
|
+
# of the left `DataFrame` by default, with an optional coalesce when
|
4297
|
+
# `include_nulls: False`.
|
4298
|
+
#
|
4299
|
+
# @example Update `df` values with the non-null values in `new_df`, by row index:
|
4300
|
+
# lf = Polars::LazyFrame.new(
|
4301
|
+
# {
|
4302
|
+
# "A" => [1, 2, 3, 4],
|
4303
|
+
# "B" => [400, 500, 600, 700]
|
4304
|
+
# }
|
4305
|
+
# )
|
4306
|
+
# new_lf = Polars::LazyFrame.new(
|
4307
|
+
# {
|
4308
|
+
# "B" => [-66, nil, -99],
|
4309
|
+
# "C" => [5, 3, 1]
|
4310
|
+
# }
|
4311
|
+
# )
|
4312
|
+
# lf.update(new_lf).collect
|
4313
|
+
# # =>
|
4314
|
+
# # shape: (4, 2)
|
4315
|
+
# # ┌─────┬─────┐
|
4316
|
+
# # │ A ┆ B │
|
4317
|
+
# # │ --- ┆ --- │
|
4318
|
+
# # │ i64 ┆ i64 │
|
4319
|
+
# # ╞═════╪═════╡
|
4320
|
+
# # │ 1 ┆ -66 │
|
4321
|
+
# # │ 2 ┆ 500 │
|
4322
|
+
# # │ 3 ┆ -99 │
|
4323
|
+
# # │ 4 ┆ 700 │
|
4324
|
+
# # └─────┴─────┘
|
4325
|
+
#
|
4326
|
+
# @example Update `df` values with the non-null values in `new_df`, by row index, but only keeping those rows that are common to both frames:
|
4327
|
+
# lf.update(new_lf, how: "inner").collect
|
4328
|
+
# # =>
|
4329
|
+
# # shape: (3, 2)
|
4330
|
+
# # ┌─────┬─────┐
|
4331
|
+
# # │ A ┆ B │
|
4332
|
+
# # │ --- ┆ --- │
|
4333
|
+
# # │ i64 ┆ i64 │
|
4334
|
+
# # ╞═════╪═════╡
|
4335
|
+
# # │ 1 ┆ -66 │
|
4336
|
+
# # │ 2 ┆ 500 │
|
4337
|
+
# # │ 3 ┆ -99 │
|
4338
|
+
# # └─────┴─────┘
|
4339
|
+
#
|
4340
|
+
# @example Update `df` values with the non-null values in `new_df`, using a full outer join strategy that defines explicit join columns in each frame:
|
4341
|
+
# lf.update(new_lf, left_on: ["A"], right_on: ["C"], how: "full").collect
|
4342
|
+
# # =>
|
4343
|
+
# # shape: (5, 2)
|
4344
|
+
# # ┌─────┬─────┐
|
4345
|
+
# # │ A ┆ B │
|
4346
|
+
# # │ --- ┆ --- │
|
4347
|
+
# # │ i64 ┆ i64 │
|
4348
|
+
# # ╞═════╪═════╡
|
4349
|
+
# # │ 1 ┆ -99 │
|
4350
|
+
# # │ 2 ┆ 500 │
|
4351
|
+
# # │ 3 ┆ 600 │
|
4352
|
+
# # │ 4 ┆ 700 │
|
4353
|
+
# # │ 5 ┆ -66 │
|
4354
|
+
# # └─────┴─────┘
|
4355
|
+
#
|
4356
|
+
# @example Update `df` values including null values in `new_df`, using a full outer join strategy that defines explicit join columns in each frame:
|
4357
|
+
# lf.update(
|
4358
|
+
# new_lf, left_on: "A", right_on: "C", how: "full", include_nulls: true
|
4359
|
+
# ).collect
|
4360
|
+
# # =>
|
4361
|
+
# # shape: (5, 2)
|
4362
|
+
# # ┌─────┬──────┐
|
4363
|
+
# # │ A ┆ B │
|
4364
|
+
# # │ --- ┆ --- │
|
4365
|
+
# # │ i64 ┆ i64 │
|
4366
|
+
# # ╞═════╪══════╡
|
4367
|
+
# # │ 1 ┆ -99 │
|
4368
|
+
# # │ 2 ┆ 500 │
|
4369
|
+
# # │ 3 ┆ null │
|
4370
|
+
# # │ 4 ┆ 700 │
|
4371
|
+
# # │ 5 ┆ -66 │
|
4372
|
+
# # └─────┴──────┘
|
4373
|
+
def update(
|
4374
|
+
other,
|
4375
|
+
on: nil,
|
4376
|
+
how: "left",
|
4377
|
+
left_on: nil,
|
4378
|
+
right_on: nil,
|
4379
|
+
include_nulls: false,
|
4380
|
+
maintain_order: "left"
|
4381
|
+
)
|
4382
|
+
Utils.require_same_type(self, other)
|
4383
|
+
if ["outer", "outer_coalesce"].include?(how)
|
4384
|
+
how = "full"
|
4385
|
+
end
|
4386
|
+
|
4387
|
+
if !["left", "inner", "full"].include?(how)
|
4388
|
+
msg = "`how` must be one of {{'left', 'inner', 'full'}}; found #{how.inspect}"
|
4389
|
+
raise ArgumentError, msg
|
4390
|
+
end
|
4391
|
+
|
4392
|
+
slf = self
|
4393
|
+
row_index_used = false
|
4394
|
+
if on.nil?
|
4395
|
+
if left_on.nil? && right_on.nil?
|
4396
|
+
# no keys provided--use row index
|
4397
|
+
row_index_used = true
|
4398
|
+
row_index_name = "__POLARS_ROW_INDEX"
|
4399
|
+
slf = slf.with_row_index(name: row_index_name)
|
4400
|
+
other = other.with_row_index(name: row_index_name)
|
4401
|
+
left_on = right_on = [row_index_name]
|
4402
|
+
else
|
4403
|
+
# one of left or right is missing, raise error
|
4404
|
+
if left_on.nil?
|
4405
|
+
msg = "missing join columns for left frame"
|
4406
|
+
raise ArgumentError, msg
|
4407
|
+
end
|
4408
|
+
if right_on.nil?
|
4409
|
+
msg = "missing join columns for right frame"
|
4410
|
+
raise ArgumentError, msg
|
4411
|
+
end
|
4412
|
+
end
|
4413
|
+
else
|
4414
|
+
# move on into left/right_on to simplify logic
|
4415
|
+
left_on = right_on = on
|
4416
|
+
end
|
4417
|
+
|
4418
|
+
if left_on.is_a?(::String)
|
4419
|
+
left_on = [left_on]
|
4420
|
+
end
|
4421
|
+
if right_on.is_a?(::String)
|
4422
|
+
right_on = [right_on]
|
4423
|
+
end
|
4424
|
+
|
4425
|
+
left_schema = slf.collect_schema
|
4426
|
+
left_on.each do |name|
|
4427
|
+
if !left_schema.include?(name)
|
4428
|
+
msg = "left join column #{name.inspect} not found"
|
4429
|
+
raise ArgumentError, msg
|
4430
|
+
end
|
4431
|
+
end
|
4432
|
+
right_schema = other.collect_schema
|
4433
|
+
right_on.each do |name|
|
4434
|
+
if !right_schema.include?(name)
|
4435
|
+
msg = "right join column #{name.inspect} not found"
|
4436
|
+
raise ArgumentError, msg
|
4437
|
+
end
|
4438
|
+
end
|
4439
|
+
|
4440
|
+
# no need to join if *only* join columns are in other (inner/left update only)
|
4441
|
+
if how != "full" && right_schema.length == right_on.length
|
4442
|
+
if row_index_used
|
4443
|
+
return slf.drop(row_index_name)
|
4444
|
+
end
|
4445
|
+
return slf
|
4446
|
+
end
|
4447
|
+
|
4448
|
+
# only use non-idx right columns present in left frame
|
4449
|
+
right_other = Set.new(right_schema.to_h.keys).intersection(left_schema.to_h.keys) - Set.new(right_on)
|
4450
|
+
|
4451
|
+
# When include_nulls is True, we need to distinguish records after the join that
|
4452
|
+
# were originally null in the right frame, as opposed to records that were null
|
4453
|
+
# because the key was missing from the right frame.
|
4454
|
+
# Add a validity column to track whether row was matched or not.
|
4455
|
+
if include_nulls
|
4456
|
+
validity = ["__POLARS_VALIDITY"]
|
4457
|
+
other = other.with_columns(F.lit(true).alias(validity[0]))
|
4458
|
+
else
|
4459
|
+
validity = []
|
4460
|
+
end
|
4461
|
+
|
4462
|
+
tmp_name = "__POLARS_RIGHT"
|
4463
|
+
drop_columns = right_other.map { |name| "#{name}#{tmp_name}" } + validity
|
4464
|
+
result = (
|
4465
|
+
slf.join(
|
4466
|
+
other.select(*right_on, *right_other, *validity),
|
4467
|
+
left_on: left_on,
|
4468
|
+
right_on: right_on,
|
4469
|
+
how: how,
|
4470
|
+
suffix: tmp_name,
|
4471
|
+
coalesce: true,
|
4472
|
+
maintain_order: maintain_order
|
4473
|
+
)
|
4474
|
+
.with_columns(
|
4475
|
+
right_other.map do |name|
|
4476
|
+
(
|
4477
|
+
if include_nulls
|
4478
|
+
# use left value only when right value failed to join
|
4479
|
+
F.when(F.col(validity).is_null)
|
4480
|
+
.then(F.col(name))
|
4481
|
+
.otherwise(F.col("#{name}#{tmp_name}"))
|
4482
|
+
else
|
4483
|
+
F.coalesce(["#{name}#{tmp_name}", F.col(name)])
|
4484
|
+
end
|
4485
|
+
).alias(name)
|
4486
|
+
end
|
4487
|
+
)
|
4488
|
+
.drop(drop_columns)
|
4489
|
+
)
|
4490
|
+
if row_index_used
|
4491
|
+
result = result.drop(row_index_name)
|
4492
|
+
end
|
4493
|
+
|
4494
|
+
_from_rbldf(result._ldf)
|
4495
|
+
end
|
4496
|
+
|
4497
|
+
# Return the number of non-null elements for each column.
|
4498
|
+
#
|
4499
|
+
# @return [LazyFrame]
|
4500
|
+
#
|
4501
|
+
# @example
|
4502
|
+
# lf = Polars::LazyFrame.new(
|
4503
|
+
# {"a" => [1, 2, 3, 4], "b" => [1, 2, 1, nil], "c" => [nil, nil, nil, nil]}
|
4504
|
+
# )
|
4505
|
+
# lf.count.collect
|
4506
|
+
# # =>
|
4507
|
+
# # shape: (1, 3)
|
4508
|
+
# # ┌─────┬─────┬─────┐
|
4509
|
+
# # │ a ┆ b ┆ c │
|
4510
|
+
# # │ --- ┆ --- ┆ --- │
|
4511
|
+
# # │ u32 ┆ u32 ┆ u32 │
|
4512
|
+
# # ╞═════╪═════╪═════╡
|
4513
|
+
# # │ 4 ┆ 3 ┆ 0 │
|
4514
|
+
# # └─────┴─────┴─────┘
|
4515
|
+
def count
|
4516
|
+
_from_rbldf(_ldf.count)
|
4517
|
+
end
|
3577
4518
|
|
3578
4519
|
private
|
3579
4520
|
|
@@ -3585,5 +4526,64 @@ module Polars
|
|
3585
4526
|
def _from_rbldf(rb_ldf)
|
3586
4527
|
self.class._from_rbldf(rb_ldf)
|
3587
4528
|
end
|
4529
|
+
|
4530
|
+
def _filter(
|
4531
|
+
predicates:,
|
4532
|
+
constraints:,
|
4533
|
+
invert: false
|
4534
|
+
)
|
4535
|
+
all_predicates = []
|
4536
|
+
boolean_masks = []
|
4537
|
+
|
4538
|
+
predicates.each do |p|
|
4539
|
+
# quick exit/skip conditions
|
4540
|
+
if (p.is_a?(FalseClass) && invert) || (p.is_a?(TrueClass) && !invert)
|
4541
|
+
next # ignore; doesn't filter/remove anything
|
4542
|
+
end
|
4543
|
+
if (p.is_a?(TrueClass) && invert) || (p.is_a?(FalseClass) && !invert)
|
4544
|
+
return clear # discard all rows
|
4545
|
+
end
|
4546
|
+
|
4547
|
+
# note: identify masks separately from predicates
|
4548
|
+
if Utils.is_bool_sequence(p, include_series: true)
|
4549
|
+
boolean_masks << Polars::Series.new(p, dtype: Boolean)
|
4550
|
+
elsif (
|
4551
|
+
(is_seq = Utils.is_sequence(p)) && p.any? { |x| !x.is_a?(Expr) }) ||
|
4552
|
+
(!is_seq && !p.is_a?(Expr) && !(p.is_a?(::String) && collect_schema.include?(p))
|
4553
|
+
)
|
4554
|
+
err = p.is_a?(Series) ? "Series(…, dtype: #{p.dtype})" : p.inspect
|
4555
|
+
msg = "invalid predicate for `filter`: #{err}"
|
4556
|
+
raise TypeError, msg
|
4557
|
+
else
|
4558
|
+
all_predicates.concat(
|
4559
|
+
Utils.parse_into_list_of_expressions(p).map { |x| Utils.wrap_expr(x) }
|
4560
|
+
)
|
4561
|
+
end
|
4562
|
+
end
|
4563
|
+
|
4564
|
+
# unpack equality constraints from kwargs
|
4565
|
+
all_predicates.concat(
|
4566
|
+
constraints.map { |name, value| F.col(name).eq(value) }
|
4567
|
+
)
|
4568
|
+
if !(all_predicates.any? || boolean_masks.any?)
|
4569
|
+
msg = "at least one predicate or constraint must be provided"
|
4570
|
+
raise TypeError, msg
|
4571
|
+
end
|
4572
|
+
|
4573
|
+
# if multiple predicates, combine as 'horizontal' expression
|
4574
|
+
combined_predicate = all_predicates ? (all_predicates.length > 1 ? F.all_horizontal(*all_predicates) : all_predicates[0]) : nil
|
4575
|
+
|
4576
|
+
# apply reduced boolean mask first, if applicable, then predicates
|
4577
|
+
if boolean_masks.any?
|
4578
|
+
raise Todo
|
4579
|
+
end
|
4580
|
+
|
4581
|
+
if combined_predicate.nil?
|
4582
|
+
return _from_rbldf(_ldf)
|
4583
|
+
end
|
4584
|
+
|
4585
|
+
filter_method = invert ? _ldf.method(:remove) : _ldf.method(:filter)
|
4586
|
+
_from_rbldf(filter_method.(combined_predicate._rbexpr))
|
4587
|
+
end
|
3588
4588
|
end
|
3589
4589
|
end
|