polars-df 0.20.0-x86_64-darwin → 0.21.1-x86_64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +27 -0
- data/Cargo.lock +192 -186
- data/LICENSE-THIRD-PARTY.txt +1431 -1810
- data/LICENSE.txt +1 -1
- data/lib/polars/3.2/polars.bundle +0 -0
- data/lib/polars/3.3/polars.bundle +0 -0
- data/lib/polars/3.4/polars.bundle +0 -0
- data/lib/polars/array_expr.rb +382 -3
- data/lib/polars/array_name_space.rb +281 -0
- data/lib/polars/binary_expr.rb +67 -0
- data/lib/polars/binary_name_space.rb +43 -0
- data/lib/polars/cat_expr.rb +224 -0
- data/lib/polars/cat_name_space.rb +130 -32
- data/lib/polars/catalog/unity/catalog_info.rb +20 -0
- data/lib/polars/catalog/unity/column_info.rb +31 -0
- data/lib/polars/catalog/unity/namespace_info.rb +21 -0
- data/lib/polars/catalog/unity/table_info.rb +50 -0
- data/lib/polars/catalog.rb +448 -0
- data/lib/polars/config.rb +2 -2
- data/lib/polars/convert.rb +12 -2
- data/lib/polars/data_frame.rb +834 -48
- data/lib/polars/data_type_expr.rb +52 -0
- data/lib/polars/data_types.rb +61 -5
- data/lib/polars/date_time_expr.rb +251 -0
- data/lib/polars/date_time_name_space.rb +299 -0
- data/lib/polars/exceptions.rb +7 -2
- data/lib/polars/expr.rb +1247 -211
- data/lib/polars/functions/col.rb +6 -5
- data/lib/polars/functions/datatype.rb +21 -0
- data/lib/polars/functions/lazy.rb +127 -15
- data/lib/polars/functions/repeat.rb +4 -0
- data/lib/polars/io/csv.rb +19 -1
- data/lib/polars/io/json.rb +16 -0
- data/lib/polars/io/ndjson.rb +13 -0
- data/lib/polars/io/parquet.rb +70 -66
- data/lib/polars/io/scan_options.rb +47 -0
- data/lib/polars/lazy_frame.rb +1099 -95
- data/lib/polars/list_expr.rb +400 -11
- data/lib/polars/list_name_space.rb +321 -5
- data/lib/polars/meta_expr.rb +71 -22
- data/lib/polars/name_expr.rb +36 -0
- data/lib/polars/scan_cast_options.rb +64 -0
- data/lib/polars/schema.rb +84 -3
- data/lib/polars/selector.rb +210 -0
- data/lib/polars/selectors.rb +932 -203
- data/lib/polars/series.rb +1083 -63
- data/lib/polars/string_expr.rb +435 -9
- data/lib/polars/string_name_space.rb +729 -45
- data/lib/polars/struct_expr.rb +103 -0
- data/lib/polars/struct_name_space.rb +19 -1
- data/lib/polars/utils/parse.rb +40 -0
- data/lib/polars/utils/various.rb +18 -1
- data/lib/polars/utils.rb +9 -1
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +10 -0
- metadata +12 -2
data/lib/polars/lazy_frame.rb
CHANGED
@@ -234,10 +234,18 @@ module Polars
|
|
234
234
|
#
|
235
235
|
# @param by [Object]
|
236
236
|
# Column (expressions) to sort by.
|
237
|
+
# @param more_by [Array]
|
238
|
+
# Additional columns to sort by, specified as positional arguments.
|
237
239
|
# @param reverse [Boolean]
|
238
240
|
# Sort in descending order.
|
239
241
|
# @param nulls_last [Boolean]
|
240
242
|
# Place null values last. Can only be used if sorted by a single column.
|
243
|
+
# @param maintain_order [Boolean]
|
244
|
+
# Whether the order should be maintained if elements are equal.
|
245
|
+
# Note that if `true` streaming is not possible and performance might be
|
246
|
+
# worse since this requires a stable search.
|
247
|
+
# @param multithreaded [Boolean]
|
248
|
+
# Sort using multiple threads.
|
241
249
|
#
|
242
250
|
# @return [LazyFrame]
|
243
251
|
#
|
@@ -280,6 +288,201 @@ module Polars
|
|
280
288
|
)
|
281
289
|
end
|
282
290
|
|
291
|
+
# Execute a SQL query against the LazyFrame.
|
292
|
+
#
|
293
|
+
# @note
|
294
|
+
# This functionality is considered **unstable**, although it is close to
|
295
|
+
# being considered stable. It may be changed at any point without it being
|
296
|
+
# considered a breaking change.
|
297
|
+
#
|
298
|
+
# @param query [String]
|
299
|
+
# SQL query to execute.
|
300
|
+
# @param table_name [String]
|
301
|
+
# Optionally provide an explicit name for the table that represents the
|
302
|
+
# calling frame (defaults to "self").
|
303
|
+
#
|
304
|
+
# @return [Expr]
|
305
|
+
#
|
306
|
+
# @note
|
307
|
+
# * The calling frame is automatically registered as a table in the SQL context
|
308
|
+
# under the name "self". If you want access to the DataFrames and LazyFrames
|
309
|
+
# found in the current globals, use the top-level `Polars.sql`.
|
310
|
+
# * More control over registration and execution behaviour is available by
|
311
|
+
# using the `SQLContext` object.
|
312
|
+
#
|
313
|
+
# @example Query the LazyFrame using SQL:
|
314
|
+
# lf1 = Polars::LazyFrame.new({"a" => [1, 2, 3], "b" => [6, 7, 8], "c" => ["z", "y", "x"]})
|
315
|
+
# lf2 = Polars::LazyFrame.new({"a" => [3, 2, 1], "d" => [125, -654, 888]})
|
316
|
+
# lf1.sql("SELECT c, b FROM self WHERE a > 1").collect
|
317
|
+
# # =>
|
318
|
+
# # shape: (2, 2)
|
319
|
+
# # ┌─────┬─────┐
|
320
|
+
# # │ c ┆ b │
|
321
|
+
# # │ --- ┆ --- │
|
322
|
+
# # │ str ┆ i64 │
|
323
|
+
# # ╞═════╪═════╡
|
324
|
+
# # │ y ┆ 7 │
|
325
|
+
# # │ x ┆ 8 │
|
326
|
+
# # └─────┴─────┘
|
327
|
+
#
|
328
|
+
# @example Apply SQL transforms (aliasing "self" to "frame") then filter natively (you can freely mix SQL and native operations):
|
329
|
+
# lf1.sql(
|
330
|
+
# "
|
331
|
+
# SELECT
|
332
|
+
# a,
|
333
|
+
# (a % 2 == 0) AS a_is_even,
|
334
|
+
# (b::float4 / 2) AS \"b/2\",
|
335
|
+
# CONCAT_WS(':', c, c, c) AS c_c_c
|
336
|
+
# FROM frame
|
337
|
+
# ORDER BY a
|
338
|
+
# ",
|
339
|
+
# table_name: "frame",
|
340
|
+
# ).filter(~Polars.col("c_c_c").str.starts_with("x")).collect
|
341
|
+
# # =>
|
342
|
+
# # shape: (2, 4)
|
343
|
+
# # ┌─────┬───────────┬─────┬───────┐
|
344
|
+
# # │ a ┆ a_is_even ┆ b/2 ┆ c_c_c │
|
345
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
346
|
+
# # │ i64 ┆ bool ┆ f32 ┆ str │
|
347
|
+
# # ╞═════╪═══════════╪═════╪═══════╡
|
348
|
+
# # │ 1 ┆ false ┆ 3.0 ┆ z:z:z │
|
349
|
+
# # │ 2 ┆ true ┆ 3.5 ┆ y:y:y │
|
350
|
+
# # └─────┴───────────┴─────┴───────┘
|
351
|
+
def sql(query, table_name: "self")
|
352
|
+
ctx = Polars::SQLContext.new
|
353
|
+
name = table_name || "self"
|
354
|
+
ctx.register(name, self)
|
355
|
+
ctx.execute(query)
|
356
|
+
end
|
357
|
+
|
358
|
+
# Return the `k` largest rows.
|
359
|
+
#
|
360
|
+
# Non-null elements are always preferred over null elements, regardless of
|
361
|
+
# the value of `reverse`. The output is not guaranteed to be in any
|
362
|
+
# particular order, call :func:`sort` after this function if you wish the
|
363
|
+
# output to be sorted.
|
364
|
+
#
|
365
|
+
# @param k [Integer]
|
366
|
+
# Number of rows to return.
|
367
|
+
# @param by [Object]
|
368
|
+
# Column(s) used to determine the top rows.
|
369
|
+
# Accepts expression input. Strings are parsed as column names.
|
370
|
+
# @param reverse [Object]
|
371
|
+
# Consider the `k` smallest elements of the `by` column(s) (instead of the `k`
|
372
|
+
# largest). This can be specified per column by passing a sequence of
|
373
|
+
# booleans.
|
374
|
+
#
|
375
|
+
# @return [LazyFrame]
|
376
|
+
#
|
377
|
+
# @example Get the rows which contain the 4 largest values in column b.
|
378
|
+
# lf = Polars::LazyFrame.new(
|
379
|
+
# {
|
380
|
+
# "a" => ["a", "b", "a", "b", "b", "c"],
|
381
|
+
# "b" => [2, 1, 1, 3, 2, 1]
|
382
|
+
# }
|
383
|
+
# )
|
384
|
+
# lf.top_k(4, by: "b").collect
|
385
|
+
# # =>
|
386
|
+
# # shape: (4, 2)
|
387
|
+
# # ┌─────┬─────┐
|
388
|
+
# # │ a ┆ b │
|
389
|
+
# # │ --- ┆ --- │
|
390
|
+
# # │ str ┆ i64 │
|
391
|
+
# # ╞═════╪═════╡
|
392
|
+
# # │ b ┆ 3 │
|
393
|
+
# # │ a ┆ 2 │
|
394
|
+
# # │ b ┆ 2 │
|
395
|
+
# # │ b ┆ 1 │
|
396
|
+
# # └─────┴─────┘
|
397
|
+
#
|
398
|
+
# @example Get the rows which contain the 4 largest values when sorting on column b and a.
|
399
|
+
# lf.top_k(4, by: ["b", "a"]).collect
|
400
|
+
# # =>
|
401
|
+
# # shape: (4, 2)
|
402
|
+
# # ┌─────┬─────┐
|
403
|
+
# # │ a ┆ b │
|
404
|
+
# # │ --- ┆ --- │
|
405
|
+
# # │ str ┆ i64 │
|
406
|
+
# # ╞═════╪═════╡
|
407
|
+
# # │ b ┆ 3 │
|
408
|
+
# # │ b ┆ 2 │
|
409
|
+
# # │ a ┆ 2 │
|
410
|
+
# # │ c ┆ 1 │
|
411
|
+
# # └─────┴─────┘
|
412
|
+
def top_k(
|
413
|
+
k,
|
414
|
+
by:,
|
415
|
+
reverse: false
|
416
|
+
)
|
417
|
+
by = Utils.parse_into_list_of_expressions(by)
|
418
|
+
reverse = Utils.extend_bool(reverse, by.length, "reverse", "by")
|
419
|
+
_from_rbldf(_ldf.top_k(k, by, reverse))
|
420
|
+
end
|
421
|
+
|
422
|
+
# Return the `k` smallest rows.
|
423
|
+
#
|
424
|
+
# Non-null elements are always preferred over null elements, regardless of
|
425
|
+
# the value of `reverse`. The output is not guaranteed to be in any
|
426
|
+
# particular order, call :func:`sort` after this function if you wish the
|
427
|
+
# output to be sorted.
|
428
|
+
#
|
429
|
+
# @param k [Integer]
|
430
|
+
# Number of rows to return.
|
431
|
+
# @param by [Object]
|
432
|
+
# Column(s) used to determine the bottom rows.
|
433
|
+
# Accepts expression input. Strings are parsed as column names.
|
434
|
+
# @param reverse [Object]
|
435
|
+
# Consider the `k` largest elements of the `by` column(s) (instead of the `k`
|
436
|
+
# smallest). This can be specified per column by passing a sequence of
|
437
|
+
# booleans.
|
438
|
+
#
|
439
|
+
# @return [LazyFrame]
|
440
|
+
#
|
441
|
+
# @example Get the rows which contain the 4 smallest values in column b.
|
442
|
+
# lf = Polars::LazyFrame.new(
|
443
|
+
# {
|
444
|
+
# "a" => ["a", "b", "a", "b", "b", "c"],
|
445
|
+
# "b" => [2, 1, 1, 3, 2, 1]
|
446
|
+
# }
|
447
|
+
# )
|
448
|
+
# lf.bottom_k(4, by: "b").collect
|
449
|
+
# # =>
|
450
|
+
# # shape: (4, 2)
|
451
|
+
# # ┌─────┬─────┐
|
452
|
+
# # │ a ┆ b │
|
453
|
+
# # │ --- ┆ --- │
|
454
|
+
# # │ str ┆ i64 │
|
455
|
+
# # ╞═════╪═════╡
|
456
|
+
# # │ b ┆ 1 │
|
457
|
+
# # │ a ┆ 1 │
|
458
|
+
# # │ c ┆ 1 │
|
459
|
+
# # │ a ┆ 2 │
|
460
|
+
# # └─────┴─────┘
|
461
|
+
#
|
462
|
+
# @example Get the rows which contain the 4 smallest values when sorting on column a and b.
|
463
|
+
# lf.bottom_k(4, by: ["a", "b"]).collect
|
464
|
+
# # =>
|
465
|
+
# # shape: (4, 2)
|
466
|
+
# # ┌─────┬─────┐
|
467
|
+
# # │ a ┆ b │
|
468
|
+
# # │ --- ┆ --- │
|
469
|
+
# # │ str ┆ i64 │
|
470
|
+
# # ╞═════╪═════╡
|
471
|
+
# # │ a ┆ 1 │
|
472
|
+
# # │ a ┆ 2 │
|
473
|
+
# # │ b ┆ 1 │
|
474
|
+
# # │ b ┆ 2 │
|
475
|
+
# # └─────┴─────┘
|
476
|
+
def bottom_k(
|
477
|
+
k,
|
478
|
+
by:,
|
479
|
+
reverse: false
|
480
|
+
)
|
481
|
+
by = Utils.parse_into_list_of_expressions(by)
|
482
|
+
reverse = Utils.extend_bool(reverse, by.length, "reverse", "by")
|
483
|
+
_from_rbldf(_ldf.bottom_k(k, by, reverse))
|
484
|
+
end
|
485
|
+
|
283
486
|
# def profile
|
284
487
|
# end
|
285
488
|
|
@@ -305,6 +508,8 @@ module Polars
|
|
305
508
|
# Slice pushdown optimization.
|
306
509
|
# @param common_subplan_elimination [Boolean]
|
307
510
|
# Will try to cache branching subplans that occur on self-joins or unions.
|
511
|
+
# @param comm_subexpr_elim [Boolean]
|
512
|
+
# Common subexpressions will be cached and reused.
|
308
513
|
# @param allow_streaming [Boolean]
|
309
514
|
# Run parts of the query in a streaming fashion (this is in an alpha state)
|
310
515
|
#
|
@@ -369,6 +574,41 @@ module Polars
|
|
369
574
|
Utils.wrap_df(ldf.collect)
|
370
575
|
end
|
371
576
|
|
577
|
+
# Resolve the schema of this LazyFrame.
|
578
|
+
#
|
579
|
+
# @return [Schema]
|
580
|
+
#
|
581
|
+
# @example Determine the schema.
|
582
|
+
# lf = Polars::LazyFrame.new(
|
583
|
+
# {
|
584
|
+
# "foo" => [1, 2, 3],
|
585
|
+
# "bar" => [6.0, 7.0, 8.0],
|
586
|
+
# "ham" => ["a", "b", "c"]
|
587
|
+
# }
|
588
|
+
# )
|
589
|
+
# lf.collect_schema
|
590
|
+
# # => Polars::Schema({"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::String})
|
591
|
+
#
|
592
|
+
# @example Access various properties of the schema.
|
593
|
+
# schema = lf.collect_schema
|
594
|
+
# schema["bar"]
|
595
|
+
# # => Polars::Float64
|
596
|
+
#
|
597
|
+
# @example
|
598
|
+
# schema.names
|
599
|
+
# # => ["foo", "bar", "ham"]
|
600
|
+
#
|
601
|
+
# @example
|
602
|
+
# schema.dtypes
|
603
|
+
# # => [Polars::Int64, Polars::Float64, Polars::String]
|
604
|
+
#
|
605
|
+
# @example
|
606
|
+
# schema.length
|
607
|
+
# # => 3
|
608
|
+
def collect_schema
|
609
|
+
Schema.new(_ldf.collect_schema, check_dtypes: false)
|
610
|
+
end
|
611
|
+
|
372
612
|
# Persists a LazyFrame at the provided path.
|
373
613
|
#
|
374
614
|
# This allows streaming results that are larger than RAM to be written to disk.
|
@@ -412,6 +652,31 @@ module Polars
|
|
412
652
|
# Turn off (certain) optimizations.
|
413
653
|
# @param slice_pushdown [Boolean]
|
414
654
|
# Slice pushdown optimization.
|
655
|
+
# @param storage_options [String]
|
656
|
+
# Options that indicate how to connect to a cloud provider.
|
657
|
+
#
|
658
|
+
# The cloud providers currently supported are AWS, GCP, and Azure.
|
659
|
+
# See supported keys here:
|
660
|
+
#
|
661
|
+
# * [aws](https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html)
|
662
|
+
# * [gcp](https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html)
|
663
|
+
# * [azure](https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html)
|
664
|
+
# * Hugging Face (`hf://`): Accepts an API key under the `token` parameter: `{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.
|
665
|
+
#
|
666
|
+
# If `storage_options` is not provided, Polars will try to infer the
|
667
|
+
# information from environment variables.
|
668
|
+
# @param retries [Integer]
|
669
|
+
# Number of retries if accessing a cloud instance fails.
|
670
|
+
# @param sync_on_close ['data', 'all']
|
671
|
+
# Sync to disk when before closing a file.
|
672
|
+
#
|
673
|
+
# * `nil` does not sync.
|
674
|
+
# * `data` syncs the file contents.
|
675
|
+
# * `all` syncs the file contents and metadata.
|
676
|
+
# @param mkdir [Boolean]
|
677
|
+
# Recursively create all the directories in the path.
|
678
|
+
# @param lazy [Boolean]
|
679
|
+
# Wait to start execution until `collect` is called.
|
415
680
|
#
|
416
681
|
# @return [DataFrame]
|
417
682
|
#
|
@@ -521,6 +786,16 @@ module Polars
|
|
521
786
|
# Slice pushdown optimization.
|
522
787
|
# @param no_optimization [Boolean]
|
523
788
|
# Turn off (certain) optimizations.
|
789
|
+
# @param sync_on_close ['data', 'all']
|
790
|
+
# Sync to disk when before closing a file.
|
791
|
+
#
|
792
|
+
# * `nil` does not sync.
|
793
|
+
# * `data` syncs the file contents.
|
794
|
+
# * `all` syncs the file contents and metadata.
|
795
|
+
# @param mkdir [Boolean]
|
796
|
+
# Recursively create all the directories in the path.
|
797
|
+
# @param lazy [Boolean]
|
798
|
+
# Wait to start execution until `collect` is called.
|
524
799
|
#
|
525
800
|
# @return [DataFrame]
|
526
801
|
#
|
@@ -614,9 +889,15 @@ module Polars
|
|
614
889
|
# A format string, with the specifiers defined by the
|
615
890
|
# `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
|
616
891
|
# Rust crate.
|
892
|
+
# @param float_scientific [Integer]
|
893
|
+
# Whether to use scientific form always (true), never (false), or
|
894
|
+
# automatically (nil) for `Float32` and `Float64` datatypes.
|
617
895
|
# @param float_precision [Integer]
|
618
896
|
# Number of decimal places to write, applied to both `Float32` and
|
619
897
|
# `Float64` datatypes.
|
898
|
+
# @param decimal_comma [Boolean]
|
899
|
+
# Use a comma as the decimal separator instead of a point. Floats will be
|
900
|
+
# encapsulated in quotes if necessary; set the field separator to override.
|
620
901
|
# @param null_value [String]
|
621
902
|
# A string representing null values (defaulting to the empty string).
|
622
903
|
# @param quote_style ["necessary", "always", "non_numeric", "never"]
|
@@ -655,6 +936,16 @@ module Polars
|
|
655
936
|
# Options that indicate how to connect to a cloud provider.
|
656
937
|
# @param retries [Integer]
|
657
938
|
# Number of retries if accessing a cloud instance fails.
|
939
|
+
# @param sync_on_close ['data', 'all']
|
940
|
+
# Sync to disk when before closing a file.
|
941
|
+
#
|
942
|
+
# * `nil` does not sync.
|
943
|
+
# * `data` syncs the file contents.
|
944
|
+
# * `all` syncs the file contents and metadata.
|
945
|
+
# @param mkdir [Boolean]
|
946
|
+
# Recursively create all the directories in the path.
|
947
|
+
# @param lazy [Boolean]
|
948
|
+
# Wait to start execution until `collect` is called.
|
658
949
|
#
|
659
950
|
# @return [DataFrame]
|
660
951
|
#
|
@@ -674,6 +965,7 @@ module Polars
|
|
674
965
|
time_format: nil,
|
675
966
|
float_scientific: nil,
|
676
967
|
float_precision: nil,
|
968
|
+
decimal_comma: false,
|
677
969
|
null_value: nil,
|
678
970
|
quote_style: nil,
|
679
971
|
maintain_order: true,
|
@@ -726,6 +1018,7 @@ module Polars
|
|
726
1018
|
time_format,
|
727
1019
|
float_scientific,
|
728
1020
|
float_precision,
|
1021
|
+
decimal_comma,
|
729
1022
|
null_value,
|
730
1023
|
quote_style,
|
731
1024
|
storage_options,
|
@@ -762,6 +1055,31 @@ module Polars
|
|
762
1055
|
# Slice pushdown optimization.
|
763
1056
|
# @param no_optimization [Boolean]
|
764
1057
|
# Turn off (certain) optimizations.
|
1058
|
+
# @param storage_options [String]
|
1059
|
+
# Options that indicate how to connect to a cloud provider.
|
1060
|
+
#
|
1061
|
+
# The cloud providers currently supported are AWS, GCP, and Azure.
|
1062
|
+
# See supported keys here:
|
1063
|
+
#
|
1064
|
+
# * [aws](https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html)
|
1065
|
+
# * [gcp](https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html)
|
1066
|
+
# * [azure](https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html)
|
1067
|
+
# * Hugging Face (`hf://`): Accepts an API key under the `token` parameter: `{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.
|
1068
|
+
#
|
1069
|
+
# If `storage_options` is not provided, Polars will try to infer the
|
1070
|
+
# information from environment variables.
|
1071
|
+
# @param retries [Integer]
|
1072
|
+
# Number of retries if accessing a cloud instance fails.
|
1073
|
+
# @param sync_on_close ['data', 'all']
|
1074
|
+
# Sync to disk when before closing a file.
|
1075
|
+
#
|
1076
|
+
# * `nil` does not sync.
|
1077
|
+
# * `data` syncs the file contents.
|
1078
|
+
# * `all` syncs the file contents and metadata.
|
1079
|
+
# @param mkdir [Boolean]
|
1080
|
+
# Recursively create all the directories in the path.
|
1081
|
+
# @param lazy [Boolean]
|
1082
|
+
# Wait to start execution until `collect` is called.
|
765
1083
|
#
|
766
1084
|
# @return [DataFrame]
|
767
1085
|
#
|
@@ -854,25 +1172,6 @@ module Polars
|
|
854
1172
|
#
|
855
1173
|
# @param n_rows [Integer]
|
856
1174
|
# Collect n_rows from the data sources.
|
857
|
-
# @param type_coercion [Boolean]
|
858
|
-
# Run type coercion optimization.
|
859
|
-
# @param predicate_pushdown [Boolean]
|
860
|
-
# Run predicate pushdown optimization.
|
861
|
-
# @param projection_pushdown [Boolean]
|
862
|
-
# Run projection pushdown optimization.
|
863
|
-
# @param simplify_expression [Boolean]
|
864
|
-
# Run simplify expressions optimization.
|
865
|
-
# @param string_cache [Boolean]
|
866
|
-
# This argument is deprecated. Please set the string cache globally.
|
867
|
-
# The argument will be ignored
|
868
|
-
# @param no_optimization [Boolean]
|
869
|
-
# Turn off optimizations.
|
870
|
-
# @param slice_pushdown [Boolean]
|
871
|
-
# Slice pushdown optimization
|
872
|
-
# @param common_subplan_elimination [Boolean]
|
873
|
-
# Will try to cache branching subplans that occur on self-joins or unions.
|
874
|
-
# @param allow_streaming [Boolean]
|
875
|
-
# Run parts of the query in a streaming fashion (this is in an alpha state)
|
876
1175
|
#
|
877
1176
|
# @return [DataFrame]
|
878
1177
|
#
|
@@ -892,41 +1191,11 @@ module Polars
|
|
892
1191
|
# # │ --- ┆ --- ┆ --- │
|
893
1192
|
# # │ str ┆ i64 ┆ i64 │
|
894
1193
|
# # ╞═════╪═════╪═════╡
|
895
|
-
# # │ a ┆
|
896
|
-
# # │ b ┆
|
1194
|
+
# # │ a ┆ 4 ┆ 10 │
|
1195
|
+
# # │ b ┆ 11 ┆ 10 │
|
897
1196
|
# # └─────┴─────┴─────┘
|
898
|
-
def fetch(
|
899
|
-
n_rows
|
900
|
-
type_coercion: true,
|
901
|
-
predicate_pushdown: true,
|
902
|
-
projection_pushdown: true,
|
903
|
-
simplify_expression: true,
|
904
|
-
string_cache: false,
|
905
|
-
no_optimization: false,
|
906
|
-
slice_pushdown: true,
|
907
|
-
common_subplan_elimination: true,
|
908
|
-
comm_subexpr_elim: true,
|
909
|
-
allow_streaming: false
|
910
|
-
)
|
911
|
-
if no_optimization
|
912
|
-
predicate_pushdown = false
|
913
|
-
projection_pushdown = false
|
914
|
-
slice_pushdown = false
|
915
|
-
common_subplan_elimination = false
|
916
|
-
end
|
917
|
-
|
918
|
-
ldf = _ldf.optimization_toggle(
|
919
|
-
type_coercion,
|
920
|
-
predicate_pushdown,
|
921
|
-
projection_pushdown,
|
922
|
-
simplify_expression,
|
923
|
-
slice_pushdown,
|
924
|
-
common_subplan_elimination,
|
925
|
-
comm_subexpr_elim,
|
926
|
-
allow_streaming,
|
927
|
-
false
|
928
|
-
)
|
929
|
-
Utils.wrap_df(ldf.fetch(n_rows))
|
1197
|
+
def fetch(n_rows = 500, **kwargs)
|
1198
|
+
head(n_rows).collect(**kwargs)
|
930
1199
|
end
|
931
1200
|
|
932
1201
|
# Return lazy representation, i.e. itself.
|
@@ -1058,7 +1327,7 @@ module Polars
|
|
1058
1327
|
# # │ null ┆ null ┆ null │
|
1059
1328
|
# # └──────┴──────┴──────┘
|
1060
1329
|
def clear(n = 0)
|
1061
|
-
DataFrame.new(
|
1330
|
+
DataFrame.new(schema: schema).clear(n).lazy
|
1062
1331
|
end
|
1063
1332
|
alias_method :cleared, :clear
|
1064
1333
|
|
@@ -1108,6 +1377,140 @@ module Polars
|
|
1108
1377
|
)
|
1109
1378
|
end
|
1110
1379
|
|
1380
|
+
# Remove rows, dropping those that match the given predicate expression(s).
|
1381
|
+
#
|
1382
|
+
# The original order of the remaining rows is preserved.
|
1383
|
+
#
|
1384
|
+
# Rows where the filter predicate does not evaluate to true are retained
|
1385
|
+
# (this includes rows where the predicate evaluates as `null`).
|
1386
|
+
#
|
1387
|
+
# @param predicates [Array]
|
1388
|
+
# Expression that evaluates to a boolean Series.
|
1389
|
+
# @param constraints [Hash]
|
1390
|
+
# Column filters; use `name = value` to filter columns using the supplied
|
1391
|
+
# value. Each constraint behaves the same as `Polars.col(name).eq(value)`,
|
1392
|
+
# and is implicitly joined with the other filter conditions using `&`.
|
1393
|
+
#
|
1394
|
+
# @return [LazyFrame]
|
1395
|
+
#
|
1396
|
+
# @example Remove rows matching a condition:
|
1397
|
+
# lf = Polars::LazyFrame.new(
|
1398
|
+
# {
|
1399
|
+
# "foo" => [2, 3, nil, 4, 0],
|
1400
|
+
# "bar" => [5, 6, nil, nil, 0],
|
1401
|
+
# "ham" => ["a", "b", nil, "c", "d"]
|
1402
|
+
# }
|
1403
|
+
# )
|
1404
|
+
# lf.remove(
|
1405
|
+
# Polars.col("bar") >= 5
|
1406
|
+
# ).collect
|
1407
|
+
# # =>
|
1408
|
+
# # shape: (3, 3)
|
1409
|
+
# # ┌──────┬──────┬──────┐
|
1410
|
+
# # │ foo ┆ bar ┆ ham │
|
1411
|
+
# # │ --- ┆ --- ┆ --- │
|
1412
|
+
# # │ i64 ┆ i64 ┆ str │
|
1413
|
+
# # ╞══════╪══════╪══════╡
|
1414
|
+
# # │ null ┆ null ┆ null │
|
1415
|
+
# # │ 4 ┆ null ┆ c │
|
1416
|
+
# # │ 0 ┆ 0 ┆ d │
|
1417
|
+
# # └──────┴──────┴──────┘
|
1418
|
+
#
|
1419
|
+
# @example Discard rows based on multiple conditions, combined with and/or operators:
|
1420
|
+
# lf.remove(
|
1421
|
+
# (Polars.col("foo") >= 0) & (Polars.col("bar") >= 0)
|
1422
|
+
# ).collect
|
1423
|
+
# # =>
|
1424
|
+
# # shape: (2, 3)
|
1425
|
+
# # ┌──────┬──────┬──────┐
|
1426
|
+
# # │ foo ┆ bar ┆ ham │
|
1427
|
+
# # │ --- ┆ --- ┆ --- │
|
1428
|
+
# # │ i64 ┆ i64 ┆ str │
|
1429
|
+
# # ╞══════╪══════╪══════╡
|
1430
|
+
# # │ null ┆ null ┆ null │
|
1431
|
+
# # │ 4 ┆ null ┆ c │
|
1432
|
+
# # └──────┴──────┴──────┘
|
1433
|
+
#
|
1434
|
+
# @example
|
1435
|
+
# lf.remove(
|
1436
|
+
# (Polars.col("foo") >= 0) | (Polars.col("bar") >= 0)
|
1437
|
+
# ).collect
|
1438
|
+
# # =>
|
1439
|
+
# # shape: (1, 3)
|
1440
|
+
# # ┌──────┬──────┬──────┐
|
1441
|
+
# # │ foo ┆ bar ┆ ham │
|
1442
|
+
# # │ --- ┆ --- ┆ --- │
|
1443
|
+
# # │ i64 ┆ i64 ┆ str │
|
1444
|
+
# # ╞══════╪══════╪══════╡
|
1445
|
+
# # │ null ┆ null ┆ null │
|
1446
|
+
# # └──────┴──────┴──────┘
|
1447
|
+
#
|
1448
|
+
# @example Provide multiple constraints using `*args` syntax:
|
1449
|
+
# lf.remove(
|
1450
|
+
# Polars.col("ham").is_not_null,
|
1451
|
+
# Polars.col("bar") >= 0
|
1452
|
+
# ).collect
|
1453
|
+
# # =>
|
1454
|
+
# # shape: (2, 3)
|
1455
|
+
# # ┌──────┬──────┬──────┐
|
1456
|
+
# # │ foo ┆ bar ┆ ham │
|
1457
|
+
# # │ --- ┆ --- ┆ --- │
|
1458
|
+
# # │ i64 ┆ i64 ┆ str │
|
1459
|
+
# # ╞══════╪══════╪══════╡
|
1460
|
+
# # │ null ┆ null ┆ null │
|
1461
|
+
# # │ 4 ┆ null ┆ c │
|
1462
|
+
# # └──────┴──────┴──────┘
|
1463
|
+
#
|
1464
|
+
# @example Provide constraints(s) using `**kwargs` syntax:
|
1465
|
+
# lf.remove(foo: 0, bar: 0).collect
|
1466
|
+
# # =>
|
1467
|
+
# # shape: (4, 3)
|
1468
|
+
# # ┌──────┬──────┬──────┐
|
1469
|
+
# # │ foo ┆ bar ┆ ham │
|
1470
|
+
# # │ --- ┆ --- ┆ --- │
|
1471
|
+
# # │ i64 ┆ i64 ┆ str │
|
1472
|
+
# # ╞══════╪══════╪══════╡
|
1473
|
+
# # │ 2 ┆ 5 ┆ a │
|
1474
|
+
# # │ 3 ┆ 6 ┆ b │
|
1475
|
+
# # │ null ┆ null ┆ null │
|
1476
|
+
# # │ 4 ┆ null ┆ c │
|
1477
|
+
# # └──────┴──────┴──────┘
|
1478
|
+
#
|
1479
|
+
# @example Remove rows by comparing two columns against each other; in this case, we remove rows where the two columns are not equal (using `ne_missing` to ensure that null values compare equal):
|
1480
|
+
# lf.remove(
|
1481
|
+
# Polars.col("foo").ne_missing(Polars.col("bar"))
|
1482
|
+
# ).collect
|
1483
|
+
# # =>
|
1484
|
+
# # shape: (2, 3)
|
1485
|
+
# # ┌──────┬──────┬──────┐
|
1486
|
+
# # │ foo ┆ bar ┆ ham │
|
1487
|
+
# # │ --- ┆ --- ┆ --- │
|
1488
|
+
# # │ i64 ┆ i64 ┆ str │
|
1489
|
+
# # ╞══════╪══════╪══════╡
|
1490
|
+
# # │ null ┆ null ┆ null │
|
1491
|
+
# # │ 0 ┆ 0 ┆ d │
|
1492
|
+
# # └──────┴──────┴──────┘
|
1493
|
+
def remove(
|
1494
|
+
*predicates,
|
1495
|
+
**constraints
|
1496
|
+
)
|
1497
|
+
if constraints.empty?
|
1498
|
+
# early-exit conditions (exclude/include all rows)
|
1499
|
+
if predicates.empty? || (predicates.length == 1 && predicates[0].is_a?(TrueClass))
|
1500
|
+
return clear
|
1501
|
+
end
|
1502
|
+
if predicates.length == 1 && predicates[0].is_a?(FalseClass)
|
1503
|
+
return dup
|
1504
|
+
end
|
1505
|
+
end
|
1506
|
+
|
1507
|
+
_filter(
|
1508
|
+
predicates: predicates,
|
1509
|
+
constraints: constraints,
|
1510
|
+
invert: true
|
1511
|
+
)
|
1512
|
+
end
|
1513
|
+
|
1111
1514
|
# Select columns from this DataFrame.
|
1112
1515
|
#
|
1113
1516
|
# @param exprs [Array]
|
@@ -1205,6 +1608,29 @@ module Polars
|
|
1205
1608
|
_from_rbldf(_ldf.select(rbexprs))
|
1206
1609
|
end
|
1207
1610
|
|
1611
|
+
# Select columns from this LazyFrame.
|
1612
|
+
#
|
1613
|
+
# This will run all expression sequentially instead of in parallel.
|
1614
|
+
# Use this when the work per expression is cheap.
|
1615
|
+
#
|
1616
|
+
# @param exprs [Array]
|
1617
|
+
# Column(s) to select, specified as positional arguments.
|
1618
|
+
# Accepts expression input. Strings are parsed as column names,
|
1619
|
+
# other non-expression inputs are parsed as literals.
|
1620
|
+
# @param named_exprs [Hash]
|
1621
|
+
# Additional columns to select, specified as keyword arguments.
|
1622
|
+
# The columns will be renamed to the keyword used.
|
1623
|
+
#
|
1624
|
+
# @return [LazyFrame]
|
1625
|
+
def select_seq(*exprs, **named_exprs)
|
1626
|
+
structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", 0).to_i != 0
|
1627
|
+
|
1628
|
+
rbexprs = Utils.parse_into_list_of_expressions(
|
1629
|
+
*exprs, **named_exprs, __structify: structify
|
1630
|
+
)
|
1631
|
+
_from_rbldf(_ldf.select_seq(rbexprs))
|
1632
|
+
end
|
1633
|
+
|
1208
1634
|
# Start a group by operation.
|
1209
1635
|
#
|
1210
1636
|
# @param by [Array]
|
@@ -1401,9 +1827,9 @@ module Polars
|
|
1401
1827
|
# @param every [Object]
|
1402
1828
|
# Interval of the window.
|
1403
1829
|
# @param period [Object]
|
1404
|
-
# Length of the window, if
|
1830
|
+
# Length of the window, if nil it is equal to 'every'.
|
1405
1831
|
# @param offset [Object]
|
1406
|
-
# Offset of the window if
|
1832
|
+
# Offset of the window if nil and period is nil it will be equal to negative
|
1407
1833
|
# `every`.
|
1408
1834
|
# @param truncate [Boolean]
|
1409
1835
|
# Truncate the time value to the window lower bound.
|
@@ -1413,8 +1839,32 @@ module Polars
|
|
1413
1839
|
# parallelize
|
1414
1840
|
# @param closed ["right", "left", "both", "none"]
|
1415
1841
|
# Define whether the temporal window interval is closed or not.
|
1842
|
+
# @param label ['left', 'right', 'datapoint']
|
1843
|
+
# Define which label to use for the window:
|
1844
|
+
#
|
1845
|
+
# - 'left': lower boundary of the window
|
1846
|
+
# - 'right': upper boundary of the window
|
1847
|
+
# - 'datapoint': the first value of the index column in the given window.
|
1848
|
+
# If you don't need the label to be at one of the boundaries, choose this
|
1849
|
+
# option for maximum performance
|
1416
1850
|
# @param by [Object]
|
1417
1851
|
# Also group by this column/these columns
|
1852
|
+
# @param start_by ['window', 'datapoint', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
|
1853
|
+
# The strategy to determine the start of the first window by.
|
1854
|
+
#
|
1855
|
+
# * 'window': Start by taking the earliest timestamp, truncating it with
|
1856
|
+
# `every`, and then adding `offset`.
|
1857
|
+
# Note that weekly windows start on Monday.
|
1858
|
+
# * 'datapoint': Start from the first encountered data point.
|
1859
|
+
# * a day of the week (only takes effect if `every` contains `'w'`):
|
1860
|
+
#
|
1861
|
+
# * 'monday': Start the window on the Monday before the first data point.
|
1862
|
+
# * 'tuesday': Start the window on the Tuesday before the first data point.
|
1863
|
+
# * ...
|
1864
|
+
# * 'sunday': Start the window on the Sunday before the first data point.
|
1865
|
+
#
|
1866
|
+
# The resulting window is then shifted back until the earliest datapoint
|
1867
|
+
# is in or in front of it.
|
1418
1868
|
#
|
1419
1869
|
# @return [DataFrame]
|
1420
1870
|
#
|
@@ -1651,13 +2101,13 @@ module Polars
|
|
1651
2101
|
# Join column of the right DataFrame.
|
1652
2102
|
# @param on [String]
|
1653
2103
|
# Join column of both DataFrames. If set, `left_on` and `right_on` should be
|
1654
|
-
#
|
1655
|
-
# @param by [Object]
|
1656
|
-
# Join on these columns before doing asof join.
|
2104
|
+
# nil.
|
1657
2105
|
# @param by_left [Object]
|
1658
2106
|
# Join on these columns before doing asof join.
|
1659
2107
|
# @param by_right [Object]
|
1660
2108
|
# Join on these columns before doing asof join.
|
2109
|
+
# @param by [Object]
|
2110
|
+
# Join on these columns before doing asof join.
|
1661
2111
|
# @param strategy ["backward", "forward"]
|
1662
2112
|
# Join strategy.
|
1663
2113
|
# @param suffix [String]
|
@@ -1873,7 +2323,7 @@ module Polars
|
|
1873
2323
|
# # └─────────────┴────────────┴────────────┘
|
1874
2324
|
#
|
1875
2325
|
# @example
|
1876
|
-
# pop2.join_asof(gdp2, by: "country", on: "date", strategy: "nearest").collect
|
2326
|
+
# pop2.join_asof(gdp2, by: "country", on: "date", strategy: "nearest", check_sortedness: false).collect
|
1877
2327
|
# # =>
|
1878
2328
|
# # shape: (6, 4)
|
1879
2329
|
# # ┌─────────────┬────────────┬────────────┬──────┐
|
@@ -1976,7 +2426,7 @@ module Polars
|
|
1976
2426
|
# Join column of the right DataFrame.
|
1977
2427
|
# @param on Object
|
1978
2428
|
# Join column of both DataFrames. If set, `left_on` and `right_on` should be
|
1979
|
-
#
|
2429
|
+
# nil.
|
1980
2430
|
# @param how ["inner", "left", "full", "semi", "anti", "cross"]
|
1981
2431
|
# Join strategy.
|
1982
2432
|
# @param suffix [String]
|
@@ -2171,10 +2621,110 @@ module Polars
|
|
2171
2621
|
)
|
2172
2622
|
end
|
2173
2623
|
|
2624
|
+
# Perform a join based on one or multiple (in)equality predicates.
|
2625
|
+
#
|
2626
|
+
# This performs an inner join, so only rows where all predicates are true
|
2627
|
+
# are included in the result, and a row from either DataFrame may be included
|
2628
|
+
# multiple times in the result.
|
2629
|
+
#
|
2630
|
+
# @note
|
2631
|
+
# The row order of the input DataFrames is not preserved.
|
2632
|
+
#
|
2633
|
+
# @note
|
2634
|
+
# This functionality is experimental. It may be
|
2635
|
+
# changed at any point without it being considered a breaking change.
|
2636
|
+
#
|
2637
|
+
# @param other [Object]
|
2638
|
+
# DataFrame to join with.
|
2639
|
+
# @param predicates [Object]
|
2640
|
+
# (In)Equality condition to join the two tables on.
|
2641
|
+
# When a column name occurs in both tables, the proper suffix must
|
2642
|
+
# be applied in the predicate.
|
2643
|
+
# @param suffix [String]
|
2644
|
+
# Suffix to append to columns with a duplicate name.
|
2645
|
+
#
|
2646
|
+
# @return [LazyFrame]
|
2647
|
+
#
|
2648
|
+
# @example Join two lazyframes together based on two predicates which get AND-ed together.
|
2649
|
+
# east = Polars::LazyFrame.new(
|
2650
|
+
# {
|
2651
|
+
# "id" => [100, 101, 102],
|
2652
|
+
# "dur" => [120, 140, 160],
|
2653
|
+
# "rev" => [12, 14, 16],
|
2654
|
+
# "cores" => [2, 8, 4]
|
2655
|
+
# }
|
2656
|
+
# )
|
2657
|
+
# west = Polars::LazyFrame.new(
|
2658
|
+
# {
|
2659
|
+
# "t_id" => [404, 498, 676, 742],
|
2660
|
+
# "time" => [90, 130, 150, 170],
|
2661
|
+
# "cost" => [9, 13, 15, 16],
|
2662
|
+
# "cores" => [4, 2, 1, 4]
|
2663
|
+
# }
|
2664
|
+
# )
|
2665
|
+
# east.join_where(
|
2666
|
+
# west,
|
2667
|
+
# Polars.col("dur") < Polars.col("time"),
|
2668
|
+
# Polars.col("rev") < Polars.col("cost")
|
2669
|
+
# ).collect
|
2670
|
+
# # =>
|
2671
|
+
# # shape: (5, 8)
|
2672
|
+
# # ┌─────┬─────┬─────┬───────┬──────┬──────┬──────┬─────────────┐
|
2673
|
+
# # │ id ┆ dur ┆ rev ┆ cores ┆ t_id ┆ time ┆ cost ┆ cores_right │
|
2674
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
2675
|
+
# # │ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
|
2676
|
+
# # ╞═════╪═════╪═════╪═══════╪══════╪══════╪══════╪═════════════╡
|
2677
|
+
# # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 498 ┆ 130 ┆ 13 ┆ 2 │
|
2678
|
+
# # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
|
2679
|
+
# # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
|
2680
|
+
# # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
|
2681
|
+
# # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
|
2682
|
+
# # └─────┴─────┴─────┴───────┴──────┴──────┴──────┴─────────────┘
|
2683
|
+
#
|
2684
|
+
# @example To OR them together, use a single expression and the `|` operator.
|
2685
|
+
# east.join_where(
|
2686
|
+
# west,
|
2687
|
+
# (Polars.col("dur") < Polars.col("time")) | (Polars.col("rev") < Polars.col("cost"))
|
2688
|
+
# ).collect
|
2689
|
+
# # =>
|
2690
|
+
# # shape: (6, 8)
|
2691
|
+
# # ┌─────┬─────┬─────┬───────┬──────┬──────┬──────┬─────────────┐
|
2692
|
+
# # │ id ┆ dur ┆ rev ┆ cores ┆ t_id ┆ time ┆ cost ┆ cores_right │
|
2693
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
2694
|
+
# # │ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
|
2695
|
+
# # ╞═════╪═════╪═════╪═══════╪══════╪══════╪══════╪═════════════╡
|
2696
|
+
# # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 498 ┆ 130 ┆ 13 ┆ 2 │
|
2697
|
+
# # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
|
2698
|
+
# # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
|
2699
|
+
# # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
|
2700
|
+
# # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
|
2701
|
+
# # │ 102 ┆ 160 ┆ 16 ┆ 4 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
|
2702
|
+
# # └─────┴─────┴─────┴───────┴──────┴──────┴──────┴─────────────┘
|
2703
|
+
def join_where(
|
2704
|
+
other,
|
2705
|
+
*predicates,
|
2706
|
+
suffix: "_right"
|
2707
|
+
)
|
2708
|
+
Utils.require_same_type(self, other)
|
2709
|
+
|
2710
|
+
rbexprs = Utils.parse_into_list_of_expressions(*predicates)
|
2711
|
+
|
2712
|
+
_from_rbldf(
|
2713
|
+
_ldf.join_where(
|
2714
|
+
other._ldf,
|
2715
|
+
rbexprs,
|
2716
|
+
suffix
|
2717
|
+
)
|
2718
|
+
)
|
2719
|
+
end
|
2720
|
+
|
2174
2721
|
# Add or overwrite multiple columns in a DataFrame.
|
2175
2722
|
#
|
2176
2723
|
# @param exprs [Object]
|
2177
2724
|
# List of Expressions that evaluate to columns.
|
2725
|
+
# @param named_exprs [Hash]
|
2726
|
+
# Additional columns to add, specified as keyword arguments.
|
2727
|
+
# The columns will be renamed to the keyword used.
|
2178
2728
|
#
|
2179
2729
|
# @return [LazyFrame]
|
2180
2730
|
#
|
@@ -2213,6 +2763,34 @@ module Polars
|
|
2213
2763
|
_from_rbldf(_ldf.with_columns(rbexprs))
|
2214
2764
|
end
|
2215
2765
|
|
2766
|
+
# Add columns to this LazyFrame.
|
2767
|
+
#
|
2768
|
+
# Added columns will replace existing columns with the same name.
|
2769
|
+
#
|
2770
|
+
# This will run all expression sequentially instead of in parallel.
|
2771
|
+
# Use this when the work per expression is cheap.
|
2772
|
+
#
|
2773
|
+
# @param exprs [Array]
|
2774
|
+
# Column(s) to add, specified as positional arguments.
|
2775
|
+
# Accepts expression input. Strings are parsed as column names, other
|
2776
|
+
# non-expression inputs are parsed as literals.
|
2777
|
+
# @param named_exprs [Hash]
|
2778
|
+
# Additional columns to add, specified as keyword arguments.
|
2779
|
+
# The columns will be renamed to the keyword used.
|
2780
|
+
#
|
2781
|
+
# @return [LazyFrame]
|
2782
|
+
def with_columns_seq(
|
2783
|
+
*exprs,
|
2784
|
+
**named_exprs
|
2785
|
+
)
|
2786
|
+
structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", 0).to_i != 0
|
2787
|
+
|
2788
|
+
rbexprs = Utils.parse_into_list_of_expressions(
|
2789
|
+
*exprs, **named_exprs, __structify: structify
|
2790
|
+
)
|
2791
|
+
_from_rbldf(_ldf.with_columns_seq(rbexprs))
|
2792
|
+
end
|
2793
|
+
|
2216
2794
|
# Add an external context to the computation graph.
|
2217
2795
|
#
|
2218
2796
|
# This allows expressions to also access columns from DataFrames
|
@@ -2299,6 +2877,9 @@ module Polars
|
|
2299
2877
|
# @param columns [Object]
|
2300
2878
|
# - Name of the column that should be removed.
|
2301
2879
|
# - List of column names.
|
2880
|
+
# @param strict [Boolean]
|
2881
|
+
# Validate that all column names exist in the current schema,
|
2882
|
+
# and throw an exception if any do not.
|
2302
2883
|
#
|
2303
2884
|
# @return [LazyFrame]
|
2304
2885
|
#
|
@@ -2350,9 +2931,18 @@ module Polars
|
|
2350
2931
|
# # │ 7.0 │
|
2351
2932
|
# # │ 8.0 │
|
2352
2933
|
# # └─────┘
|
2353
|
-
def drop(*columns)
|
2354
|
-
|
2355
|
-
|
2934
|
+
def drop(*columns, strict: true)
|
2935
|
+
selectors = []
|
2936
|
+
columns.each do |c|
|
2937
|
+
if c.is_a?(Enumerable)
|
2938
|
+
selectors += c
|
2939
|
+
else
|
2940
|
+
selectors += [c]
|
2941
|
+
end
|
2942
|
+
end
|
2943
|
+
|
2944
|
+
drop_cols = Utils.parse_list_into_selector(selectors, strict: strict)
|
2945
|
+
_from_rbldf(_ldf.drop(drop_cols._rbselector))
|
2356
2946
|
end
|
2357
2947
|
|
2358
2948
|
# Rename column names.
|
@@ -2809,7 +3399,7 @@ module Polars
|
|
2809
3399
|
#
|
2810
3400
|
# @example
|
2811
3401
|
# s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]}).lazy
|
2812
|
-
# s.
|
3402
|
+
# s.gather_every(2).collect
|
2813
3403
|
# # =>
|
2814
3404
|
# # shape: (2, 2)
|
2815
3405
|
# # ┌─────┬─────┐
|
@@ -2820,9 +3410,10 @@ module Polars
|
|
2820
3410
|
# # │ 1 ┆ 5 │
|
2821
3411
|
# # │ 3 ┆ 7 │
|
2822
3412
|
# # └─────┴─────┘
|
2823
|
-
def
|
2824
|
-
select(F.col("*").
|
3413
|
+
def gather_every(n)
|
3414
|
+
select(F.col("*").gather_every(n))
|
2825
3415
|
end
|
3416
|
+
alias_method :take_every, :gather_every
|
2826
3417
|
|
2827
3418
|
# Fill null values using the specified value or strategy.
|
2828
3419
|
#
|
@@ -3099,6 +3690,32 @@ module Polars
|
|
3099
3690
|
_from_rbldf(_ldf.median)
|
3100
3691
|
end
|
3101
3692
|
|
3693
|
+
# Aggregate the columns in the LazyFrame as the sum of their null value count.
|
3694
|
+
#
|
3695
|
+
# @return [LazyFrame]
|
3696
|
+
#
|
3697
|
+
# @example
|
3698
|
+
# lf = Polars::LazyFrame.new(
|
3699
|
+
# {
|
3700
|
+
# "foo" => [1, nil, 3],
|
3701
|
+
# "bar" => [6, 7, nil],
|
3702
|
+
# "ham" => ["a", "b", "c"]
|
3703
|
+
# }
|
3704
|
+
# )
|
3705
|
+
# lf.null_count.collect
|
3706
|
+
# # =>
|
3707
|
+
# # shape: (1, 3)
|
3708
|
+
# # ┌─────┬─────┬─────┐
|
3709
|
+
# # │ foo ┆ bar ┆ ham │
|
3710
|
+
# # │ --- ┆ --- ┆ --- │
|
3711
|
+
# # │ u32 ┆ u32 ┆ u32 │
|
3712
|
+
# # ╞═════╪═════╪═════╡
|
3713
|
+
# # │ 1 ┆ 1 ┆ 0 │
|
3714
|
+
# # └─────┴─────┴─────┘
|
3715
|
+
def null_count
|
3716
|
+
_from_rbldf(_ldf.null_count)
|
3717
|
+
end
|
3718
|
+
|
3102
3719
|
# Aggregate the columns in the DataFrame to their quantile value.
|
3103
3720
|
#
|
3104
3721
|
# @param quantile [Float]
|
@@ -3153,9 +3770,11 @@ module Polars
|
|
3153
3770
|
# # │ c ┆ 7 │
|
3154
3771
|
# # │ c ┆ 8 │
|
3155
3772
|
# # └─────────┴─────────┘
|
3156
|
-
def explode(columns)
|
3157
|
-
|
3158
|
-
|
3773
|
+
def explode(columns, *more_columns)
|
3774
|
+
subset = Utils.parse_list_into_selector(columns) | Utils.parse_list_into_selector(
|
3775
|
+
more_columns
|
3776
|
+
)
|
3777
|
+
_from_rbldf(_ldf.explode(subset._rbselector))
|
3159
3778
|
end
|
3160
3779
|
|
3161
3780
|
# Drop duplicate rows from this DataFrame.
|
@@ -3220,43 +3839,110 @@ module Polars
|
|
3220
3839
|
# # │ 1 ┆ a ┆ b │
|
3221
3840
|
# # └─────┴─────┴─────┘
|
3222
3841
|
def unique(maintain_order: true, subset: nil, keep: "first")
|
3223
|
-
|
3224
|
-
|
3842
|
+
selector_subset = nil
|
3843
|
+
if !subset.nil?
|
3844
|
+
selector_subset = Utils.parse_list_into_selector(subset)._rbselector
|
3225
3845
|
end
|
3226
|
-
_from_rbldf(_ldf.unique(maintain_order,
|
3846
|
+
_from_rbldf(_ldf.unique(maintain_order, selector_subset, keep))
|
3227
3847
|
end
|
3228
3848
|
|
3229
|
-
# Drop rows
|
3849
|
+
# Drop all rows that contain one or more NaN values.
|
3850
|
+
#
|
3851
|
+
# The original order of the remaining rows is preserved.
|
3230
3852
|
#
|
3231
3853
|
# @param subset [Object]
|
3232
|
-
#
|
3854
|
+
# Column name(s) for which NaN values are considered; if set to `nil`
|
3855
|
+
# (default), use all columns (note that only floating-point columns
|
3856
|
+
# can contain NaNs).
|
3233
3857
|
#
|
3234
3858
|
# @return [LazyFrame]
|
3235
3859
|
#
|
3236
3860
|
# @example
|
3237
|
-
#
|
3861
|
+
# lf = Polars::LazyFrame.new(
|
3862
|
+
# {
|
3863
|
+
# "foo" => [-20.5, Float::NAN, 80.0],
|
3864
|
+
# "bar" => [Float::NAN, 110.0, 25.5],
|
3865
|
+
# "ham" => ["xxx", "yyy", nil]
|
3866
|
+
# }
|
3867
|
+
# )
|
3868
|
+
# lf.drop_nans.collect
|
3869
|
+
# # =>
|
3870
|
+
# # shape: (1, 3)
|
3871
|
+
# # ┌──────┬──────┬──────┐
|
3872
|
+
# # │ foo ┆ bar ┆ ham │
|
3873
|
+
# # │ --- ┆ --- ┆ --- │
|
3874
|
+
# # │ f64 ┆ f64 ┆ str │
|
3875
|
+
# # ╞══════╪══════╪══════╡
|
3876
|
+
# # │ 80.0 ┆ 25.5 ┆ null │
|
3877
|
+
# # └──────┴──────┴──────┘
|
3878
|
+
#
|
3879
|
+
# @example
|
3880
|
+
# lf.drop_nans(subset: ["bar"]).collect
|
3881
|
+
# # =>
|
3882
|
+
# # shape: (2, 3)
|
3883
|
+
# # ┌──────┬───────┬──────┐
|
3884
|
+
# # │ foo ┆ bar ┆ ham │
|
3885
|
+
# # │ --- ┆ --- ┆ --- │
|
3886
|
+
# # │ f64 ┆ f64 ┆ str │
|
3887
|
+
# # ╞══════╪═══════╪══════╡
|
3888
|
+
# # │ NaN ┆ 110.0 ┆ yyy │
|
3889
|
+
# # │ 80.0 ┆ 25.5 ┆ null │
|
3890
|
+
# # └──────┴───────┴──────┘
|
3891
|
+
def drop_nans(subset: nil)
|
3892
|
+
selector_subset = nil
|
3893
|
+
if !subset.nil?
|
3894
|
+
selector_subset = Utils.parse_list_into_selector(subset)._rbselector
|
3895
|
+
end
|
3896
|
+
_from_rbldf(_ldf.drop_nans(selector_subset))
|
3897
|
+
end
|
3898
|
+
|
3899
|
+
# Drop all rows that contain one or more null values.
|
3900
|
+
#
|
3901
|
+
# The original order of the remaining rows is preserved.
|
3902
|
+
#
|
3903
|
+
# @param subset [Object]
|
3904
|
+
# Column name(s) for which null values are considered.
|
3905
|
+
# If set to `nil` (default), use all columns.
|
3906
|
+
#
|
3907
|
+
# @return [LazyFrame]
|
3908
|
+
#
|
3909
|
+
# @example
|
3910
|
+
# lf = Polars::LazyFrame.new(
|
3238
3911
|
# {
|
3239
3912
|
# "foo" => [1, 2, 3],
|
3240
3913
|
# "bar" => [6, nil, 8],
|
3241
|
-
# "ham" => ["a", "b",
|
3914
|
+
# "ham" => ["a", "b", nil]
|
3242
3915
|
# }
|
3243
3916
|
# )
|
3244
|
-
#
|
3917
|
+
# lf.drop_nulls.collect
|
3245
3918
|
# # =>
|
3246
|
-
# # shape: (
|
3919
|
+
# # shape: (1, 3)
|
3247
3920
|
# # ┌─────┬─────┬─────┐
|
3248
3921
|
# # │ foo ┆ bar ┆ ham │
|
3249
3922
|
# # │ --- ┆ --- ┆ --- │
|
3250
3923
|
# # │ i64 ┆ i64 ┆ str │
|
3251
3924
|
# # ╞═════╪═════╪═════╡
|
3252
3925
|
# # │ 1 ┆ 6 ┆ a │
|
3253
|
-
# # │ 3 ┆ 8 ┆ c │
|
3254
3926
|
# # └─────┴─────┴─────┘
|
3927
|
+
#
|
3928
|
+
# @example
|
3929
|
+
# lf.drop_nulls(subset: Polars.cs.integer).collect
|
3930
|
+
# # =>
|
3931
|
+
# # shape: (2, 3)
|
3932
|
+
# # ┌─────┬─────┬──────┐
|
3933
|
+
# # │ foo ┆ bar ┆ ham │
|
3934
|
+
# # │ --- ┆ --- ┆ --- │
|
3935
|
+
# # │ i64 ┆ i64 ┆ str │
|
3936
|
+
# # ╞═════╪═════╪══════╡
|
3937
|
+
# # │ 1 ┆ 6 ┆ a │
|
3938
|
+
# # │ 3 ┆ 8 ┆ null │
|
3939
|
+
# # └─────┴─────┴──────┘
|
3255
3940
|
def drop_nulls(subset: nil)
|
3256
|
-
|
3257
|
-
|
3941
|
+
selector_subset = nil
|
3942
|
+
if !subset.nil?
|
3943
|
+
selector_subset = Utils.parse_list_into_selector(subset)._rbselector
|
3258
3944
|
end
|
3259
|
-
_from_rbldf(_ldf.drop_nulls(
|
3945
|
+
_from_rbldf(_ldf.drop_nulls(selector_subset))
|
3260
3946
|
end
|
3261
3947
|
|
3262
3948
|
# Unpivot a DataFrame from wide to long format.
|
@@ -3318,11 +4004,16 @@ module Polars
|
|
3318
4004
|
warn "The `streamable` parameter for `LazyFrame.unpivot` is deprecated"
|
3319
4005
|
end
|
3320
4006
|
|
3321
|
-
|
3322
|
-
|
4007
|
+
selector_on = on.nil? ? Selectors.empty : Utils.parse_list_into_selector(on)
|
4008
|
+
selector_index = index.nil? ? Selectors.empty : Utils.parse_list_into_selector(index)
|
3323
4009
|
|
3324
4010
|
_from_rbldf(
|
3325
|
-
_ldf.unpivot(
|
4011
|
+
_ldf.unpivot(
|
4012
|
+
selector_on._rbselector,
|
4013
|
+
selector_index._rbselector,
|
4014
|
+
value_name,
|
4015
|
+
variable_name
|
4016
|
+
)
|
3326
4017
|
)
|
3327
4018
|
end
|
3328
4019
|
alias_method :melt, :unpivot
|
@@ -3364,8 +4055,10 @@ module Polars
|
|
3364
4055
|
# The fields will be inserted into the `DataFrame` on the location of the
|
3365
4056
|
# `struct` type.
|
3366
4057
|
#
|
3367
|
-
# @param
|
4058
|
+
# @param columns [Object]
|
3368
4059
|
# Names of the struct columns that will be decomposed by its fields
|
4060
|
+
# @param more_columns [Array]
|
4061
|
+
# Additional columns to unnest, specified as positional arguments.
|
3369
4062
|
#
|
3370
4063
|
# @return [LazyFrame]
|
3371
4064
|
#
|
@@ -3410,11 +4103,11 @@ module Polars
|
|
3410
4103
|
# # │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │
|
3411
4104
|
# # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │
|
3412
4105
|
# # └────────┴─────┴─────┴──────┴───────────┴───────┘
|
3413
|
-
def unnest(
|
3414
|
-
|
3415
|
-
|
3416
|
-
|
3417
|
-
_from_rbldf(_ldf.unnest(
|
4106
|
+
def unnest(columns, *more_columns)
|
4107
|
+
subset = Utils.parse_list_into_selector(columns) | Utils.parse_list_into_selector(
|
4108
|
+
more_columns
|
4109
|
+
)
|
4110
|
+
_from_rbldf(_ldf.unnest(subset._rbselector))
|
3418
4111
|
end
|
3419
4112
|
|
3420
4113
|
# Take two sorted DataFrames and merge them by the sorted key.
|
@@ -3483,9 +4176,261 @@ module Polars
|
|
3483
4176
|
with_columns(F.col(column).set_sorted(descending: descending))
|
3484
4177
|
end
|
3485
4178
|
|
3486
|
-
#
|
3487
|
-
#
|
3488
|
-
#
|
4179
|
+
# Update the values in this `LazyFrame` with the values in `other`.
|
4180
|
+
#
|
4181
|
+
# @note
|
4182
|
+
# This functionality is considered **unstable**. It may be changed
|
4183
|
+
# at any point without it being considered a breaking change.
|
4184
|
+
#
|
4185
|
+
# @param other [LazyFrame]
|
4186
|
+
# LazyFrame that will be used to update the values
|
4187
|
+
# @param on [Object]
|
4188
|
+
# Column names that will be joined on. If set to `nil` (default),
|
4189
|
+
# the implicit row index of each frame is used as a join key.
|
4190
|
+
# @param how ['left', 'inner', 'full']
|
4191
|
+
# * 'left' will keep all rows from the left table; rows may be duplicated
|
4192
|
+
# if multiple rows in the right frame match the left row's key.
|
4193
|
+
# * 'inner' keeps only those rows where the key exists in both frames.
|
4194
|
+
# * 'full' will update existing rows where the key matches while also
|
4195
|
+
# adding any new rows contained in the given frame.
|
4196
|
+
# @param left_on [Object]
|
4197
|
+
# Join column(s) of the left DataFrame.
|
4198
|
+
# @param right_on [Object]
|
4199
|
+
# Join column(s) of the right DataFrame.
|
4200
|
+
# @param include_nulls [Boolean]
|
4201
|
+
# Overwrite values in the left frame with null values from the right frame.
|
4202
|
+
# If set to `false` (default), null values in the right frame are ignored.
|
4203
|
+
# @param maintain_order ['none', 'left', 'right', 'left_right', 'right_left']
|
4204
|
+
# Which order of rows from the inputs to preserve. See `LazyFrame.join`
|
4205
|
+
# for details. Unlike `join` this function preserves the left order by
|
4206
|
+
# default.
|
4207
|
+
#
|
4208
|
+
# @return [LazyFrame]
|
4209
|
+
#
|
4210
|
+
# @note
|
4211
|
+
# This is syntactic sugar for a left/inner join that preserves the order
|
4212
|
+
# of the left `DataFrame` by default, with an optional coalesce when
|
4213
|
+
# `include_nulls: False`.
|
4214
|
+
#
|
4215
|
+
# @example Update `df` values with the non-null values in `new_df`, by row index:
|
4216
|
+
# lf = Polars::LazyFrame.new(
|
4217
|
+
# {
|
4218
|
+
# "A" => [1, 2, 3, 4],
|
4219
|
+
# "B" => [400, 500, 600, 700]
|
4220
|
+
# }
|
4221
|
+
# )
|
4222
|
+
# new_lf = Polars::LazyFrame.new(
|
4223
|
+
# {
|
4224
|
+
# "B" => [-66, nil, -99],
|
4225
|
+
# "C" => [5, 3, 1]
|
4226
|
+
# }
|
4227
|
+
# )
|
4228
|
+
# lf.update(new_lf).collect
|
4229
|
+
# # =>
|
4230
|
+
# # shape: (4, 2)
|
4231
|
+
# # ┌─────┬─────┐
|
4232
|
+
# # │ A ┆ B │
|
4233
|
+
# # │ --- ┆ --- │
|
4234
|
+
# # │ i64 ┆ i64 │
|
4235
|
+
# # ╞═════╪═════╡
|
4236
|
+
# # │ 1 ┆ -66 │
|
4237
|
+
# # │ 2 ┆ 500 │
|
4238
|
+
# # │ 3 ┆ -99 │
|
4239
|
+
# # │ 4 ┆ 700 │
|
4240
|
+
# # └─────┴─────┘
|
4241
|
+
#
|
4242
|
+
# @example Update `df` values with the non-null values in `new_df`, by row index, but only keeping those rows that are common to both frames:
|
4243
|
+
# lf.update(new_lf, how: "inner").collect
|
4244
|
+
# # =>
|
4245
|
+
# # shape: (3, 2)
|
4246
|
+
# # ┌─────┬─────┐
|
4247
|
+
# # │ A ┆ B │
|
4248
|
+
# # │ --- ┆ --- │
|
4249
|
+
# # │ i64 ┆ i64 │
|
4250
|
+
# # ╞═════╪═════╡
|
4251
|
+
# # │ 1 ┆ -66 │
|
4252
|
+
# # │ 2 ┆ 500 │
|
4253
|
+
# # │ 3 ┆ -99 │
|
4254
|
+
# # └─────┴─────┘
|
4255
|
+
#
|
4256
|
+
# @example Update `df` values with the non-null values in `new_df`, using a full outer join strategy that defines explicit join columns in each frame:
|
4257
|
+
# lf.update(new_lf, left_on: ["A"], right_on: ["C"], how: "full").collect
|
4258
|
+
# # =>
|
4259
|
+
# # shape: (5, 2)
|
4260
|
+
# # ┌─────┬─────┐
|
4261
|
+
# # │ A ┆ B │
|
4262
|
+
# # │ --- ┆ --- │
|
4263
|
+
# # │ i64 ┆ i64 │
|
4264
|
+
# # ╞═════╪═════╡
|
4265
|
+
# # │ 1 ┆ -99 │
|
4266
|
+
# # │ 2 ┆ 500 │
|
4267
|
+
# # │ 3 ┆ 600 │
|
4268
|
+
# # │ 4 ┆ 700 │
|
4269
|
+
# # │ 5 ┆ -66 │
|
4270
|
+
# # └─────┴─────┘
|
4271
|
+
#
|
4272
|
+
# @example Update `df` values including null values in `new_df`, using a full outer join strategy that defines explicit join columns in each frame:
|
4273
|
+
# lf.update(
|
4274
|
+
# new_lf, left_on: "A", right_on: "C", how: "full", include_nulls: true
|
4275
|
+
# ).collect
|
4276
|
+
# # =>
|
4277
|
+
# # shape: (5, 2)
|
4278
|
+
# # ┌─────┬──────┐
|
4279
|
+
# # │ A ┆ B │
|
4280
|
+
# # │ --- ┆ --- │
|
4281
|
+
# # │ i64 ┆ i64 │
|
4282
|
+
# # ╞═════╪══════╡
|
4283
|
+
# # │ 1 ┆ -99 │
|
4284
|
+
# # │ 2 ┆ 500 │
|
4285
|
+
# # │ 3 ┆ null │
|
4286
|
+
# # │ 4 ┆ 700 │
|
4287
|
+
# # │ 5 ┆ -66 │
|
4288
|
+
# # └─────┴──────┘
|
4289
|
+
def update(
|
4290
|
+
other,
|
4291
|
+
on: nil,
|
4292
|
+
how: "left",
|
4293
|
+
left_on: nil,
|
4294
|
+
right_on: nil,
|
4295
|
+
include_nulls: false,
|
4296
|
+
maintain_order: "left"
|
4297
|
+
)
|
4298
|
+
Utils.require_same_type(self, other)
|
4299
|
+
if ["outer", "outer_coalesce"].include?(how)
|
4300
|
+
how = "full"
|
4301
|
+
end
|
4302
|
+
|
4303
|
+
if !["left", "inner", "full"].include?(how)
|
4304
|
+
msg = "`how` must be one of {{'left', 'inner', 'full'}}; found #{how.inspect}"
|
4305
|
+
raise ArgumentError, msg
|
4306
|
+
end
|
4307
|
+
|
4308
|
+
slf = self
|
4309
|
+
row_index_used = false
|
4310
|
+
if on.nil?
|
4311
|
+
if left_on.nil? && right_on.nil?
|
4312
|
+
# no keys provided--use row index
|
4313
|
+
row_index_used = true
|
4314
|
+
row_index_name = "__POLARS_ROW_INDEX"
|
4315
|
+
slf = slf.with_row_index(name: row_index_name)
|
4316
|
+
other = other.with_row_index(name: row_index_name)
|
4317
|
+
left_on = right_on = [row_index_name]
|
4318
|
+
else
|
4319
|
+
# one of left or right is missing, raise error
|
4320
|
+
if left_on.nil?
|
4321
|
+
msg = "missing join columns for left frame"
|
4322
|
+
raise ArgumentError, msg
|
4323
|
+
end
|
4324
|
+
if right_on.nil?
|
4325
|
+
msg = "missing join columns for right frame"
|
4326
|
+
raise ArgumentError, msg
|
4327
|
+
end
|
4328
|
+
end
|
4329
|
+
else
|
4330
|
+
# move on into left/right_on to simplify logic
|
4331
|
+
left_on = right_on = on
|
4332
|
+
end
|
4333
|
+
|
4334
|
+
if left_on.is_a?(::String)
|
4335
|
+
left_on = [left_on]
|
4336
|
+
end
|
4337
|
+
if right_on.is_a?(::String)
|
4338
|
+
right_on = [right_on]
|
4339
|
+
end
|
4340
|
+
|
4341
|
+
left_schema = slf.collect_schema
|
4342
|
+
left_on.each do |name|
|
4343
|
+
if !left_schema.include?(name)
|
4344
|
+
msg = "left join column #{name.inspect} not found"
|
4345
|
+
raise ArgumentError, msg
|
4346
|
+
end
|
4347
|
+
end
|
4348
|
+
right_schema = other.collect_schema
|
4349
|
+
right_on.each do |name|
|
4350
|
+
if !right_schema.include?(name)
|
4351
|
+
msg = "right join column #{name.inspect} not found"
|
4352
|
+
raise ArgumentError, msg
|
4353
|
+
end
|
4354
|
+
end
|
4355
|
+
|
4356
|
+
# no need to join if *only* join columns are in other (inner/left update only)
|
4357
|
+
if how != "full" && right_schema.length == right_on.length
|
4358
|
+
if row_index_used
|
4359
|
+
return slf.drop(row_index_name)
|
4360
|
+
end
|
4361
|
+
return slf
|
4362
|
+
end
|
4363
|
+
|
4364
|
+
# only use non-idx right columns present in left frame
|
4365
|
+
right_other = Set.new(right_schema.to_h.keys).intersection(left_schema.to_h.keys) - Set.new(right_on)
|
4366
|
+
|
4367
|
+
# When include_nulls is True, we need to distinguish records after the join that
|
4368
|
+
# were originally null in the right frame, as opposed to records that were null
|
4369
|
+
# because the key was missing from the right frame.
|
4370
|
+
# Add a validity column to track whether row was matched or not.
|
4371
|
+
if include_nulls
|
4372
|
+
validity = ["__POLARS_VALIDITY"]
|
4373
|
+
other = other.with_columns(F.lit(true).alias(validity[0]))
|
4374
|
+
else
|
4375
|
+
validity = []
|
4376
|
+
end
|
4377
|
+
|
4378
|
+
tmp_name = "__POLARS_RIGHT"
|
4379
|
+
drop_columns = right_other.map { |name| "#{name}#{tmp_name}" } + validity
|
4380
|
+
result = (
|
4381
|
+
slf.join(
|
4382
|
+
other.select(*right_on, *right_other, *validity),
|
4383
|
+
left_on: left_on,
|
4384
|
+
right_on: right_on,
|
4385
|
+
how: how,
|
4386
|
+
suffix: tmp_name,
|
4387
|
+
coalesce: true,
|
4388
|
+
maintain_order: maintain_order
|
4389
|
+
)
|
4390
|
+
.with_columns(
|
4391
|
+
right_other.map do |name|
|
4392
|
+
(
|
4393
|
+
if include_nulls
|
4394
|
+
# use left value only when right value failed to join
|
4395
|
+
F.when(F.col(validity).is_null)
|
4396
|
+
.then(F.col(name))
|
4397
|
+
.otherwise(F.col("#{name}#{tmp_name}"))
|
4398
|
+
else
|
4399
|
+
F.coalesce(["#{name}#{tmp_name}", F.col(name)])
|
4400
|
+
end
|
4401
|
+
).alias(name)
|
4402
|
+
end
|
4403
|
+
)
|
4404
|
+
.drop(drop_columns)
|
4405
|
+
)
|
4406
|
+
if row_index_used
|
4407
|
+
result = result.drop(row_index_name)
|
4408
|
+
end
|
4409
|
+
|
4410
|
+
_from_rbldf(result._ldf)
|
4411
|
+
end
|
4412
|
+
|
4413
|
+
# Return the number of non-null elements for each column.
|
4414
|
+
#
|
4415
|
+
# @return [LazyFrame]
|
4416
|
+
#
|
4417
|
+
# @example
|
4418
|
+
# lf = Polars::LazyFrame.new(
|
4419
|
+
# {"a" => [1, 2, 3, 4], "b" => [1, 2, 1, nil], "c" => [nil, nil, nil, nil]}
|
4420
|
+
# )
|
4421
|
+
# lf.count.collect
|
4422
|
+
# # =>
|
4423
|
+
# # shape: (1, 3)
|
4424
|
+
# # ┌─────┬─────┬─────┐
|
4425
|
+
# # │ a ┆ b ┆ c │
|
4426
|
+
# # │ --- ┆ --- ┆ --- │
|
4427
|
+
# # │ u32 ┆ u32 ┆ u32 │
|
4428
|
+
# # ╞═════╪═════╪═════╡
|
4429
|
+
# # │ 4 ┆ 3 ┆ 0 │
|
4430
|
+
# # └─────┴─────┴─────┘
|
4431
|
+
def count
|
4432
|
+
_from_rbldf(_ldf.count)
|
4433
|
+
end
|
3489
4434
|
|
3490
4435
|
private
|
3491
4436
|
|
@@ -3497,5 +4442,64 @@ module Polars
|
|
3497
4442
|
def _from_rbldf(rb_ldf)
|
3498
4443
|
self.class._from_rbldf(rb_ldf)
|
3499
4444
|
end
|
4445
|
+
|
4446
|
+
def _filter(
|
4447
|
+
predicates:,
|
4448
|
+
constraints:,
|
4449
|
+
invert: false
|
4450
|
+
)
|
4451
|
+
all_predicates = []
|
4452
|
+
boolean_masks = []
|
4453
|
+
|
4454
|
+
predicates.each do |p|
|
4455
|
+
# quick exit/skip conditions
|
4456
|
+
if (p.is_a?(FalseClass) && invert) || (p.is_a?(TrueClass) && !invert)
|
4457
|
+
next # ignore; doesn't filter/remove anything
|
4458
|
+
end
|
4459
|
+
if (p.is_a?(TrueClass) && invert) || (p.is_a?(FalseClass) && !invert)
|
4460
|
+
return clear # discard all rows
|
4461
|
+
end
|
4462
|
+
|
4463
|
+
# note: identify masks separately from predicates
|
4464
|
+
if Utils.is_bool_sequence(p, include_series: true)
|
4465
|
+
boolean_masks << Polars::Series.new(p, dtype: Boolean)
|
4466
|
+
elsif (
|
4467
|
+
(is_seq = Utils.is_sequence(p)) && p.any? { |x| !x.is_a?(Expr) }) ||
|
4468
|
+
(!is_seq && !p.is_a?(Expr) && !(p.is_a?(::String) && collect_schema.include?(p))
|
4469
|
+
)
|
4470
|
+
err = p.is_a?(Series) ? "Series(…, dtype: #{p.dtype})" : p.inspect
|
4471
|
+
msg = "invalid predicate for `filter`: #{err}"
|
4472
|
+
raise TypeError, msg
|
4473
|
+
else
|
4474
|
+
all_predicates.concat(
|
4475
|
+
Utils.parse_into_list_of_expressions(p).map { |x| Utils.wrap_expr(x) }
|
4476
|
+
)
|
4477
|
+
end
|
4478
|
+
end
|
4479
|
+
|
4480
|
+
# unpack equality constraints from kwargs
|
4481
|
+
all_predicates.concat(
|
4482
|
+
constraints.map { |name, value| F.col(name).eq(value) }
|
4483
|
+
)
|
4484
|
+
if !(all_predicates.any? || boolean_masks.any?)
|
4485
|
+
msg = "at least one predicate or constraint must be provided"
|
4486
|
+
raise TypeError, msg
|
4487
|
+
end
|
4488
|
+
|
4489
|
+
# if multiple predicates, combine as 'horizontal' expression
|
4490
|
+
combined_predicate = all_predicates ? (all_predicates.length > 1 ? F.all_horizontal(*all_predicates) : all_predicates[0]) : nil
|
4491
|
+
|
4492
|
+
# apply reduced boolean mask first, if applicable, then predicates
|
4493
|
+
if boolean_masks.any?
|
4494
|
+
raise Todo
|
4495
|
+
end
|
4496
|
+
|
4497
|
+
if combined_predicate.nil?
|
4498
|
+
return _from_rbldf(_ldf)
|
4499
|
+
end
|
4500
|
+
|
4501
|
+
filter_method = invert ? _ldf.method(:remove) : _ldf.method(:filter)
|
4502
|
+
_from_rbldf(filter_method.(combined_predicate._rbexpr))
|
4503
|
+
end
|
3500
4504
|
end
|
3501
4505
|
end
|