polars-df 0.20.0-x64-mingw-ucrt → 0.21.1-x64-mingw-ucrt

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +27 -0
  3. data/Cargo.lock +192 -186
  4. data/LICENSE-THIRD-PARTY.txt +2153 -2532
  5. data/LICENSE.txt +1 -1
  6. data/lib/polars/3.2/polars.so +0 -0
  7. data/lib/polars/3.3/polars.so +0 -0
  8. data/lib/polars/3.4/polars.so +0 -0
  9. data/lib/polars/array_expr.rb +382 -3
  10. data/lib/polars/array_name_space.rb +281 -0
  11. data/lib/polars/binary_expr.rb +67 -0
  12. data/lib/polars/binary_name_space.rb +43 -0
  13. data/lib/polars/cat_expr.rb +224 -0
  14. data/lib/polars/cat_name_space.rb +130 -32
  15. data/lib/polars/catalog/unity/catalog_info.rb +20 -0
  16. data/lib/polars/catalog/unity/column_info.rb +31 -0
  17. data/lib/polars/catalog/unity/namespace_info.rb +21 -0
  18. data/lib/polars/catalog/unity/table_info.rb +50 -0
  19. data/lib/polars/catalog.rb +448 -0
  20. data/lib/polars/config.rb +2 -2
  21. data/lib/polars/convert.rb +12 -2
  22. data/lib/polars/data_frame.rb +834 -48
  23. data/lib/polars/data_type_expr.rb +52 -0
  24. data/lib/polars/data_types.rb +61 -5
  25. data/lib/polars/date_time_expr.rb +251 -0
  26. data/lib/polars/date_time_name_space.rb +299 -0
  27. data/lib/polars/exceptions.rb +7 -2
  28. data/lib/polars/expr.rb +1247 -211
  29. data/lib/polars/functions/col.rb +6 -5
  30. data/lib/polars/functions/datatype.rb +21 -0
  31. data/lib/polars/functions/lazy.rb +127 -15
  32. data/lib/polars/functions/repeat.rb +4 -0
  33. data/lib/polars/io/csv.rb +19 -1
  34. data/lib/polars/io/json.rb +16 -0
  35. data/lib/polars/io/ndjson.rb +13 -0
  36. data/lib/polars/io/parquet.rb +70 -66
  37. data/lib/polars/io/scan_options.rb +47 -0
  38. data/lib/polars/lazy_frame.rb +1099 -95
  39. data/lib/polars/list_expr.rb +400 -11
  40. data/lib/polars/list_name_space.rb +321 -5
  41. data/lib/polars/meta_expr.rb +71 -22
  42. data/lib/polars/name_expr.rb +36 -0
  43. data/lib/polars/scan_cast_options.rb +64 -0
  44. data/lib/polars/schema.rb +84 -3
  45. data/lib/polars/selector.rb +210 -0
  46. data/lib/polars/selectors.rb +932 -203
  47. data/lib/polars/series.rb +1083 -63
  48. data/lib/polars/string_expr.rb +435 -9
  49. data/lib/polars/string_name_space.rb +729 -45
  50. data/lib/polars/struct_expr.rb +103 -0
  51. data/lib/polars/struct_name_space.rb +19 -1
  52. data/lib/polars/utils/parse.rb +40 -0
  53. data/lib/polars/utils/various.rb +18 -1
  54. data/lib/polars/utils.rb +9 -1
  55. data/lib/polars/version.rb +1 -1
  56. data/lib/polars.rb +10 -0
  57. metadata +12 -2
@@ -234,10 +234,18 @@ module Polars
234
234
  #
235
235
  # @param by [Object]
236
236
  # Column (expressions) to sort by.
237
+ # @param more_by [Array]
238
+ # Additional columns to sort by, specified as positional arguments.
237
239
  # @param reverse [Boolean]
238
240
  # Sort in descending order.
239
241
  # @param nulls_last [Boolean]
240
242
  # Place null values last. Can only be used if sorted by a single column.
243
+ # @param maintain_order [Boolean]
244
+ # Whether the order should be maintained if elements are equal.
245
+ # Note that if `true` streaming is not possible and performance might be
246
+ # worse since this requires a stable search.
247
+ # @param multithreaded [Boolean]
248
+ # Sort using multiple threads.
241
249
  #
242
250
  # @return [LazyFrame]
243
251
  #
@@ -280,6 +288,201 @@ module Polars
280
288
  )
281
289
  end
282
290
 
291
+ # Execute a SQL query against the LazyFrame.
292
+ #
293
+ # @note
294
+ # This functionality is considered **unstable**, although it is close to
295
+ # being considered stable. It may be changed at any point without it being
296
+ # considered a breaking change.
297
+ #
298
+ # @param query [String]
299
+ # SQL query to execute.
300
+ # @param table_name [String]
301
+ # Optionally provide an explicit name for the table that represents the
302
+ # calling frame (defaults to "self").
303
+ #
304
+ # @return [Expr]
305
+ #
306
+ # @note
307
+ # * The calling frame is automatically registered as a table in the SQL context
308
+ # under the name "self". If you want access to the DataFrames and LazyFrames
309
+ # found in the current globals, use the top-level `Polars.sql`.
310
+ # * More control over registration and execution behaviour is available by
311
+ # using the `SQLContext` object.
312
+ #
313
+ # @example Query the LazyFrame using SQL:
314
+ # lf1 = Polars::LazyFrame.new({"a" => [1, 2, 3], "b" => [6, 7, 8], "c" => ["z", "y", "x"]})
315
+ # lf2 = Polars::LazyFrame.new({"a" => [3, 2, 1], "d" => [125, -654, 888]})
316
+ # lf1.sql("SELECT c, b FROM self WHERE a > 1").collect
317
+ # # =>
318
+ # # shape: (2, 2)
319
+ # # ┌─────┬─────┐
320
+ # # │ c ┆ b │
321
+ # # │ --- ┆ --- │
322
+ # # │ str ┆ i64 │
323
+ # # ╞═════╪═════╡
324
+ # # │ y ┆ 7 │
325
+ # # │ x ┆ 8 │
326
+ # # └─────┴─────┘
327
+ #
328
+ # @example Apply SQL transforms (aliasing "self" to "frame") then filter natively (you can freely mix SQL and native operations):
329
+ # lf1.sql(
330
+ # "
331
+ # SELECT
332
+ # a,
333
+ # (a % 2 == 0) AS a_is_even,
334
+ # (b::float4 / 2) AS \"b/2\",
335
+ # CONCAT_WS(':', c, c, c) AS c_c_c
336
+ # FROM frame
337
+ # ORDER BY a
338
+ # ",
339
+ # table_name: "frame",
340
+ # ).filter(~Polars.col("c_c_c").str.starts_with("x")).collect
341
+ # # =>
342
+ # # shape: (2, 4)
343
+ # # ┌─────┬───────────┬─────┬───────┐
344
+ # # │ a ┆ a_is_even ┆ b/2 ┆ c_c_c │
345
+ # # │ --- ┆ --- ┆ --- ┆ --- │
346
+ # # │ i64 ┆ bool ┆ f32 ┆ str │
347
+ # # ╞═════╪═══════════╪═════╪═══════╡
348
+ # # │ 1 ┆ false ┆ 3.0 ┆ z:z:z │
349
+ # # │ 2 ┆ true ┆ 3.5 ┆ y:y:y │
350
+ # # └─────┴───────────┴─────┴───────┘
351
+ def sql(query, table_name: "self")
352
+ ctx = Polars::SQLContext.new
353
+ name = table_name || "self"
354
+ ctx.register(name, self)
355
+ ctx.execute(query)
356
+ end
357
+
358
+ # Return the `k` largest rows.
359
+ #
360
+ # Non-null elements are always preferred over null elements, regardless of
361
+ # the value of `reverse`. The output is not guaranteed to be in any
362
+ # particular order, call :func:`sort` after this function if you wish the
363
+ # output to be sorted.
364
+ #
365
+ # @param k [Integer]
366
+ # Number of rows to return.
367
+ # @param by [Object]
368
+ # Column(s) used to determine the top rows.
369
+ # Accepts expression input. Strings are parsed as column names.
370
+ # @param reverse [Object]
371
+ # Consider the `k` smallest elements of the `by` column(s) (instead of the `k`
372
+ # largest). This can be specified per column by passing a sequence of
373
+ # booleans.
374
+ #
375
+ # @return [LazyFrame]
376
+ #
377
+ # @example Get the rows which contain the 4 largest values in column b.
378
+ # lf = Polars::LazyFrame.new(
379
+ # {
380
+ # "a" => ["a", "b", "a", "b", "b", "c"],
381
+ # "b" => [2, 1, 1, 3, 2, 1]
382
+ # }
383
+ # )
384
+ # lf.top_k(4, by: "b").collect
385
+ # # =>
386
+ # # shape: (4, 2)
387
+ # # ┌─────┬─────┐
388
+ # # │ a ┆ b │
389
+ # # │ --- ┆ --- │
390
+ # # │ str ┆ i64 │
391
+ # # ╞═════╪═════╡
392
+ # # │ b ┆ 3 │
393
+ # # │ a ┆ 2 │
394
+ # # │ b ┆ 2 │
395
+ # # │ b ┆ 1 │
396
+ # # └─────┴─────┘
397
+ #
398
+ # @example Get the rows which contain the 4 largest values when sorting on column b and a.
399
+ # lf.top_k(4, by: ["b", "a"]).collect
400
+ # # =>
401
+ # # shape: (4, 2)
402
+ # # ┌─────┬─────┐
403
+ # # │ a ┆ b │
404
+ # # │ --- ┆ --- │
405
+ # # │ str ┆ i64 │
406
+ # # ╞═════╪═════╡
407
+ # # │ b ┆ 3 │
408
+ # # │ b ┆ 2 │
409
+ # # │ a ┆ 2 │
410
+ # # │ c ┆ 1 │
411
+ # # └─────┴─────┘
412
+ def top_k(
413
+ k,
414
+ by:,
415
+ reverse: false
416
+ )
417
+ by = Utils.parse_into_list_of_expressions(by)
418
+ reverse = Utils.extend_bool(reverse, by.length, "reverse", "by")
419
+ _from_rbldf(_ldf.top_k(k, by, reverse))
420
+ end
421
+
422
+ # Return the `k` smallest rows.
423
+ #
424
+ # Non-null elements are always preferred over null elements, regardless of
425
+ # the value of `reverse`. The output is not guaranteed to be in any
426
+ # particular order, call :func:`sort` after this function if you wish the
427
+ # output to be sorted.
428
+ #
429
+ # @param k [Integer]
430
+ # Number of rows to return.
431
+ # @param by [Object]
432
+ # Column(s) used to determine the bottom rows.
433
+ # Accepts expression input. Strings are parsed as column names.
434
+ # @param reverse [Object]
435
+ # Consider the `k` largest elements of the `by` column(s) (instead of the `k`
436
+ # smallest). This can be specified per column by passing a sequence of
437
+ # booleans.
438
+ #
439
+ # @return [LazyFrame]
440
+ #
441
+ # @example Get the rows which contain the 4 smallest values in column b.
442
+ # lf = Polars::LazyFrame.new(
443
+ # {
444
+ # "a" => ["a", "b", "a", "b", "b", "c"],
445
+ # "b" => [2, 1, 1, 3, 2, 1]
446
+ # }
447
+ # )
448
+ # lf.bottom_k(4, by: "b").collect
449
+ # # =>
450
+ # # shape: (4, 2)
451
+ # # ┌─────┬─────┐
452
+ # # │ a ┆ b │
453
+ # # │ --- ┆ --- │
454
+ # # │ str ┆ i64 │
455
+ # # ╞═════╪═════╡
456
+ # # │ b ┆ 1 │
457
+ # # │ a ┆ 1 │
458
+ # # │ c ┆ 1 │
459
+ # # │ a ┆ 2 │
460
+ # # └─────┴─────┘
461
+ #
462
+ # @example Get the rows which contain the 4 smallest values when sorting on column a and b.
463
+ # lf.bottom_k(4, by: ["a", "b"]).collect
464
+ # # =>
465
+ # # shape: (4, 2)
466
+ # # ┌─────┬─────┐
467
+ # # │ a ┆ b │
468
+ # # │ --- ┆ --- │
469
+ # # │ str ┆ i64 │
470
+ # # ╞═════╪═════╡
471
+ # # │ a ┆ 1 │
472
+ # # │ a ┆ 2 │
473
+ # # │ b ┆ 1 │
474
+ # # │ b ┆ 2 │
475
+ # # └─────┴─────┘
476
+ def bottom_k(
477
+ k,
478
+ by:,
479
+ reverse: false
480
+ )
481
+ by = Utils.parse_into_list_of_expressions(by)
482
+ reverse = Utils.extend_bool(reverse, by.length, "reverse", "by")
483
+ _from_rbldf(_ldf.bottom_k(k, by, reverse))
484
+ end
485
+
283
486
  # def profile
284
487
  # end
285
488
 
@@ -305,6 +508,8 @@ module Polars
305
508
  # Slice pushdown optimization.
306
509
  # @param common_subplan_elimination [Boolean]
307
510
  # Will try to cache branching subplans that occur on self-joins or unions.
511
+ # @param comm_subexpr_elim [Boolean]
512
+ # Common subexpressions will be cached and reused.
308
513
  # @param allow_streaming [Boolean]
309
514
  # Run parts of the query in a streaming fashion (this is in an alpha state)
310
515
  #
@@ -369,6 +574,41 @@ module Polars
369
574
  Utils.wrap_df(ldf.collect)
370
575
  end
371
576
 
577
+ # Resolve the schema of this LazyFrame.
578
+ #
579
+ # @return [Schema]
580
+ #
581
+ # @example Determine the schema.
582
+ # lf = Polars::LazyFrame.new(
583
+ # {
584
+ # "foo" => [1, 2, 3],
585
+ # "bar" => [6.0, 7.0, 8.0],
586
+ # "ham" => ["a", "b", "c"]
587
+ # }
588
+ # )
589
+ # lf.collect_schema
590
+ # # => Polars::Schema({"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::String})
591
+ #
592
+ # @example Access various properties of the schema.
593
+ # schema = lf.collect_schema
594
+ # schema["bar"]
595
+ # # => Polars::Float64
596
+ #
597
+ # @example
598
+ # schema.names
599
+ # # => ["foo", "bar", "ham"]
600
+ #
601
+ # @example
602
+ # schema.dtypes
603
+ # # => [Polars::Int64, Polars::Float64, Polars::String]
604
+ #
605
+ # @example
606
+ # schema.length
607
+ # # => 3
608
+ def collect_schema
609
+ Schema.new(_ldf.collect_schema, check_dtypes: false)
610
+ end
611
+
372
612
  # Persists a LazyFrame at the provided path.
373
613
  #
374
614
  # This allows streaming results that are larger than RAM to be written to disk.
@@ -412,6 +652,31 @@ module Polars
412
652
  # Turn off (certain) optimizations.
413
653
  # @param slice_pushdown [Boolean]
414
654
  # Slice pushdown optimization.
655
+ # @param storage_options [String]
656
+ # Options that indicate how to connect to a cloud provider.
657
+ #
658
+ # The cloud providers currently supported are AWS, GCP, and Azure.
659
+ # See supported keys here:
660
+ #
661
+ # * [aws](https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html)
662
+ # * [gcp](https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html)
663
+ # * [azure](https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html)
664
+ # * Hugging Face (`hf://`): Accepts an API key under the `token` parameter: `{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.
665
+ #
666
+ # If `storage_options` is not provided, Polars will try to infer the
667
+ # information from environment variables.
668
+ # @param retries [Integer]
669
+ # Number of retries if accessing a cloud instance fails.
670
+ # @param sync_on_close ['data', 'all']
671
+ # Sync to disk when before closing a file.
672
+ #
673
+ # * `nil` does not sync.
674
+ # * `data` syncs the file contents.
675
+ # * `all` syncs the file contents and metadata.
676
+ # @param mkdir [Boolean]
677
+ # Recursively create all the directories in the path.
678
+ # @param lazy [Boolean]
679
+ # Wait to start execution until `collect` is called.
415
680
  #
416
681
  # @return [DataFrame]
417
682
  #
@@ -521,6 +786,16 @@ module Polars
521
786
  # Slice pushdown optimization.
522
787
  # @param no_optimization [Boolean]
523
788
  # Turn off (certain) optimizations.
789
+ # @param sync_on_close ['data', 'all']
790
+ # Sync to disk when before closing a file.
791
+ #
792
+ # * `nil` does not sync.
793
+ # * `data` syncs the file contents.
794
+ # * `all` syncs the file contents and metadata.
795
+ # @param mkdir [Boolean]
796
+ # Recursively create all the directories in the path.
797
+ # @param lazy [Boolean]
798
+ # Wait to start execution until `collect` is called.
524
799
  #
525
800
  # @return [DataFrame]
526
801
  #
@@ -614,9 +889,15 @@ module Polars
614
889
  # A format string, with the specifiers defined by the
615
890
  # `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
616
891
  # Rust crate.
892
+ # @param float_scientific [Integer]
893
+ # Whether to use scientific form always (true), never (false), or
894
+ # automatically (nil) for `Float32` and `Float64` datatypes.
617
895
  # @param float_precision [Integer]
618
896
  # Number of decimal places to write, applied to both `Float32` and
619
897
  # `Float64` datatypes.
898
+ # @param decimal_comma [Boolean]
899
+ # Use a comma as the decimal separator instead of a point. Floats will be
900
+ # encapsulated in quotes if necessary; set the field separator to override.
620
901
  # @param null_value [String]
621
902
  # A string representing null values (defaulting to the empty string).
622
903
  # @param quote_style ["necessary", "always", "non_numeric", "never"]
@@ -655,6 +936,16 @@ module Polars
655
936
  # Options that indicate how to connect to a cloud provider.
656
937
  # @param retries [Integer]
657
938
  # Number of retries if accessing a cloud instance fails.
939
+ # @param sync_on_close ['data', 'all']
940
+ # Sync to disk when before closing a file.
941
+ #
942
+ # * `nil` does not sync.
943
+ # * `data` syncs the file contents.
944
+ # * `all` syncs the file contents and metadata.
945
+ # @param mkdir [Boolean]
946
+ # Recursively create all the directories in the path.
947
+ # @param lazy [Boolean]
948
+ # Wait to start execution until `collect` is called.
658
949
  #
659
950
  # @return [DataFrame]
660
951
  #
@@ -674,6 +965,7 @@ module Polars
674
965
  time_format: nil,
675
966
  float_scientific: nil,
676
967
  float_precision: nil,
968
+ decimal_comma: false,
677
969
  null_value: nil,
678
970
  quote_style: nil,
679
971
  maintain_order: true,
@@ -726,6 +1018,7 @@ module Polars
726
1018
  time_format,
727
1019
  float_scientific,
728
1020
  float_precision,
1021
+ decimal_comma,
729
1022
  null_value,
730
1023
  quote_style,
731
1024
  storage_options,
@@ -762,6 +1055,31 @@ module Polars
762
1055
  # Slice pushdown optimization.
763
1056
  # @param no_optimization [Boolean]
764
1057
  # Turn off (certain) optimizations.
1058
+ # @param storage_options [String]
1059
+ # Options that indicate how to connect to a cloud provider.
1060
+ #
1061
+ # The cloud providers currently supported are AWS, GCP, and Azure.
1062
+ # See supported keys here:
1063
+ #
1064
+ # * [aws](https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html)
1065
+ # * [gcp](https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html)
1066
+ # * [azure](https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html)
1067
+ # * Hugging Face (`hf://`): Accepts an API key under the `token` parameter: `{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.
1068
+ #
1069
+ # If `storage_options` is not provided, Polars will try to infer the
1070
+ # information from environment variables.
1071
+ # @param retries [Integer]
1072
+ # Number of retries if accessing a cloud instance fails.
1073
+ # @param sync_on_close ['data', 'all']
1074
+ # Sync to disk when before closing a file.
1075
+ #
1076
+ # * `nil` does not sync.
1077
+ # * `data` syncs the file contents.
1078
+ # * `all` syncs the file contents and metadata.
1079
+ # @param mkdir [Boolean]
1080
+ # Recursively create all the directories in the path.
1081
+ # @param lazy [Boolean]
1082
+ # Wait to start execution until `collect` is called.
765
1083
  #
766
1084
  # @return [DataFrame]
767
1085
  #
@@ -854,25 +1172,6 @@ module Polars
854
1172
  #
855
1173
  # @param n_rows [Integer]
856
1174
  # Collect n_rows from the data sources.
857
- # @param type_coercion [Boolean]
858
- # Run type coercion optimization.
859
- # @param predicate_pushdown [Boolean]
860
- # Run predicate pushdown optimization.
861
- # @param projection_pushdown [Boolean]
862
- # Run projection pushdown optimization.
863
- # @param simplify_expression [Boolean]
864
- # Run simplify expressions optimization.
865
- # @param string_cache [Boolean]
866
- # This argument is deprecated. Please set the string cache globally.
867
- # The argument will be ignored
868
- # @param no_optimization [Boolean]
869
- # Turn off optimizations.
870
- # @param slice_pushdown [Boolean]
871
- # Slice pushdown optimization
872
- # @param common_subplan_elimination [Boolean]
873
- # Will try to cache branching subplans that occur on self-joins or unions.
874
- # @param allow_streaming [Boolean]
875
- # Run parts of the query in a streaming fashion (this is in an alpha state)
876
1175
  #
877
1176
  # @return [DataFrame]
878
1177
  #
@@ -892,41 +1191,11 @@ module Polars
892
1191
  # # │ --- ┆ --- ┆ --- │
893
1192
  # # │ str ┆ i64 ┆ i64 │
894
1193
  # # ╞═════╪═════╪═════╡
895
- # # │ a ┆ 16
896
- # # │ b ┆ 2 5
1194
+ # # │ a ┆ 410
1195
+ # # │ b ┆ 11 10
897
1196
  # # └─────┴─────┴─────┘
898
- def fetch(
899
- n_rows = 500,
900
- type_coercion: true,
901
- predicate_pushdown: true,
902
- projection_pushdown: true,
903
- simplify_expression: true,
904
- string_cache: false,
905
- no_optimization: false,
906
- slice_pushdown: true,
907
- common_subplan_elimination: true,
908
- comm_subexpr_elim: true,
909
- allow_streaming: false
910
- )
911
- if no_optimization
912
- predicate_pushdown = false
913
- projection_pushdown = false
914
- slice_pushdown = false
915
- common_subplan_elimination = false
916
- end
917
-
918
- ldf = _ldf.optimization_toggle(
919
- type_coercion,
920
- predicate_pushdown,
921
- projection_pushdown,
922
- simplify_expression,
923
- slice_pushdown,
924
- common_subplan_elimination,
925
- comm_subexpr_elim,
926
- allow_streaming,
927
- false
928
- )
929
- Utils.wrap_df(ldf.fetch(n_rows))
1197
+ def fetch(n_rows = 500, **kwargs)
1198
+ head(n_rows).collect(**kwargs)
930
1199
  end
931
1200
 
932
1201
  # Return lazy representation, i.e. itself.
@@ -1058,7 +1327,7 @@ module Polars
1058
1327
  # # │ null ┆ null ┆ null │
1059
1328
  # # └──────┴──────┴──────┘
1060
1329
  def clear(n = 0)
1061
- DataFrame.new(columns: schema).clear(n).lazy
1330
+ DataFrame.new(schema: schema).clear(n).lazy
1062
1331
  end
1063
1332
  alias_method :cleared, :clear
1064
1333
 
@@ -1108,6 +1377,140 @@ module Polars
1108
1377
  )
1109
1378
  end
1110
1379
 
1380
+ # Remove rows, dropping those that match the given predicate expression(s).
1381
+ #
1382
+ # The original order of the remaining rows is preserved.
1383
+ #
1384
+ # Rows where the filter predicate does not evaluate to true are retained
1385
+ # (this includes rows where the predicate evaluates as `null`).
1386
+ #
1387
+ # @param predicates [Array]
1388
+ # Expression that evaluates to a boolean Series.
1389
+ # @param constraints [Hash]
1390
+ # Column filters; use `name = value` to filter columns using the supplied
1391
+ # value. Each constraint behaves the same as `Polars.col(name).eq(value)`,
1392
+ # and is implicitly joined with the other filter conditions using `&`.
1393
+ #
1394
+ # @return [LazyFrame]
1395
+ #
1396
+ # @example Remove rows matching a condition:
1397
+ # lf = Polars::LazyFrame.new(
1398
+ # {
1399
+ # "foo" => [2, 3, nil, 4, 0],
1400
+ # "bar" => [5, 6, nil, nil, 0],
1401
+ # "ham" => ["a", "b", nil, "c", "d"]
1402
+ # }
1403
+ # )
1404
+ # lf.remove(
1405
+ # Polars.col("bar") >= 5
1406
+ # ).collect
1407
+ # # =>
1408
+ # # shape: (3, 3)
1409
+ # # ┌──────┬──────┬──────┐
1410
+ # # │ foo ┆ bar ┆ ham │
1411
+ # # │ --- ┆ --- ┆ --- │
1412
+ # # │ i64 ┆ i64 ┆ str │
1413
+ # # ╞══════╪══════╪══════╡
1414
+ # # │ null ┆ null ┆ null │
1415
+ # # │ 4 ┆ null ┆ c │
1416
+ # # │ 0 ┆ 0 ┆ d │
1417
+ # # └──────┴──────┴──────┘
1418
+ #
1419
+ # @example Discard rows based on multiple conditions, combined with and/or operators:
1420
+ # lf.remove(
1421
+ # (Polars.col("foo") >= 0) & (Polars.col("bar") >= 0)
1422
+ # ).collect
1423
+ # # =>
1424
+ # # shape: (2, 3)
1425
+ # # ┌──────┬──────┬──────┐
1426
+ # # │ foo ┆ bar ┆ ham │
1427
+ # # │ --- ┆ --- ┆ --- │
1428
+ # # │ i64 ┆ i64 ┆ str │
1429
+ # # ╞══════╪══════╪══════╡
1430
+ # # │ null ┆ null ┆ null │
1431
+ # # │ 4 ┆ null ┆ c │
1432
+ # # └──────┴──────┴──────┘
1433
+ #
1434
+ # @example
1435
+ # lf.remove(
1436
+ # (Polars.col("foo") >= 0) | (Polars.col("bar") >= 0)
1437
+ # ).collect
1438
+ # # =>
1439
+ # # shape: (1, 3)
1440
+ # # ┌──────┬──────┬──────┐
1441
+ # # │ foo ┆ bar ┆ ham │
1442
+ # # │ --- ┆ --- ┆ --- │
1443
+ # # │ i64 ┆ i64 ┆ str │
1444
+ # # ╞══════╪══════╪══════╡
1445
+ # # │ null ┆ null ┆ null │
1446
+ # # └──────┴──────┴──────┘
1447
+ #
1448
+ # @example Provide multiple constraints using `*args` syntax:
1449
+ # lf.remove(
1450
+ # Polars.col("ham").is_not_null,
1451
+ # Polars.col("bar") >= 0
1452
+ # ).collect
1453
+ # # =>
1454
+ # # shape: (2, 3)
1455
+ # # ┌──────┬──────┬──────┐
1456
+ # # │ foo ┆ bar ┆ ham │
1457
+ # # │ --- ┆ --- ┆ --- │
1458
+ # # │ i64 ┆ i64 ┆ str │
1459
+ # # ╞══════╪══════╪══════╡
1460
+ # # │ null ┆ null ┆ null │
1461
+ # # │ 4 ┆ null ┆ c │
1462
+ # # └──────┴──────┴──────┘
1463
+ #
1464
+ # @example Provide constraints(s) using `**kwargs` syntax:
1465
+ # lf.remove(foo: 0, bar: 0).collect
1466
+ # # =>
1467
+ # # shape: (4, 3)
1468
+ # # ┌──────┬──────┬──────┐
1469
+ # # │ foo ┆ bar ┆ ham │
1470
+ # # │ --- ┆ --- ┆ --- │
1471
+ # # │ i64 ┆ i64 ┆ str │
1472
+ # # ╞══════╪══════╪══════╡
1473
+ # # │ 2 ┆ 5 ┆ a │
1474
+ # # │ 3 ┆ 6 ┆ b │
1475
+ # # │ null ┆ null ┆ null │
1476
+ # # │ 4 ┆ null ┆ c │
1477
+ # # └──────┴──────┴──────┘
1478
+ #
1479
+ # @example Remove rows by comparing two columns against each other; in this case, we remove rows where the two columns are not equal (using `ne_missing` to ensure that null values compare equal):
1480
+ # lf.remove(
1481
+ # Polars.col("foo").ne_missing(Polars.col("bar"))
1482
+ # ).collect
1483
+ # # =>
1484
+ # # shape: (2, 3)
1485
+ # # ┌──────┬──────┬──────┐
1486
+ # # │ foo ┆ bar ┆ ham │
1487
+ # # │ --- ┆ --- ┆ --- │
1488
+ # # │ i64 ┆ i64 ┆ str │
1489
+ # # ╞══════╪══════╪══════╡
1490
+ # # │ null ┆ null ┆ null │
1491
+ # # │ 0 ┆ 0 ┆ d │
1492
+ # # └──────┴──────┴──────┘
1493
+ def remove(
1494
+ *predicates,
1495
+ **constraints
1496
+ )
1497
+ if constraints.empty?
1498
+ # early-exit conditions (exclude/include all rows)
1499
+ if predicates.empty? || (predicates.length == 1 && predicates[0].is_a?(TrueClass))
1500
+ return clear
1501
+ end
1502
+ if predicates.length == 1 && predicates[0].is_a?(FalseClass)
1503
+ return dup
1504
+ end
1505
+ end
1506
+
1507
+ _filter(
1508
+ predicates: predicates,
1509
+ constraints: constraints,
1510
+ invert: true
1511
+ )
1512
+ end
1513
+
1111
1514
  # Select columns from this DataFrame.
1112
1515
  #
1113
1516
  # @param exprs [Array]
@@ -1205,6 +1608,29 @@ module Polars
1205
1608
  _from_rbldf(_ldf.select(rbexprs))
1206
1609
  end
1207
1610
 
1611
+ # Select columns from this LazyFrame.
1612
+ #
1613
+ # This will run all expression sequentially instead of in parallel.
1614
+ # Use this when the work per expression is cheap.
1615
+ #
1616
+ # @param exprs [Array]
1617
+ # Column(s) to select, specified as positional arguments.
1618
+ # Accepts expression input. Strings are parsed as column names,
1619
+ # other non-expression inputs are parsed as literals.
1620
+ # @param named_exprs [Hash]
1621
+ # Additional columns to select, specified as keyword arguments.
1622
+ # The columns will be renamed to the keyword used.
1623
+ #
1624
+ # @return [LazyFrame]
1625
+ def select_seq(*exprs, **named_exprs)
1626
+ structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", 0).to_i != 0
1627
+
1628
+ rbexprs = Utils.parse_into_list_of_expressions(
1629
+ *exprs, **named_exprs, __structify: structify
1630
+ )
1631
+ _from_rbldf(_ldf.select_seq(rbexprs))
1632
+ end
1633
+
1208
1634
  # Start a group by operation.
1209
1635
  #
1210
1636
  # @param by [Array]
@@ -1401,9 +1827,9 @@ module Polars
1401
1827
  # @param every [Object]
1402
1828
  # Interval of the window.
1403
1829
  # @param period [Object]
1404
- # Length of the window, if None it is equal to 'every'.
1830
+ # Length of the window, if nil it is equal to 'every'.
1405
1831
  # @param offset [Object]
1406
- # Offset of the window if None and period is None it will be equal to negative
1832
+ # Offset of the window if nil and period is nil it will be equal to negative
1407
1833
  # `every`.
1408
1834
  # @param truncate [Boolean]
1409
1835
  # Truncate the time value to the window lower bound.
@@ -1413,8 +1839,32 @@ module Polars
1413
1839
  # parallelize
1414
1840
  # @param closed ["right", "left", "both", "none"]
1415
1841
  # Define whether the temporal window interval is closed or not.
1842
+ # @param label ['left', 'right', 'datapoint']
1843
+ # Define which label to use for the window:
1844
+ #
1845
+ # - 'left': lower boundary of the window
1846
+ # - 'right': upper boundary of the window
1847
+ # - 'datapoint': the first value of the index column in the given window.
1848
+ # If you don't need the label to be at one of the boundaries, choose this
1849
+ # option for maximum performance
1416
1850
  # @param by [Object]
1417
1851
  # Also group by this column/these columns
1852
+ # @param start_by ['window', 'datapoint', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
1853
+ # The strategy to determine the start of the first window by.
1854
+ #
1855
+ # * 'window': Start by taking the earliest timestamp, truncating it with
1856
+ # `every`, and then adding `offset`.
1857
+ # Note that weekly windows start on Monday.
1858
+ # * 'datapoint': Start from the first encountered data point.
1859
+ # * a day of the week (only takes effect if `every` contains `'w'`):
1860
+ #
1861
+ # * 'monday': Start the window on the Monday before the first data point.
1862
+ # * 'tuesday': Start the window on the Tuesday before the first data point.
1863
+ # * ...
1864
+ # * 'sunday': Start the window on the Sunday before the first data point.
1865
+ #
1866
+ # The resulting window is then shifted back until the earliest datapoint
1867
+ # is in or in front of it.
1418
1868
  #
1419
1869
  # @return [DataFrame]
1420
1870
  #
@@ -1651,13 +2101,13 @@ module Polars
1651
2101
  # Join column of the right DataFrame.
1652
2102
  # @param on [String]
1653
2103
  # Join column of both DataFrames. If set, `left_on` and `right_on` should be
1654
- # None.
1655
- # @param by [Object]
1656
- # Join on these columns before doing asof join.
2104
+ # nil.
1657
2105
  # @param by_left [Object]
1658
2106
  # Join on these columns before doing asof join.
1659
2107
  # @param by_right [Object]
1660
2108
  # Join on these columns before doing asof join.
2109
+ # @param by [Object]
2110
+ # Join on these columns before doing asof join.
1661
2111
  # @param strategy ["backward", "forward"]
1662
2112
  # Join strategy.
1663
2113
  # @param suffix [String]
@@ -1873,7 +2323,7 @@ module Polars
1873
2323
  # # └─────────────┴────────────┴────────────┘
1874
2324
  #
1875
2325
  # @example
1876
- # pop2.join_asof(gdp2, by: "country", on: "date", strategy: "nearest").collect
2326
+ # pop2.join_asof(gdp2, by: "country", on: "date", strategy: "nearest", check_sortedness: false).collect
1877
2327
  # # =>
1878
2328
  # # shape: (6, 4)
1879
2329
  # # ┌─────────────┬────────────┬────────────┬──────┐
@@ -1976,7 +2426,7 @@ module Polars
1976
2426
  # Join column of the right DataFrame.
1977
2427
  # @param on Object
1978
2428
  # Join column of both DataFrames. If set, `left_on` and `right_on` should be
1979
- # None.
2429
+ # nil.
1980
2430
  # @param how ["inner", "left", "full", "semi", "anti", "cross"]
1981
2431
  # Join strategy.
1982
2432
  # @param suffix [String]
@@ -2171,10 +2621,110 @@ module Polars
2171
2621
  )
2172
2622
  end
2173
2623
 
2624
+ # Perform a join based on one or multiple (in)equality predicates.
2625
+ #
2626
+ # This performs an inner join, so only rows where all predicates are true
2627
+ # are included in the result, and a row from either DataFrame may be included
2628
+ # multiple times in the result.
2629
+ #
2630
+ # @note
2631
+ # The row order of the input DataFrames is not preserved.
2632
+ #
2633
+ # @note
2634
+ # This functionality is experimental. It may be
2635
+ # changed at any point without it being considered a breaking change.
2636
+ #
2637
+ # @param other [Object]
2638
+ # DataFrame to join with.
2639
+ # @param predicates [Object]
2640
+ # (In)Equality condition to join the two tables on.
2641
+ # When a column name occurs in both tables, the proper suffix must
2642
+ # be applied in the predicate.
2643
+ # @param suffix [String]
2644
+ # Suffix to append to columns with a duplicate name.
2645
+ #
2646
+ # @return [LazyFrame]
2647
+ #
2648
+ # @example Join two lazyframes together based on two predicates which get AND-ed together.
2649
+ # east = Polars::LazyFrame.new(
2650
+ # {
2651
+ # "id" => [100, 101, 102],
2652
+ # "dur" => [120, 140, 160],
2653
+ # "rev" => [12, 14, 16],
2654
+ # "cores" => [2, 8, 4]
2655
+ # }
2656
+ # )
2657
+ # west = Polars::LazyFrame.new(
2658
+ # {
2659
+ # "t_id" => [404, 498, 676, 742],
2660
+ # "time" => [90, 130, 150, 170],
2661
+ # "cost" => [9, 13, 15, 16],
2662
+ # "cores" => [4, 2, 1, 4]
2663
+ # }
2664
+ # )
2665
+ # east.join_where(
2666
+ # west,
2667
+ # Polars.col("dur") < Polars.col("time"),
2668
+ # Polars.col("rev") < Polars.col("cost")
2669
+ # ).collect
2670
+ # # =>
2671
+ # # shape: (5, 8)
2672
+ # # ┌─────┬─────┬─────┬───────┬──────┬──────┬──────┬─────────────┐
2673
+ # # │ id ┆ dur ┆ rev ┆ cores ┆ t_id ┆ time ┆ cost ┆ cores_right │
2674
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
2675
+ # # │ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
2676
+ # # ╞═════╪═════╪═════╪═══════╪══════╪══════╪══════╪═════════════╡
2677
+ # # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 498 ┆ 130 ┆ 13 ┆ 2 │
2678
+ # # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
2679
+ # # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
2680
+ # # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
2681
+ # # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
2682
+ # # └─────┴─────┴─────┴───────┴──────┴──────┴──────┴─────────────┘
2683
+ #
2684
+ # @example To OR them together, use a single expression and the `|` operator.
2685
+ # east.join_where(
2686
+ # west,
2687
+ # (Polars.col("dur") < Polars.col("time")) | (Polars.col("rev") < Polars.col("cost"))
2688
+ # ).collect
2689
+ # # =>
2690
+ # # shape: (6, 8)
2691
+ # # ┌─────┬─────┬─────┬───────┬──────┬──────┬──────┬─────────────┐
2692
+ # # │ id ┆ dur ┆ rev ┆ cores ┆ t_id ┆ time ┆ cost ┆ cores_right │
2693
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
2694
+ # # │ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
2695
+ # # ╞═════╪═════╪═════╪═══════╪══════╪══════╪══════╪═════════════╡
2696
+ # # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 498 ┆ 130 ┆ 13 ┆ 2 │
2697
+ # # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
2698
+ # # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
2699
+ # # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
2700
+ # # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
2701
+ # # │ 102 ┆ 160 ┆ 16 ┆ 4 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
2702
+ # # └─────┴─────┴─────┴───────┴──────┴──────┴──────┴─────────────┘
2703
+ def join_where(
2704
+ other,
2705
+ *predicates,
2706
+ suffix: "_right"
2707
+ )
2708
+ Utils.require_same_type(self, other)
2709
+
2710
+ rbexprs = Utils.parse_into_list_of_expressions(*predicates)
2711
+
2712
+ _from_rbldf(
2713
+ _ldf.join_where(
2714
+ other._ldf,
2715
+ rbexprs,
2716
+ suffix
2717
+ )
2718
+ )
2719
+ end
2720
+
2174
2721
  # Add or overwrite multiple columns in a DataFrame.
2175
2722
  #
2176
2723
  # @param exprs [Object]
2177
2724
  # List of Expressions that evaluate to columns.
2725
+ # @param named_exprs [Hash]
2726
+ # Additional columns to add, specified as keyword arguments.
2727
+ # The columns will be renamed to the keyword used.
2178
2728
  #
2179
2729
  # @return [LazyFrame]
2180
2730
  #
@@ -2213,6 +2763,34 @@ module Polars
2213
2763
  _from_rbldf(_ldf.with_columns(rbexprs))
2214
2764
  end
2215
2765
 
2766
+ # Add columns to this LazyFrame.
2767
+ #
2768
+ # Added columns will replace existing columns with the same name.
2769
+ #
2770
+ # This will run all expression sequentially instead of in parallel.
2771
+ # Use this when the work per expression is cheap.
2772
+ #
2773
+ # @param exprs [Array]
2774
+ # Column(s) to add, specified as positional arguments.
2775
+ # Accepts expression input. Strings are parsed as column names, other
2776
+ # non-expression inputs are parsed as literals.
2777
+ # @param named_exprs [Hash]
2778
+ # Additional columns to add, specified as keyword arguments.
2779
+ # The columns will be renamed to the keyword used.
2780
+ #
2781
+ # @return [LazyFrame]
2782
+ def with_columns_seq(
2783
+ *exprs,
2784
+ **named_exprs
2785
+ )
2786
+ structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", 0).to_i != 0
2787
+
2788
+ rbexprs = Utils.parse_into_list_of_expressions(
2789
+ *exprs, **named_exprs, __structify: structify
2790
+ )
2791
+ _from_rbldf(_ldf.with_columns_seq(rbexprs))
2792
+ end
2793
+
2216
2794
  # Add an external context to the computation graph.
2217
2795
  #
2218
2796
  # This allows expressions to also access columns from DataFrames
@@ -2299,6 +2877,9 @@ module Polars
2299
2877
  # @param columns [Object]
2300
2878
  # - Name of the column that should be removed.
2301
2879
  # - List of column names.
2880
+ # @param strict [Boolean]
2881
+ # Validate that all column names exist in the current schema,
2882
+ # and throw an exception if any do not.
2302
2883
  #
2303
2884
  # @return [LazyFrame]
2304
2885
  #
@@ -2350,9 +2931,18 @@ module Polars
2350
2931
  # # │ 7.0 │
2351
2932
  # # │ 8.0 │
2352
2933
  # # └─────┘
2353
- def drop(*columns)
2354
- drop_cols = Utils._expand_selectors(self, *columns)
2355
- _from_rbldf(_ldf.drop(drop_cols))
2934
+ def drop(*columns, strict: true)
2935
+ selectors = []
2936
+ columns.each do |c|
2937
+ if c.is_a?(Enumerable)
2938
+ selectors += c
2939
+ else
2940
+ selectors += [c]
2941
+ end
2942
+ end
2943
+
2944
+ drop_cols = Utils.parse_list_into_selector(selectors, strict: strict)
2945
+ _from_rbldf(_ldf.drop(drop_cols._rbselector))
2356
2946
  end
2357
2947
 
2358
2948
  # Rename column names.
@@ -2809,7 +3399,7 @@ module Polars
2809
3399
  #
2810
3400
  # @example
2811
3401
  # s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]}).lazy
2812
- # s.take_every(2).collect
3402
+ # s.gather_every(2).collect
2813
3403
  # # =>
2814
3404
  # # shape: (2, 2)
2815
3405
  # # ┌─────┬─────┐
@@ -2820,9 +3410,10 @@ module Polars
2820
3410
  # # │ 1 ┆ 5 │
2821
3411
  # # │ 3 ┆ 7 │
2822
3412
  # # └─────┴─────┘
2823
- def take_every(n)
2824
- select(F.col("*").take_every(n))
3413
+ def gather_every(n)
3414
+ select(F.col("*").gather_every(n))
2825
3415
  end
3416
+ alias_method :take_every, :gather_every
2826
3417
 
2827
3418
  # Fill null values using the specified value or strategy.
2828
3419
  #
@@ -3099,6 +3690,32 @@ module Polars
3099
3690
  _from_rbldf(_ldf.median)
3100
3691
  end
3101
3692
 
3693
+ # Aggregate the columns in the LazyFrame as the sum of their null value count.
3694
+ #
3695
+ # @return [LazyFrame]
3696
+ #
3697
+ # @example
3698
+ # lf = Polars::LazyFrame.new(
3699
+ # {
3700
+ # "foo" => [1, nil, 3],
3701
+ # "bar" => [6, 7, nil],
3702
+ # "ham" => ["a", "b", "c"]
3703
+ # }
3704
+ # )
3705
+ # lf.null_count.collect
3706
+ # # =>
3707
+ # # shape: (1, 3)
3708
+ # # ┌─────┬─────┬─────┐
3709
+ # # │ foo ┆ bar ┆ ham │
3710
+ # # │ --- ┆ --- ┆ --- │
3711
+ # # │ u32 ┆ u32 ┆ u32 │
3712
+ # # ╞═════╪═════╪═════╡
3713
+ # # │ 1 ┆ 1 ┆ 0 │
3714
+ # # └─────┴─────┴─────┘
3715
+ def null_count
3716
+ _from_rbldf(_ldf.null_count)
3717
+ end
3718
+
3102
3719
  # Aggregate the columns in the DataFrame to their quantile value.
3103
3720
  #
3104
3721
  # @param quantile [Float]
@@ -3153,9 +3770,11 @@ module Polars
3153
3770
  # # │ c ┆ 7 │
3154
3771
  # # │ c ┆ 8 │
3155
3772
  # # └─────────┴─────────┘
3156
- def explode(columns)
3157
- columns = Utils.parse_into_list_of_expressions(columns)
3158
- _from_rbldf(_ldf.explode(columns))
3773
+ def explode(columns, *more_columns)
3774
+ subset = Utils.parse_list_into_selector(columns) | Utils.parse_list_into_selector(
3775
+ more_columns
3776
+ )
3777
+ _from_rbldf(_ldf.explode(subset._rbselector))
3159
3778
  end
3160
3779
 
3161
3780
  # Drop duplicate rows from this DataFrame.
@@ -3220,43 +3839,110 @@ module Polars
3220
3839
  # # │ 1 ┆ a ┆ b │
3221
3840
  # # └─────┴─────┴─────┘
3222
3841
  def unique(maintain_order: true, subset: nil, keep: "first")
3223
- if !subset.nil? && !subset.is_a?(::Array)
3224
- subset = [subset]
3842
+ selector_subset = nil
3843
+ if !subset.nil?
3844
+ selector_subset = Utils.parse_list_into_selector(subset)._rbselector
3225
3845
  end
3226
- _from_rbldf(_ldf.unique(maintain_order, subset, keep))
3846
+ _from_rbldf(_ldf.unique(maintain_order, selector_subset, keep))
3227
3847
  end
3228
3848
 
3229
- # Drop rows with null values from this LazyFrame.
3849
+ # Drop all rows that contain one or more NaN values.
3850
+ #
3851
+ # The original order of the remaining rows is preserved.
3230
3852
  #
3231
3853
  # @param subset [Object]
3232
- # Subset of column(s) on which `drop_nulls` will be applied.
3854
+ # Column name(s) for which NaN values are considered; if set to `nil`
3855
+ # (default), use all columns (note that only floating-point columns
3856
+ # can contain NaNs).
3233
3857
  #
3234
3858
  # @return [LazyFrame]
3235
3859
  #
3236
3860
  # @example
3237
- # df = Polars::DataFrame.new(
3861
+ # lf = Polars::LazyFrame.new(
3862
+ # {
3863
+ # "foo" => [-20.5, Float::NAN, 80.0],
3864
+ # "bar" => [Float::NAN, 110.0, 25.5],
3865
+ # "ham" => ["xxx", "yyy", nil]
3866
+ # }
3867
+ # )
3868
+ # lf.drop_nans.collect
3869
+ # # =>
3870
+ # # shape: (1, 3)
3871
+ # # ┌──────┬──────┬──────┐
3872
+ # # │ foo ┆ bar ┆ ham │
3873
+ # # │ --- ┆ --- ┆ --- │
3874
+ # # │ f64 ┆ f64 ┆ str │
3875
+ # # ╞══════╪══════╪══════╡
3876
+ # # │ 80.0 ┆ 25.5 ┆ null │
3877
+ # # └──────┴──────┴──────┘
3878
+ #
3879
+ # @example
3880
+ # lf.drop_nans(subset: ["bar"]).collect
3881
+ # # =>
3882
+ # # shape: (2, 3)
3883
+ # # ┌──────┬───────┬──────┐
3884
+ # # │ foo ┆ bar ┆ ham │
3885
+ # # │ --- ┆ --- ┆ --- │
3886
+ # # │ f64 ┆ f64 ┆ str │
3887
+ # # ╞══════╪═══════╪══════╡
3888
+ # # │ NaN ┆ 110.0 ┆ yyy │
3889
+ # # │ 80.0 ┆ 25.5 ┆ null │
3890
+ # # └──────┴───────┴──────┘
3891
+ def drop_nans(subset: nil)
3892
+ selector_subset = nil
3893
+ if !subset.nil?
3894
+ selector_subset = Utils.parse_list_into_selector(subset)._rbselector
3895
+ end
3896
+ _from_rbldf(_ldf.drop_nans(selector_subset))
3897
+ end
3898
+
3899
+ # Drop all rows that contain one or more null values.
3900
+ #
3901
+ # The original order of the remaining rows is preserved.
3902
+ #
3903
+ # @param subset [Object]
3904
+ # Column name(s) for which null values are considered.
3905
+ # If set to `nil` (default), use all columns.
3906
+ #
3907
+ # @return [LazyFrame]
3908
+ #
3909
+ # @example
3910
+ # lf = Polars::LazyFrame.new(
3238
3911
  # {
3239
3912
  # "foo" => [1, 2, 3],
3240
3913
  # "bar" => [6, nil, 8],
3241
- # "ham" => ["a", "b", "c"]
3914
+ # "ham" => ["a", "b", nil]
3242
3915
  # }
3243
3916
  # )
3244
- # df.lazy.drop_nulls.collect
3917
+ # lf.drop_nulls.collect
3245
3918
  # # =>
3246
- # # shape: (2, 3)
3919
+ # # shape: (1, 3)
3247
3920
  # # ┌─────┬─────┬─────┐
3248
3921
  # # │ foo ┆ bar ┆ ham │
3249
3922
  # # │ --- ┆ --- ┆ --- │
3250
3923
  # # │ i64 ┆ i64 ┆ str │
3251
3924
  # # ╞═════╪═════╪═════╡
3252
3925
  # # │ 1 ┆ 6 ┆ a │
3253
- # # │ 3 ┆ 8 ┆ c │
3254
3926
  # # └─────┴─────┴─────┘
3927
+ #
3928
+ # @example
3929
+ # lf.drop_nulls(subset: Polars.cs.integer).collect
3930
+ # # =>
3931
+ # # shape: (2, 3)
3932
+ # # ┌─────┬─────┬──────┐
3933
+ # # │ foo ┆ bar ┆ ham │
3934
+ # # │ --- ┆ --- ┆ --- │
3935
+ # # │ i64 ┆ i64 ┆ str │
3936
+ # # ╞═════╪═════╪══════╡
3937
+ # # │ 1 ┆ 6 ┆ a │
3938
+ # # │ 3 ┆ 8 ┆ null │
3939
+ # # └─────┴─────┴──────┘
3255
3940
  def drop_nulls(subset: nil)
3256
- if !subset.nil? && !subset.is_a?(::Array)
3257
- subset = [subset]
3941
+ selector_subset = nil
3942
+ if !subset.nil?
3943
+ selector_subset = Utils.parse_list_into_selector(subset)._rbselector
3258
3944
  end
3259
- _from_rbldf(_ldf.drop_nulls(subset))
3945
+ _from_rbldf(_ldf.drop_nulls(selector_subset))
3260
3946
  end
3261
3947
 
3262
3948
  # Unpivot a DataFrame from wide to long format.
@@ -3318,11 +4004,16 @@ module Polars
3318
4004
  warn "The `streamable` parameter for `LazyFrame.unpivot` is deprecated"
3319
4005
  end
3320
4006
 
3321
- on = on.nil? ? [] : Utils.parse_into_list_of_expressions(on)
3322
- index = index.nil? ? [] : Utils.parse_into_list_of_expressions(index)
4007
+ selector_on = on.nil? ? Selectors.empty : Utils.parse_list_into_selector(on)
4008
+ selector_index = index.nil? ? Selectors.empty : Utils.parse_list_into_selector(index)
3323
4009
 
3324
4010
  _from_rbldf(
3325
- _ldf.unpivot(on, index, value_name, variable_name)
4011
+ _ldf.unpivot(
4012
+ selector_on._rbselector,
4013
+ selector_index._rbselector,
4014
+ value_name,
4015
+ variable_name
4016
+ )
3326
4017
  )
3327
4018
  end
3328
4019
  alias_method :melt, :unpivot
@@ -3364,8 +4055,10 @@ module Polars
3364
4055
  # The fields will be inserted into the `DataFrame` on the location of the
3365
4056
  # `struct` type.
3366
4057
  #
3367
- # @param names [Object]
4058
+ # @param columns [Object]
3368
4059
  # Names of the struct columns that will be decomposed by its fields
4060
+ # @param more_columns [Array]
4061
+ # Additional columns to unnest, specified as positional arguments.
3369
4062
  #
3370
4063
  # @return [LazyFrame]
3371
4064
  #
@@ -3410,11 +4103,11 @@ module Polars
3410
4103
  # # │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │
3411
4104
  # # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │
3412
4105
  # # └────────┴─────┴─────┴──────┴───────────┴───────┘
3413
- def unnest(names)
3414
- if names.is_a?(::String)
3415
- names = [names]
3416
- end
3417
- _from_rbldf(_ldf.unnest(names))
4106
+ def unnest(columns, *more_columns)
4107
+ subset = Utils.parse_list_into_selector(columns) | Utils.parse_list_into_selector(
4108
+ more_columns
4109
+ )
4110
+ _from_rbldf(_ldf.unnest(subset._rbselector))
3418
4111
  end
3419
4112
 
3420
4113
  # Take two sorted DataFrames and merge them by the sorted key.
@@ -3483,9 +4176,261 @@ module Polars
3483
4176
  with_columns(F.col(column).set_sorted(descending: descending))
3484
4177
  end
3485
4178
 
3486
- # TODO
3487
- # def update
3488
- # end
4179
+ # Update the values in this `LazyFrame` with the values in `other`.
4180
+ #
4181
+ # @note
4182
+ # This functionality is considered **unstable**. It may be changed
4183
+ # at any point without it being considered a breaking change.
4184
+ #
4185
+ # @param other [LazyFrame]
4186
+ # LazyFrame that will be used to update the values
4187
+ # @param on [Object]
4188
+ # Column names that will be joined on. If set to `nil` (default),
4189
+ # the implicit row index of each frame is used as a join key.
4190
+ # @param how ['left', 'inner', 'full']
4191
+ # * 'left' will keep all rows from the left table; rows may be duplicated
4192
+ # if multiple rows in the right frame match the left row's key.
4193
+ # * 'inner' keeps only those rows where the key exists in both frames.
4194
+ # * 'full' will update existing rows where the key matches while also
4195
+ # adding any new rows contained in the given frame.
4196
+ # @param left_on [Object]
4197
+ # Join column(s) of the left DataFrame.
4198
+ # @param right_on [Object]
4199
+ # Join column(s) of the right DataFrame.
4200
+ # @param include_nulls [Boolean]
4201
+ # Overwrite values in the left frame with null values from the right frame.
4202
+ # If set to `false` (default), null values in the right frame are ignored.
4203
+ # @param maintain_order ['none', 'left', 'right', 'left_right', 'right_left']
4204
+ # Which order of rows from the inputs to preserve. See `LazyFrame.join`
4205
+ # for details. Unlike `join` this function preserves the left order by
4206
+ # default.
4207
+ #
4208
+ # @return [LazyFrame]
4209
+ #
4210
+ # @note
4211
+ # This is syntactic sugar for a left/inner join that preserves the order
4212
+ # of the left `DataFrame` by default, with an optional coalesce when
4213
+ # `include_nulls: False`.
4214
+ #
4215
+ # @example Update `df` values with the non-null values in `new_df`, by row index:
4216
+ # lf = Polars::LazyFrame.new(
4217
+ # {
4218
+ # "A" => [1, 2, 3, 4],
4219
+ # "B" => [400, 500, 600, 700]
4220
+ # }
4221
+ # )
4222
+ # new_lf = Polars::LazyFrame.new(
4223
+ # {
4224
+ # "B" => [-66, nil, -99],
4225
+ # "C" => [5, 3, 1]
4226
+ # }
4227
+ # )
4228
+ # lf.update(new_lf).collect
4229
+ # # =>
4230
+ # # shape: (4, 2)
4231
+ # # ┌─────┬─────┐
4232
+ # # │ A ┆ B │
4233
+ # # │ --- ┆ --- │
4234
+ # # │ i64 ┆ i64 │
4235
+ # # ╞═════╪═════╡
4236
+ # # │ 1 ┆ -66 │
4237
+ # # │ 2 ┆ 500 │
4238
+ # # │ 3 ┆ -99 │
4239
+ # # │ 4 ┆ 700 │
4240
+ # # └─────┴─────┘
4241
+ #
4242
+ # @example Update `df` values with the non-null values in `new_df`, by row index, but only keeping those rows that are common to both frames:
4243
+ # lf.update(new_lf, how: "inner").collect
4244
+ # # =>
4245
+ # # shape: (3, 2)
4246
+ # # ┌─────┬─────┐
4247
+ # # │ A ┆ B │
4248
+ # # │ --- ┆ --- │
4249
+ # # │ i64 ┆ i64 │
4250
+ # # ╞═════╪═════╡
4251
+ # # │ 1 ┆ -66 │
4252
+ # # │ 2 ┆ 500 │
4253
+ # # │ 3 ┆ -99 │
4254
+ # # └─────┴─────┘
4255
+ #
4256
+ # @example Update `df` values with the non-null values in `new_df`, using a full outer join strategy that defines explicit join columns in each frame:
4257
+ # lf.update(new_lf, left_on: ["A"], right_on: ["C"], how: "full").collect
4258
+ # # =>
4259
+ # # shape: (5, 2)
4260
+ # # ┌─────┬─────┐
4261
+ # # │ A ┆ B │
4262
+ # # │ --- ┆ --- │
4263
+ # # │ i64 ┆ i64 │
4264
+ # # ╞═════╪═════╡
4265
+ # # │ 1 ┆ -99 │
4266
+ # # │ 2 ┆ 500 │
4267
+ # # │ 3 ┆ 600 │
4268
+ # # │ 4 ┆ 700 │
4269
+ # # │ 5 ┆ -66 │
4270
+ # # └─────┴─────┘
4271
+ #
4272
+ # @example Update `df` values including null values in `new_df`, using a full outer join strategy that defines explicit join columns in each frame:
4273
+ # lf.update(
4274
+ # new_lf, left_on: "A", right_on: "C", how: "full", include_nulls: true
4275
+ # ).collect
4276
+ # # =>
4277
+ # # shape: (5, 2)
4278
+ # # ┌─────┬──────┐
4279
+ # # │ A ┆ B │
4280
+ # # │ --- ┆ --- │
4281
+ # # │ i64 ┆ i64 │
4282
+ # # ╞═════╪══════╡
4283
+ # # │ 1 ┆ -99 │
4284
+ # # │ 2 ┆ 500 │
4285
+ # # │ 3 ┆ null │
4286
+ # # │ 4 ┆ 700 │
4287
+ # # │ 5 ┆ -66 │
4288
+ # # └─────┴──────┘
4289
+ def update(
4290
+ other,
4291
+ on: nil,
4292
+ how: "left",
4293
+ left_on: nil,
4294
+ right_on: nil,
4295
+ include_nulls: false,
4296
+ maintain_order: "left"
4297
+ )
4298
+ Utils.require_same_type(self, other)
4299
+ if ["outer", "outer_coalesce"].include?(how)
4300
+ how = "full"
4301
+ end
4302
+
4303
+ if !["left", "inner", "full"].include?(how)
4304
+ msg = "`how` must be one of {{'left', 'inner', 'full'}}; found #{how.inspect}"
4305
+ raise ArgumentError, msg
4306
+ end
4307
+
4308
+ slf = self
4309
+ row_index_used = false
4310
+ if on.nil?
4311
+ if left_on.nil? && right_on.nil?
4312
+ # no keys provided--use row index
4313
+ row_index_used = true
4314
+ row_index_name = "__POLARS_ROW_INDEX"
4315
+ slf = slf.with_row_index(name: row_index_name)
4316
+ other = other.with_row_index(name: row_index_name)
4317
+ left_on = right_on = [row_index_name]
4318
+ else
4319
+ # one of left or right is missing, raise error
4320
+ if left_on.nil?
4321
+ msg = "missing join columns for left frame"
4322
+ raise ArgumentError, msg
4323
+ end
4324
+ if right_on.nil?
4325
+ msg = "missing join columns for right frame"
4326
+ raise ArgumentError, msg
4327
+ end
4328
+ end
4329
+ else
4330
+ # move on into left/right_on to simplify logic
4331
+ left_on = right_on = on
4332
+ end
4333
+
4334
+ if left_on.is_a?(::String)
4335
+ left_on = [left_on]
4336
+ end
4337
+ if right_on.is_a?(::String)
4338
+ right_on = [right_on]
4339
+ end
4340
+
4341
+ left_schema = slf.collect_schema
4342
+ left_on.each do |name|
4343
+ if !left_schema.include?(name)
4344
+ msg = "left join column #{name.inspect} not found"
4345
+ raise ArgumentError, msg
4346
+ end
4347
+ end
4348
+ right_schema = other.collect_schema
4349
+ right_on.each do |name|
4350
+ if !right_schema.include?(name)
4351
+ msg = "right join column #{name.inspect} not found"
4352
+ raise ArgumentError, msg
4353
+ end
4354
+ end
4355
+
4356
+ # no need to join if *only* join columns are in other (inner/left update only)
4357
+ if how != "full" && right_schema.length == right_on.length
4358
+ if row_index_used
4359
+ return slf.drop(row_index_name)
4360
+ end
4361
+ return slf
4362
+ end
4363
+
4364
+ # only use non-idx right columns present in left frame
4365
+ right_other = Set.new(right_schema.to_h.keys).intersection(left_schema.to_h.keys) - Set.new(right_on)
4366
+
4367
+ # When include_nulls is True, we need to distinguish records after the join that
4368
+ # were originally null in the right frame, as opposed to records that were null
4369
+ # because the key was missing from the right frame.
4370
+ # Add a validity column to track whether row was matched or not.
4371
+ if include_nulls
4372
+ validity = ["__POLARS_VALIDITY"]
4373
+ other = other.with_columns(F.lit(true).alias(validity[0]))
4374
+ else
4375
+ validity = []
4376
+ end
4377
+
4378
+ tmp_name = "__POLARS_RIGHT"
4379
+ drop_columns = right_other.map { |name| "#{name}#{tmp_name}" } + validity
4380
+ result = (
4381
+ slf.join(
4382
+ other.select(*right_on, *right_other, *validity),
4383
+ left_on: left_on,
4384
+ right_on: right_on,
4385
+ how: how,
4386
+ suffix: tmp_name,
4387
+ coalesce: true,
4388
+ maintain_order: maintain_order
4389
+ )
4390
+ .with_columns(
4391
+ right_other.map do |name|
4392
+ (
4393
+ if include_nulls
4394
+ # use left value only when right value failed to join
4395
+ F.when(F.col(validity).is_null)
4396
+ .then(F.col(name))
4397
+ .otherwise(F.col("#{name}#{tmp_name}"))
4398
+ else
4399
+ F.coalesce(["#{name}#{tmp_name}", F.col(name)])
4400
+ end
4401
+ ).alias(name)
4402
+ end
4403
+ )
4404
+ .drop(drop_columns)
4405
+ )
4406
+ if row_index_used
4407
+ result = result.drop(row_index_name)
4408
+ end
4409
+
4410
+ _from_rbldf(result._ldf)
4411
+ end
4412
+
4413
+ # Return the number of non-null elements for each column.
4414
+ #
4415
+ # @return [LazyFrame]
4416
+ #
4417
+ # @example
4418
+ # lf = Polars::LazyFrame.new(
4419
+ # {"a" => [1, 2, 3, 4], "b" => [1, 2, 1, nil], "c" => [nil, nil, nil, nil]}
4420
+ # )
4421
+ # lf.count.collect
4422
+ # # =>
4423
+ # # shape: (1, 3)
4424
+ # # ┌─────┬─────┬─────┐
4425
+ # # │ a ┆ b ┆ c │
4426
+ # # │ --- ┆ --- ┆ --- │
4427
+ # # │ u32 ┆ u32 ┆ u32 │
4428
+ # # ╞═════╪═════╪═════╡
4429
+ # # │ 4 ┆ 3 ┆ 0 │
4430
+ # # └─────┴─────┴─────┘
4431
+ def count
4432
+ _from_rbldf(_ldf.count)
4433
+ end
3489
4434
 
3490
4435
  private
3491
4436
 
@@ -3497,5 +4442,64 @@ module Polars
3497
4442
  def _from_rbldf(rb_ldf)
3498
4443
  self.class._from_rbldf(rb_ldf)
3499
4444
  end
4445
+
4446
+ def _filter(
4447
+ predicates:,
4448
+ constraints:,
4449
+ invert: false
4450
+ )
4451
+ all_predicates = []
4452
+ boolean_masks = []
4453
+
4454
+ predicates.each do |p|
4455
+ # quick exit/skip conditions
4456
+ if (p.is_a?(FalseClass) && invert) || (p.is_a?(TrueClass) && !invert)
4457
+ next # ignore; doesn't filter/remove anything
4458
+ end
4459
+ if (p.is_a?(TrueClass) && invert) || (p.is_a?(FalseClass) && !invert)
4460
+ return clear # discard all rows
4461
+ end
4462
+
4463
+ # note: identify masks separately from predicates
4464
+ if Utils.is_bool_sequence(p, include_series: true)
4465
+ boolean_masks << Polars::Series.new(p, dtype: Boolean)
4466
+ elsif (
4467
+ (is_seq = Utils.is_sequence(p)) && p.any? { |x| !x.is_a?(Expr) }) ||
4468
+ (!is_seq && !p.is_a?(Expr) && !(p.is_a?(::String) && collect_schema.include?(p))
4469
+ )
4470
+ err = p.is_a?(Series) ? "Series(…, dtype: #{p.dtype})" : p.inspect
4471
+ msg = "invalid predicate for `filter`: #{err}"
4472
+ raise TypeError, msg
4473
+ else
4474
+ all_predicates.concat(
4475
+ Utils.parse_into_list_of_expressions(p).map { |x| Utils.wrap_expr(x) }
4476
+ )
4477
+ end
4478
+ end
4479
+
4480
+ # unpack equality constraints from kwargs
4481
+ all_predicates.concat(
4482
+ constraints.map { |name, value| F.col(name).eq(value) }
4483
+ )
4484
+ if !(all_predicates.any? || boolean_masks.any?)
4485
+ msg = "at least one predicate or constraint must be provided"
4486
+ raise TypeError, msg
4487
+ end
4488
+
4489
+ # if multiple predicates, combine as 'horizontal' expression
4490
+ combined_predicate = all_predicates ? (all_predicates.length > 1 ? F.all_horizontal(*all_predicates) : all_predicates[0]) : nil
4491
+
4492
+ # apply reduced boolean mask first, if applicable, then predicates
4493
+ if boolean_masks.any?
4494
+ raise Todo
4495
+ end
4496
+
4497
+ if combined_predicate.nil?
4498
+ return _from_rbldf(_ldf)
4499
+ end
4500
+
4501
+ filter_method = invert ? _ldf.method(:remove) : _ldf.method(:filter)
4502
+ _from_rbldf(filter_method.(combined_predicate._rbexpr))
4503
+ end
3500
4504
  end
3501
4505
  end