polars-df 0.21.0-arm64-darwin → 0.22.0-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +27 -0
  3. data/Cargo.lock +55 -48
  4. data/Cargo.toml +3 -0
  5. data/LICENSE-THIRD-PARTY.txt +23 -49
  6. data/README.md +12 -0
  7. data/lib/polars/3.2/polars.bundle +0 -0
  8. data/lib/polars/3.3/polars.bundle +0 -0
  9. data/lib/polars/3.4/polars.bundle +0 -0
  10. data/lib/polars/array_expr.rb +382 -3
  11. data/lib/polars/array_name_space.rb +281 -0
  12. data/lib/polars/binary_expr.rb +67 -0
  13. data/lib/polars/binary_name_space.rb +43 -0
  14. data/lib/polars/cat_expr.rb +224 -0
  15. data/lib/polars/cat_name_space.rb +138 -0
  16. data/lib/polars/config.rb +2 -2
  17. data/lib/polars/convert.rb +6 -6
  18. data/lib/polars/data_frame.rb +794 -27
  19. data/lib/polars/data_type_expr.rb +52 -0
  20. data/lib/polars/data_types.rb +26 -5
  21. data/lib/polars/date_time_expr.rb +252 -1
  22. data/lib/polars/date_time_name_space.rb +299 -0
  23. data/lib/polars/expr.rb +1248 -206
  24. data/lib/polars/functions/business.rb +95 -0
  25. data/lib/polars/functions/datatype.rb +21 -0
  26. data/lib/polars/functions/lazy.rb +14 -1
  27. data/lib/polars/io/csv.rb +1 -1
  28. data/lib/polars/io/iceberg.rb +27 -0
  29. data/lib/polars/io/json.rb +4 -4
  30. data/lib/polars/io/ndjson.rb +4 -4
  31. data/lib/polars/io/parquet.rb +32 -7
  32. data/lib/polars/io/scan_options.rb +4 -1
  33. data/lib/polars/lazy_frame.rb +1028 -28
  34. data/lib/polars/list_expr.rb +217 -17
  35. data/lib/polars/list_name_space.rb +231 -22
  36. data/lib/polars/meta_expr.rb +89 -0
  37. data/lib/polars/name_expr.rb +36 -0
  38. data/lib/polars/query_opt_flags.rb +50 -0
  39. data/lib/polars/scan_cast_options.rb +20 -1
  40. data/lib/polars/schema.rb +79 -3
  41. data/lib/polars/selector.rb +72 -0
  42. data/lib/polars/selectors.rb +3 -3
  43. data/lib/polars/series.rb +1053 -54
  44. data/lib/polars/string_expr.rb +436 -32
  45. data/lib/polars/string_name_space.rb +736 -50
  46. data/lib/polars/struct_expr.rb +103 -0
  47. data/lib/polars/struct_name_space.rb +19 -1
  48. data/lib/polars/utils/serde.rb +17 -0
  49. data/lib/polars/utils/various.rb +22 -1
  50. data/lib/polars/utils.rb +5 -1
  51. data/lib/polars/version.rb +1 -1
  52. data/lib/polars.rb +6 -0
  53. metadata +8 -2
@@ -27,9 +27,6 @@ module Polars
27
27
  ldf
28
28
  end
29
29
 
30
- # def self.from_json
31
- # end
32
-
33
30
  # Read a logical plan from a JSON file to construct a LazyFrame.
34
31
  #
35
32
  # @param file [String]
@@ -41,7 +38,49 @@ module Polars
41
38
  file = Utils.normalize_filepath(file)
42
39
  end
43
40
 
44
- Utils.wrap_ldf(RbLazyFrame.read_json(file))
41
+ Utils.wrap_ldf(RbLazyFrame.deserialize_json(file))
42
+ end
43
+
44
+ # Read a logical plan from a file to construct a LazyFrame.
45
+ #
46
+ # @param source [Object]
47
+ # Path to a file or a file-like object (by file-like object, we refer to
48
+ # objects that have a `read` method, such as a file handler or `StringIO`).
49
+ #
50
+ # @return [LazyFrame]
51
+ #
52
+ # @note
53
+ # This function uses marshaling if the logical plan contains Ruby UDFs,
54
+ # and as such inherits the security implications. Deserializing can execute
55
+ # arbitrary code, so it should only be attempted on trusted data.
56
+ #
57
+ # @note
58
+ # Serialization is not stable across Polars versions: a LazyFrame serialized
59
+ # in one Polars version may not be deserializable in another Polars version.
60
+ #
61
+ # @example
62
+ # lf = Polars::LazyFrame.new({"a" => [1, 2, 3]}).sum
63
+ # bytes = lf.serialize
64
+ # Polars::LazyFrame.deserialize(StringIO.new(bytes)).collect
65
+ # # =>
66
+ # # shape: (1, 1)
67
+ # # ┌─────┐
68
+ # # │ a │
69
+ # # │ --- │
70
+ # # │ i64 │
71
+ # # ╞═════╡
72
+ # # │ 6 │
73
+ # # └─────┘
74
+ def self.deserialize(source)
75
+ raise Todo unless RbLazyFrame.respond_to?(:deserialize_binary)
76
+
77
+ if Utils.pathlike?(source)
78
+ source = Utils.normalize_filepath(source)
79
+ end
80
+
81
+ deserializer = RbLazyFrame.method(:deserialize_binary)
82
+
83
+ _from_rbldf(deserializer.(source))
45
84
  end
46
85
 
47
86
  # Get or set column names.
@@ -151,6 +190,38 @@ module Polars
151
190
  nil
152
191
  end
153
192
 
193
+ # Serialize the logical plan of this LazyFrame to a file or string.
194
+ #
195
+ # @param file [Object]
196
+ # File path to which the result should be written. If set to `nil`
197
+ # (default), the output is returned as a string instead.
198
+ #
199
+ # @return [Object]
200
+ #
201
+ # @note
202
+ # Serialization is not stable across Polars versions: a LazyFrame serialized
203
+ # in one Polars version may not be deserializable in another Polars version.
204
+ #
205
+ # @example Serialize the logical plan into a binary representation.
206
+ # lf = Polars::LazyFrame.new({"a" => [1, 2, 3]}).sum
207
+ # bytes = lf.serialize
208
+ # Polars::LazyFrame.deserialize(StringIO.new(bytes)).collect
209
+ # # =>
210
+ # # shape: (1, 1)
211
+ # # ┌─────┐
212
+ # # │ a │
213
+ # # │ --- │
214
+ # # │ i64 │
215
+ # # ╞═════╡
216
+ # # │ 6 │
217
+ # # └─────┘
218
+ def serialize(file = nil)
219
+ raise Todo unless _ldf.respond_to?(:serialize_binary)
220
+
221
+ serializer = _ldf.method(:serialize_binary)
222
+ Utils.serialize_polars_object(serializer, file)
223
+ end
224
+
154
225
  # Offers a structured way to apply a sequence of user-defined functions (UDFs).
155
226
  #
156
227
  # @param func [Object]
@@ -288,6 +359,201 @@ module Polars
288
359
  )
289
360
  end
290
361
 
362
+ # Execute a SQL query against the LazyFrame.
363
+ #
364
+ # @note
365
+ # This functionality is considered **unstable**, although it is close to
366
+ # being considered stable. It may be changed at any point without it being
367
+ # considered a breaking change.
368
+ #
369
+ # @param query [String]
370
+ # SQL query to execute.
371
+ # @param table_name [String]
372
+ # Optionally provide an explicit name for the table that represents the
373
+ # calling frame (defaults to "self").
374
+ #
375
+ # @return [Expr]
376
+ #
377
+ # @note
378
+ # * The calling frame is automatically registered as a table in the SQL context
379
+ # under the name "self". If you want access to the DataFrames and LazyFrames
380
+ # found in the current globals, use the top-level `Polars.sql`.
381
+ # * More control over registration and execution behaviour is available by
382
+ # using the `SQLContext` object.
383
+ #
384
+ # @example Query the LazyFrame using SQL:
385
+ # lf1 = Polars::LazyFrame.new({"a" => [1, 2, 3], "b" => [6, 7, 8], "c" => ["z", "y", "x"]})
386
+ # lf2 = Polars::LazyFrame.new({"a" => [3, 2, 1], "d" => [125, -654, 888]})
387
+ # lf1.sql("SELECT c, b FROM self WHERE a > 1").collect
388
+ # # =>
389
+ # # shape: (2, 2)
390
+ # # ┌─────┬─────┐
391
+ # # │ c ┆ b │
392
+ # # │ --- ┆ --- │
393
+ # # │ str ┆ i64 │
394
+ # # ╞═════╪═════╡
395
+ # # │ y ┆ 7 │
396
+ # # │ x ┆ 8 │
397
+ # # └─────┴─────┘
398
+ #
399
+ # @example Apply SQL transforms (aliasing "self" to "frame") then filter natively (you can freely mix SQL and native operations):
400
+ # lf1.sql(
401
+ # "
402
+ # SELECT
403
+ # a,
404
+ # (a % 2 == 0) AS a_is_even,
405
+ # (b::float4 / 2) AS \"b/2\",
406
+ # CONCAT_WS(':', c, c, c) AS c_c_c
407
+ # FROM frame
408
+ # ORDER BY a
409
+ # ",
410
+ # table_name: "frame",
411
+ # ).filter(~Polars.col("c_c_c").str.starts_with("x")).collect
412
+ # # =>
413
+ # # shape: (2, 4)
414
+ # # ┌─────┬───────────┬─────┬───────┐
415
+ # # │ a ┆ a_is_even ┆ b/2 ┆ c_c_c │
416
+ # # │ --- ┆ --- ┆ --- ┆ --- │
417
+ # # │ i64 ┆ bool ┆ f32 ┆ str │
418
+ # # ╞═════╪═══════════╪═════╪═══════╡
419
+ # # │ 1 ┆ false ┆ 3.0 ┆ z:z:z │
420
+ # # │ 2 ┆ true ┆ 3.5 ┆ y:y:y │
421
+ # # └─────┴───────────┴─────┴───────┘
422
+ def sql(query, table_name: "self")
423
+ ctx = Polars::SQLContext.new
424
+ name = table_name || "self"
425
+ ctx.register(name, self)
426
+ ctx.execute(query)
427
+ end
428
+
429
+ # Return the `k` largest rows.
430
+ #
431
+ # Non-null elements are always preferred over null elements, regardless of
432
+ # the value of `reverse`. The output is not guaranteed to be in any
433
+ # particular order, call :func:`sort` after this function if you wish the
434
+ # output to be sorted.
435
+ #
436
+ # @param k [Integer]
437
+ # Number of rows to return.
438
+ # @param by [Object]
439
+ # Column(s) used to determine the top rows.
440
+ # Accepts expression input. Strings are parsed as column names.
441
+ # @param reverse [Object]
442
+ # Consider the `k` smallest elements of the `by` column(s) (instead of the `k`
443
+ # largest). This can be specified per column by passing a sequence of
444
+ # booleans.
445
+ #
446
+ # @return [LazyFrame]
447
+ #
448
+ # @example Get the rows which contain the 4 largest values in column b.
449
+ # lf = Polars::LazyFrame.new(
450
+ # {
451
+ # "a" => ["a", "b", "a", "b", "b", "c"],
452
+ # "b" => [2, 1, 1, 3, 2, 1]
453
+ # }
454
+ # )
455
+ # lf.top_k(4, by: "b").collect
456
+ # # =>
457
+ # # shape: (4, 2)
458
+ # # ┌─────┬─────┐
459
+ # # │ a ┆ b │
460
+ # # │ --- ┆ --- │
461
+ # # │ str ┆ i64 │
462
+ # # ╞═════╪═════╡
463
+ # # │ b ┆ 3 │
464
+ # # │ a ┆ 2 │
465
+ # # │ b ┆ 2 │
466
+ # # │ b ┆ 1 │
467
+ # # └─────┴─────┘
468
+ #
469
+ # @example Get the rows which contain the 4 largest values when sorting on column b and a.
470
+ # lf.top_k(4, by: ["b", "a"]).collect
471
+ # # =>
472
+ # # shape: (4, 2)
473
+ # # ┌─────┬─────┐
474
+ # # │ a ┆ b │
475
+ # # │ --- ┆ --- │
476
+ # # │ str ┆ i64 │
477
+ # # ╞═════╪═════╡
478
+ # # │ b ┆ 3 │
479
+ # # │ b ┆ 2 │
480
+ # # │ a ┆ 2 │
481
+ # # │ c ┆ 1 │
482
+ # # └─────┴─────┘
483
+ def top_k(
484
+ k,
485
+ by:,
486
+ reverse: false
487
+ )
488
+ by = Utils.parse_into_list_of_expressions(by)
489
+ reverse = Utils.extend_bool(reverse, by.length, "reverse", "by")
490
+ _from_rbldf(_ldf.top_k(k, by, reverse))
491
+ end
492
+
493
+ # Return the `k` smallest rows.
494
+ #
495
+ # Non-null elements are always preferred over null elements, regardless of
496
+ # the value of `reverse`. The output is not guaranteed to be in any
497
+ # particular order, call :func:`sort` after this function if you wish the
498
+ # output to be sorted.
499
+ #
500
+ # @param k [Integer]
501
+ # Number of rows to return.
502
+ # @param by [Object]
503
+ # Column(s) used to determine the bottom rows.
504
+ # Accepts expression input. Strings are parsed as column names.
505
+ # @param reverse [Object]
506
+ # Consider the `k` largest elements of the `by` column(s) (instead of the `k`
507
+ # smallest). This can be specified per column by passing a sequence of
508
+ # booleans.
509
+ #
510
+ # @return [LazyFrame]
511
+ #
512
+ # @example Get the rows which contain the 4 smallest values in column b.
513
+ # lf = Polars::LazyFrame.new(
514
+ # {
515
+ # "a" => ["a", "b", "a", "b", "b", "c"],
516
+ # "b" => [2, 1, 1, 3, 2, 1]
517
+ # }
518
+ # )
519
+ # lf.bottom_k(4, by: "b").collect
520
+ # # =>
521
+ # # shape: (4, 2)
522
+ # # ┌─────┬─────┐
523
+ # # │ a ┆ b │
524
+ # # │ --- ┆ --- │
525
+ # # │ str ┆ i64 │
526
+ # # ╞═════╪═════╡
527
+ # # │ b ┆ 1 │
528
+ # # │ a ┆ 1 │
529
+ # # │ c ┆ 1 │
530
+ # # │ a ┆ 2 │
531
+ # # └─────┴─────┘
532
+ #
533
+ # @example Get the rows which contain the 4 smallest values when sorting on column a and b.
534
+ # lf.bottom_k(4, by: ["a", "b"]).collect
535
+ # # =>
536
+ # # shape: (4, 2)
537
+ # # ┌─────┬─────┐
538
+ # # │ a ┆ b │
539
+ # # │ --- ┆ --- │
540
+ # # │ str ┆ i64 │
541
+ # # ╞═════╪═════╡
542
+ # # │ a ┆ 1 │
543
+ # # │ a ┆ 2 │
544
+ # # │ b ┆ 1 │
545
+ # # │ b ┆ 2 │
546
+ # # └─────┴─────┘
547
+ def bottom_k(
548
+ k,
549
+ by:,
550
+ reverse: false
551
+ )
552
+ by = Utils.parse_into_list_of_expressions(by)
553
+ reverse = Utils.extend_bool(reverse, by.length, "reverse", "by")
554
+ _from_rbldf(_ldf.bottom_k(k, by, reverse))
555
+ end
556
+
291
557
  # def profile
292
558
  # end
293
559
 
@@ -379,6 +645,41 @@ module Polars
379
645
  Utils.wrap_df(ldf.collect)
380
646
  end
381
647
 
648
+ # Resolve the schema of this LazyFrame.
649
+ #
650
+ # @return [Schema]
651
+ #
652
+ # @example Determine the schema.
653
+ # lf = Polars::LazyFrame.new(
654
+ # {
655
+ # "foo" => [1, 2, 3],
656
+ # "bar" => [6.0, 7.0, 8.0],
657
+ # "ham" => ["a", "b", "c"]
658
+ # }
659
+ # )
660
+ # lf.collect_schema
661
+ # # => Polars::Schema({"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::String})
662
+ #
663
+ # @example Access various properties of the schema.
664
+ # schema = lf.collect_schema
665
+ # schema["bar"]
666
+ # # => Polars::Float64
667
+ #
668
+ # @example
669
+ # schema.names
670
+ # # => ["foo", "bar", "ham"]
671
+ #
672
+ # @example
673
+ # schema.dtypes
674
+ # # => [Polars::Int64, Polars::Float64, Polars::String]
675
+ #
676
+ # @example
677
+ # schema.length
678
+ # # => 3
679
+ def collect_schema
680
+ Schema.new(_ldf.collect_schema, check_dtypes: false)
681
+ end
682
+
382
683
  # Persists a LazyFrame at the provided path.
383
684
  #
384
685
  # This allows streaming results that are larger than RAM to be written to disk.
@@ -544,6 +845,21 @@ module Polars
544
845
  # @param maintain_order [Boolean]
545
846
  # Maintain the order in which data is processed.
546
847
  # Setting this to `false` will be slightly faster.
848
+ # @param storage_options [String]
849
+ # Options that indicate how to connect to a cloud provider.
850
+ #
851
+ # The cloud providers currently supported are AWS, GCP, and Azure.
852
+ # See supported keys here:
853
+ #
854
+ # * [aws](https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html)
855
+ # * [gcp](https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html)
856
+ # * [azure](https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html)
857
+ # * Hugging Face (`hf://`): Accepts an API key under the `token` parameter: `{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.
858
+ #
859
+ # If `storage_options` is not provided, Polars will try to infer the
860
+ # information from environment variables.
861
+ # @param retries [Integer]
862
+ # Number of retries if accessing a cloud instance fails.
547
863
  # @param type_coercion [Boolean]
548
864
  # Do type coercion optimization.
549
865
  # @param predicate_pushdown [Boolean]
@@ -576,6 +892,8 @@ module Polars
576
892
  path,
577
893
  compression: "zstd",
578
894
  maintain_order: true,
895
+ storage_options: nil,
896
+ retries: 2,
579
897
  type_coercion: true,
580
898
  predicate_pushdown: true,
581
899
  projection_pushdown: true,
@@ -586,10 +904,6 @@ module Polars
586
904
  mkdir: false,
587
905
  lazy: false
588
906
  )
589
- # TODO support storage options in Rust
590
- storage_options = nil
591
- retries = 2
592
-
593
907
  lf = _set_sink_optimizations(
594
908
  type_coercion: type_coercion,
595
909
  predicate_pushdown: predicate_pushdown,
@@ -1147,6 +1461,140 @@ module Polars
1147
1461
  )
1148
1462
  end
1149
1463
 
1464
+ # Remove rows, dropping those that match the given predicate expression(s).
1465
+ #
1466
+ # The original order of the remaining rows is preserved.
1467
+ #
1468
+ # Rows where the filter predicate does not evaluate to true are retained
1469
+ # (this includes rows where the predicate evaluates as `null`).
1470
+ #
1471
+ # @param predicates [Array]
1472
+ # Expression that evaluates to a boolean Series.
1473
+ # @param constraints [Hash]
1474
+ # Column filters; use `name = value` to filter columns using the supplied
1475
+ # value. Each constraint behaves the same as `Polars.col(name).eq(value)`,
1476
+ # and is implicitly joined with the other filter conditions using `&`.
1477
+ #
1478
+ # @return [LazyFrame]
1479
+ #
1480
+ # @example Remove rows matching a condition:
1481
+ # lf = Polars::LazyFrame.new(
1482
+ # {
1483
+ # "foo" => [2, 3, nil, 4, 0],
1484
+ # "bar" => [5, 6, nil, nil, 0],
1485
+ # "ham" => ["a", "b", nil, "c", "d"]
1486
+ # }
1487
+ # )
1488
+ # lf.remove(
1489
+ # Polars.col("bar") >= 5
1490
+ # ).collect
1491
+ # # =>
1492
+ # # shape: (3, 3)
1493
+ # # ┌──────┬──────┬──────┐
1494
+ # # │ foo ┆ bar ┆ ham │
1495
+ # # │ --- ┆ --- ┆ --- │
1496
+ # # │ i64 ┆ i64 ┆ str │
1497
+ # # ╞══════╪══════╪══════╡
1498
+ # # │ null ┆ null ┆ null │
1499
+ # # │ 4 ┆ null ┆ c │
1500
+ # # │ 0 ┆ 0 ┆ d │
1501
+ # # └──────┴──────┴──────┘
1502
+ #
1503
+ # @example Discard rows based on multiple conditions, combined with and/or operators:
1504
+ # lf.remove(
1505
+ # (Polars.col("foo") >= 0) & (Polars.col("bar") >= 0)
1506
+ # ).collect
1507
+ # # =>
1508
+ # # shape: (2, 3)
1509
+ # # ┌──────┬──────┬──────┐
1510
+ # # │ foo ┆ bar ┆ ham │
1511
+ # # │ --- ┆ --- ┆ --- │
1512
+ # # │ i64 ┆ i64 ┆ str │
1513
+ # # ╞══════╪══════╪══════╡
1514
+ # # │ null ┆ null ┆ null │
1515
+ # # │ 4 ┆ null ┆ c │
1516
+ # # └──────┴──────┴──────┘
1517
+ #
1518
+ # @example
1519
+ # lf.remove(
1520
+ # (Polars.col("foo") >= 0) | (Polars.col("bar") >= 0)
1521
+ # ).collect
1522
+ # # =>
1523
+ # # shape: (1, 3)
1524
+ # # ┌──────┬──────┬──────┐
1525
+ # # │ foo ┆ bar ┆ ham │
1526
+ # # │ --- ┆ --- ┆ --- │
1527
+ # # │ i64 ┆ i64 ┆ str │
1528
+ # # ╞══════╪══════╪══════╡
1529
+ # # │ null ┆ null ┆ null │
1530
+ # # └──────┴──────┴──────┘
1531
+ #
1532
+ # @example Provide multiple constraints using `*args` syntax:
1533
+ # lf.remove(
1534
+ # Polars.col("ham").is_not_null,
1535
+ # Polars.col("bar") >= 0
1536
+ # ).collect
1537
+ # # =>
1538
+ # # shape: (2, 3)
1539
+ # # ┌──────┬──────┬──────┐
1540
+ # # │ foo ┆ bar ┆ ham │
1541
+ # # │ --- ┆ --- ┆ --- │
1542
+ # # │ i64 ┆ i64 ┆ str │
1543
+ # # ╞══════╪══════╪══════╡
1544
+ # # │ null ┆ null ┆ null │
1545
+ # # │ 4 ┆ null ┆ c │
1546
+ # # └──────┴──────┴──────┘
1547
+ #
1548
+ # @example Provide constraints(s) using `**kwargs` syntax:
1549
+ # lf.remove(foo: 0, bar: 0).collect
1550
+ # # =>
1551
+ # # shape: (4, 3)
1552
+ # # ┌──────┬──────┬──────┐
1553
+ # # │ foo ┆ bar ┆ ham │
1554
+ # # │ --- ┆ --- ┆ --- │
1555
+ # # │ i64 ┆ i64 ┆ str │
1556
+ # # ╞══════╪══════╪══════╡
1557
+ # # │ 2 ┆ 5 ┆ a │
1558
+ # # │ 3 ┆ 6 ┆ b │
1559
+ # # │ null ┆ null ┆ null │
1560
+ # # │ 4 ┆ null ┆ c │
1561
+ # # └──────┴──────┴──────┘
1562
+ #
1563
+ # @example Remove rows by comparing two columns against each other; in this case, we remove rows where the two columns are not equal (using `ne_missing` to ensure that null values compare equal):
1564
+ # lf.remove(
1565
+ # Polars.col("foo").ne_missing(Polars.col("bar"))
1566
+ # ).collect
1567
+ # # =>
1568
+ # # shape: (2, 3)
1569
+ # # ┌──────┬──────┬──────┐
1570
+ # # │ foo ┆ bar ┆ ham │
1571
+ # # │ --- ┆ --- ┆ --- │
1572
+ # # │ i64 ┆ i64 ┆ str │
1573
+ # # ╞══════╪══════╪══════╡
1574
+ # # │ null ┆ null ┆ null │
1575
+ # # │ 0 ┆ 0 ┆ d │
1576
+ # # └──────┴──────┴──────┘
1577
+ def remove(
1578
+ *predicates,
1579
+ **constraints
1580
+ )
1581
+ if constraints.empty?
1582
+ # early-exit conditions (exclude/include all rows)
1583
+ if predicates.empty? || (predicates.length == 1 && predicates[0].is_a?(TrueClass))
1584
+ return clear
1585
+ end
1586
+ if predicates.length == 1 && predicates[0].is_a?(FalseClass)
1587
+ return dup
1588
+ end
1589
+ end
1590
+
1591
+ _filter(
1592
+ predicates: predicates,
1593
+ constraints: constraints,
1594
+ invert: true
1595
+ )
1596
+ end
1597
+
1150
1598
  # Select columns from this DataFrame.
1151
1599
  #
1152
1600
  # @param exprs [Array]
@@ -1244,6 +1692,29 @@ module Polars
1244
1692
  _from_rbldf(_ldf.select(rbexprs))
1245
1693
  end
1246
1694
 
1695
+ # Select columns from this LazyFrame.
1696
+ #
1697
+ # This will run all expression sequentially instead of in parallel.
1698
+ # Use this when the work per expression is cheap.
1699
+ #
1700
+ # @param exprs [Array]
1701
+ # Column(s) to select, specified as positional arguments.
1702
+ # Accepts expression input. Strings are parsed as column names,
1703
+ # other non-expression inputs are parsed as literals.
1704
+ # @param named_exprs [Hash]
1705
+ # Additional columns to select, specified as keyword arguments.
1706
+ # The columns will be renamed to the keyword used.
1707
+ #
1708
+ # @return [LazyFrame]
1709
+ def select_seq(*exprs, **named_exprs)
1710
+ structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", 0).to_i != 0
1711
+
1712
+ rbexprs = Utils.parse_into_list_of_expressions(
1713
+ *exprs, **named_exprs, __structify: structify
1714
+ )
1715
+ _from_rbldf(_ldf.select_seq(rbexprs))
1716
+ end
1717
+
1247
1718
  # Start a group by operation.
1248
1719
  #
1249
1720
  # @param by [Array]
@@ -1440,9 +1911,9 @@ module Polars
1440
1911
  # @param every [Object]
1441
1912
  # Interval of the window.
1442
1913
  # @param period [Object]
1443
- # Length of the window, if None it is equal to 'every'.
1914
+ # Length of the window, if nil it is equal to 'every'.
1444
1915
  # @param offset [Object]
1445
- # Offset of the window if None and period is None it will be equal to negative
1916
+ # Offset of the window if nil and period is nil it will be equal to negative
1446
1917
  # `every`.
1447
1918
  # @param truncate [Boolean]
1448
1919
  # Truncate the time value to the window lower bound.
@@ -1714,7 +2185,7 @@ module Polars
1714
2185
  # Join column of the right DataFrame.
1715
2186
  # @param on [String]
1716
2187
  # Join column of both DataFrames. If set, `left_on` and `right_on` should be
1717
- # None.
2188
+ # nil.
1718
2189
  # @param by_left [Object]
1719
2190
  # Join on these columns before doing asof join.
1720
2191
  # @param by_right [Object]
@@ -2039,7 +2510,7 @@ module Polars
2039
2510
  # Join column of the right DataFrame.
2040
2511
  # @param on Object
2041
2512
  # Join column of both DataFrames. If set, `left_on` and `right_on` should be
2042
- # None.
2513
+ # nil.
2043
2514
  # @param how ["inner", "left", "full", "semi", "anti", "cross"]
2044
2515
  # Join strategy.
2045
2516
  # @param suffix [String]
@@ -2234,6 +2705,103 @@ module Polars
2234
2705
  )
2235
2706
  end
2236
2707
 
2708
+ # Perform a join based on one or multiple (in)equality predicates.
2709
+ #
2710
+ # This performs an inner join, so only rows where all predicates are true
2711
+ # are included in the result, and a row from either DataFrame may be included
2712
+ # multiple times in the result.
2713
+ #
2714
+ # @note
2715
+ # The row order of the input DataFrames is not preserved.
2716
+ #
2717
+ # @note
2718
+ # This functionality is experimental. It may be
2719
+ # changed at any point without it being considered a breaking change.
2720
+ #
2721
+ # @param other [Object]
2722
+ # DataFrame to join with.
2723
+ # @param predicates [Object]
2724
+ # (In)Equality condition to join the two tables on.
2725
+ # When a column name occurs in both tables, the proper suffix must
2726
+ # be applied in the predicate.
2727
+ # @param suffix [String]
2728
+ # Suffix to append to columns with a duplicate name.
2729
+ #
2730
+ # @return [LazyFrame]
2731
+ #
2732
+ # @example Join two lazyframes together based on two predicates which get AND-ed together.
2733
+ # east = Polars::LazyFrame.new(
2734
+ # {
2735
+ # "id" => [100, 101, 102],
2736
+ # "dur" => [120, 140, 160],
2737
+ # "rev" => [12, 14, 16],
2738
+ # "cores" => [2, 8, 4]
2739
+ # }
2740
+ # )
2741
+ # west = Polars::LazyFrame.new(
2742
+ # {
2743
+ # "t_id" => [404, 498, 676, 742],
2744
+ # "time" => [90, 130, 150, 170],
2745
+ # "cost" => [9, 13, 15, 16],
2746
+ # "cores" => [4, 2, 1, 4]
2747
+ # }
2748
+ # )
2749
+ # east.join_where(
2750
+ # west,
2751
+ # Polars.col("dur") < Polars.col("time"),
2752
+ # Polars.col("rev") < Polars.col("cost")
2753
+ # ).collect
2754
+ # # =>
2755
+ # # shape: (5, 8)
2756
+ # # ┌─────┬─────┬─────┬───────┬──────┬──────┬──────┬─────────────┐
2757
+ # # │ id ┆ dur ┆ rev ┆ cores ┆ t_id ┆ time ┆ cost ┆ cores_right │
2758
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
2759
+ # # │ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
2760
+ # # ╞═════╪═════╪═════╪═══════╪══════╪══════╪══════╪═════════════╡
2761
+ # # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 498 ┆ 130 ┆ 13 ┆ 2 │
2762
+ # # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
2763
+ # # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
2764
+ # # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
2765
+ # # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
2766
+ # # └─────┴─────┴─────┴───────┴──────┴──────┴──────┴─────────────┘
2767
+ #
2768
+ # @example To OR them together, use a single expression and the `|` operator.
2769
+ # east.join_where(
2770
+ # west,
2771
+ # (Polars.col("dur") < Polars.col("time")) | (Polars.col("rev") < Polars.col("cost"))
2772
+ # ).collect
2773
+ # # =>
2774
+ # # shape: (6, 8)
2775
+ # # ┌─────┬─────┬─────┬───────┬──────┬──────┬──────┬─────────────┐
2776
+ # # │ id ┆ dur ┆ rev ┆ cores ┆ t_id ┆ time ┆ cost ┆ cores_right │
2777
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
2778
+ # # │ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
2779
+ # # ╞═════╪═════╪═════╪═══════╪══════╪══════╪══════╪═════════════╡
2780
+ # # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 498 ┆ 130 ┆ 13 ┆ 2 │
2781
+ # # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
2782
+ # # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
2783
+ # # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
2784
+ # # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
2785
+ # # │ 102 ┆ 160 ┆ 16 ┆ 4 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
2786
+ # # └─────┴─────┴─────┴───────┴──────┴──────┴──────┴─────────────┘
2787
+ def join_where(
2788
+ other,
2789
+ *predicates,
2790
+ suffix: "_right"
2791
+ )
2792
+ Utils.require_same_type(self, other)
2793
+
2794
+ rbexprs = Utils.parse_into_list_of_expressions(*predicates)
2795
+
2796
+ _from_rbldf(
2797
+ _ldf.join_where(
2798
+ other._ldf,
2799
+ rbexprs,
2800
+ suffix
2801
+ )
2802
+ )
2803
+ end
2804
+
2237
2805
  # Add or overwrite multiple columns in a DataFrame.
2238
2806
  #
2239
2807
  # @param exprs [Object]
@@ -2279,6 +2847,34 @@ module Polars
2279
2847
  _from_rbldf(_ldf.with_columns(rbexprs))
2280
2848
  end
2281
2849
 
2850
+ # Add columns to this LazyFrame.
2851
+ #
2852
+ # Added columns will replace existing columns with the same name.
2853
+ #
2854
+ # This will run all expression sequentially instead of in parallel.
2855
+ # Use this when the work per expression is cheap.
2856
+ #
2857
+ # @param exprs [Array]
2858
+ # Column(s) to add, specified as positional arguments.
2859
+ # Accepts expression input. Strings are parsed as column names, other
2860
+ # non-expression inputs are parsed as literals.
2861
+ # @param named_exprs [Hash]
2862
+ # Additional columns to add, specified as keyword arguments.
2863
+ # The columns will be renamed to the keyword used.
2864
+ #
2865
+ # @return [LazyFrame]
2866
+ def with_columns_seq(
2867
+ *exprs,
2868
+ **named_exprs
2869
+ )
2870
+ structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", 0).to_i != 0
2871
+
2872
+ rbexprs = Utils.parse_into_list_of_expressions(
2873
+ *exprs, **named_exprs, __structify: structify
2874
+ )
2875
+ _from_rbldf(_ldf.with_columns_seq(rbexprs))
2876
+ end
2877
+
2282
2878
  # Add an external context to the computation graph.
2283
2879
  #
2284
2880
  # This allows expressions to also access columns from DataFrames
@@ -2887,7 +3483,7 @@ module Polars
2887
3483
  #
2888
3484
  # @example
2889
3485
  # s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]}).lazy
2890
- # s.take_every(2).collect
3486
+ # s.gather_every(2).collect
2891
3487
  # # =>
2892
3488
  # # shape: (2, 2)
2893
3489
  # # ┌─────┬─────┐
@@ -2898,9 +3494,10 @@ module Polars
2898
3494
  # # │ 1 ┆ 5 │
2899
3495
  # # │ 3 ┆ 7 │
2900
3496
  # # └─────┴─────┘
2901
- def take_every(n)
2902
- select(F.col("*").take_every(n))
3497
+ def gather_every(n)
3498
+ select(F.col("*").gather_every(n))
2903
3499
  end
3500
+ alias_method :take_every, :gather_every
2904
3501
 
2905
3502
  # Fill null values using the specified value or strategy.
2906
3503
  #
@@ -3177,6 +3774,32 @@ module Polars
3177
3774
  _from_rbldf(_ldf.median)
3178
3775
  end
3179
3776
 
3777
+ # Aggregate the columns in the LazyFrame as the sum of their null value count.
3778
+ #
3779
+ # @return [LazyFrame]
3780
+ #
3781
+ # @example
3782
+ # lf = Polars::LazyFrame.new(
3783
+ # {
3784
+ # "foo" => [1, nil, 3],
3785
+ # "bar" => [6, 7, nil],
3786
+ # "ham" => ["a", "b", "c"]
3787
+ # }
3788
+ # )
3789
+ # lf.null_count.collect
3790
+ # # =>
3791
+ # # shape: (1, 3)
3792
+ # # ┌─────┬─────┬─────┐
3793
+ # # │ foo ┆ bar ┆ ham │
3794
+ # # │ --- ┆ --- ┆ --- │
3795
+ # # │ u32 ┆ u32 ┆ u32 │
3796
+ # # ╞═════╪═════╪═════╡
3797
+ # # │ 1 ┆ 1 ┆ 0 │
3798
+ # # └─────┴─────┴─────┘
3799
+ def null_count
3800
+ _from_rbldf(_ldf.null_count)
3801
+ end
3802
+
3180
3803
  # Aggregate the columns in the DataFrame to their quantile value.
3181
3804
  #
3182
3805
  # @param quantile [Float]
@@ -3307,37 +3930,103 @@ module Polars
3307
3930
  _from_rbldf(_ldf.unique(maintain_order, selector_subset, keep))
3308
3931
  end
3309
3932
 
3310
- # Drop rows with null values from this LazyFrame.
3933
+ # Drop all rows that contain one or more NaN values.
3934
+ #
3935
+ # The original order of the remaining rows is preserved.
3311
3936
  #
3312
3937
  # @param subset [Object]
3313
- # Subset of column(s) on which `drop_nulls` will be applied.
3938
+ # Column name(s) for which NaN values are considered; if set to `nil`
3939
+ # (default), use all columns (note that only floating-point columns
3940
+ # can contain NaNs).
3314
3941
  #
3315
3942
  # @return [LazyFrame]
3316
3943
  #
3317
3944
  # @example
3318
- # df = Polars::DataFrame.new(
3945
+ # lf = Polars::LazyFrame.new(
3946
+ # {
3947
+ # "foo" => [-20.5, Float::NAN, 80.0],
3948
+ # "bar" => [Float::NAN, 110.0, 25.5],
3949
+ # "ham" => ["xxx", "yyy", nil]
3950
+ # }
3951
+ # )
3952
+ # lf.drop_nans.collect
3953
+ # # =>
3954
+ # # shape: (1, 3)
3955
+ # # ┌──────┬──────┬──────┐
3956
+ # # │ foo ┆ bar ┆ ham │
3957
+ # # │ --- ┆ --- ┆ --- │
3958
+ # # │ f64 ┆ f64 ┆ str │
3959
+ # # ╞══════╪══════╪══════╡
3960
+ # # │ 80.0 ┆ 25.5 ┆ null │
3961
+ # # └──────┴──────┴──────┘
3962
+ #
3963
+ # @example
3964
+ # lf.drop_nans(subset: ["bar"]).collect
3965
+ # # =>
3966
+ # # shape: (2, 3)
3967
+ # # ┌──────┬───────┬──────┐
3968
+ # # │ foo ┆ bar ┆ ham │
3969
+ # # │ --- ┆ --- ┆ --- │
3970
+ # # │ f64 ┆ f64 ┆ str │
3971
+ # # ╞══════╪═══════╪══════╡
3972
+ # # │ NaN ┆ 110.0 ┆ yyy │
3973
+ # # │ 80.0 ┆ 25.5 ┆ null │
3974
+ # # └──────┴───────┴──────┘
3975
+ def drop_nans(subset: nil)
3976
+ selector_subset = nil
3977
+ if !subset.nil?
3978
+ selector_subset = Utils.parse_list_into_selector(subset)._rbselector
3979
+ end
3980
+ _from_rbldf(_ldf.drop_nans(selector_subset))
3981
+ end
3982
+
3983
+ # Drop all rows that contain one or more null values.
3984
+ #
3985
+ # The original order of the remaining rows is preserved.
3986
+ #
3987
+ # @param subset [Object]
3988
+ # Column name(s) for which null values are considered.
3989
+ # If set to `nil` (default), use all columns.
3990
+ #
3991
+ # @return [LazyFrame]
3992
+ #
3993
+ # @example
3994
+ # lf = Polars::LazyFrame.new(
3319
3995
  # {
3320
3996
  # "foo" => [1, 2, 3],
3321
3997
  # "bar" => [6, nil, 8],
3322
- # "ham" => ["a", "b", "c"]
3998
+ # "ham" => ["a", "b", nil]
3323
3999
  # }
3324
4000
  # )
3325
- # df.lazy.drop_nulls.collect
4001
+ # lf.drop_nulls.collect
3326
4002
  # # =>
3327
- # # shape: (2, 3)
4003
+ # # shape: (1, 3)
3328
4004
  # # ┌─────┬─────┬─────┐
3329
4005
  # # │ foo ┆ bar ┆ ham │
3330
4006
  # # │ --- ┆ --- ┆ --- │
3331
4007
  # # │ i64 ┆ i64 ┆ str │
3332
4008
  # # ╞═════╪═════╪═════╡
3333
4009
  # # │ 1 ┆ 6 ┆ a │
3334
- # # │ 3 ┆ 8 ┆ c │
3335
4010
  # # └─────┴─────┴─────┘
4011
+ #
4012
+ # @example
4013
+ # lf.drop_nulls(subset: Polars.cs.integer).collect
4014
+ # # =>
4015
+ # # shape: (2, 3)
4016
+ # # ┌─────┬─────┬──────┐
4017
+ # # │ foo ┆ bar ┆ ham │
4018
+ # # │ --- ┆ --- ┆ --- │
4019
+ # # │ i64 ┆ i64 ┆ str │
4020
+ # # ╞═════╪═════╪══════╡
4021
+ # # │ 1 ┆ 6 ┆ a │
4022
+ # # │ 3 ┆ 8 ┆ null │
4023
+ # # └─────┴─────┴──────┘
3336
4024
  def drop_nulls(subset: nil)
3337
- if !subset.nil? && !subset.is_a?(::Array)
3338
- subset = [subset]
4025
+ selector_subset = nil
4026
+ if !subset.nil?
4027
+ selector_subset = Utils.parse_list_into_selector(subset)._rbselector
3339
4028
  end
3340
- _from_rbldf(_ldf.drop_nulls(subset))
4029
+ _from_rbldf(_ldf.drop_nulls(selector_subset))
3341
4030
  end
3342
4031
 
3343
4032
  # Unpivot a DataFrame from wide to long format.
@@ -3571,9 +4260,261 @@ module Polars
3571
4260
  with_columns(F.col(column).set_sorted(descending: descending))
3572
4261
  end
3573
4262
 
3574
- # TODO
3575
- # def update
3576
- # end
4263
+ # Update the values in this `LazyFrame` with the values in `other`.
4264
+ #
4265
+ # @note
4266
+ # This functionality is considered **unstable**. It may be changed
4267
+ # at any point without it being considered a breaking change.
4268
+ #
4269
+ # @param other [LazyFrame]
4270
+ # LazyFrame that will be used to update the values
4271
+ # @param on [Object]
4272
+ # Column names that will be joined on. If set to `nil` (default),
4273
+ # the implicit row index of each frame is used as a join key.
4274
+ # @param how ['left', 'inner', 'full']
4275
+ # * 'left' will keep all rows from the left table; rows may be duplicated
4276
+ # if multiple rows in the right frame match the left row's key.
4277
+ # * 'inner' keeps only those rows where the key exists in both frames.
4278
+ # * 'full' will update existing rows where the key matches while also
4279
+ # adding any new rows contained in the given frame.
4280
+ # @param left_on [Object]
4281
+ # Join column(s) of the left DataFrame.
4282
+ # @param right_on [Object]
4283
+ # Join column(s) of the right DataFrame.
4284
+ # @param include_nulls [Boolean]
4285
+ # Overwrite values in the left frame with null values from the right frame.
4286
+ # If set to `false` (default), null values in the right frame are ignored.
4287
+ # @param maintain_order ['none', 'left', 'right', 'left_right', 'right_left']
4288
+ # Which order of rows from the inputs to preserve. See `LazyFrame.join`
4289
+ # for details. Unlike `join` this function preserves the left order by
4290
+ # default.
4291
+ #
4292
+ # @return [LazyFrame]
4293
+ #
4294
+ # @note
4295
+ # This is syntactic sugar for a left/inner join that preserves the order
4296
+ # of the left `DataFrame` by default, with an optional coalesce when
4297
+ # `include_nulls: False`.
4298
+ #
4299
+ # @example Update `df` values with the non-null values in `new_df`, by row index:
4300
+ # lf = Polars::LazyFrame.new(
4301
+ # {
4302
+ # "A" => [1, 2, 3, 4],
4303
+ # "B" => [400, 500, 600, 700]
4304
+ # }
4305
+ # )
4306
+ # new_lf = Polars::LazyFrame.new(
4307
+ # {
4308
+ # "B" => [-66, nil, -99],
4309
+ # "C" => [5, 3, 1]
4310
+ # }
4311
+ # )
4312
+ # lf.update(new_lf).collect
4313
+ # # =>
4314
+ # # shape: (4, 2)
4315
+ # # ┌─────┬─────┐
4316
+ # # │ A ┆ B │
4317
+ # # │ --- ┆ --- │
4318
+ # # │ i64 ┆ i64 │
4319
+ # # ╞═════╪═════╡
4320
+ # # │ 1 ┆ -66 │
4321
+ # # │ 2 ┆ 500 │
4322
+ # # │ 3 ┆ -99 │
4323
+ # # │ 4 ┆ 700 │
4324
+ # # └─────┴─────┘
4325
+ #
4326
+ # @example Update `df` values with the non-null values in `new_df`, by row index, but only keeping those rows that are common to both frames:
4327
+ # lf.update(new_lf, how: "inner").collect
4328
+ # # =>
4329
+ # # shape: (3, 2)
4330
+ # # ┌─────┬─────┐
4331
+ # # │ A ┆ B │
4332
+ # # │ --- ┆ --- │
4333
+ # # │ i64 ┆ i64 │
4334
+ # # ╞═════╪═════╡
4335
+ # # │ 1 ┆ -66 │
4336
+ # # │ 2 ┆ 500 │
4337
+ # # │ 3 ┆ -99 │
4338
+ # # └─────┴─────┘
4339
+ #
4340
+ # @example Update `df` values with the non-null values in `new_df`, using a full outer join strategy that defines explicit join columns in each frame:
4341
+ # lf.update(new_lf, left_on: ["A"], right_on: ["C"], how: "full").collect
4342
+ # # =>
4343
+ # # shape: (5, 2)
4344
+ # # ┌─────┬─────┐
4345
+ # # │ A ┆ B │
4346
+ # # │ --- ┆ --- │
4347
+ # # │ i64 ┆ i64 │
4348
+ # # ╞═════╪═════╡
4349
+ # # │ 1 ┆ -99 │
4350
+ # # │ 2 ┆ 500 │
4351
+ # # │ 3 ┆ 600 │
4352
+ # # │ 4 ┆ 700 │
4353
+ # # │ 5 ┆ -66 │
4354
+ # # └─────┴─────┘
4355
+ #
4356
+ # @example Update `df` values including null values in `new_df`, using a full outer join strategy that defines explicit join columns in each frame:
4357
+ # lf.update(
4358
+ # new_lf, left_on: "A", right_on: "C", how: "full", include_nulls: true
4359
+ # ).collect
4360
+ # # =>
4361
+ # # shape: (5, 2)
4362
+ # # ┌─────┬──────┐
4363
+ # # │ A ┆ B │
4364
+ # # │ --- ┆ --- │
4365
+ # # │ i64 ┆ i64 │
4366
+ # # ╞═════╪══════╡
4367
+ # # │ 1 ┆ -99 │
4368
+ # # │ 2 ┆ 500 │
4369
+ # # │ 3 ┆ null │
4370
+ # # │ 4 ┆ 700 │
4371
+ # # │ 5 ┆ -66 │
4372
+ # # └─────┴──────┘
4373
+ def update(
4374
+ other,
4375
+ on: nil,
4376
+ how: "left",
4377
+ left_on: nil,
4378
+ right_on: nil,
4379
+ include_nulls: false,
4380
+ maintain_order: "left"
4381
+ )
4382
+ Utils.require_same_type(self, other)
4383
+ if ["outer", "outer_coalesce"].include?(how)
4384
+ how = "full"
4385
+ end
4386
+
4387
+ if !["left", "inner", "full"].include?(how)
4388
+ msg = "`how` must be one of {{'left', 'inner', 'full'}}; found #{how.inspect}"
4389
+ raise ArgumentError, msg
4390
+ end
4391
+
4392
+ slf = self
4393
+ row_index_used = false
4394
+ if on.nil?
4395
+ if left_on.nil? && right_on.nil?
4396
+ # no keys provided--use row index
4397
+ row_index_used = true
4398
+ row_index_name = "__POLARS_ROW_INDEX"
4399
+ slf = slf.with_row_index(name: row_index_name)
4400
+ other = other.with_row_index(name: row_index_name)
4401
+ left_on = right_on = [row_index_name]
4402
+ else
4403
+ # one of left or right is missing, raise error
4404
+ if left_on.nil?
4405
+ msg = "missing join columns for left frame"
4406
+ raise ArgumentError, msg
4407
+ end
4408
+ if right_on.nil?
4409
+ msg = "missing join columns for right frame"
4410
+ raise ArgumentError, msg
4411
+ end
4412
+ end
4413
+ else
4414
+ # move on into left/right_on to simplify logic
4415
+ left_on = right_on = on
4416
+ end
4417
+
4418
+ if left_on.is_a?(::String)
4419
+ left_on = [left_on]
4420
+ end
4421
+ if right_on.is_a?(::String)
4422
+ right_on = [right_on]
4423
+ end
4424
+
4425
+ left_schema = slf.collect_schema
4426
+ left_on.each do |name|
4427
+ if !left_schema.include?(name)
4428
+ msg = "left join column #{name.inspect} not found"
4429
+ raise ArgumentError, msg
4430
+ end
4431
+ end
4432
+ right_schema = other.collect_schema
4433
+ right_on.each do |name|
4434
+ if !right_schema.include?(name)
4435
+ msg = "right join column #{name.inspect} not found"
4436
+ raise ArgumentError, msg
4437
+ end
4438
+ end
4439
+
4440
+ # no need to join if *only* join columns are in other (inner/left update only)
4441
+ if how != "full" && right_schema.length == right_on.length
4442
+ if row_index_used
4443
+ return slf.drop(row_index_name)
4444
+ end
4445
+ return slf
4446
+ end
4447
+
4448
+ # only use non-idx right columns present in left frame
4449
+ right_other = Set.new(right_schema.to_h.keys).intersection(left_schema.to_h.keys) - Set.new(right_on)
4450
+
4451
+ # When include_nulls is True, we need to distinguish records after the join that
4452
+ # were originally null in the right frame, as opposed to records that were null
4453
+ # because the key was missing from the right frame.
4454
+ # Add a validity column to track whether row was matched or not.
4455
+ if include_nulls
4456
+ validity = ["__POLARS_VALIDITY"]
4457
+ other = other.with_columns(F.lit(true).alias(validity[0]))
4458
+ else
4459
+ validity = []
4460
+ end
4461
+
4462
+ tmp_name = "__POLARS_RIGHT"
4463
+ drop_columns = right_other.map { |name| "#{name}#{tmp_name}" } + validity
4464
+ result = (
4465
+ slf.join(
4466
+ other.select(*right_on, *right_other, *validity),
4467
+ left_on: left_on,
4468
+ right_on: right_on,
4469
+ how: how,
4470
+ suffix: tmp_name,
4471
+ coalesce: true,
4472
+ maintain_order: maintain_order
4473
+ )
4474
+ .with_columns(
4475
+ right_other.map do |name|
4476
+ (
4477
+ if include_nulls
4478
+ # use left value only when right value failed to join
4479
+ F.when(F.col(validity).is_null)
4480
+ .then(F.col(name))
4481
+ .otherwise(F.col("#{name}#{tmp_name}"))
4482
+ else
4483
+ F.coalesce(["#{name}#{tmp_name}", F.col(name)])
4484
+ end
4485
+ ).alias(name)
4486
+ end
4487
+ )
4488
+ .drop(drop_columns)
4489
+ )
4490
+ if row_index_used
4491
+ result = result.drop(row_index_name)
4492
+ end
4493
+
4494
+ _from_rbldf(result._ldf)
4495
+ end
4496
+
4497
+ # Return the number of non-null elements for each column.
4498
+ #
4499
+ # @return [LazyFrame]
4500
+ #
4501
+ # @example
4502
+ # lf = Polars::LazyFrame.new(
4503
+ # {"a" => [1, 2, 3, 4], "b" => [1, 2, 1, nil], "c" => [nil, nil, nil, nil]}
4504
+ # )
4505
+ # lf.count.collect
4506
+ # # =>
4507
+ # # shape: (1, 3)
4508
+ # # ┌─────┬─────┬─────┐
4509
+ # # │ a ┆ b ┆ c │
4510
+ # # │ --- ┆ --- ┆ --- │
4511
+ # # │ u32 ┆ u32 ┆ u32 │
4512
+ # # ╞═════╪═════╪═════╡
4513
+ # # │ 4 ┆ 3 ┆ 0 │
4514
+ # # └─────┴─────┴─────┘
4515
+ def count
4516
+ _from_rbldf(_ldf.count)
4517
+ end
3577
4518
 
3578
4519
  private
3579
4520
 
@@ -3585,5 +4526,64 @@ module Polars
3585
4526
  def _from_rbldf(rb_ldf)
3586
4527
  self.class._from_rbldf(rb_ldf)
3587
4528
  end
4529
+
4530
+ def _filter(
4531
+ predicates:,
4532
+ constraints:,
4533
+ invert: false
4534
+ )
4535
+ all_predicates = []
4536
+ boolean_masks = []
4537
+
4538
+ predicates.each do |p|
4539
+ # quick exit/skip conditions
4540
+ if (p.is_a?(FalseClass) && invert) || (p.is_a?(TrueClass) && !invert)
4541
+ next # ignore; doesn't filter/remove anything
4542
+ end
4543
+ if (p.is_a?(TrueClass) && invert) || (p.is_a?(FalseClass) && !invert)
4544
+ return clear # discard all rows
4545
+ end
4546
+
4547
+ # note: identify masks separately from predicates
4548
+ if Utils.is_bool_sequence(p, include_series: true)
4549
+ boolean_masks << Polars::Series.new(p, dtype: Boolean)
4550
+ elsif (
4551
+ (is_seq = Utils.is_sequence(p)) && p.any? { |x| !x.is_a?(Expr) }) ||
4552
+ (!is_seq && !p.is_a?(Expr) && !(p.is_a?(::String) && collect_schema.include?(p))
4553
+ )
4554
+ err = p.is_a?(Series) ? "Series(…, dtype: #{p.dtype})" : p.inspect
4555
+ msg = "invalid predicate for `filter`: #{err}"
4556
+ raise TypeError, msg
4557
+ else
4558
+ all_predicates.concat(
4559
+ Utils.parse_into_list_of_expressions(p).map { |x| Utils.wrap_expr(x) }
4560
+ )
4561
+ end
4562
+ end
4563
+
4564
+ # unpack equality constraints from kwargs
4565
+ all_predicates.concat(
4566
+ constraints.map { |name, value| F.col(name).eq(value) }
4567
+ )
4568
+ if !(all_predicates.any? || boolean_masks.any?)
4569
+ msg = "at least one predicate or constraint must be provided"
4570
+ raise TypeError, msg
4571
+ end
4572
+
4573
+ # if multiple predicates, combine as 'horizontal' expression
4574
+ combined_predicate = all_predicates ? (all_predicates.length > 1 ? F.all_horizontal(*all_predicates) : all_predicates[0]) : nil
4575
+
4576
+ # apply reduced boolean mask first, if applicable, then predicates
4577
+ if boolean_masks.any?
4578
+ raise Todo
4579
+ end
4580
+
4581
+ if combined_predicate.nil?
4582
+ return _from_rbldf(_ldf)
4583
+ end
4584
+
4585
+ filter_method = invert ? _ldf.method(:remove) : _ldf.method(:filter)
4586
+ _from_rbldf(filter_method.(combined_predicate._rbexpr))
4587
+ end
3588
4588
  end
3589
4589
  end