polars-df 0.21.0 → 0.21.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/Cargo.lock +1 -1
  4. data/ext/polars/Cargo.toml +7 -1
  5. data/ext/polars/src/conversion/mod.rs +92 -4
  6. data/ext/polars/src/exceptions.rs +1 -0
  7. data/ext/polars/src/expr/array.rs +73 -4
  8. data/ext/polars/src/expr/binary.rs +26 -1
  9. data/ext/polars/src/expr/bitwise.rs +39 -0
  10. data/ext/polars/src/expr/categorical.rs +20 -0
  11. data/ext/polars/src/expr/datatype.rs +24 -1
  12. data/ext/polars/src/expr/datetime.rs +58 -0
  13. data/ext/polars/src/expr/general.rs +84 -5
  14. data/ext/polars/src/expr/list.rs +24 -0
  15. data/ext/polars/src/expr/meta.rs +11 -0
  16. data/ext/polars/src/expr/mod.rs +1 -0
  17. data/ext/polars/src/expr/name.rs +8 -0
  18. data/ext/polars/src/expr/rolling.rs +20 -0
  19. data/ext/polars/src/expr/string.rs +59 -0
  20. data/ext/polars/src/expr/struct.rs +9 -1
  21. data/ext/polars/src/functions/io.rs +19 -0
  22. data/ext/polars/src/functions/lazy.rs +4 -0
  23. data/ext/polars/src/lazyframe/general.rs +51 -0
  24. data/ext/polars/src/lib.rs +119 -10
  25. data/ext/polars/src/map/dataframe.rs +2 -2
  26. data/ext/polars/src/map/series.rs +1 -1
  27. data/ext/polars/src/series/aggregation.rs +44 -0
  28. data/ext/polars/src/series/general.rs +64 -4
  29. data/lib/polars/array_expr.rb +382 -3
  30. data/lib/polars/array_name_space.rb +281 -0
  31. data/lib/polars/binary_expr.rb +67 -0
  32. data/lib/polars/binary_name_space.rb +43 -0
  33. data/lib/polars/cat_expr.rb +224 -0
  34. data/lib/polars/cat_name_space.rb +138 -0
  35. data/lib/polars/config.rb +2 -2
  36. data/lib/polars/convert.rb +6 -6
  37. data/lib/polars/data_frame.rb +684 -19
  38. data/lib/polars/data_type_expr.rb +52 -0
  39. data/lib/polars/data_types.rb +14 -2
  40. data/lib/polars/date_time_expr.rb +251 -0
  41. data/lib/polars/date_time_name_space.rb +299 -0
  42. data/lib/polars/expr.rb +1213 -180
  43. data/lib/polars/functions/datatype.rb +21 -0
  44. data/lib/polars/functions/lazy.rb +13 -0
  45. data/lib/polars/io/csv.rb +1 -1
  46. data/lib/polars/io/json.rb +4 -4
  47. data/lib/polars/io/ndjson.rb +4 -4
  48. data/lib/polars/io/parquet.rb +27 -5
  49. data/lib/polars/lazy_frame.rb +936 -20
  50. data/lib/polars/list_expr.rb +196 -4
  51. data/lib/polars/list_name_space.rb +201 -4
  52. data/lib/polars/meta_expr.rb +64 -0
  53. data/lib/polars/name_expr.rb +36 -0
  54. data/lib/polars/schema.rb +79 -3
  55. data/lib/polars/selector.rb +72 -0
  56. data/lib/polars/selectors.rb +3 -3
  57. data/lib/polars/series.rb +1051 -54
  58. data/lib/polars/string_expr.rb +411 -6
  59. data/lib/polars/string_name_space.rb +722 -49
  60. data/lib/polars/struct_expr.rb +103 -0
  61. data/lib/polars/struct_name_space.rb +19 -1
  62. data/lib/polars/utils/various.rb +18 -1
  63. data/lib/polars/utils.rb +5 -1
  64. data/lib/polars/version.rb +1 -1
  65. data/lib/polars.rb +2 -0
  66. metadata +4 -1
@@ -288,6 +288,201 @@ module Polars
288
288
  )
289
289
  end
290
290
 
291
+ # Execute a SQL query against the LazyFrame.
292
+ #
293
+ # @note
294
+ # This functionality is considered **unstable**, although it is close to
295
+ # being considered stable. It may be changed at any point without it being
296
+ # considered a breaking change.
297
+ #
298
+ # @param query [String]
299
+ # SQL query to execute.
300
+ # @param table_name [String]
301
+ # Optionally provide an explicit name for the table that represents the
302
+ # calling frame (defaults to "self").
303
+ #
304
+ # @return [Expr]
305
+ #
306
+ # @note
307
+ # * The calling frame is automatically registered as a table in the SQL context
308
+ # under the name "self". If you want access to the DataFrames and LazyFrames
309
+ # found in the current globals, use the top-level `Polars.sql`.
310
+ # * More control over registration and execution behaviour is available by
311
+ # using the `SQLContext` object.
312
+ #
313
+ # @example Query the LazyFrame using SQL:
314
+ # lf1 = Polars::LazyFrame.new({"a" => [1, 2, 3], "b" => [6, 7, 8], "c" => ["z", "y", "x"]})
315
+ # lf2 = Polars::LazyFrame.new({"a" => [3, 2, 1], "d" => [125, -654, 888]})
316
+ # lf1.sql("SELECT c, b FROM self WHERE a > 1").collect
317
+ # # =>
318
+ # # shape: (2, 2)
319
+ # # ┌─────┬─────┐
320
+ # # │ c ┆ b │
321
+ # # │ --- ┆ --- │
322
+ # # │ str ┆ i64 │
323
+ # # ╞═════╪═════╡
324
+ # # │ y ┆ 7 │
325
+ # # │ x ┆ 8 │
326
+ # # └─────┴─────┘
327
+ #
328
+ # @example Apply SQL transforms (aliasing "self" to "frame") then filter natively (you can freely mix SQL and native operations):
329
+ # lf1.sql(
330
+ # "
331
+ # SELECT
332
+ # a,
333
+ # (a % 2 == 0) AS a_is_even,
334
+ # (b::float4 / 2) AS \"b/2\",
335
+ # CONCAT_WS(':', c, c, c) AS c_c_c
336
+ # FROM frame
337
+ # ORDER BY a
338
+ # ",
339
+ # table_name: "frame",
340
+ # ).filter(~Polars.col("c_c_c").str.starts_with("x")).collect
341
+ # # =>
342
+ # # shape: (2, 4)
343
+ # # ┌─────┬───────────┬─────┬───────┐
344
+ # # │ a ┆ a_is_even ┆ b/2 ┆ c_c_c │
345
+ # # │ --- ┆ --- ┆ --- ┆ --- │
346
+ # # │ i64 ┆ bool ┆ f32 ┆ str │
347
+ # # ╞═════╪═══════════╪═════╪═══════╡
348
+ # # │ 1 ┆ false ┆ 3.0 ┆ z:z:z │
349
+ # # │ 2 ┆ true ┆ 3.5 ┆ y:y:y │
350
+ # # └─────┴───────────┴─────┴───────┘
351
+ def sql(query, table_name: "self")
352
+ ctx = Polars::SQLContext.new
353
+ name = table_name || "self"
354
+ ctx.register(name, self)
355
+ ctx.execute(query)
356
+ end
357
+
358
+ # Return the `k` largest rows.
359
+ #
360
+ # Non-null elements are always preferred over null elements, regardless of
361
+ # the value of `reverse`. The output is not guaranteed to be in any
362
+ # particular order, call :func:`sort` after this function if you wish the
363
+ # output to be sorted.
364
+ #
365
+ # @param k [Integer]
366
+ # Number of rows to return.
367
+ # @param by [Object]
368
+ # Column(s) used to determine the top rows.
369
+ # Accepts expression input. Strings are parsed as column names.
370
+ # @param reverse [Object]
371
+ # Consider the `k` smallest elements of the `by` column(s) (instead of the `k`
372
+ # largest). This can be specified per column by passing a sequence of
373
+ # booleans.
374
+ #
375
+ # @return [LazyFrame]
376
+ #
377
+ # @example Get the rows which contain the 4 largest values in column b.
378
+ # lf = Polars::LazyFrame.new(
379
+ # {
380
+ # "a" => ["a", "b", "a", "b", "b", "c"],
381
+ # "b" => [2, 1, 1, 3, 2, 1]
382
+ # }
383
+ # )
384
+ # lf.top_k(4, by: "b").collect
385
+ # # =>
386
+ # # shape: (4, 2)
387
+ # # ┌─────┬─────┐
388
+ # # │ a ┆ b │
389
+ # # │ --- ┆ --- │
390
+ # # │ str ┆ i64 │
391
+ # # ╞═════╪═════╡
392
+ # # │ b ┆ 3 │
393
+ # # │ a ┆ 2 │
394
+ # # │ b ┆ 2 │
395
+ # # │ b ┆ 1 │
396
+ # # └─────┴─────┘
397
+ #
398
+ # @example Get the rows which contain the 4 largest values when sorting on column b and a.
399
+ # lf.top_k(4, by: ["b", "a"]).collect
400
+ # # =>
401
+ # # shape: (4, 2)
402
+ # # ┌─────┬─────┐
403
+ # # │ a ┆ b │
404
+ # # │ --- ┆ --- │
405
+ # # │ str ┆ i64 │
406
+ # # ╞═════╪═════╡
407
+ # # │ b ┆ 3 │
408
+ # # │ b ┆ 2 │
409
+ # # │ a ┆ 2 │
410
+ # # │ c ┆ 1 │
411
+ # # └─────┴─────┘
412
+ def top_k(
413
+ k,
414
+ by:,
415
+ reverse: false
416
+ )
417
+ by = Utils.parse_into_list_of_expressions(by)
418
+ reverse = Utils.extend_bool(reverse, by.length, "reverse", "by")
419
+ _from_rbldf(_ldf.top_k(k, by, reverse))
420
+ end
421
+
422
+ # Return the `k` smallest rows.
423
+ #
424
+ # Non-null elements are always preferred over null elements, regardless of
425
+ # the value of `reverse`. The output is not guaranteed to be in any
426
+ # particular order, call :func:`sort` after this function if you wish the
427
+ # output to be sorted.
428
+ #
429
+ # @param k [Integer]
430
+ # Number of rows to return.
431
+ # @param by [Object]
432
+ # Column(s) used to determine the bottom rows.
433
+ # Accepts expression input. Strings are parsed as column names.
434
+ # @param reverse [Object]
435
+ # Consider the `k` largest elements of the `by` column(s) (instead of the `k`
436
+ # smallest). This can be specified per column by passing a sequence of
437
+ # booleans.
438
+ #
439
+ # @return [LazyFrame]
440
+ #
441
+ # @example Get the rows which contain the 4 smallest values in column b.
442
+ # lf = Polars::LazyFrame.new(
443
+ # {
444
+ # "a" => ["a", "b", "a", "b", "b", "c"],
445
+ # "b" => [2, 1, 1, 3, 2, 1]
446
+ # }
447
+ # )
448
+ # lf.bottom_k(4, by: "b").collect
449
+ # # =>
450
+ # # shape: (4, 2)
451
+ # # ┌─────┬─────┐
452
+ # # │ a ┆ b │
453
+ # # │ --- ┆ --- │
454
+ # # │ str ┆ i64 │
455
+ # # ╞═════╪═════╡
456
+ # # │ b ┆ 1 │
457
+ # # │ a ┆ 1 │
458
+ # # │ c ┆ 1 │
459
+ # # │ a ┆ 2 │
460
+ # # └─────┴─────┘
461
+ #
462
+ # @example Get the rows which contain the 4 smallest values when sorting on column a and b.
463
+ # lf.bottom_k(4, by: ["a", "b"]).collect
464
+ # # =>
465
+ # # shape: (4, 2)
466
+ # # ┌─────┬─────┐
467
+ # # │ a ┆ b │
468
+ # # │ --- ┆ --- │
469
+ # # │ str ┆ i64 │
470
+ # # ╞═════╪═════╡
471
+ # # │ a ┆ 1 │
472
+ # # │ a ┆ 2 │
473
+ # # │ b ┆ 1 │
474
+ # # │ b ┆ 2 │
475
+ # # └─────┴─────┘
476
+ def bottom_k(
477
+ k,
478
+ by:,
479
+ reverse: false
480
+ )
481
+ by = Utils.parse_into_list_of_expressions(by)
482
+ reverse = Utils.extend_bool(reverse, by.length, "reverse", "by")
483
+ _from_rbldf(_ldf.bottom_k(k, by, reverse))
484
+ end
485
+
291
486
  # def profile
292
487
  # end
293
488
 
@@ -379,6 +574,41 @@ module Polars
379
574
  Utils.wrap_df(ldf.collect)
380
575
  end
381
576
 
577
+ # Resolve the schema of this LazyFrame.
578
+ #
579
+ # @return [Schema]
580
+ #
581
+ # @example Determine the schema.
582
+ # lf = Polars::LazyFrame.new(
583
+ # {
584
+ # "foo" => [1, 2, 3],
585
+ # "bar" => [6.0, 7.0, 8.0],
586
+ # "ham" => ["a", "b", "c"]
587
+ # }
588
+ # )
589
+ # lf.collect_schema
590
+ # # => Polars::Schema({"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::String})
591
+ #
592
+ # @example Access various properties of the schema.
593
+ # schema = lf.collect_schema
594
+ # schema["bar"]
595
+ # # => Polars::Float64
596
+ #
597
+ # @example
598
+ # schema.names
599
+ # # => ["foo", "bar", "ham"]
600
+ #
601
+ # @example
602
+ # schema.dtypes
603
+ # # => [Polars::Int64, Polars::Float64, Polars::String]
604
+ #
605
+ # @example
606
+ # schema.length
607
+ # # => 3
608
+ def collect_schema
609
+ Schema.new(_ldf.collect_schema, check_dtypes: false)
610
+ end
611
+
382
612
  # Persists a LazyFrame at the provided path.
383
613
  #
384
614
  # This allows streaming results that are larger than RAM to be written to disk.
@@ -1147,6 +1377,140 @@ module Polars
1147
1377
  )
1148
1378
  end
1149
1379
 
1380
+ # Remove rows, dropping those that match the given predicate expression(s).
1381
+ #
1382
+ # The original order of the remaining rows is preserved.
1383
+ #
1384
+ # Rows where the filter predicate does not evaluate to true are retained
1385
+ # (this includes rows where the predicate evaluates as `null`).
1386
+ #
1387
+ # @param predicates [Array]
1388
+ # Expression that evaluates to a boolean Series.
1389
+ # @param constraints [Hash]
1390
+ # Column filters; use `name = value` to filter columns using the supplied
1391
+ # value. Each constraint behaves the same as `Polars.col(name).eq(value)`,
1392
+ # and is implicitly joined with the other filter conditions using `&`.
1393
+ #
1394
+ # @return [LazyFrame]
1395
+ #
1396
+ # @example Remove rows matching a condition:
1397
+ # lf = Polars::LazyFrame.new(
1398
+ # {
1399
+ # "foo" => [2, 3, nil, 4, 0],
1400
+ # "bar" => [5, 6, nil, nil, 0],
1401
+ # "ham" => ["a", "b", nil, "c", "d"]
1402
+ # }
1403
+ # )
1404
+ # lf.remove(
1405
+ # Polars.col("bar") >= 5
1406
+ # ).collect
1407
+ # # =>
1408
+ # # shape: (3, 3)
1409
+ # # ┌──────┬──────┬──────┐
1410
+ # # │ foo ┆ bar ┆ ham │
1411
+ # # │ --- ┆ --- ┆ --- │
1412
+ # # │ i64 ┆ i64 ┆ str │
1413
+ # # ╞══════╪══════╪══════╡
1414
+ # # │ null ┆ null ┆ null │
1415
+ # # │ 4 ┆ null ┆ c │
1416
+ # # │ 0 ┆ 0 ┆ d │
1417
+ # # └──────┴──────┴──────┘
1418
+ #
1419
+ # @example Discard rows based on multiple conditions, combined with and/or operators:
1420
+ # lf.remove(
1421
+ # (Polars.col("foo") >= 0) & (Polars.col("bar") >= 0)
1422
+ # ).collect
1423
+ # # =>
1424
+ # # shape: (2, 3)
1425
+ # # ┌──────┬──────┬──────┐
1426
+ # # │ foo ┆ bar ┆ ham │
1427
+ # # │ --- ┆ --- ┆ --- │
1428
+ # # │ i64 ┆ i64 ┆ str │
1429
+ # # ╞══════╪══════╪══════╡
1430
+ # # │ null ┆ null ┆ null │
1431
+ # # │ 4 ┆ null ┆ c │
1432
+ # # └──────┴──────┴──────┘
1433
+ #
1434
+ # @example
1435
+ # lf.remove(
1436
+ # (Polars.col("foo") >= 0) | (Polars.col("bar") >= 0)
1437
+ # ).collect
1438
+ # # =>
1439
+ # # shape: (1, 3)
1440
+ # # ┌──────┬──────┬──────┐
1441
+ # # │ foo ┆ bar ┆ ham │
1442
+ # # │ --- ┆ --- ┆ --- │
1443
+ # # │ i64 ┆ i64 ┆ str │
1444
+ # # ╞══════╪══════╪══════╡
1445
+ # # │ null ┆ null ┆ null │
1446
+ # # └──────┴──────┴──────┘
1447
+ #
1448
+ # @example Provide multiple constraints using `*args` syntax:
1449
+ # lf.remove(
1450
+ # Polars.col("ham").is_not_null,
1451
+ # Polars.col("bar") >= 0
1452
+ # ).collect
1453
+ # # =>
1454
+ # # shape: (2, 3)
1455
+ # # ┌──────┬──────┬──────┐
1456
+ # # │ foo ┆ bar ┆ ham │
1457
+ # # │ --- ┆ --- ┆ --- │
1458
+ # # │ i64 ┆ i64 ┆ str │
1459
+ # # ╞══════╪══════╪══════╡
1460
+ # # │ null ┆ null ┆ null │
1461
+ # # │ 4 ┆ null ┆ c │
1462
+ # # └──────┴──────┴──────┘
1463
+ #
1464
+ # @example Provide constraints(s) using `**kwargs` syntax:
1465
+ # lf.remove(foo: 0, bar: 0).collect
1466
+ # # =>
1467
+ # # shape: (4, 3)
1468
+ # # ┌──────┬──────┬──────┐
1469
+ # # │ foo ┆ bar ┆ ham │
1470
+ # # │ --- ┆ --- ┆ --- │
1471
+ # # │ i64 ┆ i64 ┆ str │
1472
+ # # ╞══════╪══════╪══════╡
1473
+ # # │ 2 ┆ 5 ┆ a │
1474
+ # # │ 3 ┆ 6 ┆ b │
1475
+ # # │ null ┆ null ┆ null │
1476
+ # # │ 4 ┆ null ┆ c │
1477
+ # # └──────┴──────┴──────┘
1478
+ #
1479
+ # @example Remove rows by comparing two columns against each other; in this case, we remove rows where the two columns are not equal (using `ne_missing` to ensure that null values compare equal):
1480
+ # lf.remove(
1481
+ # Polars.col("foo").ne_missing(Polars.col("bar"))
1482
+ # ).collect
1483
+ # # =>
1484
+ # # shape: (2, 3)
1485
+ # # ┌──────┬──────┬──────┐
1486
+ # # │ foo ┆ bar ┆ ham │
1487
+ # # │ --- ┆ --- ┆ --- │
1488
+ # # │ i64 ┆ i64 ┆ str │
1489
+ # # ╞══════╪══════╪══════╡
1490
+ # # │ null ┆ null ┆ null │
1491
+ # # │ 0 ┆ 0 ┆ d │
1492
+ # # └──────┴──────┴──────┘
1493
+ def remove(
1494
+ *predicates,
1495
+ **constraints
1496
+ )
1497
+ if constraints.empty?
1498
+ # early-exit conditions (exclude/include all rows)
1499
+ if predicates.empty? || (predicates.length == 1 && predicates[0].is_a?(TrueClass))
1500
+ return clear
1501
+ end
1502
+ if predicates.length == 1 && predicates[0].is_a?(FalseClass)
1503
+ return dup
1504
+ end
1505
+ end
1506
+
1507
+ _filter(
1508
+ predicates: predicates,
1509
+ constraints: constraints,
1510
+ invert: true
1511
+ )
1512
+ end
1513
+
1150
1514
  # Select columns from this DataFrame.
1151
1515
  #
1152
1516
  # @param exprs [Array]
@@ -1244,6 +1608,29 @@ module Polars
1244
1608
  _from_rbldf(_ldf.select(rbexprs))
1245
1609
  end
1246
1610
 
1611
+ # Select columns from this LazyFrame.
1612
+ #
1613
+ # This will run all expression sequentially instead of in parallel.
1614
+ # Use this when the work per expression is cheap.
1615
+ #
1616
+ # @param exprs [Array]
1617
+ # Column(s) to select, specified as positional arguments.
1618
+ # Accepts expression input. Strings are parsed as column names,
1619
+ # other non-expression inputs are parsed as literals.
1620
+ # @param named_exprs [Hash]
1621
+ # Additional columns to select, specified as keyword arguments.
1622
+ # The columns will be renamed to the keyword used.
1623
+ #
1624
+ # @return [LazyFrame]
1625
+ def select_seq(*exprs, **named_exprs)
1626
+ structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", 0).to_i != 0
1627
+
1628
+ rbexprs = Utils.parse_into_list_of_expressions(
1629
+ *exprs, **named_exprs, __structify: structify
1630
+ )
1631
+ _from_rbldf(_ldf.select_seq(rbexprs))
1632
+ end
1633
+
1247
1634
  # Start a group by operation.
1248
1635
  #
1249
1636
  # @param by [Array]
@@ -1440,9 +1827,9 @@ module Polars
1440
1827
  # @param every [Object]
1441
1828
  # Interval of the window.
1442
1829
  # @param period [Object]
1443
- # Length of the window, if None it is equal to 'every'.
1830
+ # Length of the window, if nil it is equal to 'every'.
1444
1831
  # @param offset [Object]
1445
- # Offset of the window if None and period is None it will be equal to negative
1832
+ # Offset of the window if nil and period is nil it will be equal to negative
1446
1833
  # `every`.
1447
1834
  # @param truncate [Boolean]
1448
1835
  # Truncate the time value to the window lower bound.
@@ -1714,7 +2101,7 @@ module Polars
1714
2101
  # Join column of the right DataFrame.
1715
2102
  # @param on [String]
1716
2103
  # Join column of both DataFrames. If set, `left_on` and `right_on` should be
1717
- # None.
2104
+ # nil.
1718
2105
  # @param by_left [Object]
1719
2106
  # Join on these columns before doing asof join.
1720
2107
  # @param by_right [Object]
@@ -2039,7 +2426,7 @@ module Polars
2039
2426
  # Join column of the right DataFrame.
2040
2427
  # @param on Object
2041
2428
  # Join column of both DataFrames. If set, `left_on` and `right_on` should be
2042
- # None.
2429
+ # nil.
2043
2430
  # @param how ["inner", "left", "full", "semi", "anti", "cross"]
2044
2431
  # Join strategy.
2045
2432
  # @param suffix [String]
@@ -2234,6 +2621,103 @@ module Polars
2234
2621
  )
2235
2622
  end
2236
2623
 
2624
+ # Perform a join based on one or multiple (in)equality predicates.
2625
+ #
2626
+ # This performs an inner join, so only rows where all predicates are true
2627
+ # are included in the result, and a row from either DataFrame may be included
2628
+ # multiple times in the result.
2629
+ #
2630
+ # @note
2631
+ # The row order of the input DataFrames is not preserved.
2632
+ #
2633
+ # @note
2634
+ # This functionality is experimental. It may be
2635
+ # changed at any point without it being considered a breaking change.
2636
+ #
2637
+ # @param other [Object]
2638
+ # DataFrame to join with.
2639
+ # @param predicates [Object]
2640
+ # (In)Equality condition to join the two tables on.
2641
+ # When a column name occurs in both tables, the proper suffix must
2642
+ # be applied in the predicate.
2643
+ # @param suffix [String]
2644
+ # Suffix to append to columns with a duplicate name.
2645
+ #
2646
+ # @return [LazyFrame]
2647
+ #
2648
+ # @example Join two lazyframes together based on two predicates which get AND-ed together.
2649
+ # east = Polars::LazyFrame.new(
2650
+ # {
2651
+ # "id" => [100, 101, 102],
2652
+ # "dur" => [120, 140, 160],
2653
+ # "rev" => [12, 14, 16],
2654
+ # "cores" => [2, 8, 4]
2655
+ # }
2656
+ # )
2657
+ # west = Polars::LazyFrame.new(
2658
+ # {
2659
+ # "t_id" => [404, 498, 676, 742],
2660
+ # "time" => [90, 130, 150, 170],
2661
+ # "cost" => [9, 13, 15, 16],
2662
+ # "cores" => [4, 2, 1, 4]
2663
+ # }
2664
+ # )
2665
+ # east.join_where(
2666
+ # west,
2667
+ # Polars.col("dur") < Polars.col("time"),
2668
+ # Polars.col("rev") < Polars.col("cost")
2669
+ # ).collect
2670
+ # # =>
2671
+ # # shape: (5, 8)
2672
+ # # ┌─────┬─────┬─────┬───────┬──────┬──────┬──────┬─────────────┐
2673
+ # # │ id ┆ dur ┆ rev ┆ cores ┆ t_id ┆ time ┆ cost ┆ cores_right │
2674
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
2675
+ # # │ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
2676
+ # # ╞═════╪═════╪═════╪═══════╪══════╪══════╪══════╪═════════════╡
2677
+ # # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 498 ┆ 130 ┆ 13 ┆ 2 │
2678
+ # # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
2679
+ # # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
2680
+ # # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
2681
+ # # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
2682
+ # # └─────┴─────┴─────┴───────┴──────┴──────┴──────┴─────────────┘
2683
+ #
2684
+ # @example To OR them together, use a single expression and the `|` operator.
2685
+ # east.join_where(
2686
+ # west,
2687
+ # (Polars.col("dur") < Polars.col("time")) | (Polars.col("rev") < Polars.col("cost"))
2688
+ # ).collect
2689
+ # # =>
2690
+ # # shape: (6, 8)
2691
+ # # ┌─────┬─────┬─────┬───────┬──────┬──────┬──────┬─────────────┐
2692
+ # # │ id ┆ dur ┆ rev ┆ cores ┆ t_id ┆ time ┆ cost ┆ cores_right │
2693
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
2694
+ # # │ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
2695
+ # # ╞═════╪═════╪═════╪═══════╪══════╪══════╪══════╪═════════════╡
2696
+ # # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 498 ┆ 130 ┆ 13 ┆ 2 │
2697
+ # # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
2698
+ # # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
2699
+ # # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
2700
+ # # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
2701
+ # # │ 102 ┆ 160 ┆ 16 ┆ 4 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
2702
+ # # └─────┴─────┴─────┴───────┴──────┴──────┴──────┴─────────────┘
2703
+ def join_where(
2704
+ other,
2705
+ *predicates,
2706
+ suffix: "_right"
2707
+ )
2708
+ Utils.require_same_type(self, other)
2709
+
2710
+ rbexprs = Utils.parse_into_list_of_expressions(*predicates)
2711
+
2712
+ _from_rbldf(
2713
+ _ldf.join_where(
2714
+ other._ldf,
2715
+ rbexprs,
2716
+ suffix
2717
+ )
2718
+ )
2719
+ end
2720
+
2237
2721
  # Add or overwrite multiple columns in a DataFrame.
2238
2722
  #
2239
2723
  # @param exprs [Object]
@@ -2279,6 +2763,34 @@ module Polars
2279
2763
  _from_rbldf(_ldf.with_columns(rbexprs))
2280
2764
  end
2281
2765
 
2766
+ # Add columns to this LazyFrame.
2767
+ #
2768
+ # Added columns will replace existing columns with the same name.
2769
+ #
2770
+ # This will run all expression sequentially instead of in parallel.
2771
+ # Use this when the work per expression is cheap.
2772
+ #
2773
+ # @param exprs [Array]
2774
+ # Column(s) to add, specified as positional arguments.
2775
+ # Accepts expression input. Strings are parsed as column names, other
2776
+ # non-expression inputs are parsed as literals.
2777
+ # @param named_exprs [Hash]
2778
+ # Additional columns to add, specified as keyword arguments.
2779
+ # The columns will be renamed to the keyword used.
2780
+ #
2781
+ # @return [LazyFrame]
2782
+ def with_columns_seq(
2783
+ *exprs,
2784
+ **named_exprs
2785
+ )
2786
+ structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", 0).to_i != 0
2787
+
2788
+ rbexprs = Utils.parse_into_list_of_expressions(
2789
+ *exprs, **named_exprs, __structify: structify
2790
+ )
2791
+ _from_rbldf(_ldf.with_columns_seq(rbexprs))
2792
+ end
2793
+
2282
2794
  # Add an external context to the computation graph.
2283
2795
  #
2284
2796
  # This allows expressions to also access columns from DataFrames
@@ -2887,7 +3399,7 @@ module Polars
2887
3399
  #
2888
3400
  # @example
2889
3401
  # s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]}).lazy
2890
- # s.take_every(2).collect
3402
+ # s.gather_every(2).collect
2891
3403
  # # =>
2892
3404
  # # shape: (2, 2)
2893
3405
  # # ┌─────┬─────┐
@@ -2898,9 +3410,10 @@ module Polars
2898
3410
  # # │ 1 ┆ 5 │
2899
3411
  # # │ 3 ┆ 7 │
2900
3412
  # # └─────┴─────┘
2901
- def take_every(n)
2902
- select(F.col("*").take_every(n))
3413
+ def gather_every(n)
3414
+ select(F.col("*").gather_every(n))
2903
3415
  end
3416
+ alias_method :take_every, :gather_every
2904
3417
 
2905
3418
  # Fill null values using the specified value or strategy.
2906
3419
  #
@@ -3177,6 +3690,32 @@ module Polars
3177
3690
  _from_rbldf(_ldf.median)
3178
3691
  end
3179
3692
 
3693
+ # Aggregate the columns in the LazyFrame as the sum of their null value count.
3694
+ #
3695
+ # @return [LazyFrame]
3696
+ #
3697
+ # @example
3698
+ # lf = Polars::LazyFrame.new(
3699
+ # {
3700
+ # "foo" => [1, nil, 3],
3701
+ # "bar" => [6, 7, nil],
3702
+ # "ham" => ["a", "b", "c"]
3703
+ # }
3704
+ # )
3705
+ # lf.null_count.collect
3706
+ # # =>
3707
+ # # shape: (1, 3)
3708
+ # # ┌─────┬─────┬─────┐
3709
+ # # │ foo ┆ bar ┆ ham │
3710
+ # # │ --- ┆ --- ┆ --- │
3711
+ # # │ u32 ┆ u32 ┆ u32 │
3712
+ # # ╞═════╪═════╪═════╡
3713
+ # # │ 1 ┆ 1 ┆ 0 │
3714
+ # # └─────┴─────┴─────┘
3715
+ def null_count
3716
+ _from_rbldf(_ldf.null_count)
3717
+ end
3718
+
3180
3719
  # Aggregate the columns in the DataFrame to their quantile value.
3181
3720
  #
3182
3721
  # @param quantile [Float]
@@ -3307,37 +3846,103 @@ module Polars
3307
3846
  _from_rbldf(_ldf.unique(maintain_order, selector_subset, keep))
3308
3847
  end
3309
3848
 
3310
- # Drop rows with null values from this LazyFrame.
3849
+ # Drop all rows that contain one or more NaN values.
3850
+ #
3851
+ # The original order of the remaining rows is preserved.
3311
3852
  #
3312
3853
  # @param subset [Object]
3313
- # Subset of column(s) on which `drop_nulls` will be applied.
3854
+ # Column name(s) for which NaN values are considered; if set to `nil`
3855
+ # (default), use all columns (note that only floating-point columns
3856
+ # can contain NaNs).
3314
3857
  #
3315
3858
  # @return [LazyFrame]
3316
3859
  #
3317
3860
  # @example
3318
- # df = Polars::DataFrame.new(
3861
+ # lf = Polars::LazyFrame.new(
3862
+ # {
3863
+ # "foo" => [-20.5, Float::NAN, 80.0],
3864
+ # "bar" => [Float::NAN, 110.0, 25.5],
3865
+ # "ham" => ["xxx", "yyy", nil]
3866
+ # }
3867
+ # )
3868
+ # lf.drop_nans.collect
3869
+ # # =>
3870
+ # # shape: (1, 3)
3871
+ # # ┌──────┬──────┬──────┐
3872
+ # # │ foo ┆ bar ┆ ham │
3873
+ # # │ --- ┆ --- ┆ --- │
3874
+ # # │ f64 ┆ f64 ┆ str │
3875
+ # # ╞══════╪══════╪══════╡
3876
+ # # │ 80.0 ┆ 25.5 ┆ null │
3877
+ # # └──────┴──────┴──────┘
3878
+ #
3879
+ # @example
3880
+ # lf.drop_nans(subset: ["bar"]).collect
3881
+ # # =>
3882
+ # # shape: (2, 3)
3883
+ # # ┌──────┬───────┬──────┐
3884
+ # # │ foo ┆ bar ┆ ham │
3885
+ # # │ --- ┆ --- ┆ --- │
3886
+ # # │ f64 ┆ f64 ┆ str │
3887
+ # # ╞══════╪═══════╪══════╡
3888
+ # # │ NaN ┆ 110.0 ┆ yyy │
3889
+ # # │ 80.0 ┆ 25.5 ┆ null │
3890
+ # # └──────┴───────┴──────┘
3891
+ def drop_nans(subset: nil)
3892
+ selector_subset = nil
3893
+ if !subset.nil?
3894
+ selector_subset = Utils.parse_list_into_selector(subset)._rbselector
3895
+ end
3896
+ _from_rbldf(_ldf.drop_nans(selector_subset))
3897
+ end
3898
+
3899
+ # Drop all rows that contain one or more null values.
3900
+ #
3901
+ # The original order of the remaining rows is preserved.
3902
+ #
3903
+ # @param subset [Object]
3904
+ # Column name(s) for which null values are considered.
3905
+ # If set to `nil` (default), use all columns.
3906
+ #
3907
+ # @return [LazyFrame]
3908
+ #
3909
+ # @example
3910
+ # lf = Polars::LazyFrame.new(
3319
3911
  # {
3320
3912
  # "foo" => [1, 2, 3],
3321
3913
  # "bar" => [6, nil, 8],
3322
- # "ham" => ["a", "b", "c"]
3914
+ # "ham" => ["a", "b", nil]
3323
3915
  # }
3324
3916
  # )
3325
- # df.lazy.drop_nulls.collect
3917
+ # lf.drop_nulls.collect
3326
3918
  # # =>
3327
- # # shape: (2, 3)
3919
+ # # shape: (1, 3)
3328
3920
  # # ┌─────┬─────┬─────┐
3329
3921
  # # │ foo ┆ bar ┆ ham │
3330
3922
  # # │ --- ┆ --- ┆ --- │
3331
3923
  # # │ i64 ┆ i64 ┆ str │
3332
3924
  # # ╞═════╪═════╪═════╡
3333
3925
  # # │ 1 ┆ 6 ┆ a │
3334
- # # │ 3 ┆ 8 ┆ c │
3335
3926
  # # └─────┴─────┴─────┘
3927
+ #
3928
+ # @example
3929
+ # lf.drop_nulls(subset: Polars.cs.integer).collect
3930
+ # # =>
3931
+ # # shape: (2, 3)
3932
+ # # ┌─────┬─────┬──────┐
3933
+ # # │ foo ┆ bar ┆ ham │
3934
+ # # │ --- ┆ --- ┆ --- │
3935
+ # # │ i64 ┆ i64 ┆ str │
3936
+ # # ╞═════╪═════╪══════╡
3937
+ # # │ 1 ┆ 6 ┆ a │
3938
+ # # │ 3 ┆ 8 ┆ null │
3939
+ # # └─────┴─────┴──────┘
3336
3940
  def drop_nulls(subset: nil)
3337
- if !subset.nil? && !subset.is_a?(::Array)
3338
- subset = [subset]
3941
+ selector_subset = nil
3942
+ if !subset.nil?
3943
+ selector_subset = Utils.parse_list_into_selector(subset)._rbselector
3339
3944
  end
3340
- _from_rbldf(_ldf.drop_nulls(subset))
3945
+ _from_rbldf(_ldf.drop_nulls(selector_subset))
3341
3946
  end
3342
3947
 
3343
3948
  # Unpivot a DataFrame from wide to long format.
@@ -3571,9 +4176,261 @@ module Polars
3571
4176
  with_columns(F.col(column).set_sorted(descending: descending))
3572
4177
  end
3573
4178
 
3574
- # TODO
3575
- # def update
3576
- # end
4179
+ # Update the values in this `LazyFrame` with the values in `other`.
4180
+ #
4181
+ # @note
4182
+ # This functionality is considered **unstable**. It may be changed
4183
+ # at any point without it being considered a breaking change.
4184
+ #
4185
+ # @param other [LazyFrame]
4186
+ # LazyFrame that will be used to update the values
4187
+ # @param on [Object]
4188
+ # Column names that will be joined on. If set to `nil` (default),
4189
+ # the implicit row index of each frame is used as a join key.
4190
+ # @param how ['left', 'inner', 'full']
4191
+ # * 'left' will keep all rows from the left table; rows may be duplicated
4192
+ # if multiple rows in the right frame match the left row's key.
4193
+ # * 'inner' keeps only those rows where the key exists in both frames.
4194
+ # * 'full' will update existing rows where the key matches while also
4195
+ # adding any new rows contained in the given frame.
4196
+ # @param left_on [Object]
4197
+ # Join column(s) of the left DataFrame.
4198
+ # @param right_on [Object]
4199
+ # Join column(s) of the right DataFrame.
4200
+ # @param include_nulls [Boolean]
4201
+ # Overwrite values in the left frame with null values from the right frame.
4202
+ # If set to `false` (default), null values in the right frame are ignored.
4203
+ # @param maintain_order ['none', 'left', 'right', 'left_right', 'right_left']
4204
+ # Which order of rows from the inputs to preserve. See `LazyFrame.join`
4205
+ # for details. Unlike `join` this function preserves the left order by
4206
+ # default.
4207
+ #
4208
+ # @return [LazyFrame]
4209
+ #
4210
+ # @note
4211
+ # This is syntactic sugar for a left/inner join that preserves the order
4212
+ # of the left `DataFrame` by default, with an optional coalesce when
4213
+ # `include_nulls: False`.
4214
+ #
4215
+ # @example Update `df` values with the non-null values in `new_df`, by row index:
4216
+ # lf = Polars::LazyFrame.new(
4217
+ # {
4218
+ # "A" => [1, 2, 3, 4],
4219
+ # "B" => [400, 500, 600, 700]
4220
+ # }
4221
+ # )
4222
+ # new_lf = Polars::LazyFrame.new(
4223
+ # {
4224
+ # "B" => [-66, nil, -99],
4225
+ # "C" => [5, 3, 1]
4226
+ # }
4227
+ # )
4228
+ # lf.update(new_lf).collect
4229
+ # # =>
4230
+ # # shape: (4, 2)
4231
+ # # ┌─────┬─────┐
4232
+ # # │ A ┆ B │
4233
+ # # │ --- ┆ --- │
4234
+ # # │ i64 ┆ i64 │
4235
+ # # ╞═════╪═════╡
4236
+ # # │ 1 ┆ -66 │
4237
+ # # │ 2 ┆ 500 │
4238
+ # # │ 3 ┆ -99 │
4239
+ # # │ 4 ┆ 700 │
4240
+ # # └─────┴─────┘
4241
+ #
4242
+ # @example Update `df` values with the non-null values in `new_df`, by row index, but only keeping those rows that are common to both frames:
4243
+ # lf.update(new_lf, how: "inner").collect
4244
+ # # =>
4245
+ # # shape: (3, 2)
4246
+ # # ┌─────┬─────┐
4247
+ # # │ A ┆ B │
4248
+ # # │ --- ┆ --- │
4249
+ # # │ i64 ┆ i64 │
4250
+ # # ╞═════╪═════╡
4251
+ # # │ 1 ┆ -66 │
4252
+ # # │ 2 ┆ 500 │
4253
+ # # │ 3 ┆ -99 │
4254
+ # # └─────┴─────┘
4255
+ #
4256
+ # @example Update `df` values with the non-null values in `new_df`, using a full outer join strategy that defines explicit join columns in each frame:
4257
+ # lf.update(new_lf, left_on: ["A"], right_on: ["C"], how: "full").collect
4258
+ # # =>
4259
+ # # shape: (5, 2)
4260
+ # # ┌─────┬─────┐
4261
+ # # │ A ┆ B │
4262
+ # # │ --- ┆ --- │
4263
+ # # │ i64 ┆ i64 │
4264
+ # # ╞═════╪═════╡
4265
+ # # │ 1 ┆ -99 │
4266
+ # # │ 2 ┆ 500 │
4267
+ # # │ 3 ┆ 600 │
4268
+ # # │ 4 ┆ 700 │
4269
+ # # │ 5 ┆ -66 │
4270
+ # # └─────┴─────┘
4271
+ #
4272
+ # @example Update `df` values including null values in `new_df`, using a full outer join strategy that defines explicit join columns in each frame:
4273
+ # lf.update(
4274
+ # new_lf, left_on: "A", right_on: "C", how: "full", include_nulls: true
4275
+ # ).collect
4276
+ # # =>
4277
+ # # shape: (5, 2)
4278
+ # # ┌─────┬──────┐
4279
+ # # │ A ┆ B │
4280
+ # # │ --- ┆ --- │
4281
+ # # │ i64 ┆ i64 │
4282
+ # # ╞═════╪══════╡
4283
+ # # │ 1 ┆ -99 │
4284
+ # # │ 2 ┆ 500 │
4285
+ # # │ 3 ┆ null │
4286
+ # # │ 4 ┆ 700 │
4287
+ # # │ 5 ┆ -66 │
4288
+ # # └─────┴──────┘
4289
+ def update(
4290
+ other,
4291
+ on: nil,
4292
+ how: "left",
4293
+ left_on: nil,
4294
+ right_on: nil,
4295
+ include_nulls: false,
4296
+ maintain_order: "left"
4297
+ )
4298
+ Utils.require_same_type(self, other)
4299
+ if ["outer", "outer_coalesce"].include?(how)
4300
+ how = "full"
4301
+ end
4302
+
4303
+ if !["left", "inner", "full"].include?(how)
4304
+ msg = "`how` must be one of {{'left', 'inner', 'full'}}; found #{how.inspect}"
4305
+ raise ArgumentError, msg
4306
+ end
4307
+
4308
+ slf = self
4309
+ row_index_used = false
4310
+ if on.nil?
4311
+ if left_on.nil? && right_on.nil?
4312
+ # no keys provided--use row index
4313
+ row_index_used = true
4314
+ row_index_name = "__POLARS_ROW_INDEX"
4315
+ slf = slf.with_row_index(name: row_index_name)
4316
+ other = other.with_row_index(name: row_index_name)
4317
+ left_on = right_on = [row_index_name]
4318
+ else
4319
+ # one of left or right is missing, raise error
4320
+ if left_on.nil?
4321
+ msg = "missing join columns for left frame"
4322
+ raise ArgumentError, msg
4323
+ end
4324
+ if right_on.nil?
4325
+ msg = "missing join columns for right frame"
4326
+ raise ArgumentError, msg
4327
+ end
4328
+ end
4329
+ else
4330
+ # move on into left/right_on to simplify logic
4331
+ left_on = right_on = on
4332
+ end
4333
+
4334
+ if left_on.is_a?(::String)
4335
+ left_on = [left_on]
4336
+ end
4337
+ if right_on.is_a?(::String)
4338
+ right_on = [right_on]
4339
+ end
4340
+
4341
+ left_schema = slf.collect_schema
4342
+ left_on.each do |name|
4343
+ if !left_schema.include?(name)
4344
+ msg = "left join column #{name.inspect} not found"
4345
+ raise ArgumentError, msg
4346
+ end
4347
+ end
4348
+ right_schema = other.collect_schema
4349
+ right_on.each do |name|
4350
+ if !right_schema.include?(name)
4351
+ msg = "right join column #{name.inspect} not found"
4352
+ raise ArgumentError, msg
4353
+ end
4354
+ end
4355
+
4356
+ # no need to join if *only* join columns are in other (inner/left update only)
4357
+ if how != "full" && right_schema.length == right_on.length
4358
+ if row_index_used
4359
+ return slf.drop(row_index_name)
4360
+ end
4361
+ return slf
4362
+ end
4363
+
4364
+ # only use non-idx right columns present in left frame
4365
+ right_other = Set.new(right_schema.to_h.keys).intersection(left_schema.to_h.keys) - Set.new(right_on)
4366
+
4367
+ # When include_nulls is True, we need to distinguish records after the join that
4368
+ # were originally null in the right frame, as opposed to records that were null
4369
+ # because the key was missing from the right frame.
4370
+ # Add a validity column to track whether row was matched or not.
4371
+ if include_nulls
4372
+ validity = ["__POLARS_VALIDITY"]
4373
+ other = other.with_columns(F.lit(true).alias(validity[0]))
4374
+ else
4375
+ validity = []
4376
+ end
4377
+
4378
+ tmp_name = "__POLARS_RIGHT"
4379
+ drop_columns = right_other.map { |name| "#{name}#{tmp_name}" } + validity
4380
+ result = (
4381
+ slf.join(
4382
+ other.select(*right_on, *right_other, *validity),
4383
+ left_on: left_on,
4384
+ right_on: right_on,
4385
+ how: how,
4386
+ suffix: tmp_name,
4387
+ coalesce: true,
4388
+ maintain_order: maintain_order
4389
+ )
4390
+ .with_columns(
4391
+ right_other.map do |name|
4392
+ (
4393
+ if include_nulls
4394
+ # use left value only when right value failed to join
4395
+ F.when(F.col(validity).is_null)
4396
+ .then(F.col(name))
4397
+ .otherwise(F.col("#{name}#{tmp_name}"))
4398
+ else
4399
+ F.coalesce(["#{name}#{tmp_name}", F.col(name)])
4400
+ end
4401
+ ).alias(name)
4402
+ end
4403
+ )
4404
+ .drop(drop_columns)
4405
+ )
4406
+ if row_index_used
4407
+ result = result.drop(row_index_name)
4408
+ end
4409
+
4410
+ _from_rbldf(result._ldf)
4411
+ end
4412
+
4413
+ # Return the number of non-null elements for each column.
4414
+ #
4415
+ # @return [LazyFrame]
4416
+ #
4417
+ # @example
4418
+ # lf = Polars::LazyFrame.new(
4419
+ # {"a" => [1, 2, 3, 4], "b" => [1, 2, 1, nil], "c" => [nil, nil, nil, nil]}
4420
+ # )
4421
+ # lf.count.collect
4422
+ # # =>
4423
+ # # shape: (1, 3)
4424
+ # # ┌─────┬─────┬─────┐
4425
+ # # │ a ┆ b ┆ c │
4426
+ # # │ --- ┆ --- ┆ --- │
4427
+ # # │ u32 ┆ u32 ┆ u32 │
4428
+ # # ╞═════╪═════╪═════╡
4429
+ # # │ 4 ┆ 3 ┆ 0 │
4430
+ # # └─────┴─────┴─────┘
4431
+ def count
4432
+ _from_rbldf(_ldf.count)
4433
+ end
3577
4434
 
3578
4435
  private
3579
4436
 
@@ -3585,5 +4442,64 @@ module Polars
3585
4442
  def _from_rbldf(rb_ldf)
3586
4443
  self.class._from_rbldf(rb_ldf)
3587
4444
  end
4445
+
4446
+ def _filter(
4447
+ predicates:,
4448
+ constraints:,
4449
+ invert: false
4450
+ )
4451
+ all_predicates = []
4452
+ boolean_masks = []
4453
+
4454
+ predicates.each do |p|
4455
+ # quick exit/skip conditions
4456
+ if (p.is_a?(FalseClass) && invert) || (p.is_a?(TrueClass) && !invert)
4457
+ next # ignore; doesn't filter/remove anything
4458
+ end
4459
+ if (p.is_a?(TrueClass) && invert) || (p.is_a?(FalseClass) && !invert)
4460
+ return clear # discard all rows
4461
+ end
4462
+
4463
+ # note: identify masks separately from predicates
4464
+ if Utils.is_bool_sequence(p, include_series: true)
4465
+ boolean_masks << Polars::Series.new(p, dtype: Boolean)
4466
+ elsif (
4467
+ (is_seq = Utils.is_sequence(p)) && p.any? { |x| !x.is_a?(Expr) }) ||
4468
+ (!is_seq && !p.is_a?(Expr) && !(p.is_a?(::String) && collect_schema.include?(p))
4469
+ )
4470
+ err = p.is_a?(Series) ? "Series(…, dtype: #{p.dtype})" : p.inspect
4471
+ msg = "invalid predicate for `filter`: #{err}"
4472
+ raise TypeError, msg
4473
+ else
4474
+ all_predicates.concat(
4475
+ Utils.parse_into_list_of_expressions(p).map { |x| Utils.wrap_expr(x) }
4476
+ )
4477
+ end
4478
+ end
4479
+
4480
+ # unpack equality constraints from kwargs
4481
+ all_predicates.concat(
4482
+ constraints.map { |name, value| F.col(name).eq(value) }
4483
+ )
4484
+ if !(all_predicates.any? || boolean_masks.any?)
4485
+ msg = "at least one predicate or constraint must be provided"
4486
+ raise TypeError, msg
4487
+ end
4488
+
4489
+ # if multiple predicates, combine as 'horizontal' expression
4490
+ combined_predicate = all_predicates ? (all_predicates.length > 1 ? F.all_horizontal(*all_predicates) : all_predicates[0]) : nil
4491
+
4492
+ # apply reduced boolean mask first, if applicable, then predicates
4493
+ if boolean_masks.any?
4494
+ raise Todo
4495
+ end
4496
+
4497
+ if combined_predicate.nil?
4498
+ return _from_rbldf(_ldf)
4499
+ end
4500
+
4501
+ filter_method = invert ? _ldf.method(:remove) : _ldf.method(:filter)
4502
+ _from_rbldf(filter_method.(combined_predicate._rbexpr))
4503
+ end
3588
4504
  end
3589
4505
  end