polars-df 0.21.0 → 0.21.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/Cargo.lock +1 -1
- data/ext/polars/Cargo.toml +7 -1
- data/ext/polars/src/conversion/mod.rs +92 -4
- data/ext/polars/src/exceptions.rs +1 -0
- data/ext/polars/src/expr/array.rs +73 -4
- data/ext/polars/src/expr/binary.rs +26 -1
- data/ext/polars/src/expr/bitwise.rs +39 -0
- data/ext/polars/src/expr/categorical.rs +20 -0
- data/ext/polars/src/expr/datatype.rs +24 -1
- data/ext/polars/src/expr/datetime.rs +58 -0
- data/ext/polars/src/expr/general.rs +84 -5
- data/ext/polars/src/expr/list.rs +24 -0
- data/ext/polars/src/expr/meta.rs +11 -0
- data/ext/polars/src/expr/mod.rs +1 -0
- data/ext/polars/src/expr/name.rs +8 -0
- data/ext/polars/src/expr/rolling.rs +20 -0
- data/ext/polars/src/expr/string.rs +59 -0
- data/ext/polars/src/expr/struct.rs +9 -1
- data/ext/polars/src/functions/io.rs +19 -0
- data/ext/polars/src/functions/lazy.rs +4 -0
- data/ext/polars/src/lazyframe/general.rs +51 -0
- data/ext/polars/src/lib.rs +119 -10
- data/ext/polars/src/map/dataframe.rs +2 -2
- data/ext/polars/src/map/series.rs +1 -1
- data/ext/polars/src/series/aggregation.rs +44 -0
- data/ext/polars/src/series/general.rs +64 -4
- data/lib/polars/array_expr.rb +382 -3
- data/lib/polars/array_name_space.rb +281 -0
- data/lib/polars/binary_expr.rb +67 -0
- data/lib/polars/binary_name_space.rb +43 -0
- data/lib/polars/cat_expr.rb +224 -0
- data/lib/polars/cat_name_space.rb +138 -0
- data/lib/polars/config.rb +2 -2
- data/lib/polars/convert.rb +6 -6
- data/lib/polars/data_frame.rb +684 -19
- data/lib/polars/data_type_expr.rb +52 -0
- data/lib/polars/data_types.rb +14 -2
- data/lib/polars/date_time_expr.rb +251 -0
- data/lib/polars/date_time_name_space.rb +299 -0
- data/lib/polars/expr.rb +1213 -180
- data/lib/polars/functions/datatype.rb +21 -0
- data/lib/polars/functions/lazy.rb +13 -0
- data/lib/polars/io/csv.rb +1 -1
- data/lib/polars/io/json.rb +4 -4
- data/lib/polars/io/ndjson.rb +4 -4
- data/lib/polars/io/parquet.rb +27 -5
- data/lib/polars/lazy_frame.rb +936 -20
- data/lib/polars/list_expr.rb +196 -4
- data/lib/polars/list_name_space.rb +201 -4
- data/lib/polars/meta_expr.rb +64 -0
- data/lib/polars/name_expr.rb +36 -0
- data/lib/polars/schema.rb +79 -3
- data/lib/polars/selector.rb +72 -0
- data/lib/polars/selectors.rb +3 -3
- data/lib/polars/series.rb +1051 -54
- data/lib/polars/string_expr.rb +411 -6
- data/lib/polars/string_name_space.rb +722 -49
- data/lib/polars/struct_expr.rb +103 -0
- data/lib/polars/struct_name_space.rb +19 -1
- data/lib/polars/utils/various.rb +18 -1
- data/lib/polars/utils.rb +5 -1
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +2 -0
- metadata +4 -1
data/lib/polars/lazy_frame.rb
CHANGED
@@ -288,6 +288,201 @@ module Polars
|
|
288
288
|
)
|
289
289
|
end
|
290
290
|
|
291
|
+
# Execute a SQL query against the LazyFrame.
|
292
|
+
#
|
293
|
+
# @note
|
294
|
+
# This functionality is considered **unstable**, although it is close to
|
295
|
+
# being considered stable. It may be changed at any point without it being
|
296
|
+
# considered a breaking change.
|
297
|
+
#
|
298
|
+
# @param query [String]
|
299
|
+
# SQL query to execute.
|
300
|
+
# @param table_name [String]
|
301
|
+
# Optionally provide an explicit name for the table that represents the
|
302
|
+
# calling frame (defaults to "self").
|
303
|
+
#
|
304
|
+
# @return [Expr]
|
305
|
+
#
|
306
|
+
# @note
|
307
|
+
# * The calling frame is automatically registered as a table in the SQL context
|
308
|
+
# under the name "self". If you want access to the DataFrames and LazyFrames
|
309
|
+
# found in the current globals, use the top-level `Polars.sql`.
|
310
|
+
# * More control over registration and execution behaviour is available by
|
311
|
+
# using the `SQLContext` object.
|
312
|
+
#
|
313
|
+
# @example Query the LazyFrame using SQL:
|
314
|
+
# lf1 = Polars::LazyFrame.new({"a" => [1, 2, 3], "b" => [6, 7, 8], "c" => ["z", "y", "x"]})
|
315
|
+
# lf2 = Polars::LazyFrame.new({"a" => [3, 2, 1], "d" => [125, -654, 888]})
|
316
|
+
# lf1.sql("SELECT c, b FROM self WHERE a > 1").collect
|
317
|
+
# # =>
|
318
|
+
# # shape: (2, 2)
|
319
|
+
# # ┌─────┬─────┐
|
320
|
+
# # │ c ┆ b │
|
321
|
+
# # │ --- ┆ --- │
|
322
|
+
# # │ str ┆ i64 │
|
323
|
+
# # ╞═════╪═════╡
|
324
|
+
# # │ y ┆ 7 │
|
325
|
+
# # │ x ┆ 8 │
|
326
|
+
# # └─────┴─────┘
|
327
|
+
#
|
328
|
+
# @example Apply SQL transforms (aliasing "self" to "frame") then filter natively (you can freely mix SQL and native operations):
|
329
|
+
# lf1.sql(
|
330
|
+
# "
|
331
|
+
# SELECT
|
332
|
+
# a,
|
333
|
+
# (a % 2 == 0) AS a_is_even,
|
334
|
+
# (b::float4 / 2) AS \"b/2\",
|
335
|
+
# CONCAT_WS(':', c, c, c) AS c_c_c
|
336
|
+
# FROM frame
|
337
|
+
# ORDER BY a
|
338
|
+
# ",
|
339
|
+
# table_name: "frame",
|
340
|
+
# ).filter(~Polars.col("c_c_c").str.starts_with("x")).collect
|
341
|
+
# # =>
|
342
|
+
# # shape: (2, 4)
|
343
|
+
# # ┌─────┬───────────┬─────┬───────┐
|
344
|
+
# # │ a ┆ a_is_even ┆ b/2 ┆ c_c_c │
|
345
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
346
|
+
# # │ i64 ┆ bool ┆ f32 ┆ str │
|
347
|
+
# # ╞═════╪═══════════╪═════╪═══════╡
|
348
|
+
# # │ 1 ┆ false ┆ 3.0 ┆ z:z:z │
|
349
|
+
# # │ 2 ┆ true ┆ 3.5 ┆ y:y:y │
|
350
|
+
# # └─────┴───────────┴─────┴───────┘
|
351
|
+
def sql(query, table_name: "self")
|
352
|
+
ctx = Polars::SQLContext.new
|
353
|
+
name = table_name || "self"
|
354
|
+
ctx.register(name, self)
|
355
|
+
ctx.execute(query)
|
356
|
+
end
|
357
|
+
|
358
|
+
# Return the `k` largest rows.
|
359
|
+
#
|
360
|
+
# Non-null elements are always preferred over null elements, regardless of
|
361
|
+
# the value of `reverse`. The output is not guaranteed to be in any
|
362
|
+
# particular order, call :func:`sort` after this function if you wish the
|
363
|
+
# output to be sorted.
|
364
|
+
#
|
365
|
+
# @param k [Integer]
|
366
|
+
# Number of rows to return.
|
367
|
+
# @param by [Object]
|
368
|
+
# Column(s) used to determine the top rows.
|
369
|
+
# Accepts expression input. Strings are parsed as column names.
|
370
|
+
# @param reverse [Object]
|
371
|
+
# Consider the `k` smallest elements of the `by` column(s) (instead of the `k`
|
372
|
+
# largest). This can be specified per column by passing a sequence of
|
373
|
+
# booleans.
|
374
|
+
#
|
375
|
+
# @return [LazyFrame]
|
376
|
+
#
|
377
|
+
# @example Get the rows which contain the 4 largest values in column b.
|
378
|
+
# lf = Polars::LazyFrame.new(
|
379
|
+
# {
|
380
|
+
# "a" => ["a", "b", "a", "b", "b", "c"],
|
381
|
+
# "b" => [2, 1, 1, 3, 2, 1]
|
382
|
+
# }
|
383
|
+
# )
|
384
|
+
# lf.top_k(4, by: "b").collect
|
385
|
+
# # =>
|
386
|
+
# # shape: (4, 2)
|
387
|
+
# # ┌─────┬─────┐
|
388
|
+
# # │ a ┆ b │
|
389
|
+
# # │ --- ┆ --- │
|
390
|
+
# # │ str ┆ i64 │
|
391
|
+
# # ╞═════╪═════╡
|
392
|
+
# # │ b ┆ 3 │
|
393
|
+
# # │ a ┆ 2 │
|
394
|
+
# # │ b ┆ 2 │
|
395
|
+
# # │ b ┆ 1 │
|
396
|
+
# # └─────┴─────┘
|
397
|
+
#
|
398
|
+
# @example Get the rows which contain the 4 largest values when sorting on column b and a.
|
399
|
+
# lf.top_k(4, by: ["b", "a"]).collect
|
400
|
+
# # =>
|
401
|
+
# # shape: (4, 2)
|
402
|
+
# # ┌─────┬─────┐
|
403
|
+
# # │ a ┆ b │
|
404
|
+
# # │ --- ┆ --- │
|
405
|
+
# # │ str ┆ i64 │
|
406
|
+
# # ╞═════╪═════╡
|
407
|
+
# # │ b ┆ 3 │
|
408
|
+
# # │ b ┆ 2 │
|
409
|
+
# # │ a ┆ 2 │
|
410
|
+
# # │ c ┆ 1 │
|
411
|
+
# # └─────┴─────┘
|
412
|
+
def top_k(
|
413
|
+
k,
|
414
|
+
by:,
|
415
|
+
reverse: false
|
416
|
+
)
|
417
|
+
by = Utils.parse_into_list_of_expressions(by)
|
418
|
+
reverse = Utils.extend_bool(reverse, by.length, "reverse", "by")
|
419
|
+
_from_rbldf(_ldf.top_k(k, by, reverse))
|
420
|
+
end
|
421
|
+
|
422
|
+
# Return the `k` smallest rows.
|
423
|
+
#
|
424
|
+
# Non-null elements are always preferred over null elements, regardless of
|
425
|
+
# the value of `reverse`. The output is not guaranteed to be in any
|
426
|
+
# particular order, call :func:`sort` after this function if you wish the
|
427
|
+
# output to be sorted.
|
428
|
+
#
|
429
|
+
# @param k [Integer]
|
430
|
+
# Number of rows to return.
|
431
|
+
# @param by [Object]
|
432
|
+
# Column(s) used to determine the bottom rows.
|
433
|
+
# Accepts expression input. Strings are parsed as column names.
|
434
|
+
# @param reverse [Object]
|
435
|
+
# Consider the `k` largest elements of the `by` column(s) (instead of the `k`
|
436
|
+
# smallest). This can be specified per column by passing a sequence of
|
437
|
+
# booleans.
|
438
|
+
#
|
439
|
+
# @return [LazyFrame]
|
440
|
+
#
|
441
|
+
# @example Get the rows which contain the 4 smallest values in column b.
|
442
|
+
# lf = Polars::LazyFrame.new(
|
443
|
+
# {
|
444
|
+
# "a" => ["a", "b", "a", "b", "b", "c"],
|
445
|
+
# "b" => [2, 1, 1, 3, 2, 1]
|
446
|
+
# }
|
447
|
+
# )
|
448
|
+
# lf.bottom_k(4, by: "b").collect
|
449
|
+
# # =>
|
450
|
+
# # shape: (4, 2)
|
451
|
+
# # ┌─────┬─────┐
|
452
|
+
# # │ a ┆ b │
|
453
|
+
# # │ --- ┆ --- │
|
454
|
+
# # │ str ┆ i64 │
|
455
|
+
# # ╞═════╪═════╡
|
456
|
+
# # │ b ┆ 1 │
|
457
|
+
# # │ a ┆ 1 │
|
458
|
+
# # │ c ┆ 1 │
|
459
|
+
# # │ a ┆ 2 │
|
460
|
+
# # └─────┴─────┘
|
461
|
+
#
|
462
|
+
# @example Get the rows which contain the 4 smallest values when sorting on column a and b.
|
463
|
+
# lf.bottom_k(4, by: ["a", "b"]).collect
|
464
|
+
# # =>
|
465
|
+
# # shape: (4, 2)
|
466
|
+
# # ┌─────┬─────┐
|
467
|
+
# # │ a ┆ b │
|
468
|
+
# # │ --- ┆ --- │
|
469
|
+
# # │ str ┆ i64 │
|
470
|
+
# # ╞═════╪═════╡
|
471
|
+
# # │ a ┆ 1 │
|
472
|
+
# # │ a ┆ 2 │
|
473
|
+
# # │ b ┆ 1 │
|
474
|
+
# # │ b ┆ 2 │
|
475
|
+
# # └─────┴─────┘
|
476
|
+
def bottom_k(
|
477
|
+
k,
|
478
|
+
by:,
|
479
|
+
reverse: false
|
480
|
+
)
|
481
|
+
by = Utils.parse_into_list_of_expressions(by)
|
482
|
+
reverse = Utils.extend_bool(reverse, by.length, "reverse", "by")
|
483
|
+
_from_rbldf(_ldf.bottom_k(k, by, reverse))
|
484
|
+
end
|
485
|
+
|
291
486
|
# def profile
|
292
487
|
# end
|
293
488
|
|
@@ -379,6 +574,41 @@ module Polars
|
|
379
574
|
Utils.wrap_df(ldf.collect)
|
380
575
|
end
|
381
576
|
|
577
|
+
# Resolve the schema of this LazyFrame.
|
578
|
+
#
|
579
|
+
# @return [Schema]
|
580
|
+
#
|
581
|
+
# @example Determine the schema.
|
582
|
+
# lf = Polars::LazyFrame.new(
|
583
|
+
# {
|
584
|
+
# "foo" => [1, 2, 3],
|
585
|
+
# "bar" => [6.0, 7.0, 8.0],
|
586
|
+
# "ham" => ["a", "b", "c"]
|
587
|
+
# }
|
588
|
+
# )
|
589
|
+
# lf.collect_schema
|
590
|
+
# # => Polars::Schema({"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::String})
|
591
|
+
#
|
592
|
+
# @example Access various properties of the schema.
|
593
|
+
# schema = lf.collect_schema
|
594
|
+
# schema["bar"]
|
595
|
+
# # => Polars::Float64
|
596
|
+
#
|
597
|
+
# @example
|
598
|
+
# schema.names
|
599
|
+
# # => ["foo", "bar", "ham"]
|
600
|
+
#
|
601
|
+
# @example
|
602
|
+
# schema.dtypes
|
603
|
+
# # => [Polars::Int64, Polars::Float64, Polars::String]
|
604
|
+
#
|
605
|
+
# @example
|
606
|
+
# schema.length
|
607
|
+
# # => 3
|
608
|
+
def collect_schema
|
609
|
+
Schema.new(_ldf.collect_schema, check_dtypes: false)
|
610
|
+
end
|
611
|
+
|
382
612
|
# Persists a LazyFrame at the provided path.
|
383
613
|
#
|
384
614
|
# This allows streaming results that are larger than RAM to be written to disk.
|
@@ -1147,6 +1377,140 @@ module Polars
|
|
1147
1377
|
)
|
1148
1378
|
end
|
1149
1379
|
|
1380
|
+
# Remove rows, dropping those that match the given predicate expression(s).
|
1381
|
+
#
|
1382
|
+
# The original order of the remaining rows is preserved.
|
1383
|
+
#
|
1384
|
+
# Rows where the filter predicate does not evaluate to true are retained
|
1385
|
+
# (this includes rows where the predicate evaluates as `null`).
|
1386
|
+
#
|
1387
|
+
# @param predicates [Array]
|
1388
|
+
# Expression that evaluates to a boolean Series.
|
1389
|
+
# @param constraints [Hash]
|
1390
|
+
# Column filters; use `name = value` to filter columns using the supplied
|
1391
|
+
# value. Each constraint behaves the same as `Polars.col(name).eq(value)`,
|
1392
|
+
# and is implicitly joined with the other filter conditions using `&`.
|
1393
|
+
#
|
1394
|
+
# @return [LazyFrame]
|
1395
|
+
#
|
1396
|
+
# @example Remove rows matching a condition:
|
1397
|
+
# lf = Polars::LazyFrame.new(
|
1398
|
+
# {
|
1399
|
+
# "foo" => [2, 3, nil, 4, 0],
|
1400
|
+
# "bar" => [5, 6, nil, nil, 0],
|
1401
|
+
# "ham" => ["a", "b", nil, "c", "d"]
|
1402
|
+
# }
|
1403
|
+
# )
|
1404
|
+
# lf.remove(
|
1405
|
+
# Polars.col("bar") >= 5
|
1406
|
+
# ).collect
|
1407
|
+
# # =>
|
1408
|
+
# # shape: (3, 3)
|
1409
|
+
# # ┌──────┬──────┬──────┐
|
1410
|
+
# # │ foo ┆ bar ┆ ham │
|
1411
|
+
# # │ --- ┆ --- ┆ --- │
|
1412
|
+
# # │ i64 ┆ i64 ┆ str │
|
1413
|
+
# # ╞══════╪══════╪══════╡
|
1414
|
+
# # │ null ┆ null ┆ null │
|
1415
|
+
# # │ 4 ┆ null ┆ c │
|
1416
|
+
# # │ 0 ┆ 0 ┆ d │
|
1417
|
+
# # └──────┴──────┴──────┘
|
1418
|
+
#
|
1419
|
+
# @example Discard rows based on multiple conditions, combined with and/or operators:
|
1420
|
+
# lf.remove(
|
1421
|
+
# (Polars.col("foo") >= 0) & (Polars.col("bar") >= 0)
|
1422
|
+
# ).collect
|
1423
|
+
# # =>
|
1424
|
+
# # shape: (2, 3)
|
1425
|
+
# # ┌──────┬──────┬──────┐
|
1426
|
+
# # │ foo ┆ bar ┆ ham │
|
1427
|
+
# # │ --- ┆ --- ┆ --- │
|
1428
|
+
# # │ i64 ┆ i64 ┆ str │
|
1429
|
+
# # ╞══════╪══════╪══════╡
|
1430
|
+
# # │ null ┆ null ┆ null │
|
1431
|
+
# # │ 4 ┆ null ┆ c │
|
1432
|
+
# # └──────┴──────┴──────┘
|
1433
|
+
#
|
1434
|
+
# @example
|
1435
|
+
# lf.remove(
|
1436
|
+
# (Polars.col("foo") >= 0) | (Polars.col("bar") >= 0)
|
1437
|
+
# ).collect
|
1438
|
+
# # =>
|
1439
|
+
# # shape: (1, 3)
|
1440
|
+
# # ┌──────┬──────┬──────┐
|
1441
|
+
# # │ foo ┆ bar ┆ ham │
|
1442
|
+
# # │ --- ┆ --- ┆ --- │
|
1443
|
+
# # │ i64 ┆ i64 ┆ str │
|
1444
|
+
# # ╞══════╪══════╪══════╡
|
1445
|
+
# # │ null ┆ null ┆ null │
|
1446
|
+
# # └──────┴──────┴──────┘
|
1447
|
+
#
|
1448
|
+
# @example Provide multiple constraints using `*args` syntax:
|
1449
|
+
# lf.remove(
|
1450
|
+
# Polars.col("ham").is_not_null,
|
1451
|
+
# Polars.col("bar") >= 0
|
1452
|
+
# ).collect
|
1453
|
+
# # =>
|
1454
|
+
# # shape: (2, 3)
|
1455
|
+
# # ┌──────┬──────┬──────┐
|
1456
|
+
# # │ foo ┆ bar ┆ ham │
|
1457
|
+
# # │ --- ┆ --- ┆ --- │
|
1458
|
+
# # │ i64 ┆ i64 ┆ str │
|
1459
|
+
# # ╞══════╪══════╪══════╡
|
1460
|
+
# # │ null ┆ null ┆ null │
|
1461
|
+
# # │ 4 ┆ null ┆ c │
|
1462
|
+
# # └──────┴──────┴──────┘
|
1463
|
+
#
|
1464
|
+
# @example Provide constraints(s) using `**kwargs` syntax:
|
1465
|
+
# lf.remove(foo: 0, bar: 0).collect
|
1466
|
+
# # =>
|
1467
|
+
# # shape: (4, 3)
|
1468
|
+
# # ┌──────┬──────┬──────┐
|
1469
|
+
# # │ foo ┆ bar ┆ ham │
|
1470
|
+
# # │ --- ┆ --- ┆ --- │
|
1471
|
+
# # │ i64 ┆ i64 ┆ str │
|
1472
|
+
# # ╞══════╪══════╪══════╡
|
1473
|
+
# # │ 2 ┆ 5 ┆ a │
|
1474
|
+
# # │ 3 ┆ 6 ┆ b │
|
1475
|
+
# # │ null ┆ null ┆ null │
|
1476
|
+
# # │ 4 ┆ null ┆ c │
|
1477
|
+
# # └──────┴──────┴──────┘
|
1478
|
+
#
|
1479
|
+
# @example Remove rows by comparing two columns against each other; in this case, we remove rows where the two columns are not equal (using `ne_missing` to ensure that null values compare equal):
|
1480
|
+
# lf.remove(
|
1481
|
+
# Polars.col("foo").ne_missing(Polars.col("bar"))
|
1482
|
+
# ).collect
|
1483
|
+
# # =>
|
1484
|
+
# # shape: (2, 3)
|
1485
|
+
# # ┌──────┬──────┬──────┐
|
1486
|
+
# # │ foo ┆ bar ┆ ham │
|
1487
|
+
# # │ --- ┆ --- ┆ --- │
|
1488
|
+
# # │ i64 ┆ i64 ┆ str │
|
1489
|
+
# # ╞══════╪══════╪══════╡
|
1490
|
+
# # │ null ┆ null ┆ null │
|
1491
|
+
# # │ 0 ┆ 0 ┆ d │
|
1492
|
+
# # └──────┴──────┴──────┘
|
1493
|
+
def remove(
|
1494
|
+
*predicates,
|
1495
|
+
**constraints
|
1496
|
+
)
|
1497
|
+
if constraints.empty?
|
1498
|
+
# early-exit conditions (exclude/include all rows)
|
1499
|
+
if predicates.empty? || (predicates.length == 1 && predicates[0].is_a?(TrueClass))
|
1500
|
+
return clear
|
1501
|
+
end
|
1502
|
+
if predicates.length == 1 && predicates[0].is_a?(FalseClass)
|
1503
|
+
return dup
|
1504
|
+
end
|
1505
|
+
end
|
1506
|
+
|
1507
|
+
_filter(
|
1508
|
+
predicates: predicates,
|
1509
|
+
constraints: constraints,
|
1510
|
+
invert: true
|
1511
|
+
)
|
1512
|
+
end
|
1513
|
+
|
1150
1514
|
# Select columns from this DataFrame.
|
1151
1515
|
#
|
1152
1516
|
# @param exprs [Array]
|
@@ -1244,6 +1608,29 @@ module Polars
|
|
1244
1608
|
_from_rbldf(_ldf.select(rbexprs))
|
1245
1609
|
end
|
1246
1610
|
|
1611
|
+
# Select columns from this LazyFrame.
|
1612
|
+
#
|
1613
|
+
# This will run all expression sequentially instead of in parallel.
|
1614
|
+
# Use this when the work per expression is cheap.
|
1615
|
+
#
|
1616
|
+
# @param exprs [Array]
|
1617
|
+
# Column(s) to select, specified as positional arguments.
|
1618
|
+
# Accepts expression input. Strings are parsed as column names,
|
1619
|
+
# other non-expression inputs are parsed as literals.
|
1620
|
+
# @param named_exprs [Hash]
|
1621
|
+
# Additional columns to select, specified as keyword arguments.
|
1622
|
+
# The columns will be renamed to the keyword used.
|
1623
|
+
#
|
1624
|
+
# @return [LazyFrame]
|
1625
|
+
def select_seq(*exprs, **named_exprs)
|
1626
|
+
structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", 0).to_i != 0
|
1627
|
+
|
1628
|
+
rbexprs = Utils.parse_into_list_of_expressions(
|
1629
|
+
*exprs, **named_exprs, __structify: structify
|
1630
|
+
)
|
1631
|
+
_from_rbldf(_ldf.select_seq(rbexprs))
|
1632
|
+
end
|
1633
|
+
|
1247
1634
|
# Start a group by operation.
|
1248
1635
|
#
|
1249
1636
|
# @param by [Array]
|
@@ -1440,9 +1827,9 @@ module Polars
|
|
1440
1827
|
# @param every [Object]
|
1441
1828
|
# Interval of the window.
|
1442
1829
|
# @param period [Object]
|
1443
|
-
# Length of the window, if
|
1830
|
+
# Length of the window, if nil it is equal to 'every'.
|
1444
1831
|
# @param offset [Object]
|
1445
|
-
# Offset of the window if
|
1832
|
+
# Offset of the window if nil and period is nil it will be equal to negative
|
1446
1833
|
# `every`.
|
1447
1834
|
# @param truncate [Boolean]
|
1448
1835
|
# Truncate the time value to the window lower bound.
|
@@ -1714,7 +2101,7 @@ module Polars
|
|
1714
2101
|
# Join column of the right DataFrame.
|
1715
2102
|
# @param on [String]
|
1716
2103
|
# Join column of both DataFrames. If set, `left_on` and `right_on` should be
|
1717
|
-
#
|
2104
|
+
# nil.
|
1718
2105
|
# @param by_left [Object]
|
1719
2106
|
# Join on these columns before doing asof join.
|
1720
2107
|
# @param by_right [Object]
|
@@ -2039,7 +2426,7 @@ module Polars
|
|
2039
2426
|
# Join column of the right DataFrame.
|
2040
2427
|
# @param on Object
|
2041
2428
|
# Join column of both DataFrames. If set, `left_on` and `right_on` should be
|
2042
|
-
#
|
2429
|
+
# nil.
|
2043
2430
|
# @param how ["inner", "left", "full", "semi", "anti", "cross"]
|
2044
2431
|
# Join strategy.
|
2045
2432
|
# @param suffix [String]
|
@@ -2234,6 +2621,103 @@ module Polars
|
|
2234
2621
|
)
|
2235
2622
|
end
|
2236
2623
|
|
2624
|
+
# Perform a join based on one or multiple (in)equality predicates.
|
2625
|
+
#
|
2626
|
+
# This performs an inner join, so only rows where all predicates are true
|
2627
|
+
# are included in the result, and a row from either DataFrame may be included
|
2628
|
+
# multiple times in the result.
|
2629
|
+
#
|
2630
|
+
# @note
|
2631
|
+
# The row order of the input DataFrames is not preserved.
|
2632
|
+
#
|
2633
|
+
# @note
|
2634
|
+
# This functionality is experimental. It may be
|
2635
|
+
# changed at any point without it being considered a breaking change.
|
2636
|
+
#
|
2637
|
+
# @param other [Object]
|
2638
|
+
# DataFrame to join with.
|
2639
|
+
# @param predicates [Object]
|
2640
|
+
# (In)Equality condition to join the two tables on.
|
2641
|
+
# When a column name occurs in both tables, the proper suffix must
|
2642
|
+
# be applied in the predicate.
|
2643
|
+
# @param suffix [String]
|
2644
|
+
# Suffix to append to columns with a duplicate name.
|
2645
|
+
#
|
2646
|
+
# @return [LazyFrame]
|
2647
|
+
#
|
2648
|
+
# @example Join two lazyframes together based on two predicates which get AND-ed together.
|
2649
|
+
# east = Polars::LazyFrame.new(
|
2650
|
+
# {
|
2651
|
+
# "id" => [100, 101, 102],
|
2652
|
+
# "dur" => [120, 140, 160],
|
2653
|
+
# "rev" => [12, 14, 16],
|
2654
|
+
# "cores" => [2, 8, 4]
|
2655
|
+
# }
|
2656
|
+
# )
|
2657
|
+
# west = Polars::LazyFrame.new(
|
2658
|
+
# {
|
2659
|
+
# "t_id" => [404, 498, 676, 742],
|
2660
|
+
# "time" => [90, 130, 150, 170],
|
2661
|
+
# "cost" => [9, 13, 15, 16],
|
2662
|
+
# "cores" => [4, 2, 1, 4]
|
2663
|
+
# }
|
2664
|
+
# )
|
2665
|
+
# east.join_where(
|
2666
|
+
# west,
|
2667
|
+
# Polars.col("dur") < Polars.col("time"),
|
2668
|
+
# Polars.col("rev") < Polars.col("cost")
|
2669
|
+
# ).collect
|
2670
|
+
# # =>
|
2671
|
+
# # shape: (5, 8)
|
2672
|
+
# # ┌─────┬─────┬─────┬───────┬──────┬──────┬──────┬─────────────┐
|
2673
|
+
# # │ id ┆ dur ┆ rev ┆ cores ┆ t_id ┆ time ┆ cost ┆ cores_right │
|
2674
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
2675
|
+
# # │ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
|
2676
|
+
# # ╞═════╪═════╪═════╪═══════╪══════╪══════╪══════╪═════════════╡
|
2677
|
+
# # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 498 ┆ 130 ┆ 13 ┆ 2 │
|
2678
|
+
# # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
|
2679
|
+
# # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
|
2680
|
+
# # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
|
2681
|
+
# # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
|
2682
|
+
# # └─────┴─────┴─────┴───────┴──────┴──────┴──────┴─────────────┘
|
2683
|
+
#
|
2684
|
+
# @example To OR them together, use a single expression and the `|` operator.
|
2685
|
+
# east.join_where(
|
2686
|
+
# west,
|
2687
|
+
# (Polars.col("dur") < Polars.col("time")) | (Polars.col("rev") < Polars.col("cost"))
|
2688
|
+
# ).collect
|
2689
|
+
# # =>
|
2690
|
+
# # shape: (6, 8)
|
2691
|
+
# # ┌─────┬─────┬─────┬───────┬──────┬──────┬──────┬─────────────┐
|
2692
|
+
# # │ id ┆ dur ┆ rev ┆ cores ┆ t_id ┆ time ┆ cost ┆ cores_right │
|
2693
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
2694
|
+
# # │ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
|
2695
|
+
# # ╞═════╪═════╪═════╪═══════╪══════╪══════╪══════╪═════════════╡
|
2696
|
+
# # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 498 ┆ 130 ┆ 13 ┆ 2 │
|
2697
|
+
# # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
|
2698
|
+
# # │ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
|
2699
|
+
# # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
|
2700
|
+
# # │ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
|
2701
|
+
# # │ 102 ┆ 160 ┆ 16 ┆ 4 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
|
2702
|
+
# # └─────┴─────┴─────┴───────┴──────┴──────┴──────┴─────────────┘
|
2703
|
+
def join_where(
|
2704
|
+
other,
|
2705
|
+
*predicates,
|
2706
|
+
suffix: "_right"
|
2707
|
+
)
|
2708
|
+
Utils.require_same_type(self, other)
|
2709
|
+
|
2710
|
+
rbexprs = Utils.parse_into_list_of_expressions(*predicates)
|
2711
|
+
|
2712
|
+
_from_rbldf(
|
2713
|
+
_ldf.join_where(
|
2714
|
+
other._ldf,
|
2715
|
+
rbexprs,
|
2716
|
+
suffix
|
2717
|
+
)
|
2718
|
+
)
|
2719
|
+
end
|
2720
|
+
|
2237
2721
|
# Add or overwrite multiple columns in a DataFrame.
|
2238
2722
|
#
|
2239
2723
|
# @param exprs [Object]
|
@@ -2279,6 +2763,34 @@ module Polars
|
|
2279
2763
|
_from_rbldf(_ldf.with_columns(rbexprs))
|
2280
2764
|
end
|
2281
2765
|
|
2766
|
+
# Add columns to this LazyFrame.
|
2767
|
+
#
|
2768
|
+
# Added columns will replace existing columns with the same name.
|
2769
|
+
#
|
2770
|
+
# This will run all expression sequentially instead of in parallel.
|
2771
|
+
# Use this when the work per expression is cheap.
|
2772
|
+
#
|
2773
|
+
# @param exprs [Array]
|
2774
|
+
# Column(s) to add, specified as positional arguments.
|
2775
|
+
# Accepts expression input. Strings are parsed as column names, other
|
2776
|
+
# non-expression inputs are parsed as literals.
|
2777
|
+
# @param named_exprs [Hash]
|
2778
|
+
# Additional columns to add, specified as keyword arguments.
|
2779
|
+
# The columns will be renamed to the keyword used.
|
2780
|
+
#
|
2781
|
+
# @return [LazyFrame]
|
2782
|
+
def with_columns_seq(
|
2783
|
+
*exprs,
|
2784
|
+
**named_exprs
|
2785
|
+
)
|
2786
|
+
structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", 0).to_i != 0
|
2787
|
+
|
2788
|
+
rbexprs = Utils.parse_into_list_of_expressions(
|
2789
|
+
*exprs, **named_exprs, __structify: structify
|
2790
|
+
)
|
2791
|
+
_from_rbldf(_ldf.with_columns_seq(rbexprs))
|
2792
|
+
end
|
2793
|
+
|
2282
2794
|
# Add an external context to the computation graph.
|
2283
2795
|
#
|
2284
2796
|
# This allows expressions to also access columns from DataFrames
|
@@ -2887,7 +3399,7 @@ module Polars
|
|
2887
3399
|
#
|
2888
3400
|
# @example
|
2889
3401
|
# s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]}).lazy
|
2890
|
-
# s.
|
3402
|
+
# s.gather_every(2).collect
|
2891
3403
|
# # =>
|
2892
3404
|
# # shape: (2, 2)
|
2893
3405
|
# # ┌─────┬─────┐
|
@@ -2898,9 +3410,10 @@ module Polars
|
|
2898
3410
|
# # │ 1 ┆ 5 │
|
2899
3411
|
# # │ 3 ┆ 7 │
|
2900
3412
|
# # └─────┴─────┘
|
2901
|
-
def
|
2902
|
-
select(F.col("*").
|
3413
|
+
def gather_every(n)
|
3414
|
+
select(F.col("*").gather_every(n))
|
2903
3415
|
end
|
3416
|
+
alias_method :take_every, :gather_every
|
2904
3417
|
|
2905
3418
|
# Fill null values using the specified value or strategy.
|
2906
3419
|
#
|
@@ -3177,6 +3690,32 @@ module Polars
|
|
3177
3690
|
_from_rbldf(_ldf.median)
|
3178
3691
|
end
|
3179
3692
|
|
3693
|
+
# Aggregate the columns in the LazyFrame as the sum of their null value count.
|
3694
|
+
#
|
3695
|
+
# @return [LazyFrame]
|
3696
|
+
#
|
3697
|
+
# @example
|
3698
|
+
# lf = Polars::LazyFrame.new(
|
3699
|
+
# {
|
3700
|
+
# "foo" => [1, nil, 3],
|
3701
|
+
# "bar" => [6, 7, nil],
|
3702
|
+
# "ham" => ["a", "b", "c"]
|
3703
|
+
# }
|
3704
|
+
# )
|
3705
|
+
# lf.null_count.collect
|
3706
|
+
# # =>
|
3707
|
+
# # shape: (1, 3)
|
3708
|
+
# # ┌─────┬─────┬─────┐
|
3709
|
+
# # │ foo ┆ bar ┆ ham │
|
3710
|
+
# # │ --- ┆ --- ┆ --- │
|
3711
|
+
# # │ u32 ┆ u32 ┆ u32 │
|
3712
|
+
# # ╞═════╪═════╪═════╡
|
3713
|
+
# # │ 1 ┆ 1 ┆ 0 │
|
3714
|
+
# # └─────┴─────┴─────┘
|
3715
|
+
def null_count
|
3716
|
+
_from_rbldf(_ldf.null_count)
|
3717
|
+
end
|
3718
|
+
|
3180
3719
|
# Aggregate the columns in the DataFrame to their quantile value.
|
3181
3720
|
#
|
3182
3721
|
# @param quantile [Float]
|
@@ -3307,37 +3846,103 @@ module Polars
|
|
3307
3846
|
_from_rbldf(_ldf.unique(maintain_order, selector_subset, keep))
|
3308
3847
|
end
|
3309
3848
|
|
3310
|
-
# Drop rows
|
3849
|
+
# Drop all rows that contain one or more NaN values.
|
3850
|
+
#
|
3851
|
+
# The original order of the remaining rows is preserved.
|
3311
3852
|
#
|
3312
3853
|
# @param subset [Object]
|
3313
|
-
#
|
3854
|
+
# Column name(s) for which NaN values are considered; if set to `nil`
|
3855
|
+
# (default), use all columns (note that only floating-point columns
|
3856
|
+
# can contain NaNs).
|
3314
3857
|
#
|
3315
3858
|
# @return [LazyFrame]
|
3316
3859
|
#
|
3317
3860
|
# @example
|
3318
|
-
#
|
3861
|
+
# lf = Polars::LazyFrame.new(
|
3862
|
+
# {
|
3863
|
+
# "foo" => [-20.5, Float::NAN, 80.0],
|
3864
|
+
# "bar" => [Float::NAN, 110.0, 25.5],
|
3865
|
+
# "ham" => ["xxx", "yyy", nil]
|
3866
|
+
# }
|
3867
|
+
# )
|
3868
|
+
# lf.drop_nans.collect
|
3869
|
+
# # =>
|
3870
|
+
# # shape: (1, 3)
|
3871
|
+
# # ┌──────┬──────┬──────┐
|
3872
|
+
# # │ foo ┆ bar ┆ ham │
|
3873
|
+
# # │ --- ┆ --- ┆ --- │
|
3874
|
+
# # │ f64 ┆ f64 ┆ str │
|
3875
|
+
# # ╞══════╪══════╪══════╡
|
3876
|
+
# # │ 80.0 ┆ 25.5 ┆ null │
|
3877
|
+
# # └──────┴──────┴──────┘
|
3878
|
+
#
|
3879
|
+
# @example
|
3880
|
+
# lf.drop_nans(subset: ["bar"]).collect
|
3881
|
+
# # =>
|
3882
|
+
# # shape: (2, 3)
|
3883
|
+
# # ┌──────┬───────┬──────┐
|
3884
|
+
# # │ foo ┆ bar ┆ ham │
|
3885
|
+
# # │ --- ┆ --- ┆ --- │
|
3886
|
+
# # │ f64 ┆ f64 ┆ str │
|
3887
|
+
# # ╞══════╪═══════╪══════╡
|
3888
|
+
# # │ NaN ┆ 110.0 ┆ yyy │
|
3889
|
+
# # │ 80.0 ┆ 25.5 ┆ null │
|
3890
|
+
# # └──────┴───────┴──────┘
|
3891
|
+
def drop_nans(subset: nil)
|
3892
|
+
selector_subset = nil
|
3893
|
+
if !subset.nil?
|
3894
|
+
selector_subset = Utils.parse_list_into_selector(subset)._rbselector
|
3895
|
+
end
|
3896
|
+
_from_rbldf(_ldf.drop_nans(selector_subset))
|
3897
|
+
end
|
3898
|
+
|
3899
|
+
# Drop all rows that contain one or more null values.
|
3900
|
+
#
|
3901
|
+
# The original order of the remaining rows is preserved.
|
3902
|
+
#
|
3903
|
+
# @param subset [Object]
|
3904
|
+
# Column name(s) for which null values are considered.
|
3905
|
+
# If set to `nil` (default), use all columns.
|
3906
|
+
#
|
3907
|
+
# @return [LazyFrame]
|
3908
|
+
#
|
3909
|
+
# @example
|
3910
|
+
# lf = Polars::LazyFrame.new(
|
3319
3911
|
# {
|
3320
3912
|
# "foo" => [1, 2, 3],
|
3321
3913
|
# "bar" => [6, nil, 8],
|
3322
|
-
# "ham" => ["a", "b",
|
3914
|
+
# "ham" => ["a", "b", nil]
|
3323
3915
|
# }
|
3324
3916
|
# )
|
3325
|
-
#
|
3917
|
+
# lf.drop_nulls.collect
|
3326
3918
|
# # =>
|
3327
|
-
# # shape: (
|
3919
|
+
# # shape: (1, 3)
|
3328
3920
|
# # ┌─────┬─────┬─────┐
|
3329
3921
|
# # │ foo ┆ bar ┆ ham │
|
3330
3922
|
# # │ --- ┆ --- ┆ --- │
|
3331
3923
|
# # │ i64 ┆ i64 ┆ str │
|
3332
3924
|
# # ╞═════╪═════╪═════╡
|
3333
3925
|
# # │ 1 ┆ 6 ┆ a │
|
3334
|
-
# # │ 3 ┆ 8 ┆ c │
|
3335
3926
|
# # └─────┴─────┴─────┘
|
3927
|
+
#
|
3928
|
+
# @example
|
3929
|
+
# lf.drop_nulls(subset: Polars.cs.integer).collect
|
3930
|
+
# # =>
|
3931
|
+
# # shape: (2, 3)
|
3932
|
+
# # ┌─────┬─────┬──────┐
|
3933
|
+
# # │ foo ┆ bar ┆ ham │
|
3934
|
+
# # │ --- ┆ --- ┆ --- │
|
3935
|
+
# # │ i64 ┆ i64 ┆ str │
|
3936
|
+
# # ╞═════╪═════╪══════╡
|
3937
|
+
# # │ 1 ┆ 6 ┆ a │
|
3938
|
+
# # │ 3 ┆ 8 ┆ null │
|
3939
|
+
# # └─────┴─────┴──────┘
|
3336
3940
|
def drop_nulls(subset: nil)
|
3337
|
-
|
3338
|
-
|
3941
|
+
selector_subset = nil
|
3942
|
+
if !subset.nil?
|
3943
|
+
selector_subset = Utils.parse_list_into_selector(subset)._rbselector
|
3339
3944
|
end
|
3340
|
-
_from_rbldf(_ldf.drop_nulls(
|
3945
|
+
_from_rbldf(_ldf.drop_nulls(selector_subset))
|
3341
3946
|
end
|
3342
3947
|
|
3343
3948
|
# Unpivot a DataFrame from wide to long format.
|
@@ -3571,9 +4176,261 @@ module Polars
|
|
3571
4176
|
with_columns(F.col(column).set_sorted(descending: descending))
|
3572
4177
|
end
|
3573
4178
|
|
3574
|
-
#
|
3575
|
-
#
|
3576
|
-
#
|
4179
|
+
# Update the values in this `LazyFrame` with the values in `other`.
|
4180
|
+
#
|
4181
|
+
# @note
|
4182
|
+
# This functionality is considered **unstable**. It may be changed
|
4183
|
+
# at any point without it being considered a breaking change.
|
4184
|
+
#
|
4185
|
+
# @param other [LazyFrame]
|
4186
|
+
# LazyFrame that will be used to update the values
|
4187
|
+
# @param on [Object]
|
4188
|
+
# Column names that will be joined on. If set to `nil` (default),
|
4189
|
+
# the implicit row index of each frame is used as a join key.
|
4190
|
+
# @param how ['left', 'inner', 'full']
|
4191
|
+
# * 'left' will keep all rows from the left table; rows may be duplicated
|
4192
|
+
# if multiple rows in the right frame match the left row's key.
|
4193
|
+
# * 'inner' keeps only those rows where the key exists in both frames.
|
4194
|
+
# * 'full' will update existing rows where the key matches while also
|
4195
|
+
# adding any new rows contained in the given frame.
|
4196
|
+
# @param left_on [Object]
|
4197
|
+
# Join column(s) of the left DataFrame.
|
4198
|
+
# @param right_on [Object]
|
4199
|
+
# Join column(s) of the right DataFrame.
|
4200
|
+
# @param include_nulls [Boolean]
|
4201
|
+
# Overwrite values in the left frame with null values from the right frame.
|
4202
|
+
# If set to `false` (default), null values in the right frame are ignored.
|
4203
|
+
# @param maintain_order ['none', 'left', 'right', 'left_right', 'right_left']
|
4204
|
+
# Which order of rows from the inputs to preserve. See `LazyFrame.join`
|
4205
|
+
# for details. Unlike `join` this function preserves the left order by
|
4206
|
+
# default.
|
4207
|
+
#
|
4208
|
+
# @return [LazyFrame]
|
4209
|
+
#
|
4210
|
+
# @note
|
4211
|
+
# This is syntactic sugar for a left/inner join that preserves the order
|
4212
|
+
# of the left `DataFrame` by default, with an optional coalesce when
|
4213
|
+
# `include_nulls: False`.
|
4214
|
+
#
|
4215
|
+
# @example Update `df` values with the non-null values in `new_df`, by row index:
|
4216
|
+
# lf = Polars::LazyFrame.new(
|
4217
|
+
# {
|
4218
|
+
# "A" => [1, 2, 3, 4],
|
4219
|
+
# "B" => [400, 500, 600, 700]
|
4220
|
+
# }
|
4221
|
+
# )
|
4222
|
+
# new_lf = Polars::LazyFrame.new(
|
4223
|
+
# {
|
4224
|
+
# "B" => [-66, nil, -99],
|
4225
|
+
# "C" => [5, 3, 1]
|
4226
|
+
# }
|
4227
|
+
# )
|
4228
|
+
# lf.update(new_lf).collect
|
4229
|
+
# # =>
|
4230
|
+
# # shape: (4, 2)
|
4231
|
+
# # ┌─────┬─────┐
|
4232
|
+
# # │ A ┆ B │
|
4233
|
+
# # │ --- ┆ --- │
|
4234
|
+
# # │ i64 ┆ i64 │
|
4235
|
+
# # ╞═════╪═════╡
|
4236
|
+
# # │ 1 ┆ -66 │
|
4237
|
+
# # │ 2 ┆ 500 │
|
4238
|
+
# # │ 3 ┆ -99 │
|
4239
|
+
# # │ 4 ┆ 700 │
|
4240
|
+
# # └─────┴─────┘
|
4241
|
+
#
|
4242
|
+
# @example Update `df` values with the non-null values in `new_df`, by row index, but only keeping those rows that are common to both frames:
|
4243
|
+
# lf.update(new_lf, how: "inner").collect
|
4244
|
+
# # =>
|
4245
|
+
# # shape: (3, 2)
|
4246
|
+
# # ┌─────┬─────┐
|
4247
|
+
# # │ A ┆ B │
|
4248
|
+
# # │ --- ┆ --- │
|
4249
|
+
# # │ i64 ┆ i64 │
|
4250
|
+
# # ╞═════╪═════╡
|
4251
|
+
# # │ 1 ┆ -66 │
|
4252
|
+
# # │ 2 ┆ 500 │
|
4253
|
+
# # │ 3 ┆ -99 │
|
4254
|
+
# # └─────┴─────┘
|
4255
|
+
#
|
4256
|
+
# @example Update `df` values with the non-null values in `new_df`, using a full outer join strategy that defines explicit join columns in each frame:
|
4257
|
+
# lf.update(new_lf, left_on: ["A"], right_on: ["C"], how: "full").collect
|
4258
|
+
# # =>
|
4259
|
+
# # shape: (5, 2)
|
4260
|
+
# # ┌─────┬─────┐
|
4261
|
+
# # │ A ┆ B │
|
4262
|
+
# # │ --- ┆ --- │
|
4263
|
+
# # │ i64 ┆ i64 │
|
4264
|
+
# # ╞═════╪═════╡
|
4265
|
+
# # │ 1 ┆ -99 │
|
4266
|
+
# # │ 2 ┆ 500 │
|
4267
|
+
# # │ 3 ┆ 600 │
|
4268
|
+
# # │ 4 ┆ 700 │
|
4269
|
+
# # │ 5 ┆ -66 │
|
4270
|
+
# # └─────┴─────┘
|
4271
|
+
#
|
4272
|
+
# @example Update `df` values including null values in `new_df`, using a full outer join strategy that defines explicit join columns in each frame:
|
4273
|
+
# lf.update(
|
4274
|
+
# new_lf, left_on: "A", right_on: "C", how: "full", include_nulls: true
|
4275
|
+
# ).collect
|
4276
|
+
# # =>
|
4277
|
+
# # shape: (5, 2)
|
4278
|
+
# # ┌─────┬──────┐
|
4279
|
+
# # │ A ┆ B │
|
4280
|
+
# # │ --- ┆ --- │
|
4281
|
+
# # │ i64 ┆ i64 │
|
4282
|
+
# # ╞═════╪══════╡
|
4283
|
+
# # │ 1 ┆ -99 │
|
4284
|
+
# # │ 2 ┆ 500 │
|
4285
|
+
# # │ 3 ┆ null │
|
4286
|
+
# # │ 4 ┆ 700 │
|
4287
|
+
# # │ 5 ┆ -66 │
|
4288
|
+
# # └─────┴──────┘
|
4289
|
+
def update(
|
4290
|
+
other,
|
4291
|
+
on: nil,
|
4292
|
+
how: "left",
|
4293
|
+
left_on: nil,
|
4294
|
+
right_on: nil,
|
4295
|
+
include_nulls: false,
|
4296
|
+
maintain_order: "left"
|
4297
|
+
)
|
4298
|
+
Utils.require_same_type(self, other)
|
4299
|
+
if ["outer", "outer_coalesce"].include?(how)
|
4300
|
+
how = "full"
|
4301
|
+
end
|
4302
|
+
|
4303
|
+
if !["left", "inner", "full"].include?(how)
|
4304
|
+
msg = "`how` must be one of {{'left', 'inner', 'full'}}; found #{how.inspect}"
|
4305
|
+
raise ArgumentError, msg
|
4306
|
+
end
|
4307
|
+
|
4308
|
+
slf = self
|
4309
|
+
row_index_used = false
|
4310
|
+
if on.nil?
|
4311
|
+
if left_on.nil? && right_on.nil?
|
4312
|
+
# no keys provided--use row index
|
4313
|
+
row_index_used = true
|
4314
|
+
row_index_name = "__POLARS_ROW_INDEX"
|
4315
|
+
slf = slf.with_row_index(name: row_index_name)
|
4316
|
+
other = other.with_row_index(name: row_index_name)
|
4317
|
+
left_on = right_on = [row_index_name]
|
4318
|
+
else
|
4319
|
+
# one of left or right is missing, raise error
|
4320
|
+
if left_on.nil?
|
4321
|
+
msg = "missing join columns for left frame"
|
4322
|
+
raise ArgumentError, msg
|
4323
|
+
end
|
4324
|
+
if right_on.nil?
|
4325
|
+
msg = "missing join columns for right frame"
|
4326
|
+
raise ArgumentError, msg
|
4327
|
+
end
|
4328
|
+
end
|
4329
|
+
else
|
4330
|
+
# move on into left/right_on to simplify logic
|
4331
|
+
left_on = right_on = on
|
4332
|
+
end
|
4333
|
+
|
4334
|
+
if left_on.is_a?(::String)
|
4335
|
+
left_on = [left_on]
|
4336
|
+
end
|
4337
|
+
if right_on.is_a?(::String)
|
4338
|
+
right_on = [right_on]
|
4339
|
+
end
|
4340
|
+
|
4341
|
+
left_schema = slf.collect_schema
|
4342
|
+
left_on.each do |name|
|
4343
|
+
if !left_schema.include?(name)
|
4344
|
+
msg = "left join column #{name.inspect} not found"
|
4345
|
+
raise ArgumentError, msg
|
4346
|
+
end
|
4347
|
+
end
|
4348
|
+
right_schema = other.collect_schema
|
4349
|
+
right_on.each do |name|
|
4350
|
+
if !right_schema.include?(name)
|
4351
|
+
msg = "right join column #{name.inspect} not found"
|
4352
|
+
raise ArgumentError, msg
|
4353
|
+
end
|
4354
|
+
end
|
4355
|
+
|
4356
|
+
# no need to join if *only* join columns are in other (inner/left update only)
|
4357
|
+
if how != "full" && right_schema.length == right_on.length
|
4358
|
+
if row_index_used
|
4359
|
+
return slf.drop(row_index_name)
|
4360
|
+
end
|
4361
|
+
return slf
|
4362
|
+
end
|
4363
|
+
|
4364
|
+
# only use non-idx right columns present in left frame
|
4365
|
+
right_other = Set.new(right_schema.to_h.keys).intersection(left_schema.to_h.keys) - Set.new(right_on)
|
4366
|
+
|
4367
|
+
# When include_nulls is True, we need to distinguish records after the join that
|
4368
|
+
# were originally null in the right frame, as opposed to records that were null
|
4369
|
+
# because the key was missing from the right frame.
|
4370
|
+
# Add a validity column to track whether row was matched or not.
|
4371
|
+
if include_nulls
|
4372
|
+
validity = ["__POLARS_VALIDITY"]
|
4373
|
+
other = other.with_columns(F.lit(true).alias(validity[0]))
|
4374
|
+
else
|
4375
|
+
validity = []
|
4376
|
+
end
|
4377
|
+
|
4378
|
+
tmp_name = "__POLARS_RIGHT"
|
4379
|
+
drop_columns = right_other.map { |name| "#{name}#{tmp_name}" } + validity
|
4380
|
+
result = (
|
4381
|
+
slf.join(
|
4382
|
+
other.select(*right_on, *right_other, *validity),
|
4383
|
+
left_on: left_on,
|
4384
|
+
right_on: right_on,
|
4385
|
+
how: how,
|
4386
|
+
suffix: tmp_name,
|
4387
|
+
coalesce: true,
|
4388
|
+
maintain_order: maintain_order
|
4389
|
+
)
|
4390
|
+
.with_columns(
|
4391
|
+
right_other.map do |name|
|
4392
|
+
(
|
4393
|
+
if include_nulls
|
4394
|
+
# use left value only when right value failed to join
|
4395
|
+
F.when(F.col(validity).is_null)
|
4396
|
+
.then(F.col(name))
|
4397
|
+
.otherwise(F.col("#{name}#{tmp_name}"))
|
4398
|
+
else
|
4399
|
+
F.coalesce(["#{name}#{tmp_name}", F.col(name)])
|
4400
|
+
end
|
4401
|
+
).alias(name)
|
4402
|
+
end
|
4403
|
+
)
|
4404
|
+
.drop(drop_columns)
|
4405
|
+
)
|
4406
|
+
if row_index_used
|
4407
|
+
result = result.drop(row_index_name)
|
4408
|
+
end
|
4409
|
+
|
4410
|
+
_from_rbldf(result._ldf)
|
4411
|
+
end
|
4412
|
+
|
4413
|
+
# Return the number of non-null elements for each column.
|
4414
|
+
#
|
4415
|
+
# @return [LazyFrame]
|
4416
|
+
#
|
4417
|
+
# @example
|
4418
|
+
# lf = Polars::LazyFrame.new(
|
4419
|
+
# {"a" => [1, 2, 3, 4], "b" => [1, 2, 1, nil], "c" => [nil, nil, nil, nil]}
|
4420
|
+
# )
|
4421
|
+
# lf.count.collect
|
4422
|
+
# # =>
|
4423
|
+
# # shape: (1, 3)
|
4424
|
+
# # ┌─────┬─────┬─────┐
|
4425
|
+
# # │ a ┆ b ┆ c │
|
4426
|
+
# # │ --- ┆ --- ┆ --- │
|
4427
|
+
# # │ u32 ┆ u32 ┆ u32 │
|
4428
|
+
# # ╞═════╪═════╪═════╡
|
4429
|
+
# # │ 4 ┆ 3 ┆ 0 │
|
4430
|
+
# # └─────┴─────┴─────┘
|
4431
|
+
def count
|
4432
|
+
_from_rbldf(_ldf.count)
|
4433
|
+
end
|
3577
4434
|
|
3578
4435
|
private
|
3579
4436
|
|
@@ -3585,5 +4442,64 @@ module Polars
|
|
3585
4442
|
def _from_rbldf(rb_ldf)
|
3586
4443
|
self.class._from_rbldf(rb_ldf)
|
3587
4444
|
end
|
4445
|
+
|
4446
|
+
def _filter(
|
4447
|
+
predicates:,
|
4448
|
+
constraints:,
|
4449
|
+
invert: false
|
4450
|
+
)
|
4451
|
+
all_predicates = []
|
4452
|
+
boolean_masks = []
|
4453
|
+
|
4454
|
+
predicates.each do |p|
|
4455
|
+
# quick exit/skip conditions
|
4456
|
+
if (p.is_a?(FalseClass) && invert) || (p.is_a?(TrueClass) && !invert)
|
4457
|
+
next # ignore; doesn't filter/remove anything
|
4458
|
+
end
|
4459
|
+
if (p.is_a?(TrueClass) && invert) || (p.is_a?(FalseClass) && !invert)
|
4460
|
+
return clear # discard all rows
|
4461
|
+
end
|
4462
|
+
|
4463
|
+
# note: identify masks separately from predicates
|
4464
|
+
if Utils.is_bool_sequence(p, include_series: true)
|
4465
|
+
boolean_masks << Polars::Series.new(p, dtype: Boolean)
|
4466
|
+
elsif (
|
4467
|
+
(is_seq = Utils.is_sequence(p)) && p.any? { |x| !x.is_a?(Expr) }) ||
|
4468
|
+
(!is_seq && !p.is_a?(Expr) && !(p.is_a?(::String) && collect_schema.include?(p))
|
4469
|
+
)
|
4470
|
+
err = p.is_a?(Series) ? "Series(…, dtype: #{p.dtype})" : p.inspect
|
4471
|
+
msg = "invalid predicate for `filter`: #{err}"
|
4472
|
+
raise TypeError, msg
|
4473
|
+
else
|
4474
|
+
all_predicates.concat(
|
4475
|
+
Utils.parse_into_list_of_expressions(p).map { |x| Utils.wrap_expr(x) }
|
4476
|
+
)
|
4477
|
+
end
|
4478
|
+
end
|
4479
|
+
|
4480
|
+
# unpack equality constraints from kwargs
|
4481
|
+
all_predicates.concat(
|
4482
|
+
constraints.map { |name, value| F.col(name).eq(value) }
|
4483
|
+
)
|
4484
|
+
if !(all_predicates.any? || boolean_masks.any?)
|
4485
|
+
msg = "at least one predicate or constraint must be provided"
|
4486
|
+
raise TypeError, msg
|
4487
|
+
end
|
4488
|
+
|
4489
|
+
# if multiple predicates, combine as 'horizontal' expression
|
4490
|
+
combined_predicate = all_predicates ? (all_predicates.length > 1 ? F.all_horizontal(*all_predicates) : all_predicates[0]) : nil
|
4491
|
+
|
4492
|
+
# apply reduced boolean mask first, if applicable, then predicates
|
4493
|
+
if boolean_masks.any?
|
4494
|
+
raise Todo
|
4495
|
+
end
|
4496
|
+
|
4497
|
+
if combined_predicate.nil?
|
4498
|
+
return _from_rbldf(_ldf)
|
4499
|
+
end
|
4500
|
+
|
4501
|
+
filter_method = invert ? _ldf.method(:remove) : _ldf.method(:filter)
|
4502
|
+
_from_rbldf(filter_method.(combined_predicate._rbexpr))
|
4503
|
+
end
|
3588
4504
|
end
|
3589
4505
|
end
|