polars-df 0.20.0-x64-mingw-ucrt → 0.21.1-x64-mingw-ucrt
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +27 -0
- data/Cargo.lock +192 -186
- data/LICENSE-THIRD-PARTY.txt +2153 -2532
- data/LICENSE.txt +1 -1
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/3.3/polars.so +0 -0
- data/lib/polars/3.4/polars.so +0 -0
- data/lib/polars/array_expr.rb +382 -3
- data/lib/polars/array_name_space.rb +281 -0
- data/lib/polars/binary_expr.rb +67 -0
- data/lib/polars/binary_name_space.rb +43 -0
- data/lib/polars/cat_expr.rb +224 -0
- data/lib/polars/cat_name_space.rb +130 -32
- data/lib/polars/catalog/unity/catalog_info.rb +20 -0
- data/lib/polars/catalog/unity/column_info.rb +31 -0
- data/lib/polars/catalog/unity/namespace_info.rb +21 -0
- data/lib/polars/catalog/unity/table_info.rb +50 -0
- data/lib/polars/catalog.rb +448 -0
- data/lib/polars/config.rb +2 -2
- data/lib/polars/convert.rb +12 -2
- data/lib/polars/data_frame.rb +834 -48
- data/lib/polars/data_type_expr.rb +52 -0
- data/lib/polars/data_types.rb +61 -5
- data/lib/polars/date_time_expr.rb +251 -0
- data/lib/polars/date_time_name_space.rb +299 -0
- data/lib/polars/exceptions.rb +7 -2
- data/lib/polars/expr.rb +1247 -211
- data/lib/polars/functions/col.rb +6 -5
- data/lib/polars/functions/datatype.rb +21 -0
- data/lib/polars/functions/lazy.rb +127 -15
- data/lib/polars/functions/repeat.rb +4 -0
- data/lib/polars/io/csv.rb +19 -1
- data/lib/polars/io/json.rb +16 -0
- data/lib/polars/io/ndjson.rb +13 -0
- data/lib/polars/io/parquet.rb +70 -66
- data/lib/polars/io/scan_options.rb +47 -0
- data/lib/polars/lazy_frame.rb +1099 -95
- data/lib/polars/list_expr.rb +400 -11
- data/lib/polars/list_name_space.rb +321 -5
- data/lib/polars/meta_expr.rb +71 -22
- data/lib/polars/name_expr.rb +36 -0
- data/lib/polars/scan_cast_options.rb +64 -0
- data/lib/polars/schema.rb +84 -3
- data/lib/polars/selector.rb +210 -0
- data/lib/polars/selectors.rb +932 -203
- data/lib/polars/series.rb +1083 -63
- data/lib/polars/string_expr.rb +435 -9
- data/lib/polars/string_name_space.rb +729 -45
- data/lib/polars/struct_expr.rb +103 -0
- data/lib/polars/struct_name_space.rb +19 -1
- data/lib/polars/utils/parse.rb +40 -0
- data/lib/polars/utils/various.rb +18 -1
- data/lib/polars/utils.rb +9 -1
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +10 -0
- metadata +12 -2
data/lib/polars/string_expr.rb
CHANGED
@@ -63,6 +63,13 @@ module Polars
|
|
63
63
|
# in the target string.
|
64
64
|
# @param cache [Boolean]
|
65
65
|
# Use a cache of unique, converted datetimes to apply the conversion.
|
66
|
+
# @param ambiguous ['raise', 'earliest', 'latest', 'null']
|
67
|
+
# Determine how to deal with ambiguous datetimes:
|
68
|
+
#
|
69
|
+
# - `'raise'` (default): raise
|
70
|
+
# - `'earliest'`: use the earliest datetime
|
71
|
+
# - `'latest'`: use the latest datetime
|
72
|
+
# - `'null'`: set to null
|
66
73
|
#
|
67
74
|
# @return [Expr]
|
68
75
|
#
|
@@ -145,6 +152,8 @@ module Polars
|
|
145
152
|
# @param exact [Boolean]
|
146
153
|
# - If true, require an exact format match.
|
147
154
|
# - If false, allow the format to match anywhere in the target string.
|
155
|
+
# @param cache [Boolean]
|
156
|
+
# Use a cache of unique, converted dates to apply the datetime conversion.
|
148
157
|
# @param utc [Boolean]
|
149
158
|
# Parse timezone aware datetimes as UTC. This may be useful if you have data
|
150
159
|
# with mixed offsets.
|
@@ -359,6 +368,71 @@ module Polars
|
|
359
368
|
end
|
360
369
|
alias_method :concat, :join
|
361
370
|
|
371
|
+
# Returns string values with all regular expression meta characters escaped.
|
372
|
+
#
|
373
|
+
# @return [Expr]
|
374
|
+
#
|
375
|
+
# @example
|
376
|
+
# df = Polars::DataFrame.new({"text" => ["abc", "def", nil, "abc(\\w+)"]})
|
377
|
+
# df.with_columns(Polars.col("text").str.escape_regex.alias("escaped"))
|
378
|
+
# # =>
|
379
|
+
# # shape: (4, 2)
|
380
|
+
# # ┌──────────┬──────────────┐
|
381
|
+
# # │ text ┆ escaped │
|
382
|
+
# # │ --- ┆ --- │
|
383
|
+
# # │ str ┆ str │
|
384
|
+
# # ╞══════════╪══════════════╡
|
385
|
+
# # │ abc ┆ abc │
|
386
|
+
# # │ def ┆ def │
|
387
|
+
# # │ null ┆ null │
|
388
|
+
# # │ abc(\w+) ┆ abc\(\\w\+\) │
|
389
|
+
# # └──────────┴──────────────┘
|
390
|
+
def escape_regex
|
391
|
+
Utils.wrap_expr(_rbexpr.str_escape_regex)
|
392
|
+
end
|
393
|
+
|
394
|
+
# Returns the Unicode normal form of the string values.
|
395
|
+
#
|
396
|
+
# This uses the forms described in Unicode Standard Annex 15: <https://www.unicode.org/reports/tr15/>.
|
397
|
+
#
|
398
|
+
# @param form ['NFC', 'NFKC', 'NFD', 'NFKD']
|
399
|
+
# Unicode form to use.
|
400
|
+
#
|
401
|
+
# @return [Expr]
|
402
|
+
#
|
403
|
+
# @example
|
404
|
+
# df = Polars::DataFrame.new({"text" => ["01²", "KADOKAWA"]})
|
405
|
+
# new = df.with_columns(
|
406
|
+
# nfc: Polars.col("text").str.normalize("NFC"),
|
407
|
+
# nfkc: Polars.col("text").str.normalize("NFKC")
|
408
|
+
# )
|
409
|
+
# # =>
|
410
|
+
# # shape: (2, 3)
|
411
|
+
# # ┌──────────────────┬──────────────────┬──────────┐
|
412
|
+
# # │ text ┆ nfc ┆ nfkc │
|
413
|
+
# # │ --- ┆ --- ┆ --- │
|
414
|
+
# # │ str ┆ str ┆ str │
|
415
|
+
# # ╞══════════════════╪══════════════════╪══════════╡
|
416
|
+
# # │ 01² ┆ 01² ┆ 012 │
|
417
|
+
# # │ KADOKAWA ┆ KADOKAWA ┆ KADOKAWA │
|
418
|
+
# # └──────────────────┴──────────────────┴──────────┘
|
419
|
+
#
|
420
|
+
# @example
|
421
|
+
# new.select(Polars.all.str.len_bytes)
|
422
|
+
# # =>
|
423
|
+
# # shape: (2, 3)
|
424
|
+
# # ┌──────┬─────┬──────┐
|
425
|
+
# # │ text ┆ nfc ┆ nfkc │
|
426
|
+
# # │ --- ┆ --- ┆ --- │
|
427
|
+
# # │ u32 ┆ u32 ┆ u32 │
|
428
|
+
# # ╞══════╪═════╪══════╡
|
429
|
+
# # │ 4 ┆ 4 ┆ 3 │
|
430
|
+
# # │ 24 ┆ 24 ┆ 8 │
|
431
|
+
# # └──────┴─────┴──────┘
|
432
|
+
def normalize(form = "NFC")
|
433
|
+
Utils.wrap_expr(_rbexpr.str_normalize(form))
|
434
|
+
end
|
435
|
+
|
362
436
|
# Transform to uppercase variant.
|
363
437
|
#
|
364
438
|
# @return [Expr]
|
@@ -590,6 +664,7 @@ module Polars
|
|
590
664
|
# # │ null ┆ null │
|
591
665
|
# # └──────────────┴──────────────┘
|
592
666
|
def pad_start(length, fill_char = " ")
|
667
|
+
length = Utils.parse_into_expression(length)
|
593
668
|
Utils.wrap_expr(_rbexpr.str_pad_start(length, fill_char))
|
594
669
|
end
|
595
670
|
alias_method :rjust, :pad_start
|
@@ -620,6 +695,7 @@ module Polars
|
|
620
695
|
# # │ null ┆ null │
|
621
696
|
# # └──────────────┴──────────────┘
|
622
697
|
def pad_end(length, fill_char = " ")
|
698
|
+
length = Utils.parse_into_expression(length)
|
623
699
|
Utils.wrap_expr(_rbexpr.str_pad_end(length, fill_char))
|
624
700
|
end
|
625
701
|
alias_method :ljust, :pad_end
|
@@ -664,6 +740,9 @@ module Polars
|
|
664
740
|
# A valid regex pattern.
|
665
741
|
# @param literal [Boolean]
|
666
742
|
# Treat pattern as a literal string.
|
743
|
+
# @param strict [Boolean]
|
744
|
+
# Raise an error if the underlying pattern is not a valid regex,
|
745
|
+
# otherwise mask out with a null value.
|
667
746
|
#
|
668
747
|
# @return [Expr]
|
669
748
|
#
|
@@ -693,6 +772,68 @@ module Polars
|
|
693
772
|
Utils.wrap_expr(_rbexpr.str_contains(pattern, literal, strict))
|
694
773
|
end
|
695
774
|
|
775
|
+
# Return the bytes offset of the first substring matching a pattern.
|
776
|
+
#
|
777
|
+
# If the pattern is not found, returns None.
|
778
|
+
#
|
779
|
+
# @param pattern [String]
|
780
|
+
# A valid regular expression pattern, compatible with the [regex crate](https://docs.rs/regex/latest/regex/).
|
781
|
+
# @param literal [Boolean]
|
782
|
+
# Treat `pattern` as a literal string, not as a regular expression.
|
783
|
+
# @param strict [Boolean]
|
784
|
+
# Raise an error if the underlying pattern is not a valid regex,
|
785
|
+
# otherwise mask out with a null value.
|
786
|
+
#
|
787
|
+
# @return [Expr]
|
788
|
+
#
|
789
|
+
# @note
|
790
|
+
# To modify regular expression behaviour (such as case-sensitivity) with
|
791
|
+
# flags, use the inline `(?iLmsuxU)` syntax.
|
792
|
+
#
|
793
|
+
# @example Find the index of the first substring matching a regex or literal pattern:
|
794
|
+
# df = Polars::DataFrame.new(
|
795
|
+
# {
|
796
|
+
# "txt" => ["Crab", "Lobster", nil, "Crustacean"],
|
797
|
+
# "pat" => ["a[bc]", "b.t", "[aeiuo]", "(?i)A[BC]"]
|
798
|
+
# }
|
799
|
+
# )
|
800
|
+
# df.select(
|
801
|
+
# Polars.col("txt"),
|
802
|
+
# Polars.col("txt").str.find("a|e").alias("a|e (regex)"),
|
803
|
+
# Polars.col("txt").str.find("e", literal: true).alias("e (lit)"),
|
804
|
+
# )
|
805
|
+
# # =>
|
806
|
+
# # shape: (4, 3)
|
807
|
+
# # ┌────────────┬─────────────┬─────────┐
|
808
|
+
# # │ txt ┆ a|e (regex) ┆ e (lit) │
|
809
|
+
# # │ --- ┆ --- ┆ --- │
|
810
|
+
# # │ str ┆ u32 ┆ u32 │
|
811
|
+
# # ╞════════════╪═════════════╪═════════╡
|
812
|
+
# # │ Crab ┆ 2 ┆ null │
|
813
|
+
# # │ Lobster ┆ 5 ┆ 5 │
|
814
|
+
# # │ null ┆ null ┆ null │
|
815
|
+
# # │ Crustacean ┆ 5 ┆ 7 │
|
816
|
+
# # └────────────┴─────────────┴─────────┘
|
817
|
+
#
|
818
|
+
# @example Match against a pattern found in another column or (expression):
|
819
|
+
# df.with_columns(Polars.col("txt").str.find(Polars.col("pat")).alias("find_pat"))
|
820
|
+
# # =>
|
821
|
+
# # shape: (4, 3)
|
822
|
+
# # ┌────────────┬───────────┬──────────┐
|
823
|
+
# # │ txt ┆ pat ┆ find_pat │
|
824
|
+
# # │ --- ┆ --- ┆ --- │
|
825
|
+
# # │ str ┆ str ┆ u32 │
|
826
|
+
# # ╞════════════╪═══════════╪══════════╡
|
827
|
+
# # │ Crab ┆ a[bc] ┆ 2 │
|
828
|
+
# # │ Lobster ┆ b.t ┆ 2 │
|
829
|
+
# # │ null ┆ [aeiuo] ┆ null │
|
830
|
+
# # │ Crustacean ┆ (?i)A[BC] ┆ 5 │
|
831
|
+
# # └────────────┴───────────┴──────────┘
|
832
|
+
def find(pattern, literal: false, strict: true)
|
833
|
+
pattern = Utils.parse_into_expression(pattern, str_as_lit: true)
|
834
|
+
Utils.wrap_expr(_rbexpr.str_find(pattern, literal, strict))
|
835
|
+
end
|
836
|
+
|
696
837
|
# Check if string values end with a substring.
|
697
838
|
#
|
698
839
|
# @param sub [String]
|
@@ -780,6 +921,9 @@ module Polars
|
|
780
921
|
# @param dtype [Object]
|
781
922
|
# The dtype to cast the extracted value to. If nil, the dtype will be
|
782
923
|
# inferred from the JSON value.
|
924
|
+
# @param infer_schema_length [Integer]
|
925
|
+
# The maximum number of rows to scan for schema inference.
|
926
|
+
# If set to `nil`, the full data may be scanned *(this is slow)*.
|
783
927
|
#
|
784
928
|
# @return [Expr]
|
785
929
|
#
|
@@ -1036,6 +1180,8 @@ module Polars
|
|
1036
1180
|
#
|
1037
1181
|
# @param pattern [String]
|
1038
1182
|
# A valid regex pattern
|
1183
|
+
# @param literal [Boolean]
|
1184
|
+
# Treat `pattern` as a literal string, not as a regular expression.
|
1039
1185
|
#
|
1040
1186
|
# @return [Expr]
|
1041
1187
|
#
|
@@ -1177,6 +1323,8 @@ module Polars
|
|
1177
1323
|
# Replacement string.
|
1178
1324
|
# @param literal [Boolean]
|
1179
1325
|
# Treat pattern as a literal string.
|
1326
|
+
# @param n [Integer]
|
1327
|
+
# Number of matches to replace.
|
1180
1328
|
#
|
1181
1329
|
# @return [Expr]
|
1182
1330
|
#
|
@@ -1286,6 +1434,130 @@ module Polars
|
|
1286
1434
|
Utils.wrap_expr(_rbexpr.str_slice(offset, length))
|
1287
1435
|
end
|
1288
1436
|
|
1437
|
+
# Return the first n characters of each string in a String Series.
|
1438
|
+
#
|
1439
|
+
# @param n [Integer]
|
1440
|
+
# Length of the slice (integer or expression). Negative indexing is supported;
|
1441
|
+
# see note (2) below.
|
1442
|
+
#
|
1443
|
+
# @return [Expr]
|
1444
|
+
#
|
1445
|
+
# @note
|
1446
|
+
# 1) The `n` input is defined in terms of the number of characters in the (UTF8)
|
1447
|
+
# string. A character is defined as a [Unicode scalar value](https://www.unicode.org/glossary/#unicode_scalar_value). A single
|
1448
|
+
# character is represented by a single byte when working with ASCII text, and a
|
1449
|
+
# maximum of 4 bytes otherwise.
|
1450
|
+
#
|
1451
|
+
# 2) When the `n` input is negative, `head` returns characters up to the `n`th
|
1452
|
+
# from the end of the string. For example, if `n = -3`, then all characters
|
1453
|
+
# except the last three are returned.
|
1454
|
+
#
|
1455
|
+
# 3) If the length of the string has fewer than `n` characters, the full string is
|
1456
|
+
# returned.
|
1457
|
+
#
|
1458
|
+
# @example Return up to the first 5 characters:
|
1459
|
+
# df = Polars::DataFrame.new({"s" => ["pear", nil, "papaya", "dragonfruit"]})
|
1460
|
+
# df.with_columns(Polars.col("s").str.head(5).alias("s_head_5"))
|
1461
|
+
# # =>
|
1462
|
+
# # shape: (4, 2)
|
1463
|
+
# # ┌─────────────┬──────────┐
|
1464
|
+
# # │ s ┆ s_head_5 │
|
1465
|
+
# # │ --- ┆ --- │
|
1466
|
+
# # │ str ┆ str │
|
1467
|
+
# # ╞═════════════╪══════════╡
|
1468
|
+
# # │ pear ┆ pear │
|
1469
|
+
# # │ null ┆ null │
|
1470
|
+
# # │ papaya ┆ papay │
|
1471
|
+
# # │ dragonfruit ┆ drago │
|
1472
|
+
# # └─────────────┴──────────┘
|
1473
|
+
#
|
1474
|
+
# @example Return characters determined by column `n`:
|
1475
|
+
# df = Polars::DataFrame.new(
|
1476
|
+
# {
|
1477
|
+
# "s" => ["pear", nil, "papaya", "dragonfruit"],
|
1478
|
+
# "n" => [3, 4, -2, -5]
|
1479
|
+
# }
|
1480
|
+
# )
|
1481
|
+
# df.with_columns(Polars.col("s").str.head("n").alias("s_head_n"))
|
1482
|
+
# # =>
|
1483
|
+
# # shape: (4, 3)
|
1484
|
+
# # ┌─────────────┬─────┬──────────┐
|
1485
|
+
# # │ s ┆ n ┆ s_head_n │
|
1486
|
+
# # │ --- ┆ --- ┆ --- │
|
1487
|
+
# # │ str ┆ i64 ┆ str │
|
1488
|
+
# # ╞═════════════╪═════╪══════════╡
|
1489
|
+
# # │ pear ┆ 3 ┆ pea │
|
1490
|
+
# # │ null ┆ 4 ┆ null │
|
1491
|
+
# # │ papaya ┆ -2 ┆ papa │
|
1492
|
+
# # │ dragonfruit ┆ -5 ┆ dragon │
|
1493
|
+
# # └─────────────┴─────┴──────────┘
|
1494
|
+
def head(n)
|
1495
|
+
n = Utils.parse_into_expression(n)
|
1496
|
+
Utils.wrap_expr(_rbexpr.str_head(n))
|
1497
|
+
end
|
1498
|
+
|
1499
|
+
# Return the last n characters of each string in a String Series.
|
1500
|
+
#
|
1501
|
+
# @param n [Integer]
|
1502
|
+
# Length of the slice (integer or expression). Negative indexing is supported;
|
1503
|
+
# see note (2) below.
|
1504
|
+
#
|
1505
|
+
# @return [Expr]
|
1506
|
+
#
|
1507
|
+
# @note
|
1508
|
+
# 1) The `n` input is defined in terms of the number of characters in the (UTF8)
|
1509
|
+
# string. A character is defined as a [Unicode scalar value](https://www.unicode.org/glossary/#unicode_scalar_value). A single
|
1510
|
+
# character is represented by a single byte when working with ASCII text, and a
|
1511
|
+
# maximum of 4 bytes otherwise.
|
1512
|
+
#
|
1513
|
+
# 2) When the `n` input is negative, `tail` returns characters starting from the
|
1514
|
+
# `n`th from the beginning of the string. For example, if `n = -3`, then all
|
1515
|
+
# characters except the first three are returned.
|
1516
|
+
#
|
1517
|
+
# 3) If the length of the string has fewer than `n` characters, the full string is
|
1518
|
+
# returned.
|
1519
|
+
#
|
1520
|
+
# @example Return up to the last 5 characters:
|
1521
|
+
# df = Polars::DataFrame.new({"s" => ["pear", nil, "papaya", "dragonfruit"]})
|
1522
|
+
# df.with_columns(Polars.col("s").str.tail(5).alias("s_tail_5"))
|
1523
|
+
# # =>
|
1524
|
+
# # shape: (4, 2)
|
1525
|
+
# # ┌─────────────┬──────────┐
|
1526
|
+
# # │ s ┆ s_tail_5 │
|
1527
|
+
# # │ --- ┆ --- │
|
1528
|
+
# # │ str ┆ str │
|
1529
|
+
# # ╞═════════════╪══════════╡
|
1530
|
+
# # │ pear ┆ pear │
|
1531
|
+
# # │ null ┆ null │
|
1532
|
+
# # │ papaya ┆ apaya │
|
1533
|
+
# # │ dragonfruit ┆ fruit │
|
1534
|
+
# # └─────────────┴──────────┘
|
1535
|
+
#
|
1536
|
+
# @example Return characters determined by column `n`:
|
1537
|
+
# df = Polars::DataFrame.new(
|
1538
|
+
# {
|
1539
|
+
# "s" => ["pear", nil, "papaya", "dragonfruit"],
|
1540
|
+
# "n" => [3, 4, -2, -5]
|
1541
|
+
# }
|
1542
|
+
# )
|
1543
|
+
# df.with_columns(Polars.col("s").str.tail("n").alias("s_tail_n"))
|
1544
|
+
# # =>
|
1545
|
+
# # shape: (4, 3)
|
1546
|
+
# # ┌─────────────┬─────┬──────────┐
|
1547
|
+
# # │ s ┆ n ┆ s_tail_n │
|
1548
|
+
# # │ --- ┆ --- ┆ --- │
|
1549
|
+
# # │ str ┆ i64 ┆ str │
|
1550
|
+
# # ╞═════════════╪═════╪══════════╡
|
1551
|
+
# # │ pear ┆ 3 ┆ ear │
|
1552
|
+
# # │ null ┆ 4 ┆ null │
|
1553
|
+
# # │ papaya ┆ -2 ┆ paya │
|
1554
|
+
# # │ dragonfruit ┆ -5 ┆ nfruit │
|
1555
|
+
# # └─────────────┴─────┴──────────┘
|
1556
|
+
def tail(n)
|
1557
|
+
n = Utils.parse_into_expression(n)
|
1558
|
+
Utils.wrap_expr(_rbexpr.str_tail(n))
|
1559
|
+
end
|
1560
|
+
|
1289
1561
|
# Convert an Utf8 column into an Int64 column with base radix.
|
1290
1562
|
#
|
1291
1563
|
# @param base [Integer]
|
@@ -1328,9 +1600,9 @@ module Polars
|
|
1328
1600
|
# # │ cafe ┆ 51966 │
|
1329
1601
|
# # │ null ┆ null │
|
1330
1602
|
# # └──────┴────────┘
|
1331
|
-
def to_integer(base: 10, strict: true)
|
1603
|
+
def to_integer(base: 10, dtype: Int64, strict: true)
|
1332
1604
|
base = Utils.parse_into_expression(base, str_as_lit: false)
|
1333
|
-
Utils.wrap_expr(_rbexpr.str_to_integer(base, strict))
|
1605
|
+
Utils.wrap_expr(_rbexpr.str_to_integer(base, dtype, strict))
|
1334
1606
|
end
|
1335
1607
|
|
1336
1608
|
# Parse integers with base radix from strings.
|
@@ -1411,9 +1683,9 @@ module Polars
|
|
1411
1683
|
|
1412
1684
|
# Use the aho-corasick algorithm to replace many matches.
|
1413
1685
|
#
|
1414
|
-
# @param patterns [
|
1686
|
+
# @param patterns [Object]
|
1415
1687
|
# String patterns to search and replace.
|
1416
|
-
# @param replace_with [
|
1688
|
+
# @param replace_with [Object]
|
1417
1689
|
# Strings to replace where a pattern was a match.
|
1418
1690
|
# This can be broadcasted. So it supports many:one and many:many.
|
1419
1691
|
# @param ascii_case_insensitive [Boolean]
|
@@ -1437,7 +1709,7 @@ module Polars
|
|
1437
1709
|
# Polars.col("lyrics")
|
1438
1710
|
# .str.replace_many(
|
1439
1711
|
# ["me", "you", "they"],
|
1440
|
-
# ""
|
1712
|
+
# [""]
|
1441
1713
|
# )
|
1442
1714
|
# .alias("removes_pronouns")
|
1443
1715
|
# )
|
@@ -1473,11 +1745,22 @@ module Polars
|
|
1473
1745
|
# # │ Tell me what you want, what yo… ┆ Tell you what me want, what me… │
|
1474
1746
|
# # │ Can you feel the love tonight ┆ Can me feel the love tonight │
|
1475
1747
|
# # └─────────────────────────────────┴─────────────────────────────────┘
|
1476
|
-
def replace_many(patterns, replace_with, ascii_case_insensitive: false)
|
1748
|
+
def replace_many(patterns, replace_with = Expr::NO_DEFAULT, ascii_case_insensitive: false)
|
1749
|
+
if replace_with == Expr::NO_DEFAULT
|
1750
|
+
if !patterns.is_a?(Hash)
|
1751
|
+
msg = "`replace_with` argument is required if `patterns` argument is not a Hash type"
|
1752
|
+
raise TypeError, msg
|
1753
|
+
end
|
1754
|
+
# Early return in case of an empty mapping.
|
1755
|
+
if patterns.empty?
|
1756
|
+
return Utils.wrap_expr(_rbexpr)
|
1757
|
+
end
|
1758
|
+
replace_with = patterns.values
|
1759
|
+
patterns = patterns.keys
|
1760
|
+
end
|
1761
|
+
|
1477
1762
|
patterns = Utils.parse_into_expression(patterns, str_as_lit: false)
|
1478
|
-
replace_with = Utils.parse_into_expression(
|
1479
|
-
replace_with, str_as_lit: true
|
1480
|
-
)
|
1763
|
+
replace_with = Utils.parse_into_expression(replace_with, str_as_lit: true)
|
1481
1764
|
Utils.wrap_expr(
|
1482
1765
|
_rbexpr.str_replace_many(
|
1483
1766
|
patterns, replace_with, ascii_case_insensitive
|
@@ -1485,6 +1768,149 @@ module Polars
|
|
1485
1768
|
)
|
1486
1769
|
end
|
1487
1770
|
|
1771
|
+
# Use the Aho-Corasick algorithm to extract many matches.
|
1772
|
+
#
|
1773
|
+
# @param patterns [Object]
|
1774
|
+
# String patterns to search.
|
1775
|
+
# @param ascii_case_insensitive [Boolean]
|
1776
|
+
# Enable ASCII-aware case-insensitive matching.
|
1777
|
+
# When this option is enabled, searching will be performed without respect
|
1778
|
+
# to case for ASCII letters (a-z and A-Z) only.
|
1779
|
+
# @param overlapping [Boolean]
|
1780
|
+
# Whether matches may overlap.
|
1781
|
+
#
|
1782
|
+
# @return [Expr]
|
1783
|
+
#
|
1784
|
+
# @note
|
1785
|
+
# This method supports matching on string literals only, and does not support
|
1786
|
+
# regular expression matching.
|
1787
|
+
#
|
1788
|
+
# @example
|
1789
|
+
# df = Polars::DataFrame.new({"values" => ["discontent"]})
|
1790
|
+
# patterns = ["winter", "disco", "onte", "discontent"]
|
1791
|
+
# df.with_columns(
|
1792
|
+
# Polars.col("values")
|
1793
|
+
# .str.extract_many(patterns, overlapping: false)
|
1794
|
+
# .alias("matches"),
|
1795
|
+
# Polars.col("values")
|
1796
|
+
# .str.extract_many(patterns, overlapping: true)
|
1797
|
+
# .alias("matches_overlapping"),
|
1798
|
+
# )
|
1799
|
+
# # =>
|
1800
|
+
# # shape: (1, 3)
|
1801
|
+
# # ┌────────────┬───────────┬─────────────────────────────────┐
|
1802
|
+
# # │ values ┆ matches ┆ matches_overlapping │
|
1803
|
+
# # │ --- ┆ --- ┆ --- │
|
1804
|
+
# # │ str ┆ list[str] ┆ list[str] │
|
1805
|
+
# # ╞════════════╪═══════════╪═════════════════════════════════╡
|
1806
|
+
# # │ discontent ┆ ["disco"] ┆ ["disco", "onte", "discontent"… │
|
1807
|
+
# # └────────────┴───────────┴─────────────────────────────────┘
|
1808
|
+
#
|
1809
|
+
# @example
|
1810
|
+
# df = Polars::DataFrame.new(
|
1811
|
+
# {
|
1812
|
+
# "values" => ["discontent", "rhapsody"],
|
1813
|
+
# "patterns" => [
|
1814
|
+
# ["winter", "disco", "onte", "discontent"],
|
1815
|
+
# ["rhap", "ody", "coalesce"]
|
1816
|
+
# ]
|
1817
|
+
# }
|
1818
|
+
# )
|
1819
|
+
# df.select(Polars.col("values").str.extract_many("patterns"))
|
1820
|
+
# # =>
|
1821
|
+
# # shape: (2, 1)
|
1822
|
+
# # ┌─────────────────┐
|
1823
|
+
# # │ values │
|
1824
|
+
# # │ --- │
|
1825
|
+
# # │ list[str] │
|
1826
|
+
# # ╞═════════════════╡
|
1827
|
+
# # │ ["disco"] │
|
1828
|
+
# # │ ["rhap", "ody"] │
|
1829
|
+
# # └─────────────────┘
|
1830
|
+
def extract_many(
|
1831
|
+
patterns,
|
1832
|
+
ascii_case_insensitive: false,
|
1833
|
+
overlapping: false
|
1834
|
+
)
|
1835
|
+
patterns = Utils.parse_into_expression(patterns, str_as_lit: false)
|
1836
|
+
Utils.wrap_expr(
|
1837
|
+
_rbexpr.str_extract_many(patterns, ascii_case_insensitive, overlapping)
|
1838
|
+
)
|
1839
|
+
end
|
1840
|
+
|
1841
|
+
# Use the Aho-Corasick algorithm to find many matches.
|
1842
|
+
#
|
1843
|
+
# The function will return the bytes offset of the start of each match.
|
1844
|
+
# The return type will be `List<UInt32>`
|
1845
|
+
#
|
1846
|
+
# @param patterns [Object]
|
1847
|
+
# String patterns to search.
|
1848
|
+
# @param ascii_case_insensitive [Boolean]
|
1849
|
+
# Enable ASCII-aware case-insensitive matching.
|
1850
|
+
# When this option is enabled, searching will be performed without respect
|
1851
|
+
# to case for ASCII letters (a-z and A-Z) only.
|
1852
|
+
# @param overlapping [Boolean]
|
1853
|
+
# Whether matches may overlap.
|
1854
|
+
#
|
1855
|
+
# @return [Expr]
|
1856
|
+
#
|
1857
|
+
# @note
|
1858
|
+
# This method supports matching on string literals only, and does not support
|
1859
|
+
# regular expression matching.
|
1860
|
+
#
|
1861
|
+
# @example
|
1862
|
+
# df = Polars::DataFrame.new({"values" => ["discontent"]})
|
1863
|
+
# patterns = ["winter", "disco", "onte", "discontent"]
|
1864
|
+
# df.with_columns(
|
1865
|
+
# Polars.col("values")
|
1866
|
+
# .str.extract_many(patterns, overlapping: false)
|
1867
|
+
# .alias("matches"),
|
1868
|
+
# Polars.col("values")
|
1869
|
+
# .str.extract_many(patterns, overlapping: true)
|
1870
|
+
# .alias("matches_overlapping"),
|
1871
|
+
# )
|
1872
|
+
# # =>
|
1873
|
+
# # shape: (1, 3)
|
1874
|
+
# # ┌────────────┬───────────┬─────────────────────────────────┐
|
1875
|
+
# # │ values ┆ matches ┆ matches_overlapping │
|
1876
|
+
# # │ --- ┆ --- ┆ --- │
|
1877
|
+
# # │ str ┆ list[str] ┆ list[str] │
|
1878
|
+
# # ╞════════════╪═══════════╪═════════════════════════════════╡
|
1879
|
+
# # │ discontent ┆ ["disco"] ┆ ["disco", "onte", "discontent"… │
|
1880
|
+
# # └────────────┴───────────┴─────────────────────────────────┘
|
1881
|
+
#
|
1882
|
+
# @example
|
1883
|
+
# df = Polars::DataFrame.new(
|
1884
|
+
# {
|
1885
|
+
# "values" => ["discontent", "rhapsody"],
|
1886
|
+
# "patterns" => [
|
1887
|
+
# ["winter", "disco", "onte", "discontent"],
|
1888
|
+
# ["rhap", "ody", "coalesce"]
|
1889
|
+
# ]
|
1890
|
+
# }
|
1891
|
+
# )
|
1892
|
+
# df.select(Polars.col("values").str.find_many("patterns"))
|
1893
|
+
# # =>
|
1894
|
+
# # shape: (2, 1)
|
1895
|
+
# # ┌───────────┐
|
1896
|
+
# # │ values │
|
1897
|
+
# # │ --- │
|
1898
|
+
# # │ list[u32] │
|
1899
|
+
# # ╞═══════════╡
|
1900
|
+
# # │ [0] │
|
1901
|
+
# # │ [0, 5] │
|
1902
|
+
# # └───────────┘
|
1903
|
+
def find_many(
|
1904
|
+
patterns,
|
1905
|
+
ascii_case_insensitive: false,
|
1906
|
+
overlapping: false
|
1907
|
+
)
|
1908
|
+
patterns = Utils.parse_into_expression(patterns, str_as_lit: false)
|
1909
|
+
Utils.wrap_expr(
|
1910
|
+
_rbexpr.str_find_many(patterns, ascii_case_insensitive, overlapping)
|
1911
|
+
)
|
1912
|
+
end
|
1913
|
+
|
1488
1914
|
private
|
1489
1915
|
|
1490
1916
|
def _validate_format_argument(format)
|