polars-df 0.20.0-x86_64-darwin → 0.21.1-x86_64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +27 -0
  3. data/Cargo.lock +192 -186
  4. data/LICENSE-THIRD-PARTY.txt +1431 -1810
  5. data/LICENSE.txt +1 -1
  6. data/lib/polars/3.2/polars.bundle +0 -0
  7. data/lib/polars/3.3/polars.bundle +0 -0
  8. data/lib/polars/3.4/polars.bundle +0 -0
  9. data/lib/polars/array_expr.rb +382 -3
  10. data/lib/polars/array_name_space.rb +281 -0
  11. data/lib/polars/binary_expr.rb +67 -0
  12. data/lib/polars/binary_name_space.rb +43 -0
  13. data/lib/polars/cat_expr.rb +224 -0
  14. data/lib/polars/cat_name_space.rb +130 -32
  15. data/lib/polars/catalog/unity/catalog_info.rb +20 -0
  16. data/lib/polars/catalog/unity/column_info.rb +31 -0
  17. data/lib/polars/catalog/unity/namespace_info.rb +21 -0
  18. data/lib/polars/catalog/unity/table_info.rb +50 -0
  19. data/lib/polars/catalog.rb +448 -0
  20. data/lib/polars/config.rb +2 -2
  21. data/lib/polars/convert.rb +12 -2
  22. data/lib/polars/data_frame.rb +834 -48
  23. data/lib/polars/data_type_expr.rb +52 -0
  24. data/lib/polars/data_types.rb +61 -5
  25. data/lib/polars/date_time_expr.rb +251 -0
  26. data/lib/polars/date_time_name_space.rb +299 -0
  27. data/lib/polars/exceptions.rb +7 -2
  28. data/lib/polars/expr.rb +1247 -211
  29. data/lib/polars/functions/col.rb +6 -5
  30. data/lib/polars/functions/datatype.rb +21 -0
  31. data/lib/polars/functions/lazy.rb +127 -15
  32. data/lib/polars/functions/repeat.rb +4 -0
  33. data/lib/polars/io/csv.rb +19 -1
  34. data/lib/polars/io/json.rb +16 -0
  35. data/lib/polars/io/ndjson.rb +13 -0
  36. data/lib/polars/io/parquet.rb +70 -66
  37. data/lib/polars/io/scan_options.rb +47 -0
  38. data/lib/polars/lazy_frame.rb +1099 -95
  39. data/lib/polars/list_expr.rb +400 -11
  40. data/lib/polars/list_name_space.rb +321 -5
  41. data/lib/polars/meta_expr.rb +71 -22
  42. data/lib/polars/name_expr.rb +36 -0
  43. data/lib/polars/scan_cast_options.rb +64 -0
  44. data/lib/polars/schema.rb +84 -3
  45. data/lib/polars/selector.rb +210 -0
  46. data/lib/polars/selectors.rb +932 -203
  47. data/lib/polars/series.rb +1083 -63
  48. data/lib/polars/string_expr.rb +435 -9
  49. data/lib/polars/string_name_space.rb +729 -45
  50. data/lib/polars/struct_expr.rb +103 -0
  51. data/lib/polars/struct_name_space.rb +19 -1
  52. data/lib/polars/utils/parse.rb +40 -0
  53. data/lib/polars/utils/various.rb +18 -1
  54. data/lib/polars/utils.rb +9 -1
  55. data/lib/polars/version.rb +1 -1
  56. data/lib/polars.rb +10 -0
  57. metadata +12 -2
@@ -0,0 +1,52 @@
1
+ module Polars
2
+ # A lazily instantiated `DataType` that can be used in an `Expr`.
3
+ class DataTypeExpr
4
+ # @private
5
+ attr_accessor :_rbdatatype_expr
6
+
7
+ # @private
8
+ def self._from_rbdatatype_expr(rbdatatype_expr)
9
+ slf = new
10
+ slf._rbdatatype_expr = rbdatatype_expr
11
+ slf
12
+ end
13
+
14
+ # Materialize the `DataTypeExpr` in a specific context.
15
+ #
16
+ # This is a useful function when debugging datatype expressions.
17
+ #
18
+ # @return [DataType]
19
+ #
20
+ # @example
21
+ # lf = Polars::LazyFrame.new(
22
+ # {
23
+ # "a" => [1, 2, 3]
24
+ # }
25
+ # )
26
+ # Polars.dtype_of("a").collect_dtype(lf)
27
+ # # => Polars::Int64
28
+ #
29
+ # @example
30
+ # Polars.dtype_of("a").collect_dtype({"a" => Polars::String})
31
+ # # => Polars::String
32
+ def collect_dtype(
33
+ context
34
+ )
35
+ schema = nil
36
+ if context.is_a?(Schema)
37
+ schema = context
38
+ elsif context.is_a?(Hash)
39
+ schema = Schema.new(context)
40
+ elsif context.is_a?(DataFrame)
41
+ schema = context.schema
42
+ elsif context.is_a?(LazyFrame)
43
+ schema = context.collect_schema
44
+ else
45
+ msg = "DataTypeExpr.collect_dtype did not expect #{context.inspect}"
46
+ raise TypeError, msg
47
+ end
48
+
49
+ _rbdatatype_expr.collect_dtype(schema.to_h)
50
+ end
51
+ end
52
+ end
@@ -99,7 +99,18 @@ module Polars
99
99
  self < NestedType
100
100
  end
101
101
 
102
- [:numeric?, :decimal?, :integer?, :signed_integer?, :unsigned_integer?, :float?, :temporal?, :nested?].each do |v|
102
+ # Return a `DataTypeExpr` with a static `DataType`.
103
+ #
104
+ # @return [Expr]
105
+ #
106
+ # @example
107
+ # Polars::Int16.new.to_dtype_expr.collect_dtype({})
108
+ # # => Polars::Int16
109
+ def self.to_dtype_expr
110
+ DataTypeExpr._from_rbdatatype_expr(RbDataTypeExpr.from_dtype(self))
111
+ end
112
+
113
+ [:numeric?, :decimal?, :integer?, :signed_integer?, :unsigned_integer?, :float?, :temporal?, :nested?, :to_dtype_expr].each do |v|
103
114
  define_method(v) do
104
115
  self.class.public_send(v)
105
116
  end
@@ -294,12 +305,57 @@ module Polars
294
305
  end
295
306
  end
296
307
 
308
+ # A named collection of categories for `Categorical`.
309
+ #
310
+ # Two categories are considered equal (and will use the same physical mapping of
311
+ # categories to strings) if they have the same name, namespace and physical backing
312
+ # type, even if they are created in separate calls to `Categories`.
313
+ #
314
+ # @note
315
+ # This functionality is currently considered **unstable**. It may be
316
+ # changed at any point without it being considered a breaking change.
317
+ class Categories
318
+ attr_accessor :_categories
319
+
320
+ def initialize
321
+ # TODO fix
322
+ name = nil
323
+ if name.nil? || name == ""
324
+ @_categories = RbCategories.global_categories
325
+ return
326
+ end
327
+
328
+ raise Todo
329
+ end
330
+
331
+ # @private
332
+ def self._from_rb_categories(rb_categories)
333
+ slf = new
334
+ slf._categories = rb_categories
335
+ slf
336
+ end
337
+ end
338
+
297
339
  # A categorical encoding of a set of strings.
298
340
  class Categorical < DataType
299
- attr_reader :ordering
341
+ attr_reader :ordering, :categories
300
342
 
301
- def initialize(ordering = "physical")
302
- @ordering = ordering
343
+ def initialize(ordering = "physical", **kwargs)
344
+ if ordering.is_a?(Categories)
345
+ @ordering = "lexical"
346
+ @categories = ordering
347
+ # assert kwargs.length == 0
348
+ return
349
+ end
350
+
351
+ @ordering = "lexical"
352
+ if kwargs[:categories]
353
+ # assert kwargs.length == 1
354
+ @categories = kwargs[:categories]
355
+ else
356
+ # assert kwargs.length == 0
357
+ @categories = Categories.new
358
+ end
303
359
  end
304
360
  end
305
361
 
@@ -357,7 +413,7 @@ module Polars
357
413
  class Object < DataType
358
414
  end
359
415
 
360
- # Type representing Null / None values.
416
+ # Type representing Null / nil values.
361
417
  class Null < DataType
362
418
  end
363
419
 
@@ -9,6 +9,57 @@ module Polars
9
9
  self._rbexpr = expr._rbexpr
10
10
  end
11
11
 
12
+ # Offset by `n` business days.
13
+ #
14
+ # @note
15
+ # This functionality is considered **unstable**. It may be changed
16
+ # at any point without it being considered a breaking change.
17
+ #
18
+ # @param n
19
+ # Number of business days to offset by. Can be a single number of an
20
+ # expression.
21
+ # @param week_mask
22
+ # Which days of the week to count. The default is Monday to Friday.
23
+ # If you wanted to count only Monday to Thursday, you would pass
24
+ # `[true, true, true, true, false, false, false]`.
25
+ # @param roll
26
+ # What to do when the start date lands on a non-business day. Options are:
27
+ #
28
+ # - `'raise'`: raise an error
29
+ # - `'forward'`: move to the next business day
30
+ # - `'backward'`: move to the previous business day
31
+ #
32
+ # @return [Expr]
33
+ #
34
+ # @example
35
+ # df = Polars::DataFrame.new({"start" => [Date.new(2020, 1, 1), Date.new(2020, 1, 2)]})
36
+ # df.with_columns(result: Polars.col("start").dt.add_business_days(5))
37
+ # # =>
38
+ # # shape: (2, 2)
39
+ # # ┌────────────┬────────────┐
40
+ # # │ start ┆ result │
41
+ # # │ --- ┆ --- │
42
+ # # │ date ┆ date │
43
+ # # ╞════════════╪════════════╡
44
+ # # │ 2020-01-01 ┆ 2020-01-08 │
45
+ # # │ 2020-01-02 ┆ 2020-01-09 │
46
+ # # └────────────┴────────────┘
47
+ def add_business_days(
48
+ n,
49
+ week_mask: [true, true, true, true, true, false, false],
50
+ roll: "raise"
51
+ )
52
+ n_rbexpr = Utils.parse_into_expression(n)
53
+ Utils.wrap_expr(
54
+ _rbexpr.dt_add_business_days(
55
+ n_rbexpr,
56
+ week_mask,
57
+ [],
58
+ roll
59
+ )
60
+ )
61
+ end
62
+
12
63
  # Divide the date/datetime range into buckets.
13
64
  #
14
65
  # Each date/datetime is mapped to the start of its bucket.
@@ -203,6 +254,93 @@ module Polars
203
254
  Utils.wrap_expr(_rbexpr.dt_round(every))
204
255
  end
205
256
 
257
+ # Replace time unit.
258
+ #
259
+ # @param year [Object]
260
+ # Column or literal.
261
+ # @param month [Object]
262
+ # Column or literal, ranging from 1-12.
263
+ # @param day [Object]
264
+ # Column or literal, ranging from 1-31.
265
+ # @param hour [Object]
266
+ # Column or literal, ranging from 0-23.
267
+ # @param minute [Object]
268
+ # Column or literal, ranging from 0-59.
269
+ # @param second [Object]
270
+ # Column or literal, ranging from 0-59.
271
+ # @param microsecond [Object]
272
+ # Column or literal, ranging from 0-999999.
273
+ # @param ambiguous [String]
274
+ # Determine how to deal with ambiguous datetimes:
275
+ #
276
+ # - `'raise'` (default): raise
277
+ # - `'earliest'`: use the earliest datetime
278
+ # - `'latest'`: use the latest datetime
279
+ # - `'null'`: set to null
280
+ #
281
+ # @return [Expr]
282
+ #
283
+ # @example
284
+ # df = Polars::DataFrame.new(
285
+ # {
286
+ # "date" => [Date.new(2024, 4, 1), Date.new(2025, 3, 16)],
287
+ # "new_day" => [10, 15]
288
+ # }
289
+ # )
290
+ # df.with_columns(Polars.col("date").dt.replace(day: "new_day").alias("replaced"))
291
+ # # =>
292
+ # # shape: (2, 3)
293
+ # # ┌────────────┬─────────┬────────────┐
294
+ # # │ date ┆ new_day ┆ replaced │
295
+ # # │ --- ┆ --- ┆ --- │
296
+ # # │ date ┆ i64 ┆ date │
297
+ # # ╞════════════╪═════════╪════════════╡
298
+ # # │ 2024-04-01 ┆ 10 ┆ 2024-04-10 │
299
+ # # │ 2025-03-16 ┆ 15 ┆ 2025-03-15 │
300
+ # # └────────────┴─────────┴────────────┘
301
+ #
302
+ # @example
303
+ # df.with_columns(Polars.col("date").dt.replace(year: 1800).alias("replaced"))
304
+ # # =>
305
+ # # shape: (2, 3)
306
+ # # ┌────────────┬─────────┬────────────┐
307
+ # # │ date ┆ new_day ┆ replaced │
308
+ # # │ --- ┆ --- ┆ --- │
309
+ # # │ date ┆ i64 ┆ date │
310
+ # # ╞════════════╪═════════╪════════════╡
311
+ # # │ 2024-04-01 ┆ 10 ┆ 1800-04-01 │
312
+ # # │ 2025-03-16 ┆ 15 ┆ 1800-03-16 │
313
+ # # └────────────┴─────────┴────────────┘
314
+ def replace(
315
+ year: nil,
316
+ month: nil,
317
+ day: nil,
318
+ hour: nil,
319
+ minute: nil,
320
+ second: nil,
321
+ microsecond: nil,
322
+ ambiguous: "raise"
323
+ )
324
+ day, month, year, hour, minute, second, microsecond = (
325
+ Utils.parse_into_list_of_expressions(
326
+ day, month, year, hour, minute, second, microsecond
327
+ )
328
+ )
329
+ ambiguous_expr = Utils.parse_into_expression(ambiguous, str_as_lit: true)
330
+ Utils.wrap_expr(
331
+ _rbexpr.dt_replace(
332
+ year,
333
+ month,
334
+ day,
335
+ hour,
336
+ minute,
337
+ second,
338
+ microsecond,
339
+ ambiguous_expr
340
+ )
341
+ )
342
+ end
343
+
206
344
  # Create a naive Datetime from an existing Date/Datetime expression and a Time.
207
345
  #
208
346
  # If the underlying expression is a Datetime then its time component is replaced,
@@ -317,6 +455,82 @@ module Polars
317
455
  Utils.wrap_expr(_rbexpr.strftime(fmt))
318
456
  end
319
457
 
458
+ # Extract the millennium from underlying representation.
459
+ #
460
+ # Applies to Date and Datetime columns.
461
+ #
462
+ # Returns the millennium number in the calendar date.
463
+ #
464
+ # @return [Expr]
465
+ #
466
+ # @example
467
+ # df = Polars::DataFrame.new(
468
+ # {
469
+ # "date" => [
470
+ # Date.new(999, 12, 31),
471
+ # Date.new(1897, 5, 7),
472
+ # Date.new(2000, 1, 1),
473
+ # Date.new(2001, 7, 5),
474
+ # Date.new(3002, 10, 20)
475
+ # ]
476
+ # }
477
+ # )
478
+ # df.with_columns(mlnm: Polars.col("date").dt.millennium)
479
+ # # =>
480
+ # # shape: (5, 2)
481
+ # # ┌────────────┬──────┐
482
+ # # │ date ┆ mlnm │
483
+ # # │ --- ┆ --- │
484
+ # # │ date ┆ i32 │
485
+ # # ╞════════════╪══════╡
486
+ # # │ 0999-12-31 ┆ 1 │
487
+ # # │ 1897-05-07 ┆ 2 │
488
+ # # │ 2000-01-01 ┆ 2 │
489
+ # # │ 2001-07-05 ┆ 3 │
490
+ # # │ 3002-10-20 ┆ 4 │
491
+ # # └────────────┴──────┘
492
+ def millennium
493
+ Utils.wrap_expr(_rbexpr.dt_millennium)
494
+ end
495
+
496
+ # Extract the century from underlying representation.
497
+ #
498
+ # Applies to Date and Datetime columns.
499
+ #
500
+ # Returns the century number in the calendar date.
501
+ #
502
+ # @return [Expr]
503
+ #
504
+ # @example
505
+ # df = Polars::DataFrame.new(
506
+ # {
507
+ # "date" => [
508
+ # Date.new(999, 12, 31),
509
+ # Date.new(1897, 5, 7),
510
+ # Date.new(2000, 1, 1),
511
+ # Date.new(2001, 7, 5),
512
+ # Date.new(3002, 10, 20)
513
+ # ]
514
+ # }
515
+ # )
516
+ # df.with_columns(cent: Polars.col("date").dt.century)
517
+ # # =>
518
+ # # shape: (5, 2)
519
+ # # ┌────────────┬──────┐
520
+ # # │ date ┆ cent │
521
+ # # │ --- ┆ --- │
522
+ # # │ date ┆ i32 │
523
+ # # ╞════════════╪══════╡
524
+ # # │ 0999-12-31 ┆ 10 │
525
+ # # │ 1897-05-07 ┆ 19 │
526
+ # # │ 2000-01-01 ┆ 20 │
527
+ # # │ 2001-07-05 ┆ 21 │
528
+ # # │ 3002-10-20 ┆ 31 │
529
+ # # └────────────┴──────┘
530
+ def century
531
+ Utils.wrap_expr(_rbexpr.dt_century)
532
+ end
533
+
320
534
  # Extract year from underlying Date representation.
321
535
  #
322
536
  # Applies to Date and Datetime columns.
@@ -348,6 +562,43 @@ module Polars
348
562
  Utils.wrap_expr(_rbexpr.dt_year)
349
563
  end
350
564
 
565
+ # Determine whether each day lands on a business day.
566
+ #
567
+ # @note
568
+ # This functionality is considered **unstable**. It may be changed
569
+ # at any point without it being considered a breaking change.
570
+ #
571
+ # @param week_mask [Array]
572
+ # Which days of the week to count. The default is Monday to Friday.
573
+ # If you wanted to count only Monday to Thursday, you would pass
574
+ # `[true, true, true, true, false, false, false]`.
575
+ #
576
+ # @return [Expr]
577
+ #
578
+ # @example
579
+ # df = Polars::DataFrame.new({"start" => [Date.new(2020, 1, 3), Date.new(2020, 1, 5)]})
580
+ # df.with_columns(is_business_day: Polars.col("start").dt.is_business_day)
581
+ # # =>
582
+ # # shape: (2, 2)
583
+ # # ┌────────────┬─────────────────┐
584
+ # # │ start ┆ is_business_day │
585
+ # # │ --- ┆ --- │
586
+ # # │ date ┆ bool │
587
+ # # ╞════════════╪═════════════════╡
588
+ # # │ 2020-01-03 ┆ true │
589
+ # # │ 2020-01-05 ┆ false │
590
+ # # └────────────┴─────────────────┘
591
+ def is_business_day(
592
+ week_mask: [true, true, true, true, true, false, false]
593
+ )
594
+ Utils.wrap_expr(
595
+ _rbexpr.dt_is_business_day(
596
+ week_mask,
597
+ []
598
+ )
599
+ )
600
+ end
601
+
351
602
  # Determine whether the year of the underlying date is a leap year.
352
603
  #
353
604
  # Applies to Date and Datetime columns.