polars-df 0.15.0-arm64-darwin → 0.16.0-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -14,7 +14,7 @@ gem "polars-df"
14
14
 
15
15
  ## Getting Started
16
16
 
17
- This library follows the [Polars Python API](https://pola-rs.github.io/polars/py-polars/html/reference/index.html).
17
+ This library follows the [Polars Python API](https://docs.pola.rs/api/python/stable/reference/index.html).
18
18
 
19
19
  ```ruby
20
20
  Polars.scan_csv("iris.csv")
@@ -24,7 +24,7 @@ Polars.scan_csv("iris.csv")
24
24
  .collect
25
25
  ```
26
26
 
27
- You can follow [Polars tutorials](https://pola-rs.github.io/polars-book/user-guide/) and convert the code to Ruby in many cases. Feel free to open an issue if you run into problems.
27
+ You can follow [Polars tutorials](https://docs.pola.rs/user-guide/getting-started/) and convert the code to Ruby in many cases. Feel free to open an issue if you run into problems.
28
28
 
29
29
  ## Reference
30
30
 
@@ -88,6 +88,15 @@ From Avro
88
88
  Polars.read_avro("file.avro")
89
89
  ```
90
90
 
91
+ From Delta Lake (requires [deltalake-rb](https://github.com/ankane/delta-ruby)) [experimental, unreleased]
92
+
93
+ ```ruby
94
+ Polars.read_delta("./table")
95
+
96
+ # or lazily with
97
+ Polars.scan_delta("./table")
98
+ ```
99
+
91
100
  From a hash
92
101
 
93
102
  ```ruby
@@ -336,6 +345,32 @@ Parquet
336
345
  df.write_parquet("file.parquet")
337
346
  ```
338
347
 
348
+ JSON
349
+
350
+ ```ruby
351
+ df.write_json("file.json")
352
+ # or
353
+ df.write_ndjson("file.ndjson")
354
+ ```
355
+
356
+ Feather / Arrow IPC
357
+
358
+ ```ruby
359
+ df.write_ipc("file.arrow")
360
+ ```
361
+
362
+ Avro
363
+
364
+ ```ruby
365
+ df.write_avro("file.avro")
366
+ ```
367
+
368
+ Delta Lake [experimental, unreleased]
369
+
370
+ ```ruby
371
+ df.write_delta("./table")
372
+ ```
373
+
339
374
  Numo array
340
375
 
341
376
  ```ruby
Binary file
Binary file
Binary file
@@ -831,7 +831,13 @@ module Polars
831
831
  # Compression method. Defaults to "uncompressed".
832
832
  #
833
833
  # @return [nil]
834
- def write_ipc(file, compression: "uncompressed", compat_level: nil)
834
+ def write_ipc(
835
+ file,
836
+ compression: "uncompressed",
837
+ compat_level: nil,
838
+ storage_options: nil,
839
+ retries: 2
840
+ )
835
841
  return_bytes = file.nil?
836
842
  if return_bytes
837
843
  file = StringIO.new
@@ -849,7 +855,13 @@ module Polars
849
855
  compression = "uncompressed"
850
856
  end
851
857
 
852
- _df.write_ipc(file, compression, compat_level)
858
+ if storage_options&.any?
859
+ storage_options = storage_options.to_a
860
+ else
861
+ storage_options = nil
862
+ end
863
+
864
+ _df.write_ipc(file, compression, compat_level, storage_options, retries)
853
865
  return_bytes ? file.string : nil
854
866
  end
855
867
 
@@ -961,6 +973,61 @@ module Polars
961
973
  )
962
974
  end
963
975
 
976
+ # Write DataFrame as delta table.
977
+ #
978
+ # @param target [Object]
979
+ # URI of a table or a DeltaTable object.
980
+ # @param mode ["error", "append", "overwrite", "ignore", "merge"]
981
+ # How to handle existing data.
982
+ # @param storage_options [Hash]
983
+ # Extra options for the storage backends supported by `deltalake-rb`.
984
+ # @param delta_write_options [Hash]
985
+ # Additional keyword arguments while writing a Delta lake Table.
986
+ # @param delta_merge_options [Hash]
987
+ # Keyword arguments which are required to `MERGE` a Delta lake Table.
988
+ #
989
+ # @return [nil]
990
+ def write_delta(
991
+ target,
992
+ mode: "error",
993
+ storage_options: nil,
994
+ delta_write_options: nil,
995
+ delta_merge_options: nil
996
+ )
997
+ Polars.send(:_check_if_delta_available)
998
+
999
+ if Utils.pathlike?(target)
1000
+ target = Polars.send(:_resolve_delta_lake_uri, target.to_s, strict: false)
1001
+ end
1002
+
1003
+ data = self
1004
+
1005
+ if mode == "merge"
1006
+ if delta_merge_options.nil?
1007
+ msg = "You need to pass delta_merge_options with at least a given predicate for `MERGE` to work."
1008
+ raise ArgumentError, msg
1009
+ end
1010
+ if target.is_a?(::String)
1011
+ dt = DeltaLake::Table.new(target, storage_options: storage_options)
1012
+ else
1013
+ dt = target
1014
+ end
1015
+
1016
+ predicate = delta_merge_options.delete(:predicate)
1017
+ dt.merge(data, predicate, **delta_merge_options)
1018
+ else
1019
+ delta_write_options ||= {}
1020
+
1021
+ DeltaLake.write(
1022
+ target,
1023
+ data,
1024
+ mode: mode,
1025
+ storage_options: storage_options,
1026
+ **delta_write_options
1027
+ )
1028
+ end
1029
+ end
1030
+
964
1031
  # Return an estimation of the total (heap) allocated size of the DataFrame.
965
1032
  #
966
1033
  # Estimated size is given in the specified unit (bytes by default).
@@ -3939,14 +4006,32 @@ module Polars
3939
4006
  # # ╞═════╪═════╪═════╡
3940
4007
  # # │ 3 ┆ 8 ┆ c │
3941
4008
  # # └─────┴─────┴─────┘
3942
- def max(axis: 0)
3943
- if axis == 0
3944
- lazy.max.collect(_eager: true)
3945
- elsif axis == 1
3946
- Utils.wrap_s(_df.max_horizontal)
3947
- else
3948
- raise ArgumentError, "Axis should be 0 or 1."
3949
- end
4009
+ def max
4010
+ lazy.max.collect(_eager: true)
4011
+ end
4012
+
4013
+ # Get the maximum value horizontally across columns.
4014
+ #
4015
+ # @return [Series]
4016
+ #
4017
+ # @example
4018
+ # df = Polars::DataFrame.new(
4019
+ # {
4020
+ # "foo" => [1, 2, 3],
4021
+ # "bar" => [4.0, 5.0, 6.0]
4022
+ # }
4023
+ # )
4024
+ # df.max_horizontal
4025
+ # # =>
4026
+ # # shape: (3,)
4027
+ # # Series: 'max' [f64]
4028
+ # # [
4029
+ # # 4.0
4030
+ # # 5.0
4031
+ # # 6.0
4032
+ # # ]
4033
+ def max_horizontal
4034
+ select(max: F.max_horizontal(F.all)).to_series
3950
4035
  end
3951
4036
 
3952
4037
  # Aggregate the columns of this DataFrame to their minimum value.
@@ -3971,22 +4056,35 @@ module Polars
3971
4056
  # # ╞═════╪═════╪═════╡
3972
4057
  # # │ 1 ┆ 6 ┆ a │
3973
4058
  # # └─────┴─────┴─────┘
3974
- def min(axis: 0)
3975
- if axis == 0
3976
- lazy.min.collect(_eager: true)
3977
- elsif axis == 1
3978
- Utils.wrap_s(_df.min_horizontal)
3979
- else
3980
- raise ArgumentError, "Axis should be 0 or 1."
3981
- end
4059
+ def min
4060
+ lazy.min.collect(_eager: true)
3982
4061
  end
3983
4062
 
3984
- # Aggregate the columns of this DataFrame to their sum value.
4063
+ # Get the minimum value horizontally across columns.
3985
4064
  #
3986
- # @param axis [Integer]
3987
- # Either 0 or 1.
3988
- # @param null_strategy ["ignore", "propagate"]
3989
- # This argument is only used if axis == 1.
4065
+ # @return [Series]
4066
+ #
4067
+ # @example
4068
+ # df = Polars::DataFrame.new(
4069
+ # {
4070
+ # "foo" => [1, 2, 3],
4071
+ # "bar" => [4.0, 5.0, 6.0]
4072
+ # }
4073
+ # )
4074
+ # df.min_horizontal
4075
+ # # =>
4076
+ # # shape: (3,)
4077
+ # # Series: 'min' [f64]
4078
+ # # [
4079
+ # # 1.0
4080
+ # # 2.0
4081
+ # # 3.0
4082
+ # # ]
4083
+ def min_horizontal
4084
+ select(min: F.min_horizontal(F.all)).to_series
4085
+ end
4086
+
4087
+ # Aggregate the columns of this DataFrame to their sum value.
3990
4088
  #
3991
4089
  # @return [DataFrame]
3992
4090
  #
@@ -4008,35 +4106,42 @@ module Polars
4008
4106
  # # ╞═════╪═════╪══════╡
4009
4107
  # # │ 6 ┆ 21 ┆ null │
4010
4108
  # # └─────┴─────┴──────┘
4109
+ def sum
4110
+ lazy.sum.collect(_eager: true)
4111
+ end
4112
+
4113
+ # Sum all values horizontally across columns.
4114
+ #
4115
+ # @param ignore_nulls [Boolean]
4116
+ # Ignore null values (default).
4117
+ # If set to `false`, any null value in the input will lead to a null output.
4118
+ #
4119
+ # @return [Series]
4011
4120
  #
4012
4121
  # @example
4013
- # df.sum(axis: 1)
4122
+ # df = Polars::DataFrame.new(
4123
+ # {
4124
+ # "foo" => [1, 2, 3],
4125
+ # "bar" => [4.0, 5.0, 6.0]
4126
+ # }
4127
+ # )
4128
+ # df.sum_horizontal
4014
4129
  # # =>
4015
4130
  # # shape: (3,)
4016
- # # Series: 'foo' [str]
4131
+ # # Series: 'sum' [f64]
4017
4132
  # # [
4018
- # # "16a"
4019
- # # "27b"
4020
- # # "38c"
4133
+ # # 5.0
4134
+ # # 7.0
4135
+ # # 9.0
4021
4136
  # # ]
4022
- def sum(axis: 0, null_strategy: "ignore")
4023
- case axis
4024
- when 0
4025
- lazy.sum.collect(_eager: true)
4026
- when 1
4027
- Utils.wrap_s(_df.sum_horizontal(null_strategy))
4028
- else
4029
- raise ArgumentError, "Axis should be 0 or 1."
4030
- end
4137
+ def sum_horizontal(ignore_nulls: true)
4138
+ select(
4139
+ sum: F.sum_horizontal(F.all, ignore_nulls: ignore_nulls)
4140
+ ).to_series
4031
4141
  end
4032
4142
 
4033
4143
  # Aggregate the columns of this DataFrame to their mean value.
4034
4144
  #
4035
- # @param axis [Integer]
4036
- # Either 0 or 1.
4037
- # @param null_strategy ["ignore", "propagate"]
4038
- # This argument is only used if axis == 1.
4039
- #
4040
4145
  # @return [DataFrame]
4041
4146
  #
4042
4147
  # @example
@@ -4057,15 +4162,38 @@ module Polars
4057
4162
  # # ╞═════╪═════╪══════╡
4058
4163
  # # │ 2.0 ┆ 7.0 ┆ null │
4059
4164
  # # └─────┴─────┴──────┘
4060
- def mean(axis: 0, null_strategy: "ignore")
4061
- case axis
4062
- when 0
4063
- lazy.mean.collect(_eager: true)
4064
- when 1
4065
- Utils.wrap_s(_df.mean_horizontal(null_strategy))
4066
- else
4067
- raise ArgumentError, "Axis should be 0 or 1."
4068
- end
4165
+ def mean
4166
+ lazy.mean.collect(_eager: true)
4167
+ end
4168
+
4169
+ # Take the mean of all values horizontally across columns.
4170
+ #
4171
+ # @param ignore_nulls [Boolean]
4172
+ # Ignore null values (default).
4173
+ # If set to `false`, any null value in the input will lead to a null output.
4174
+ #
4175
+ # @return [Series]
4176
+ #
4177
+ # @example
4178
+ # df = Polars::DataFrame.new(
4179
+ # {
4180
+ # "foo" => [1, 2, 3],
4181
+ # "bar" => [4.0, 5.0, 6.0]
4182
+ # }
4183
+ # )
4184
+ # df.mean_horizontal
4185
+ # # =>
4186
+ # # shape: (3,)
4187
+ # # Series: 'mean' [f64]
4188
+ # # [
4189
+ # # 2.5
4190
+ # # 3.5
4191
+ # # 4.5
4192
+ # # ]
4193
+ def mean_horizontal(ignore_nulls: true)
4194
+ select(
4195
+ mean: F.mean_horizontal(F.all, ignore_nulls: ignore_nulls)
4196
+ ).to_series
4069
4197
  end
4070
4198
 
4071
4199
  # Aggregate the columns of this DataFrame to their standard deviation value.
@@ -311,7 +311,7 @@ module Polars
311
311
  end
312
312
 
313
313
  if categories.empty?
314
- self.categories = Series.new("category", [], dtype: String)
314
+ @categories = Series.new("category", [], dtype: String)
315
315
  return
316
316
  end
317
317
 
@@ -143,6 +143,9 @@ module Polars
143
143
  # @param exprs [Array]
144
144
  # Column(s) to use in the aggregation. Accepts expression input. Strings are
145
145
  # parsed as column names, other non-expression inputs are parsed as literals.
146
+ # @param ignore_nulls [Boolean]
147
+ # Ignore null values (default).
148
+ # If set to `false`, any null value in the input will lead to a null output.
146
149
  #
147
150
  # @return [Expr]
148
151
  #
@@ -166,9 +169,9 @@ module Polars
166
169
  # # │ 8 ┆ 5 ┆ y ┆ 13 │
167
170
  # # │ 3 ┆ null ┆ z ┆ 3 │
168
171
  # # └─────┴──────┴─────┴─────┘
169
- def sum_horizontal(*exprs)
172
+ def sum_horizontal(*exprs, ignore_nulls: true)
170
173
  rbexprs = Utils.parse_into_list_of_expressions(*exprs)
171
- Utils.wrap_expr(Plr.sum_horizontal(rbexprs))
174
+ Utils.wrap_expr(Plr.sum_horizontal(rbexprs, ignore_nulls))
172
175
  end
173
176
 
174
177
  # Compute the mean of all values horizontally across columns.
@@ -176,6 +179,9 @@ module Polars
176
179
  # @param exprs [Array]
177
180
  # Column(s) to use in the aggregation. Accepts expression input. Strings are
178
181
  # parsed as column names, other non-expression inputs are parsed as literals.
182
+ # @param ignore_nulls [Boolean]
183
+ # Ignore null values (default).
184
+ # If set to `false`, any null value in the input will lead to a null output.
179
185
  #
180
186
  # @return [Expr]
181
187
  #
@@ -199,9 +205,9 @@ module Polars
199
205
  # # │ 8 ┆ 5 ┆ y ┆ 6.5 │
200
206
  # # │ 3 ┆ null ┆ z ┆ 3.0 │
201
207
  # # └─────┴──────┴─────┴──────┘
202
- def mean_horizontal(*exprs)
208
+ def mean_horizontal(*exprs, ignore_nulls: true)
203
209
  rbexprs = Utils.parse_into_list_of_expressions(*exprs)
204
- Utils.wrap_expr(Plr.mean_horizontal(rbexprs))
210
+ Utils.wrap_expr(Plr.mean_horizontal(rbexprs, ignore_nulls))
205
211
  end
206
212
 
207
213
  # Cumulatively sum all values horizontally across columns.
@@ -729,16 +729,20 @@ module Polars
729
729
  a,
730
730
  b,
731
731
  method: "pearson",
732
- ddof: 1,
732
+ ddof: nil,
733
733
  propagate_nans: false
734
734
  )
735
+ if !ddof.nil?
736
+ warn "The `ddof` parameter has no effect. Do not use it."
737
+ end
738
+
735
739
  a = Utils.parse_into_expression(a)
736
740
  b = Utils.parse_into_expression(b)
737
741
 
738
742
  if method == "pearson"
739
- Utils.wrap_expr(Plr.pearson_corr(a, b, ddof))
743
+ Utils.wrap_expr(Plr.pearson_corr(a, b))
740
744
  elsif method == "spearman"
741
- Utils.wrap_expr(Plr.spearman_rank_corr(a, b, ddof, propagate_nans))
745
+ Utils.wrap_expr(Plr.spearman_rank_corr(a, b, propagate_nans))
742
746
  else
743
747
  msg = "method must be one of {{'pearson', 'spearman'}}, got #{method}"
744
748
  raise ArgumentError, msg
@@ -0,0 +1,126 @@
1
+ module Polars
2
+ module IO
3
+ # Reads into a DataFrame from a Delta lake table.
4
+ #
5
+ # @param source [Object]
6
+ # DeltaTable or a Path or URI to the root of the Delta lake table.
7
+ # @param version [Object]
8
+ # Numerical version or timestamp version of the Delta lake table.
9
+ # @param columns [Array]
10
+ # Columns to select. Accepts a list of column names.
11
+ # @param rechunk [Boolean]
12
+ # Make sure that all columns are contiguous in memory by
13
+ # aggregating the chunks into a single array.
14
+ # @param storage_options [Hash]
15
+ # Extra options for the storage backends supported by `deltalake-rb`.
16
+ # @param delta_table_options [Hash]
17
+ # Additional keyword arguments while reading a Delta lake Table.
18
+ #
19
+ # @return [DataFrame]
20
+ def read_delta(
21
+ source,
22
+ version: nil,
23
+ columns: nil,
24
+ rechunk: false,
25
+ storage_options: nil,
26
+ delta_table_options: nil
27
+ )
28
+ dl_tbl =
29
+ _get_delta_lake_table(
30
+ source,
31
+ version: version,
32
+ storage_options: storage_options,
33
+ delta_table_options: delta_table_options
34
+ )
35
+
36
+ dl_tbl.to_polars(columns: columns, rechunk: rechunk)
37
+ end
38
+
39
+ # Lazily read from a Delta lake table.
40
+ #
41
+ # @param source [Object]
42
+ # DeltaTable or a Path or URI to the root of the Delta lake table.
43
+ # @param version [Object]
44
+ # Numerical version or timestamp version of the Delta lake table.
45
+ # @param storage_options [Hash]
46
+ # Extra options for the storage backends supported by `deltalake-rb`.
47
+ # @param delta_table_options [Hash]
48
+ # Additional keyword arguments while reading a Delta lake Table.
49
+ #
50
+ # @return [LazyFrame]
51
+ def scan_delta(
52
+ source,
53
+ version: nil,
54
+ storage_options: nil,
55
+ delta_table_options: nil
56
+ )
57
+ dl_tbl =
58
+ _get_delta_lake_table(
59
+ source,
60
+ version: version,
61
+ storage_options: storage_options,
62
+ delta_table_options: delta_table_options
63
+ )
64
+
65
+ dl_tbl.to_polars(eager: false)
66
+ end
67
+
68
+ private
69
+
70
+ def _resolve_delta_lake_uri(table_uri, strict: true)
71
+ require "uri"
72
+
73
+ parsed_result = URI(table_uri)
74
+
75
+ resolved_uri =
76
+ if parsed_result.scheme == ""
77
+ Utils.normalize_filepath(table_uri)
78
+ else
79
+ table_uri
80
+ end
81
+
82
+ resolved_uri
83
+ end
84
+
85
+ def _get_delta_lake_table(
86
+ table_path,
87
+ version: nil,
88
+ storage_options: nil,
89
+ delta_table_options: nil
90
+ )
91
+ _check_if_delta_available
92
+
93
+ if table_path.is_a?(DeltaLake::Table)
94
+ return table_path
95
+ end
96
+ delta_table_options ||= {}
97
+ resolved_uri = _resolve_delta_lake_uri(table_path)
98
+ if !version.is_a?(::String) && !version.is_a?(::Time)
99
+ dl_tbl =
100
+ DeltaLake::Table.new(
101
+ resolved_uri,
102
+ version: version,
103
+ storage_options: storage_options,
104
+ **delta_table_options
105
+ )
106
+ else
107
+ dl_tbl =
108
+ DeltaLake::Table.new(
109
+ resolved_uri,
110
+ storage_options: storage_options,
111
+ **delta_table_options
112
+ )
113
+ dl_tbl.load_as_version(version)
114
+ end
115
+
116
+ dl_tbl = DeltaLake::Table.new(table_path)
117
+ dl_tbl
118
+ end
119
+
120
+ def _check_if_delta_available
121
+ if !defined?(DeltaLake)
122
+ raise Error, "Delta Lake not available"
123
+ end
124
+ end
125
+ end
126
+ end
@@ -431,7 +431,9 @@ module Polars
431
431
  projection_pushdown: true,
432
432
  simplify_expression: true,
433
433
  no_optimization: false,
434
- slice_pushdown: true
434
+ slice_pushdown: true,
435
+ storage_options: nil,
436
+ retries: 2
435
437
  )
436
438
  lf = _set_sink_optimizations(
437
439
  type_coercion: type_coercion,
@@ -460,6 +462,12 @@ module Polars
460
462
  }
461
463
  end
462
464
 
465
+ if storage_options&.any?
466
+ storage_options = storage_options.to_a
467
+ else
468
+ storage_options = nil
469
+ end
470
+
463
471
  lf.sink_parquet(
464
472
  path,
465
473
  compression,
@@ -467,7 +475,9 @@ module Polars
467
475
  statistics,
468
476
  row_group_size,
469
477
  data_pagesize_limit,
470
- maintain_order
478
+ maintain_order,
479
+ storage_options,
480
+ retries
471
481
  )
472
482
  end
473
483
 
@@ -512,6 +522,10 @@ module Polars
512
522
  slice_pushdown: true,
513
523
  no_optimization: false
514
524
  )
525
+ # TODO support storage options in Rust
526
+ storage_options = nil
527
+ retries = 2
528
+
515
529
  lf = _set_sink_optimizations(
516
530
  type_coercion: type_coercion,
517
531
  predicate_pushdown: predicate_pushdown,
@@ -521,10 +535,18 @@ module Polars
521
535
  no_optimization: no_optimization
522
536
  )
523
537
 
538
+ if storage_options&.any?
539
+ storage_options = storage_options.to_a
540
+ else
541
+ storage_options = nil
542
+ end
543
+
524
544
  lf.sink_ipc(
525
545
  path,
526
546
  compression,
527
- maintain_order
547
+ maintain_order,
548
+ storage_options,
549
+ retries
528
550
  )
529
551
  end
530
552
 
@@ -692,7 +714,9 @@ module Polars
692
714
  projection_pushdown: true,
693
715
  simplify_expression: true,
694
716
  slice_pushdown: true,
695
- no_optimization: false
717
+ no_optimization: false,
718
+ storage_options: nil,
719
+ retries: 2
696
720
  )
697
721
  lf = _set_sink_optimizations(
698
722
  type_coercion: type_coercion,
@@ -703,7 +727,13 @@ module Polars
703
727
  no_optimization: no_optimization
704
728
  )
705
729
 
706
- lf.sink_json(path, maintain_order)
730
+ if storage_options&.any?
731
+ storage_options = storage_options.to_a
732
+ else
733
+ storage_options = nil
734
+ end
735
+
736
+ lf.sink_json(path, maintain_order, storage_options, retries)
707
737
  end
708
738
 
709
739
  # @private