polars-df 0.15.0-arm64-darwin → 0.16.0-arm64-darwin

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -14,7 +14,7 @@ gem "polars-df"
14
14
 
15
15
  ## Getting Started
16
16
 
17
- This library follows the [Polars Python API](https://pola-rs.github.io/polars/py-polars/html/reference/index.html).
17
+ This library follows the [Polars Python API](https://docs.pola.rs/api/python/stable/reference/index.html).
18
18
 
19
19
  ```ruby
20
20
  Polars.scan_csv("iris.csv")
@@ -24,7 +24,7 @@ Polars.scan_csv("iris.csv")
24
24
  .collect
25
25
  ```
26
26
 
27
- You can follow [Polars tutorials](https://pola-rs.github.io/polars-book/user-guide/) and convert the code to Ruby in many cases. Feel free to open an issue if you run into problems.
27
+ You can follow [Polars tutorials](https://docs.pola.rs/user-guide/getting-started/) and convert the code to Ruby in many cases. Feel free to open an issue if you run into problems.
28
28
 
29
29
  ## Reference
30
30
 
@@ -88,6 +88,15 @@ From Avro
88
88
  Polars.read_avro("file.avro")
89
89
  ```
90
90
 
91
+ From Delta Lake (requires [deltalake-rb](https://github.com/ankane/delta-ruby)) [experimental, unreleased]
92
+
93
+ ```ruby
94
+ Polars.read_delta("./table")
95
+
96
+ # or lazily with
97
+ Polars.scan_delta("./table")
98
+ ```
99
+
91
100
  From a hash
92
101
 
93
102
  ```ruby
@@ -336,6 +345,32 @@ Parquet
336
345
  df.write_parquet("file.parquet")
337
346
  ```
338
347
 
348
+ JSON
349
+
350
+ ```ruby
351
+ df.write_json("file.json")
352
+ # or
353
+ df.write_ndjson("file.ndjson")
354
+ ```
355
+
356
+ Feather / Arrow IPC
357
+
358
+ ```ruby
359
+ df.write_ipc("file.arrow")
360
+ ```
361
+
362
+ Avro
363
+
364
+ ```ruby
365
+ df.write_avro("file.avro")
366
+ ```
367
+
368
+ Delta Lake [experimental, unreleased]
369
+
370
+ ```ruby
371
+ df.write_delta("./table")
372
+ ```
373
+
339
374
  Numo array
340
375
 
341
376
  ```ruby
Binary file
Binary file
Binary file
@@ -831,7 +831,13 @@ module Polars
831
831
  # Compression method. Defaults to "uncompressed".
832
832
  #
833
833
  # @return [nil]
834
- def write_ipc(file, compression: "uncompressed", compat_level: nil)
834
+ def write_ipc(
835
+ file,
836
+ compression: "uncompressed",
837
+ compat_level: nil,
838
+ storage_options: nil,
839
+ retries: 2
840
+ )
835
841
  return_bytes = file.nil?
836
842
  if return_bytes
837
843
  file = StringIO.new
@@ -849,7 +855,13 @@ module Polars
849
855
  compression = "uncompressed"
850
856
  end
851
857
 
852
- _df.write_ipc(file, compression, compat_level)
858
+ if storage_options&.any?
859
+ storage_options = storage_options.to_a
860
+ else
861
+ storage_options = nil
862
+ end
863
+
864
+ _df.write_ipc(file, compression, compat_level, storage_options, retries)
853
865
  return_bytes ? file.string : nil
854
866
  end
855
867
 
@@ -961,6 +973,61 @@ module Polars
961
973
  )
962
974
  end
963
975
 
976
+ # Write DataFrame as delta table.
977
+ #
978
+ # @param target [Object]
979
+ # URI of a table or a DeltaTable object.
980
+ # @param mode ["error", "append", "overwrite", "ignore", "merge"]
981
+ # How to handle existing data.
982
+ # @param storage_options [Hash]
983
+ # Extra options for the storage backends supported by `deltalake-rb`.
984
+ # @param delta_write_options [Hash]
985
+ # Additional keyword arguments while writing a Delta lake Table.
986
+ # @param delta_merge_options [Hash]
987
+ # Keyword arguments which are required to `MERGE` a Delta lake Table.
988
+ #
989
+ # @return [nil]
990
+ def write_delta(
991
+ target,
992
+ mode: "error",
993
+ storage_options: nil,
994
+ delta_write_options: nil,
995
+ delta_merge_options: nil
996
+ )
997
+ Polars.send(:_check_if_delta_available)
998
+
999
+ if Utils.pathlike?(target)
1000
+ target = Polars.send(:_resolve_delta_lake_uri, target.to_s, strict: false)
1001
+ end
1002
+
1003
+ data = self
1004
+
1005
+ if mode == "merge"
1006
+ if delta_merge_options.nil?
1007
+ msg = "You need to pass delta_merge_options with at least a given predicate for `MERGE` to work."
1008
+ raise ArgumentError, msg
1009
+ end
1010
+ if target.is_a?(::String)
1011
+ dt = DeltaLake::Table.new(target, storage_options: storage_options)
1012
+ else
1013
+ dt = target
1014
+ end
1015
+
1016
+ predicate = delta_merge_options.delete(:predicate)
1017
+ dt.merge(data, predicate, **delta_merge_options)
1018
+ else
1019
+ delta_write_options ||= {}
1020
+
1021
+ DeltaLake.write(
1022
+ target,
1023
+ data,
1024
+ mode: mode,
1025
+ storage_options: storage_options,
1026
+ **delta_write_options
1027
+ )
1028
+ end
1029
+ end
1030
+
964
1031
  # Return an estimation of the total (heap) allocated size of the DataFrame.
965
1032
  #
966
1033
  # Estimated size is given in the specified unit (bytes by default).
@@ -3939,14 +4006,32 @@ module Polars
3939
4006
  # # ╞═════╪═════╪═════╡
3940
4007
  # # │ 3 ┆ 8 ┆ c │
3941
4008
  # # └─────┴─────┴─────┘
3942
- def max(axis: 0)
3943
- if axis == 0
3944
- lazy.max.collect(_eager: true)
3945
- elsif axis == 1
3946
- Utils.wrap_s(_df.max_horizontal)
3947
- else
3948
- raise ArgumentError, "Axis should be 0 or 1."
3949
- end
4009
+ def max
4010
+ lazy.max.collect(_eager: true)
4011
+ end
4012
+
4013
+ # Get the maximum value horizontally across columns.
4014
+ #
4015
+ # @return [Series]
4016
+ #
4017
+ # @example
4018
+ # df = Polars::DataFrame.new(
4019
+ # {
4020
+ # "foo" => [1, 2, 3],
4021
+ # "bar" => [4.0, 5.0, 6.0]
4022
+ # }
4023
+ # )
4024
+ # df.max_horizontal
4025
+ # # =>
4026
+ # # shape: (3,)
4027
+ # # Series: 'max' [f64]
4028
+ # # [
4029
+ # # 4.0
4030
+ # # 5.0
4031
+ # # 6.0
4032
+ # # ]
4033
+ def max_horizontal
4034
+ select(max: F.max_horizontal(F.all)).to_series
3950
4035
  end
3951
4036
 
3952
4037
  # Aggregate the columns of this DataFrame to their minimum value.
@@ -3971,22 +4056,35 @@ module Polars
3971
4056
  # # ╞═════╪═════╪═════╡
3972
4057
  # # │ 1 ┆ 6 ┆ a │
3973
4058
  # # └─────┴─────┴─────┘
3974
- def min(axis: 0)
3975
- if axis == 0
3976
- lazy.min.collect(_eager: true)
3977
- elsif axis == 1
3978
- Utils.wrap_s(_df.min_horizontal)
3979
- else
3980
- raise ArgumentError, "Axis should be 0 or 1."
3981
- end
4059
+ def min
4060
+ lazy.min.collect(_eager: true)
3982
4061
  end
3983
4062
 
3984
- # Aggregate the columns of this DataFrame to their sum value.
4063
+ # Get the minimum value horizontally across columns.
3985
4064
  #
3986
- # @param axis [Integer]
3987
- # Either 0 or 1.
3988
- # @param null_strategy ["ignore", "propagate"]
3989
- # This argument is only used if axis == 1.
4065
+ # @return [Series]
4066
+ #
4067
+ # @example
4068
+ # df = Polars::DataFrame.new(
4069
+ # {
4070
+ # "foo" => [1, 2, 3],
4071
+ # "bar" => [4.0, 5.0, 6.0]
4072
+ # }
4073
+ # )
4074
+ # df.min_horizontal
4075
+ # # =>
4076
+ # # shape: (3,)
4077
+ # # Series: 'min' [f64]
4078
+ # # [
4079
+ # # 1.0
4080
+ # # 2.0
4081
+ # # 3.0
4082
+ # # ]
4083
+ def min_horizontal
4084
+ select(min: F.min_horizontal(F.all)).to_series
4085
+ end
4086
+
4087
+ # Aggregate the columns of this DataFrame to their sum value.
3990
4088
  #
3991
4089
  # @return [DataFrame]
3992
4090
  #
@@ -4008,35 +4106,42 @@ module Polars
4008
4106
  # # ╞═════╪═════╪══════╡
4009
4107
  # # │ 6 ┆ 21 ┆ null │
4010
4108
  # # └─────┴─────┴──────┘
4109
+ def sum
4110
+ lazy.sum.collect(_eager: true)
4111
+ end
4112
+
4113
+ # Sum all values horizontally across columns.
4114
+ #
4115
+ # @param ignore_nulls [Boolean]
4116
+ # Ignore null values (default).
4117
+ # If set to `false`, any null value in the input will lead to a null output.
4118
+ #
4119
+ # @return [Series]
4011
4120
  #
4012
4121
  # @example
4013
- # df.sum(axis: 1)
4122
+ # df = Polars::DataFrame.new(
4123
+ # {
4124
+ # "foo" => [1, 2, 3],
4125
+ # "bar" => [4.0, 5.0, 6.0]
4126
+ # }
4127
+ # )
4128
+ # df.sum_horizontal
4014
4129
  # # =>
4015
4130
  # # shape: (3,)
4016
- # # Series: 'foo' [str]
4131
+ # # Series: 'sum' [f64]
4017
4132
  # # [
4018
- # # "16a"
4019
- # # "27b"
4020
- # # "38c"
4133
+ # # 5.0
4134
+ # # 7.0
4135
+ # # 9.0
4021
4136
  # # ]
4022
- def sum(axis: 0, null_strategy: "ignore")
4023
- case axis
4024
- when 0
4025
- lazy.sum.collect(_eager: true)
4026
- when 1
4027
- Utils.wrap_s(_df.sum_horizontal(null_strategy))
4028
- else
4029
- raise ArgumentError, "Axis should be 0 or 1."
4030
- end
4137
+ def sum_horizontal(ignore_nulls: true)
4138
+ select(
4139
+ sum: F.sum_horizontal(F.all, ignore_nulls: ignore_nulls)
4140
+ ).to_series
4031
4141
  end
4032
4142
 
4033
4143
  # Aggregate the columns of this DataFrame to their mean value.
4034
4144
  #
4035
- # @param axis [Integer]
4036
- # Either 0 or 1.
4037
- # @param null_strategy ["ignore", "propagate"]
4038
- # This argument is only used if axis == 1.
4039
- #
4040
4145
  # @return [DataFrame]
4041
4146
  #
4042
4147
  # @example
@@ -4057,15 +4162,38 @@ module Polars
4057
4162
  # # ╞═════╪═════╪══════╡
4058
4163
  # # │ 2.0 ┆ 7.0 ┆ null │
4059
4164
  # # └─────┴─────┴──────┘
4060
- def mean(axis: 0, null_strategy: "ignore")
4061
- case axis
4062
- when 0
4063
- lazy.mean.collect(_eager: true)
4064
- when 1
4065
- Utils.wrap_s(_df.mean_horizontal(null_strategy))
4066
- else
4067
- raise ArgumentError, "Axis should be 0 or 1."
4068
- end
4165
+ def mean
4166
+ lazy.mean.collect(_eager: true)
4167
+ end
4168
+
4169
+ # Take the mean of all values horizontally across columns.
4170
+ #
4171
+ # @param ignore_nulls [Boolean]
4172
+ # Ignore null values (default).
4173
+ # If set to `false`, any null value in the input will lead to a null output.
4174
+ #
4175
+ # @return [Series]
4176
+ #
4177
+ # @example
4178
+ # df = Polars::DataFrame.new(
4179
+ # {
4180
+ # "foo" => [1, 2, 3],
4181
+ # "bar" => [4.0, 5.0, 6.0]
4182
+ # }
4183
+ # )
4184
+ # df.mean_horizontal
4185
+ # # =>
4186
+ # # shape: (3,)
4187
+ # # Series: 'mean' [f64]
4188
+ # # [
4189
+ # # 2.5
4190
+ # # 3.5
4191
+ # # 4.5
4192
+ # # ]
4193
+ def mean_horizontal(ignore_nulls: true)
4194
+ select(
4195
+ mean: F.mean_horizontal(F.all, ignore_nulls: ignore_nulls)
4196
+ ).to_series
4069
4197
  end
4070
4198
 
4071
4199
  # Aggregate the columns of this DataFrame to their standard deviation value.
@@ -311,7 +311,7 @@ module Polars
311
311
  end
312
312
 
313
313
  if categories.empty?
314
- self.categories = Series.new("category", [], dtype: String)
314
+ @categories = Series.new("category", [], dtype: String)
315
315
  return
316
316
  end
317
317
 
@@ -143,6 +143,9 @@ module Polars
143
143
  # @param exprs [Array]
144
144
  # Column(s) to use in the aggregation. Accepts expression input. Strings are
145
145
  # parsed as column names, other non-expression inputs are parsed as literals.
146
+ # @param ignore_nulls [Boolean]
147
+ # Ignore null values (default).
148
+ # If set to `false`, any null value in the input will lead to a null output.
146
149
  #
147
150
  # @return [Expr]
148
151
  #
@@ -166,9 +169,9 @@ module Polars
166
169
  # # │ 8 ┆ 5 ┆ y ┆ 13 │
167
170
  # # │ 3 ┆ null ┆ z ┆ 3 │
168
171
  # # └─────┴──────┴─────┴─────┘
169
- def sum_horizontal(*exprs)
172
+ def sum_horizontal(*exprs, ignore_nulls: true)
170
173
  rbexprs = Utils.parse_into_list_of_expressions(*exprs)
171
- Utils.wrap_expr(Plr.sum_horizontal(rbexprs))
174
+ Utils.wrap_expr(Plr.sum_horizontal(rbexprs, ignore_nulls))
172
175
  end
173
176
 
174
177
  # Compute the mean of all values horizontally across columns.
@@ -176,6 +179,9 @@ module Polars
176
179
  # @param exprs [Array]
177
180
  # Column(s) to use in the aggregation. Accepts expression input. Strings are
178
181
  # parsed as column names, other non-expression inputs are parsed as literals.
182
+ # @param ignore_nulls [Boolean]
183
+ # Ignore null values (default).
184
+ # If set to `false`, any null value in the input will lead to a null output.
179
185
  #
180
186
  # @return [Expr]
181
187
  #
@@ -199,9 +205,9 @@ module Polars
199
205
  # # │ 8 ┆ 5 ┆ y ┆ 6.5 │
200
206
  # # │ 3 ┆ null ┆ z ┆ 3.0 │
201
207
  # # └─────┴──────┴─────┴──────┘
202
- def mean_horizontal(*exprs)
208
+ def mean_horizontal(*exprs, ignore_nulls: true)
203
209
  rbexprs = Utils.parse_into_list_of_expressions(*exprs)
204
- Utils.wrap_expr(Plr.mean_horizontal(rbexprs))
210
+ Utils.wrap_expr(Plr.mean_horizontal(rbexprs, ignore_nulls))
205
211
  end
206
212
 
207
213
  # Cumulatively sum all values horizontally across columns.
@@ -729,16 +729,20 @@ module Polars
729
729
  a,
730
730
  b,
731
731
  method: "pearson",
732
- ddof: 1,
732
+ ddof: nil,
733
733
  propagate_nans: false
734
734
  )
735
+ if !ddof.nil?
736
+ warn "The `ddof` parameter has no effect. Do not use it."
737
+ end
738
+
735
739
  a = Utils.parse_into_expression(a)
736
740
  b = Utils.parse_into_expression(b)
737
741
 
738
742
  if method == "pearson"
739
- Utils.wrap_expr(Plr.pearson_corr(a, b, ddof))
743
+ Utils.wrap_expr(Plr.pearson_corr(a, b))
740
744
  elsif method == "spearman"
741
- Utils.wrap_expr(Plr.spearman_rank_corr(a, b, ddof, propagate_nans))
745
+ Utils.wrap_expr(Plr.spearman_rank_corr(a, b, propagate_nans))
742
746
  else
743
747
  msg = "method must be one of {{'pearson', 'spearman'}}, got #{method}"
744
748
  raise ArgumentError, msg
@@ -0,0 +1,126 @@
1
+ module Polars
2
+ module IO
3
+ # Reads into a DataFrame from a Delta lake table.
4
+ #
5
+ # @param source [Object]
6
+ # DeltaTable or a Path or URI to the root of the Delta lake table.
7
+ # @param version [Object]
8
+ # Numerical version or timestamp version of the Delta lake table.
9
+ # @param columns [Array]
10
+ # Columns to select. Accepts a list of column names.
11
+ # @param rechunk [Boolean]
12
+ # Make sure that all columns are contiguous in memory by
13
+ # aggregating the chunks into a single array.
14
+ # @param storage_options [Hash]
15
+ # Extra options for the storage backends supported by `deltalake-rb`.
16
+ # @param delta_table_options [Hash]
17
+ # Additional keyword arguments while reading a Delta lake Table.
18
+ #
19
+ # @return [DataFrame]
20
+ def read_delta(
21
+ source,
22
+ version: nil,
23
+ columns: nil,
24
+ rechunk: false,
25
+ storage_options: nil,
26
+ delta_table_options: nil
27
+ )
28
+ dl_tbl =
29
+ _get_delta_lake_table(
30
+ source,
31
+ version: version,
32
+ storage_options: storage_options,
33
+ delta_table_options: delta_table_options
34
+ )
35
+
36
+ dl_tbl.to_polars(columns: columns, rechunk: rechunk)
37
+ end
38
+
39
+ # Lazily read from a Delta lake table.
40
+ #
41
+ # @param source [Object]
42
+ # DeltaTable or a Path or URI to the root of the Delta lake table.
43
+ # @param version [Object]
44
+ # Numerical version or timestamp version of the Delta lake table.
45
+ # @param storage_options [Hash]
46
+ # Extra options for the storage backends supported by `deltalake-rb`.
47
+ # @param delta_table_options [Hash]
48
+ # Additional keyword arguments while reading a Delta lake Table.
49
+ #
50
+ # @return [LazyFrame]
51
+ def scan_delta(
52
+ source,
53
+ version: nil,
54
+ storage_options: nil,
55
+ delta_table_options: nil
56
+ )
57
+ dl_tbl =
58
+ _get_delta_lake_table(
59
+ source,
60
+ version: version,
61
+ storage_options: storage_options,
62
+ delta_table_options: delta_table_options
63
+ )
64
+
65
+ dl_tbl.to_polars(eager: false)
66
+ end
67
+
68
+ private
69
+
70
+ def _resolve_delta_lake_uri(table_uri, strict: true)
71
+ require "uri"
72
+
73
+ parsed_result = URI(table_uri)
74
+
75
+ resolved_uri =
76
+ if parsed_result.scheme == ""
77
+ Utils.normalize_filepath(table_uri)
78
+ else
79
+ table_uri
80
+ end
81
+
82
+ resolved_uri
83
+ end
84
+
85
+ def _get_delta_lake_table(
86
+ table_path,
87
+ version: nil,
88
+ storage_options: nil,
89
+ delta_table_options: nil
90
+ )
91
+ _check_if_delta_available
92
+
93
+ if table_path.is_a?(DeltaLake::Table)
94
+ return table_path
95
+ end
96
+ delta_table_options ||= {}
97
+ resolved_uri = _resolve_delta_lake_uri(table_path)
98
+ if !version.is_a?(::String) && !version.is_a?(::Time)
99
+ dl_tbl =
100
+ DeltaLake::Table.new(
101
+ resolved_uri,
102
+ version: version,
103
+ storage_options: storage_options,
104
+ **delta_table_options
105
+ )
106
+ else
107
+ dl_tbl =
108
+ DeltaLake::Table.new(
109
+ resolved_uri,
110
+ storage_options: storage_options,
111
+ **delta_table_options
112
+ )
113
+ dl_tbl.load_as_version(version)
114
+ end
115
+
116
+ dl_tbl = DeltaLake::Table.new(table_path)
117
+ dl_tbl
118
+ end
119
+
120
+ def _check_if_delta_available
121
+ if !defined?(DeltaLake)
122
+ raise Error, "Delta Lake not available"
123
+ end
124
+ end
125
+ end
126
+ end
@@ -431,7 +431,9 @@ module Polars
431
431
  projection_pushdown: true,
432
432
  simplify_expression: true,
433
433
  no_optimization: false,
434
- slice_pushdown: true
434
+ slice_pushdown: true,
435
+ storage_options: nil,
436
+ retries: 2
435
437
  )
436
438
  lf = _set_sink_optimizations(
437
439
  type_coercion: type_coercion,
@@ -460,6 +462,12 @@ module Polars
460
462
  }
461
463
  end
462
464
 
465
+ if storage_options&.any?
466
+ storage_options = storage_options.to_a
467
+ else
468
+ storage_options = nil
469
+ end
470
+
463
471
  lf.sink_parquet(
464
472
  path,
465
473
  compression,
@@ -467,7 +475,9 @@ module Polars
467
475
  statistics,
468
476
  row_group_size,
469
477
  data_pagesize_limit,
470
- maintain_order
478
+ maintain_order,
479
+ storage_options,
480
+ retries
471
481
  )
472
482
  end
473
483
 
@@ -512,6 +522,10 @@ module Polars
512
522
  slice_pushdown: true,
513
523
  no_optimization: false
514
524
  )
525
+ # TODO support storage options in Rust
526
+ storage_options = nil
527
+ retries = 2
528
+
515
529
  lf = _set_sink_optimizations(
516
530
  type_coercion: type_coercion,
517
531
  predicate_pushdown: predicate_pushdown,
@@ -521,10 +535,18 @@ module Polars
521
535
  no_optimization: no_optimization
522
536
  )
523
537
 
538
+ if storage_options&.any?
539
+ storage_options = storage_options.to_a
540
+ else
541
+ storage_options = nil
542
+ end
543
+
524
544
  lf.sink_ipc(
525
545
  path,
526
546
  compression,
527
- maintain_order
547
+ maintain_order,
548
+ storage_options,
549
+ retries
528
550
  )
529
551
  end
530
552
 
@@ -692,7 +714,9 @@ module Polars
692
714
  projection_pushdown: true,
693
715
  simplify_expression: true,
694
716
  slice_pushdown: true,
695
- no_optimization: false
717
+ no_optimization: false,
718
+ storage_options: nil,
719
+ retries: 2
696
720
  )
697
721
  lf = _set_sink_optimizations(
698
722
  type_coercion: type_coercion,
@@ -703,7 +727,13 @@ module Polars
703
727
  no_optimization: no_optimization
704
728
  )
705
729
 
706
- lf.sink_json(path, maintain_order)
730
+ if storage_options&.any?
731
+ storage_options = storage_options.to_a
732
+ else
733
+ storage_options = nil
734
+ end
735
+
736
+ lf.sink_json(path, maintain_order, storage_options, retries)
707
737
  end
708
738
 
709
739
  # @private