polars-df 0.19.0-x64-mingw-ucrt → 0.21.0-x64-mingw-ucrt

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +15 -0
  3. data/Cargo.lock +211 -320
  4. data/LICENSE-THIRD-PARTY.txt +1376 -2634
  5. data/LICENSE.txt +1 -1
  6. data/lib/polars/3.2/polars.so +0 -0
  7. data/lib/polars/3.3/polars.so +0 -0
  8. data/lib/polars/3.4/polars.so +0 -0
  9. data/lib/polars/cat_name_space.rb +3 -43
  10. data/lib/polars/catalog/unity/catalog_info.rb +20 -0
  11. data/lib/polars/catalog/unity/column_info.rb +31 -0
  12. data/lib/polars/catalog/unity/namespace_info.rb +21 -0
  13. data/lib/polars/catalog/unity/table_info.rb +50 -0
  14. data/lib/polars/catalog.rb +448 -0
  15. data/lib/polars/convert.rb +10 -0
  16. data/lib/polars/data_frame.rb +151 -30
  17. data/lib/polars/data_types.rb +47 -3
  18. data/lib/polars/exceptions.rb +7 -2
  19. data/lib/polars/expr.rb +48 -39
  20. data/lib/polars/functions/col.rb +6 -5
  21. data/lib/polars/functions/eager.rb +1 -1
  22. data/lib/polars/functions/lazy.rb +114 -15
  23. data/lib/polars/functions/repeat.rb +4 -0
  24. data/lib/polars/io/csv.rb +18 -0
  25. data/lib/polars/io/json.rb +16 -0
  26. data/lib/polars/io/ndjson.rb +13 -0
  27. data/lib/polars/io/parquet.rb +45 -63
  28. data/lib/polars/io/scan_options.rb +47 -0
  29. data/lib/polars/lazy_frame.rb +163 -75
  30. data/lib/polars/list_expr.rb +213 -17
  31. data/lib/polars/list_name_space.rb +121 -8
  32. data/lib/polars/meta_expr.rb +14 -29
  33. data/lib/polars/scan_cast_options.rb +64 -0
  34. data/lib/polars/schema.rb +6 -1
  35. data/lib/polars/selector.rb +138 -0
  36. data/lib/polars/selectors.rb +931 -202
  37. data/lib/polars/series.rb +46 -19
  38. data/lib/polars/string_expr.rb +24 -3
  39. data/lib/polars/string_name_space.rb +12 -1
  40. data/lib/polars/utils/parse.rb +40 -0
  41. data/lib/polars/utils.rb +5 -1
  42. data/lib/polars/version.rb +1 -1
  43. data/lib/polars.rb +8 -0
  44. metadata +10 -2
@@ -458,7 +458,7 @@ module Polars
458
458
  # # └─────┴─────┘
459
459
  def first(*columns)
460
460
  if columns.empty?
461
- return Utils.wrap_expr(Plr.first)
461
+ return cs.first.as_expr
462
462
  end
463
463
 
464
464
  col(*columns).first
@@ -518,7 +518,7 @@ module Polars
518
518
  # # └─────┴─────┘
519
519
  def last(*columns)
520
520
  if columns.empty?
521
- return Utils.wrap_expr(Plr.last)
521
+ return cs.last.as_expr
522
522
  end
523
523
 
524
524
  col(*columns).last
@@ -565,12 +565,8 @@ module Polars
565
565
  # # │ bar ┆ 8 │
566
566
  # # │ baz ┆ 3 │
567
567
  # # └─────┴─────┘
568
- def nth(*indices)
569
- if indices.length == 1 && indices[0].is_a?(Array)
570
- indices = indices[0]
571
- end
572
-
573
- Utils.wrap_expr(Plr.index_cols(indices))
568
+ def nth(*indices, strict: true)
569
+ cs.by_index(*indices, require_all: strict).as_expr
574
570
  end
575
571
 
576
572
  # Get the first `n` rows.
@@ -675,12 +671,12 @@ module Polars
675
671
  # Column name or Expression.
676
672
  # @param b [Object]
677
673
  # Column name or Expression.
674
+ # @param method ["pearson", "spearman"]
675
+ # Correlation method.
678
676
  # @param ddof [Integer]
679
677
  # "Delta Degrees of Freedom": the divisor used in the calculation is N - ddof,
680
678
  # where N represents the number of elements.
681
679
  # By default ddof is 1.
682
- # @param method ["pearson", "spearman"]
683
- # Correlation method.
684
680
  # @param propagate_nans [Boolean]
685
681
  # If `true` any `NaN` encountered will lead to `NaN` in the output.
686
682
  # Defaults to `False` where `NaN` are regarded as larger than any finite number
@@ -795,14 +791,82 @@ module Polars
795
791
  # Accumulate over multiple columns horizontally/row wise with a left fold.
796
792
  #
797
793
  # @return [Expr]
798
- def fold(acc, f, exprs)
794
+ #
795
+ # @example Horizontally sum over all columns and add 1.
796
+ # df = Polars::DataFrame.new(
797
+ # {
798
+ # "a" => [1, 2, 3],
799
+ # "b" => [3, 4, 5],
800
+ # "c" => [5, 6, 7]
801
+ # }
802
+ # )
803
+ # df.select(
804
+ # Polars.fold(
805
+ # Polars.lit(1), ->(acc, x) { acc + x }, Polars.col("*")
806
+ # ).alias("sum")
807
+ # )
808
+ # # =>
809
+ # # shape: (3, 1)
810
+ # # ┌─────┐
811
+ # # │ sum │
812
+ # # │ --- │
813
+ # # │ i64 │
814
+ # # ╞═════╡
815
+ # # │ 10 │
816
+ # # │ 13 │
817
+ # # │ 16 │
818
+ # # └─────┘
819
+ #
820
+ # @example You can also apply a condition/predicate on all columns:
821
+ # df = Polars::DataFrame.new(
822
+ # {
823
+ # "a" => [1, 2, 3],
824
+ # "b" => [0, 1, 2]
825
+ # }
826
+ # )
827
+ # df.filter(
828
+ # Polars.fold(
829
+ # Polars.lit(true),
830
+ # ->(acc, x) { acc & x },
831
+ # Polars.col("*") > 1
832
+ # )
833
+ # )
834
+ # # =>
835
+ # # shape: (1, 2)
836
+ # # ┌─────┬─────┐
837
+ # # │ a ┆ b │
838
+ # # │ --- ┆ --- │
839
+ # # │ i64 ┆ i64 │
840
+ # # ╞═════╪═════╡
841
+ # # │ 3 ┆ 2 │
842
+ # # └─────┴─────┘
843
+ def fold(
844
+ acc,
845
+ function,
846
+ exprs,
847
+ returns_scalar: false,
848
+ return_dtype: nil
849
+ )
799
850
  acc = Utils.parse_into_expression(acc, str_as_lit: true)
800
851
  if exprs.is_a?(Expr)
801
852
  exprs = [exprs]
802
853
  end
803
854
 
855
+ rt = nil
856
+ if !return_dtype.nil?
857
+ rt = Utils.parse_into_datatype_expr(return_dtype)._rbdatatype_expr
858
+ end
859
+
804
860
  exprs = Utils.parse_into_list_of_expressions(exprs)
805
- Utils.wrap_expr(Plr.fold(acc, f, exprs))
861
+ Utils.wrap_expr(
862
+ Plr.fold(
863
+ acc,
864
+ function,
865
+ exprs,
866
+ returns_scalar,
867
+ rt
868
+ )
869
+ )
806
870
  end
807
871
 
808
872
  # def reduce
@@ -815,11 +879,17 @@ module Polars
815
879
  # @param acc [Object]
816
880
  # Accumulator Expression. This is the value that will be initialized when the fold
817
881
  # starts. For a sum this could for instance be lit(0).
818
- # @param f [Object]
882
+ # @param function [Object]
819
883
  # Function to apply over the accumulator and the value.
820
884
  # Fn(acc, value) -> new_value
821
885
  # @param exprs [Object]
822
886
  # Expressions to aggregate over. May also be a wildcard expression.
887
+ # @param returns_scalar [Boolean]
888
+ # Whether or not `function` applied returns a scalar. This must be set correctly
889
+ # by the user.
890
+ # @param return_dtype [Object]
891
+ # Output datatype.
892
+ # If not set, the dtype will be inferred based on the dtype of the accumulator.
823
893
  # @param include_init [Boolean]
824
894
  # Include the initial accumulator state as struct field.
825
895
  #
@@ -851,14 +921,35 @@ module Polars
851
921
  # # │ 2 ┆ 4 ┆ 6 ┆ {3,7,13} │
852
922
  # # │ 3 ┆ 5 ┆ 7 ┆ {4,9,16} │
853
923
  # # └─────┴─────┴─────┴───────────┘
854
- def cum_fold(acc, f, exprs, include_init: false)
924
+ def cum_fold(
925
+ acc,
926
+ function,
927
+ exprs,
928
+ returns_scalar: false,
929
+ return_dtype: nil,
930
+ include_init: false
931
+ )
855
932
  acc = Utils.parse_into_expression(acc, str_as_lit: true)
856
933
  if exprs.is_a?(Expr)
857
934
  exprs = [exprs]
858
935
  end
859
936
 
937
+ rt = nil
938
+ if !return_dtype.nil?
939
+ rt = Utils.parse_into_datatype_expr(return_dtype)._rbdatatype_expr
940
+ end
941
+
860
942
  exprs = Utils.parse_into_list_of_expressions(exprs)
861
- Utils.wrap_expr(Plr.cum_fold(acc, f, exprs, include_init)._alias("cum_fold"))
943
+ Utils.wrap_expr(
944
+ Plr.cum_fold(
945
+ acc,
946
+ function,
947
+ exprs,
948
+ returns_scalar,
949
+ rt,
950
+ include_init
951
+ )._alias("cum_fold")
952
+ )
862
953
  end
863
954
  alias_method :cumfold, :cum_fold
864
955
 
@@ -1047,8 +1138,16 @@ module Polars
1047
1138
  #
1048
1139
  # @param exprs [Object]
1049
1140
  # Columns use to determine the ordering.
1141
+ # @param more_exprs [Array]
1142
+ # Additional columns to arg sort by, specified as positional arguments.
1050
1143
  # @param reverse [Boolean]
1051
1144
  # Default is ascending.
1145
+ # @param nulls_last [Boolean]
1146
+ # Place null values last.
1147
+ # @param multithreaded [Boolean]
1148
+ # Sort using multiple threads.
1149
+ # @param maintain_order [Boolean]
1150
+ # Whether the order should be maintained if elements are equal.
1052
1151
  #
1053
1152
  # @return [Expr]
1054
1153
  #
@@ -6,6 +6,10 @@ module Polars
6
6
  # Value to repeat.
7
7
  # @param n [Integer]
8
8
  # Repeat `n` times.
9
+ # @param dtype [Object]
10
+ # Data type of the resulting column. If set to `nil` (default), data type is
11
+ # inferred from the given value. Defaults to Int32 for integer values, unless
12
+ # Int64 is required to fit the given value. Defaults to Float64 for float values.
9
13
  # @param eager [Boolean]
10
14
  # Run eagerly and collect into a `Series`.
11
15
  # @param name [String]
data/lib/polars/io/csv.rb CHANGED
@@ -347,6 +347,9 @@ module Polars
347
347
  # - `String`: All values equal to this string will be null.
348
348
  # - `Array`: All values equal to any string in this array will be null.
349
349
  # - `Hash`: A hash that maps column name to a null value string.
350
+ # @param missing_utf8_is_empty_string [Boolean]
351
+ # By default a missing value is considered to be null; if you would prefer missing
352
+ # utf8 values to be treated as the empty string you can set this param true.
350
353
  # @param ignore_errors [Boolean]
351
354
  # Try to keep reading lines if some lines yield errors.
352
355
  # First try `infer_schema_length: 0` to read all columns as
@@ -387,8 +390,13 @@ module Polars
387
390
  # Offset to start the row_count column (only used if the name is set).
388
391
  # @param eol_char [String]
389
392
  # Single byte end of line character.
393
+ # @param raise_if_empty [Boolean]
394
+ # When there is no data in the source,`NoDataError` is raised. If this parameter
395
+ # is set to false, `nil` will be returned from `next_batches(n)` instead.
390
396
  # @param truncate_ragged_lines [Boolean]
391
397
  # Truncate lines that are longer than the schema.
398
+ # @param decimal_comma [Boolean]
399
+ # Parse floats using a comma as the decimal separator instead of a period.
392
400
  #
393
401
  # @return [BatchedCsvReader]
394
402
  #
@@ -503,6 +511,9 @@ module Polars
503
511
  # - `String`: All values equal to this string will be null.
504
512
  # - `Array`: All values equal to any string in this array will be null.
505
513
  # - `Hash`: A hash that maps column name to a null value string.
514
+ # @param missing_utf8_is_empty_string [Boolean]
515
+ # By default a missing value is considered to be null; if you would prefer missing
516
+ # utf8 values to be treated as the empty string you can set this param true.
506
517
  # @param ignore_errors [Boolean]
507
518
  # Try to keep reading lines if some lines yield errors.
508
519
  # First try `infer_schema_length: 0` to read all columns as
@@ -538,8 +549,15 @@ module Polars
538
549
  # the column remains of data type `:str`.
539
550
  # @param eol_char [String]
540
551
  # Single byte end of line character.
552
+ # @param raise_if_empty [Boolean]
553
+ # When there is no data in the source, `NoDataError` is raised. If this parameter
554
+ # is set to false, an empty LazyFrame (with no columns) is returned instead.
541
555
  # @param truncate_ragged_lines [Boolean]
542
556
  # Truncate lines that are longer than the schema.
557
+ # @param decimal_comma [Boolean]
558
+ # Parse floats using a comma as the decimal separator instead of a period.
559
+ # @param glob [Boolean]
560
+ # Expand path given via globbing rules.
543
561
  #
544
562
  # @return [LazyFrame]
545
563
  def scan_csv(
@@ -4,6 +4,22 @@ module Polars
4
4
  #
5
5
  # @param source [Object]
6
6
  # Path to a file or a file-like object.
7
+ # @param schema [Object]
8
+ # The DataFrame schema may be declared in several ways:
9
+ #
10
+ # * As a dict of {name:type} pairs; if type is None, it will be auto-inferred.
11
+ # * As a list of column names; in this case types are automatically inferred.
12
+ # * As a list of (name,type) pairs; this is equivalent to the dictionary form.
13
+ #
14
+ # If you supply a list of column names that does not match the names in the
15
+ # underlying data, the names given here will overwrite them. The number
16
+ # of names given in the schema should match the underlying data dimensions.
17
+ # @param schema_overrides [Hash]
18
+ # Support type specification or override of one or more columns; note that
19
+ # any dtypes inferred from the schema param will be overridden.
20
+ # @param infer_schema_length [Integer]
21
+ # The maximum number of rows to scan for schema inference.
22
+ # If set to `nil`, the full data may be scanned *(this is slow)*.
7
23
  #
8
24
  # @return [DataFrame]
9
25
  def read_json(
@@ -4,6 +4,19 @@ module Polars
4
4
  #
5
5
  # @param source [Object]
6
6
  # Path to a file or a file-like object.
7
+ # @param schema [Object]
8
+ # The DataFrame schema may be declared in several ways:
9
+ #
10
+ # * As a dict of {name:type} pairs; if type is None, it will be auto-inferred.
11
+ # * As a list of column names; in this case types are automatically inferred.
12
+ # * As a list of (name,type) pairs; this is equivalent to the dictionary form.
13
+ #
14
+ # If you supply a list of column names that does not match the names in the
15
+ # underlying data, the names given here will overwrite them. The number
16
+ # of names given in the schema should match the underlying data dimensions.
17
+ # @param schema_overrides [Hash]
18
+ # Support type specification or override of one or more columns; note that
19
+ # any dtypes inferred from the schema param will be overridden.
7
20
  #
8
21
  # @return [DataFrame]
9
22
  def read_ndjson(
@@ -49,6 +49,12 @@ module Polars
49
49
  # Number of retries if accessing a cloud instance fails.
50
50
  # @param include_file_paths [String]
51
51
  # Include the path of the source file(s) as a column with this name.
52
+ # @param allow_missing_columns [Boolean]
53
+ # When reading a list of parquet files, if a column existing in the first
54
+ # file cannot be found in subsequent files, the default behavior is to
55
+ # raise an error. However, if `allow_missing_columns` is set to
56
+ # `true`, a full-NULL column is returned instead of erroring for the files
57
+ # that do not contain the column.
52
58
  #
53
59
  # @return [DataFrame]
54
60
  def read_parquet(
@@ -171,6 +177,17 @@ module Polars
171
177
  # Number of retries if accessing a cloud instance fails.
172
178
  # @param include_file_paths [String]
173
179
  # Include the path of the source file(s) as a column with this name.
180
+ # @param allow_missing_columns [Boolean]
181
+ # When reading a list of parquet files, if a column existing in the first
182
+ # file cannot be found in subsequent files, the default behavior is to
183
+ # raise an error. However, if `allow_missing_columns` is set to
184
+ # `true`, a full-NULL column is returned instead of erroring for the files
185
+ # that do not contain the column.
186
+ # @param extra_columns ['ignore', 'raise']
187
+ # Configuration for behavior when extra columns outside of the
188
+ # defined schema are encountered in the data:
189
+ # * `ignore`: Silently ignores.
190
+ # * `raise`: Raises an error.
174
191
  #
175
192
  # @return [LazyFrame]
176
193
  def scan_parquet(
@@ -192,8 +209,11 @@ module Polars
192
209
  credential_provider: nil,
193
210
  retries: 2,
194
211
  include_file_paths: nil,
195
- allow_missing_columns: false
212
+ allow_missing_columns: false,
213
+ extra_columns: "raise"
196
214
  )
215
+ missing_columns = allow_missing_columns ? "insert" : "raise"
216
+
197
217
  if Utils.pathlike?(source)
198
218
  source = Utils.normalize_filepath(source, check_not_directory: false)
199
219
  elsif Utils.is_path_or_str_sequence(source)
@@ -204,56 +224,11 @@ module Polars
204
224
  raise Todo
205
225
  end
206
226
 
207
- _scan_parquet_impl(
208
- source,
209
- n_rows: n_rows,
210
- cache: cache,
211
- parallel: parallel,
212
- rechunk: rechunk,
213
- row_index_name: row_count_name,
214
- row_index_offset: row_count_offset,
215
- storage_options: storage_options,
216
- credential_provider: credential_provider,
217
- low_memory: low_memory,
218
- use_statistics: use_statistics,
219
- hive_partitioning: hive_partitioning,
220
- schema: schema,
221
- hive_schema: hive_schema,
222
- try_parse_hive_dates: try_parse_hive_dates,
223
- retries: retries,
224
- glob: glob,
225
- include_file_paths: include_file_paths,
226
- allow_missing_columns: allow_missing_columns
227
- )
228
- end
229
-
230
- # @private
231
- def _scan_parquet_impl(
232
- source,
233
- n_rows: nil,
234
- cache: true,
235
- parallel: "auto",
236
- rechunk: true,
237
- row_index_name: nil,
238
- row_index_offset: 0,
239
- storage_options: nil,
240
- credential_provider: nil,
241
- low_memory: false,
242
- use_statistics: true,
243
- hive_partitioning: nil,
244
- glob: true,
245
- schema: nil,
246
- hive_schema: nil,
247
- try_parse_hive_dates: true,
248
- retries: 2,
249
- include_file_paths: nil,
250
- allow_missing_columns: false
251
- )
252
227
  if source.is_a?(::Array)
253
228
  sources = source
254
229
  source = nil
255
230
  else
256
- sources = []
231
+ sources = [source]
257
232
  end
258
233
 
259
234
  if storage_options
@@ -262,27 +237,34 @@ module Polars
262
237
  storage_options = nil
263
238
  end
264
239
 
240
+ row_index_name = row_count_name
241
+ row_index_offset = row_count_offset
242
+
265
243
  rblf =
266
244
  RbLazyFrame.new_from_parquet(
267
- source,
268
245
  sources,
269
- n_rows,
270
- cache,
246
+ schema,
247
+ ScanOptions.new(
248
+ row_index: !row_index_name.nil? ? [row_index_name, row_index_offset] : nil,
249
+ pre_slice: !n_rows.nil? ? [0, n_rows] : nil,
250
+ # cast_options: cast_options,
251
+ extra_columns: extra_columns,
252
+ missing_columns: missing_columns,
253
+ include_file_paths: include_file_paths,
254
+ glob: glob,
255
+ hive_partitioning: hive_partitioning,
256
+ hive_schema: hive_schema,
257
+ try_parse_hive_dates: try_parse_hive_dates,
258
+ rechunk: rechunk,
259
+ cache: cache,
260
+ storage_options: storage_options,
261
+ # credential_provider: credential_provider_builder,
262
+ retries: retries,
263
+ # deletion_files: _deletion_files
264
+ ),
271
265
  parallel,
272
- rechunk,
273
- Utils.parse_row_index_args(row_index_name, row_index_offset),
274
266
  low_memory,
275
- storage_options,
276
- credential_provider,
277
- use_statistics,
278
- hive_partitioning,
279
- schema,
280
- hive_schema,
281
- try_parse_hive_dates,
282
- retries,
283
- glob,
284
- include_file_paths,
285
- allow_missing_columns
267
+ use_statistics
286
268
  )
287
269
  Utils.wrap_ldf(rblf)
288
270
  end
@@ -0,0 +1,47 @@
1
+ module Polars
2
+ module IO
3
+ class ScanOptions
4
+ attr_reader :row_index, :pre_slice, :cast_options, :extra_columns, :missing_columns,
5
+ :include_file_paths, :glob, :hive_partitioning, :hive_schema, :try_parse_hive_dates,
6
+ :rechunk, :cache, :storage_options, :credential_provider, :retries, :column_mapping, :deletion_files
7
+
8
+ def initialize(
9
+ row_index: nil,
10
+ pre_slice: nil,
11
+ cast_options: nil,
12
+ extra_columns: "raise",
13
+ missing_columns: "raise",
14
+ include_file_paths: nil,
15
+ glob: true,
16
+ hive_partitioning: nil,
17
+ hive_schema: nil,
18
+ try_parse_hive_dates: true,
19
+ rechunk: false,
20
+ cache: true,
21
+ storage_options: nil,
22
+ credential_provider: nil,
23
+ retries: 2,
24
+ column_mapping: nil,
25
+ deletion_files: nil
26
+ )
27
+ @row_index = row_index
28
+ @pre_slice = pre_slice
29
+ @cast_options = cast_options
30
+ @extra_columns = extra_columns
31
+ @missing_columns = missing_columns
32
+ @include_file_paths = include_file_paths
33
+ @glob = glob
34
+ @hive_partitioning = hive_partitioning
35
+ @hive_schema = hive_schema
36
+ @try_parse_hive_dates = try_parse_hive_dates
37
+ @rechunk = rechunk
38
+ @cache = cache
39
+ @storage_options = storage_options
40
+ @credential_provider = credential_provider
41
+ @retries = retries
42
+ @column_mapping = column_mapping
43
+ @deletion_files = deletion_files
44
+ end
45
+ end
46
+ end
47
+ end