polars-df 0.23.0 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +127 -1
  3. data/Cargo.lock +72 -58
  4. data/README.md +31 -27
  5. data/ext/polars/Cargo.toml +15 -6
  6. data/ext/polars/src/batched_csv.rs +35 -39
  7. data/ext/polars/src/c_api/allocator.rs +7 -0
  8. data/ext/polars/src/c_api/mod.rs +1 -0
  9. data/ext/polars/src/catalog/unity.rs +123 -101
  10. data/ext/polars/src/conversion/any_value.rs +13 -17
  11. data/ext/polars/src/conversion/chunked_array.rs +5 -5
  12. data/ext/polars/src/conversion/datetime.rs +3 -2
  13. data/ext/polars/src/conversion/mod.rs +50 -45
  14. data/ext/polars/src/dataframe/export.rs +13 -13
  15. data/ext/polars/src/dataframe/general.rs +223 -223
  16. data/ext/polars/src/dataframe/io.rs +27 -141
  17. data/ext/polars/src/dataframe/mod.rs +13 -5
  18. data/ext/polars/src/dataframe/serde.rs +1 -1
  19. data/ext/polars/src/error.rs +44 -7
  20. data/ext/polars/src/exceptions.rs +45 -12
  21. data/ext/polars/src/expr/array.rs +12 -0
  22. data/ext/polars/src/expr/datatype.rs +2 -2
  23. data/ext/polars/src/expr/datetime.rs +4 -5
  24. data/ext/polars/src/expr/general.rs +49 -13
  25. data/ext/polars/src/expr/list.rs +4 -0
  26. data/ext/polars/src/expr/meta.rs +8 -3
  27. data/ext/polars/src/expr/mod.rs +22 -6
  28. data/ext/polars/src/expr/name.rs +19 -8
  29. data/ext/polars/src/expr/rolling.rs +50 -1
  30. data/ext/polars/src/expr/string.rs +0 -1
  31. data/ext/polars/src/expr/struct.rs +7 -2
  32. data/ext/polars/src/file.rs +136 -103
  33. data/ext/polars/src/functions/aggregation.rs +9 -8
  34. data/ext/polars/src/functions/io.rs +81 -10
  35. data/ext/polars/src/functions/lazy.rs +95 -21
  36. data/ext/polars/src/functions/mod.rs +2 -0
  37. data/ext/polars/src/functions/range.rs +19 -3
  38. data/ext/polars/src/functions/strings.rs +6 -0
  39. data/ext/polars/src/functions/utils.rs +6 -0
  40. data/ext/polars/src/interop/arrow/mod.rs +50 -1
  41. data/ext/polars/src/interop/arrow/{to_ruby.rs → to_rb.rs} +30 -0
  42. data/ext/polars/src/interop/arrow/to_rust.rs +43 -0
  43. data/ext/polars/src/interop/numo/to_numo_df.rs +1 -1
  44. data/ext/polars/src/interop/numo/to_numo_series.rs +1 -1
  45. data/ext/polars/src/lazyframe/exitable.rs +39 -0
  46. data/ext/polars/src/lazyframe/general.rs +340 -236
  47. data/ext/polars/src/lazyframe/mod.rs +46 -10
  48. data/ext/polars/src/lazyframe/optflags.rs +5 -4
  49. data/ext/polars/src/lazyframe/serde.rs +11 -3
  50. data/ext/polars/src/lazyframe/sink.rs +10 -5
  51. data/ext/polars/src/lazygroupby.rs +6 -7
  52. data/ext/polars/src/lib.rs +141 -76
  53. data/ext/polars/src/map/dataframe.rs +12 -12
  54. data/ext/polars/src/map/lazy.rs +7 -5
  55. data/ext/polars/src/map/mod.rs +15 -8
  56. data/ext/polars/src/map/series.rs +3 -3
  57. data/ext/polars/src/on_startup.rs +16 -8
  58. data/ext/polars/src/prelude.rs +1 -0
  59. data/ext/polars/src/rb_modules.rs +19 -49
  60. data/ext/polars/src/series/aggregation.rs +79 -140
  61. data/ext/polars/src/series/arithmetic.rs +16 -22
  62. data/ext/polars/src/series/comparison.rs +101 -222
  63. data/ext/polars/src/series/construction.rs +17 -18
  64. data/ext/polars/src/series/export.rs +1 -1
  65. data/ext/polars/src/series/general.rs +254 -289
  66. data/ext/polars/src/series/import.rs +17 -0
  67. data/ext/polars/src/series/map.rs +178 -160
  68. data/ext/polars/src/series/mod.rs +28 -12
  69. data/ext/polars/src/series/scatter.rs +12 -9
  70. data/ext/polars/src/sql.rs +16 -9
  71. data/ext/polars/src/testing/frame.rs +31 -0
  72. data/ext/polars/src/testing/mod.rs +5 -0
  73. data/ext/polars/src/testing/series.rs +31 -0
  74. data/ext/polars/src/timeout.rs +105 -0
  75. data/ext/polars/src/utils.rs +159 -1
  76. data/lib/polars/array_expr.rb +81 -12
  77. data/lib/polars/array_name_space.rb +74 -7
  78. data/lib/polars/batched_csv_reader.rb +21 -21
  79. data/lib/polars/binary_name_space.rb +1 -1
  80. data/lib/polars/cat_expr.rb +7 -7
  81. data/lib/polars/config.rb +1 -1
  82. data/lib/polars/convert.rb +189 -34
  83. data/lib/polars/data_frame.rb +1066 -831
  84. data/lib/polars/data_frame_plot.rb +173 -0
  85. data/lib/polars/data_type_group.rb +1 -0
  86. data/lib/polars/data_types.rb +31 -12
  87. data/lib/polars/date_time_expr.rb +51 -69
  88. data/lib/polars/date_time_name_space.rb +80 -112
  89. data/lib/polars/dynamic_group_by.rb +7 -7
  90. data/lib/polars/exceptions.rb +50 -10
  91. data/lib/polars/expr.rb +470 -517
  92. data/lib/polars/functions/aggregation/horizontal.rb +0 -1
  93. data/lib/polars/functions/aggregation/vertical.rb +2 -3
  94. data/lib/polars/functions/as_datatype.rb +290 -8
  95. data/lib/polars/functions/eager.rb +204 -10
  96. data/lib/polars/functions/escape_regex.rb +21 -0
  97. data/lib/polars/functions/lazy.rb +409 -169
  98. data/lib/polars/functions/lit.rb +17 -1
  99. data/lib/polars/functions/range/int_range.rb +74 -2
  100. data/lib/polars/functions/range/linear_space.rb +77 -0
  101. data/lib/polars/functions/range/time_range.rb +1 -1
  102. data/lib/polars/functions/repeat.rb +3 -12
  103. data/lib/polars/functions/whenthen.rb +2 -2
  104. data/lib/polars/group_by.rb +72 -20
  105. data/lib/polars/iceberg_dataset.rb +1 -6
  106. data/lib/polars/in_process_query.rb +37 -0
  107. data/lib/polars/io/cloud.rb +18 -0
  108. data/lib/polars/io/csv.rb +265 -126
  109. data/lib/polars/io/database.rb +0 -1
  110. data/lib/polars/io/delta.rb +15 -7
  111. data/lib/polars/io/ipc.rb +24 -17
  112. data/lib/polars/io/ndjson.rb +161 -24
  113. data/lib/polars/io/parquet.rb +101 -38
  114. data/lib/polars/lazy_frame.rb +849 -558
  115. data/lib/polars/lazy_group_by.rb +327 -2
  116. data/lib/polars/list_expr.rb +94 -16
  117. data/lib/polars/list_name_space.rb +88 -24
  118. data/lib/polars/meta_expr.rb +42 -1
  119. data/lib/polars/name_expr.rb +41 -4
  120. data/lib/polars/query_opt_flags.rb +198 -2
  121. data/lib/polars/rolling_group_by.rb +3 -3
  122. data/lib/polars/schema.rb +21 -3
  123. data/lib/polars/selector.rb +37 -2
  124. data/lib/polars/selectors.rb +45 -9
  125. data/lib/polars/series.rb +1156 -728
  126. data/lib/polars/series_plot.rb +72 -0
  127. data/lib/polars/slice.rb +1 -1
  128. data/lib/polars/sql_context.rb +11 -4
  129. data/lib/polars/string_expr.rb +59 -68
  130. data/lib/polars/string_name_space.rb +51 -87
  131. data/lib/polars/struct_expr.rb +36 -18
  132. data/lib/polars/testing.rb +24 -273
  133. data/lib/polars/utils/constants.rb +2 -0
  134. data/lib/polars/utils/construction/data_frame.rb +410 -0
  135. data/lib/polars/utils/construction/series.rb +364 -0
  136. data/lib/polars/utils/construction/utils.rb +9 -0
  137. data/lib/polars/utils/deprecation.rb +11 -0
  138. data/lib/polars/utils/serde.rb +8 -3
  139. data/lib/polars/utils/unstable.rb +19 -0
  140. data/lib/polars/utils/various.rb +59 -0
  141. data/lib/polars/utils.rb +46 -47
  142. data/lib/polars/version.rb +1 -1
  143. data/lib/polars.rb +47 -1
  144. metadata +25 -6
  145. data/ext/polars/src/allocator.rs +0 -13
  146. data/lib/polars/plot.rb +0 -109
@@ -9,10 +9,10 @@ module Polars
9
9
  # of column names.
10
10
  # @param n_rows [Integer]
11
11
  # Stop reading from parquet file after reading `n_rows`.
12
- # @param row_count_name [String]
12
+ # @param row_index_name [String]
13
13
  # If not nil, this will insert a row count column with give name into the
14
14
  # DataFrame.
15
- # @param row_count_offset [Integer]
15
+ # @param row_index_offset [Integer]
16
16
  # Offset to start the row_count column (only use if the name is set).
17
17
  # @param parallel ["auto", "columns", "row_groups", "none"]
18
18
  # This determines the direction of parallelism. 'auto' will try to determine the
@@ -49,6 +49,12 @@ module Polars
49
49
  # Number of retries if accessing a cloud instance fails.
50
50
  # @param include_file_paths [String]
51
51
  # Include the path of the source file(s) as a column with this name.
52
+ # @param missing_columns ['insert', 'raise']
53
+ # Configuration for behavior when columns defined in the schema
54
+ # are missing from the data:
55
+ #
56
+ # * `insert`: Inserts the missing columns using NULLs as the row values.
57
+ # * `raise`: Raises an error.
52
58
  # @param allow_missing_columns [Boolean]
53
59
  # When reading a list of parquet files, if a column existing in the first
54
60
  # file cannot be found in subsequent files, the default behavior is to
@@ -61,8 +67,8 @@ module Polars
61
67
  source,
62
68
  columns: nil,
63
69
  n_rows: nil,
64
- row_count_name: nil,
65
- row_count_offset: 0,
70
+ row_index_name: nil,
71
+ row_index_offset: 0,
66
72
  parallel: "auto",
67
73
  use_statistics: true,
68
74
  hive_partitioning: nil,
@@ -73,17 +79,18 @@ module Polars
73
79
  rechunk: false,
74
80
  low_memory: false,
75
81
  storage_options: nil,
76
- credential_provider: nil,
82
+ credential_provider: "auto",
77
83
  retries: 2,
78
84
  include_file_paths: nil,
79
- allow_missing_columns: false
85
+ missing_columns: "raise",
86
+ allow_missing_columns: nil
80
87
  )
81
88
  lf =
82
89
  scan_parquet(
83
90
  source,
84
91
  n_rows: n_rows,
85
- row_count_name: row_count_name,
86
- row_count_offset: row_count_offset,
92
+ row_index_name: row_index_name,
93
+ row_index_offset: row_index_offset,
87
94
  parallel: parallel,
88
95
  use_statistics: use_statistics,
89
96
  hive_partitioning: hive_partitioning,
@@ -98,6 +105,7 @@ module Polars
98
105
  retries: retries,
99
106
  glob: glob,
100
107
  include_file_paths: include_file_paths,
108
+ missing_columns: missing_columns,
101
109
  allow_missing_columns: allow_missing_columns
102
110
  )
103
111
 
@@ -134,14 +142,40 @@ module Polars
134
142
  #
135
143
  # @param source [Object]
136
144
  # Path to a file or a file-like object.
145
+ # @param storage_options [Hash]
146
+ # Extra options that make sense for a particular storage connection.
147
+ # @param credential_provider [Object]
148
+ # Provide a function that can be called to provide cloud storage
149
+ # credentials. The function is expected to return a hash of
150
+ # credential keys along with an optional credential expiry time.
151
+ # @param retries [Integer]
152
+ # Number of retries if accessing a cloud instance fails.
137
153
  #
138
154
  # @return [Hash]
139
- def read_parquet_metadata(source)
155
+ def read_parquet_metadata(
156
+ source,
157
+ storage_options: nil,
158
+ credential_provider: "auto",
159
+ retries: 2
160
+ )
161
+ if storage_options
162
+ raise Todo
163
+ end
164
+
140
165
  if Utils.pathlike?(source)
141
166
  source = Utils.normalize_filepath(source, check_not_directory: false)
142
167
  end
143
168
 
144
- Plr.read_parquet_metadata(source)
169
+ credential_provider_builder = _init_credential_provider_builder(
170
+ credential_provider, source, storage_options, "scan_parquet"
171
+ )
172
+
173
+ Plr.read_parquet_metadata(
174
+ source,
175
+ storage_options&.any? ? storage_options.map { |k, v| [k.to_s, v.to_s] } : nil,
176
+ credential_provider_builder,
177
+ retries
178
+ )
145
179
  end
146
180
 
147
181
  # Lazily read from a parquet file or multiple files via glob patterns.
@@ -153,10 +187,10 @@ module Polars
153
187
  # Path to a file or a file-like object.
154
188
  # @param n_rows [Integer]
155
189
  # Stop reading from parquet file after reading `n_rows`.
156
- # @param row_count_name [String]
190
+ # @param row_index_name [String]
157
191
  # If not nil, this will insert a row count column with give name into the
158
192
  # DataFrame.
159
- # @param row_count_offset [Integer]
193
+ # @param row_index_offset [Integer]
160
194
  # Offset to start the row_count column (only use if the name is set).
161
195
  # @param parallel ["auto", "columns", "row_groups", "none"]
162
196
  # This determines the direction of parallelism. 'auto' will try to determine the
@@ -169,6 +203,8 @@ module Polars
169
203
  # to prune reads.
170
204
  # @param glob [Boolean]
171
205
  # Expand path given via globbing rules.
206
+ # @param hidden_file_prefix [Boolean]
207
+ # Skip reading files whose names begin with the specified prefixes.
172
208
  # @param schema [Object]
173
209
  # Specify the datatypes of the columns. The datatypes must match the
174
210
  # datatypes in the file(s). If there are extra columns that are not in the
@@ -195,6 +231,12 @@ module Polars
195
231
  # Number of retries if accessing a cloud instance fails.
196
232
  # @param include_file_paths [String]
197
233
  # Include the path of the source file(s) as a column with this name.
234
+ # @param missing_columns ['insert', 'raise']
235
+ # Configuration for behavior when columns defined in the schema
236
+ # are missing from the data:
237
+ #
238
+ # * `insert`: Inserts the missing columns using NULLs as the row values.
239
+ # * `raise`: Raises an error.
198
240
  # @param allow_missing_columns [Boolean]
199
241
  # When reading a list of parquet files, if a column existing in the first
200
242
  # file cannot be found in subsequent files, the default behavior is to
@@ -214,12 +256,13 @@ module Polars
214
256
  def scan_parquet(
215
257
  source,
216
258
  n_rows: nil,
217
- row_count_name: nil,
218
- row_count_offset: 0,
259
+ row_index_name: nil,
260
+ row_index_offset: 0,
219
261
  parallel: "auto",
220
262
  use_statistics: true,
221
263
  hive_partitioning: nil,
222
264
  glob: true,
265
+ hidden_file_prefix: nil,
223
266
  schema: nil,
224
267
  hive_schema: nil,
225
268
  try_parse_hive_dates: true,
@@ -227,42 +270,58 @@ module Polars
227
270
  low_memory: false,
228
271
  cache: true,
229
272
  storage_options: nil,
230
- credential_provider: nil,
273
+ credential_provider: "auto",
231
274
  retries: 2,
232
275
  include_file_paths: nil,
233
- allow_missing_columns: false,
276
+ missing_columns: "raise",
277
+ allow_missing_columns: nil,
234
278
  extra_columns: "raise",
235
279
  cast_options: nil,
236
280
  _column_mapping: nil,
237
- _deletion_files: nil
281
+ _default_values: nil,
282
+ _deletion_files: nil,
283
+ _table_statistics: nil,
284
+ _row_count: nil
238
285
  )
239
- missing_columns = allow_missing_columns ? "insert" : "raise"
286
+ if !schema.nil?
287
+ msg = "the `schema` parameter of `scan_parquet` is considered unstable."
288
+ Utils.issue_unstable_warning(msg)
289
+ end
240
290
 
241
- if Utils.pathlike?(source)
242
- source = Utils.normalize_filepath(source, check_not_directory: false)
243
- elsif Utils.is_path_or_str_sequence(source)
244
- source = source.map { |s| Utils.normalize_filepath(s, check_not_directory: false) }
291
+ if !hive_schema.nil?
292
+ msg = "the `hive_schema` parameter of `scan_parquet` is considered unstable."
293
+ Utils.issue_unstable_warning(msg)
245
294
  end
246
295
 
247
- if credential_provider
248
- raise Todo
296
+ if !cast_options.nil?
297
+ msg = "The `cast_options` parameter of `scan_parquet` is considered unstable."
298
+ Utils.issue_unstable_warning(msg)
249
299
  end
250
300
 
251
- if source.is_a?(::Array)
252
- sources = source
253
- source = nil
254
- else
255
- sources = [source]
301
+ if !hidden_file_prefix.nil?
302
+ msg = "The `hidden_file_prefix` parameter of `scan_parquet` is considered unstable."
303
+ Utils.issue_unstable_warning(msg)
256
304
  end
257
305
 
258
- if storage_options
259
- storage_options = storage_options.map { |k, v| [k.to_s, v.to_s] }
260
- else
261
- storage_options = nil
306
+ if !allow_missing_columns.nil?
307
+ Utils.issue_deprecation_warning(
308
+ "the parameter `allow_missing_columns` for `scan_parquet` is deprecated. " +
309
+ "Use the parameter `missing_columns` instead and pass one of " +
310
+ "`('insert', 'raise')`."
311
+ )
312
+
313
+ missing_columns = allow_missing_columns ? "insert" : "raise"
262
314
  end
263
315
 
264
- row_index_name = row_count_name
265
- row_index_offset = row_count_offset
316
+ sources = get_sources(source)
317
+
318
+ credential_provider_builder =
319
+ _init_credential_provider_builder(
320
+ credential_provider,
321
+ sources,
322
+ storage_options,
323
+ "scan_parquet"
324
+ )
266
325
 
267
326
  rblf =
268
327
  RbLazyFrame.new_from_parquet(
@@ -276,16 +335,20 @@ module Polars
276
335
  missing_columns: missing_columns,
277
336
  include_file_paths: include_file_paths,
278
337
  glob: glob,
338
+ hidden_file_prefix: hidden_file_prefix.is_a?(::String) ? [hidden_file_prefix] : hidden_file_prefix,
279
339
  hive_partitioning: hive_partitioning,
280
340
  hive_schema: hive_schema,
281
341
  try_parse_hive_dates: try_parse_hive_dates,
282
342
  rechunk: rechunk,
283
343
  cache: cache,
284
- storage_options: storage_options,
285
- # credential_provider: credential_provider_builder,
344
+ storage_options: storage_options ? storage_options.map { |k, v| [k.to_s, v.to_s] } : nil,
345
+ credential_provider: credential_provider_builder,
286
346
  retries: retries,
347
+ column_mapping: _column_mapping,
348
+ default_values: _default_values,
287
349
  deletion_files: _deletion_files,
288
- column_mapping: _column_mapping
350
+ table_statistics: _table_statistics,
351
+ row_count: _row_count
289
352
  ),
290
353
  parallel,
291
354
  low_memory,