polars-df 0.14.0-x64-mingw-ucrt → 0.15.0-x64-mingw-ucrt

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +25 -0
  3. data/Cargo.lock +1296 -283
  4. data/LICENSE-THIRD-PARTY.txt +24727 -13877
  5. data/LICENSE.txt +1 -0
  6. data/README.md +1 -2
  7. data/lib/polars/3.1/polars.so +0 -0
  8. data/lib/polars/3.2/polars.so +0 -0
  9. data/lib/polars/3.3/polars.so +0 -0
  10. data/lib/polars/batched_csv_reader.rb +0 -2
  11. data/lib/polars/binary_expr.rb +133 -9
  12. data/lib/polars/binary_name_space.rb +101 -6
  13. data/lib/polars/config.rb +4 -0
  14. data/lib/polars/data_frame.rb +275 -52
  15. data/lib/polars/data_type_group.rb +28 -0
  16. data/lib/polars/data_types.rb +2 -0
  17. data/lib/polars/date_time_expr.rb +244 -0
  18. data/lib/polars/date_time_name_space.rb +87 -0
  19. data/lib/polars/expr.rb +103 -2
  20. data/lib/polars/functions/as_datatype.rb +51 -2
  21. data/lib/polars/functions/col.rb +1 -1
  22. data/lib/polars/functions/eager.rb +1 -3
  23. data/lib/polars/functions/lazy.rb +88 -10
  24. data/lib/polars/functions/range/time_range.rb +21 -21
  25. data/lib/polars/io/csv.rb +14 -16
  26. data/lib/polars/io/database.rb +2 -2
  27. data/lib/polars/io/ipc.rb +14 -4
  28. data/lib/polars/io/ndjson.rb +10 -0
  29. data/lib/polars/io/parquet.rb +168 -111
  30. data/lib/polars/lazy_frame.rb +649 -15
  31. data/lib/polars/list_name_space.rb +169 -0
  32. data/lib/polars/selectors.rb +1144 -0
  33. data/lib/polars/series.rb +465 -35
  34. data/lib/polars/string_cache.rb +27 -1
  35. data/lib/polars/string_expr.rb +0 -1
  36. data/lib/polars/string_name_space.rb +73 -3
  37. data/lib/polars/struct_name_space.rb +31 -7
  38. data/lib/polars/utils/various.rb +5 -1
  39. data/lib/polars/utils.rb +45 -10
  40. data/lib/polars/version.rb +1 -1
  41. data/lib/polars.rb +2 -1
  42. metadata +4 -3
  43. data/lib/polars/functions.rb +0 -57
@@ -2,120 +2,108 @@ module Polars
2
2
  module IO
3
3
  # Read into a DataFrame from a parquet file.
4
4
  #
5
- # @param source [String, Pathname, StringIO]
5
+ # @param source [Object]
6
6
  # Path to a file or a file-like object.
7
7
  # @param columns [Object]
8
8
  # Columns to select. Accepts a list of column indices (starting at zero) or a list
9
9
  # of column names.
10
10
  # @param n_rows [Integer]
11
11
  # Stop reading from parquet file after reading `n_rows`.
12
- # @param storage_options [Hash]
13
- # Extra options that make sense for a particular storage connection.
14
- # @param parallel ["auto", "columns", "row_groups", "none"]
15
- # This determines the direction of parallelism. 'auto' will try to determine the
16
- # optimal direction.
17
12
  # @param row_count_name [String]
18
13
  # If not nil, this will insert a row count column with give name into the
19
14
  # DataFrame.
20
15
  # @param row_count_offset [Integer]
21
16
  # Offset to start the row_count column (only use if the name is set).
22
- # @param low_memory [Boolean]
23
- # Reduce memory pressure at the expense of performance.
17
+ # @param parallel ["auto", "columns", "row_groups", "none"]
18
+ # This determines the direction of parallelism. 'auto' will try to determine the
19
+ # optimal direction.
24
20
  # @param use_statistics [Boolean]
25
21
  # Use statistics in the parquet to determine if pages
26
22
  # can be skipped from reading.
23
+ # @param hive_partitioning [Boolean]
24
+ # Infer statistics and schema from hive partitioned URL and use them
25
+ # to prune reads.
26
+ # @param glob [Boolean]
27
+ # Expand path given via globbing rules.
28
+ # @param schema [Object]
29
+ # Specify the datatypes of the columns. The datatypes must match the
30
+ # datatypes in the file(s). If there are extra columns that are not in the
31
+ # file(s), consider also enabling `allow_missing_columns`.
32
+ # @param hive_schema [Object]
33
+ # The column names and data types of the columns by which the data is partitioned.
34
+ # If set to `nil` (default), the schema of the Hive partitions is inferred.
35
+ # @param try_parse_hive_dates [Boolean]
36
+ # Whether to try parsing hive values as date/datetime types.
27
37
  # @param rechunk [Boolean]
28
- # Make sure that all columns are contiguous in memory by
29
- # aggregating the chunks into a single array.
38
+ # In case of reading multiple files via a glob pattern rechunk the final DataFrame
39
+ # into contiguous memory chunks.
40
+ # @param low_memory [Boolean]
41
+ # Reduce memory pressure at the expense of performance.
42
+ # @param storage_options [Hash]
43
+ # Extra options that make sense for a particular storage connection.
44
+ # @param credential_provider [Object]
45
+ # Provide a function that can be called to provide cloud storage
46
+ # credentials. The function is expected to return a dictionary of
47
+ # credential keys along with an optional credential expiry time.
48
+ # @param retries [Integer]
49
+ # Number of retries if accessing a cloud instance fails.
50
+ # @param include_file_paths [String]
51
+ # Include the path of the source file(s) as a column with this name.
30
52
  #
31
53
  # @return [DataFrame]
32
- #
33
- # @note
34
- # This operation defaults to a `rechunk` operation at the end, meaning that
35
- # all data will be stored continuously in memory.
36
- # Set `rechunk: false` if you are benchmarking the parquet-reader. A `rechunk` is
37
- # an expensive operation.
38
54
  def read_parquet(
39
55
  source,
40
56
  columns: nil,
41
57
  n_rows: nil,
42
- storage_options: nil,
43
- parallel: "auto",
44
58
  row_count_name: nil,
45
59
  row_count_offset: 0,
46
- low_memory: false,
60
+ parallel: "auto",
47
61
  use_statistics: true,
48
- rechunk: true
62
+ hive_partitioning: nil,
63
+ glob: true,
64
+ schema: nil,
65
+ hive_schema: nil,
66
+ try_parse_hive_dates: true,
67
+ rechunk: false,
68
+ low_memory: false,
69
+ storage_options: nil,
70
+ credential_provider: nil,
71
+ retries: 2,
72
+ include_file_paths: nil,
73
+ allow_missing_columns: false
49
74
  )
50
- _prepare_file_arg(source) do |data|
51
- _read_parquet_impl(
52
- data,
53
- columns: columns,
75
+ lf =
76
+ scan_parquet(
77
+ source,
54
78
  n_rows: n_rows,
55
- parallel: parallel,
56
79
  row_count_name: row_count_name,
57
80
  row_count_offset: row_count_offset,
58
- low_memory: low_memory,
81
+ parallel: parallel,
59
82
  use_statistics: use_statistics,
60
- rechunk: rechunk
83
+ hive_partitioning: hive_partitioning,
84
+ schema: schema,
85
+ hive_schema: hive_schema,
86
+ try_parse_hive_dates: try_parse_hive_dates,
87
+ rechunk: rechunk,
88
+ low_memory: low_memory,
89
+ cache: false,
90
+ storage_options: storage_options,
91
+ credential_provider: credential_provider,
92
+ retries: retries,
93
+ glob: glob,
94
+ include_file_paths: include_file_paths,
95
+ allow_missing_columns: allow_missing_columns
61
96
  )
62
- end
63
- end
64
97
 
65
- # @private
66
- def _read_parquet_impl(
67
- source,
68
- columns: nil,
69
- n_rows: nil,
70
- parallel: "auto",
71
- row_count_name: nil,
72
- row_count_offset: 0,
73
- low_memory: false,
74
- use_statistics: true,
75
- rechunk: true
76
- )
77
- if Utils.pathlike?(source)
78
- source = Utils.normalize_filepath(source)
79
- end
80
- if columns.is_a?(::String)
81
- columns = [columns]
82
- end
83
-
84
- if source.is_a?(::String) && source.include?("*") && Utils.local_file?(source)
85
- scan =
86
- scan_parquet(
87
- source,
88
- n_rows: n_rows,
89
- rechunk: true,
90
- parallel: parallel,
91
- row_count_name: row_count_name,
92
- row_count_offset: row_count_offset,
93
- low_memory: low_memory
94
- )
95
-
96
- if columns.nil?
97
- return scan.collect
98
- elsif Utils.is_str_sequence(columns, allow_str: false)
99
- return scan.select(columns).collect
98
+ if !columns.nil?
99
+ if Utils.is_int_sequence(columns)
100
+ lf = lf.select(F.nth(columns))
100
101
  else
101
- raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: Array[String]"
102
+ lf = lf.select(columns)
102
103
  end
103
104
  end
104
105
 
105
- projection, columns = Utils.handle_projection_columns(columns)
106
- rbdf =
107
- RbDataFrame.read_parquet(
108
- source,
109
- columns,
110
- projection,
111
- n_rows,
112
- parallel,
113
- Utils.parse_row_index_args(row_count_name, row_count_offset),
114
- low_memory,
115
- use_statistics,
116
- rechunk
117
- )
118
- Utils.wrap_df(rbdf)
106
+ lf.collect
119
107
  end
120
108
 
121
109
  # Get a schema of the Parquet file without reading data.
@@ -137,46 +125,83 @@ module Polars
137
125
  # This allows the query optimizer to push down predicates and projections to the scan
138
126
  # level, thereby potentially reducing memory overhead.
139
127
  #
140
- # @param source [String]
141
- # Path to a file.
128
+ # @param source [Object]
129
+ # Path to a file or a file-like object.
142
130
  # @param n_rows [Integer]
143
131
  # Stop reading from parquet file after reading `n_rows`.
144
- # @param cache [Boolean]
145
- # Cache the result after reading.
132
+ # @param row_count_name [String]
133
+ # If not nil, this will insert a row count column with give name into the
134
+ # DataFrame.
135
+ # @param row_count_offset [Integer]
136
+ # Offset to start the row_count column (only use if the name is set).
146
137
  # @param parallel ["auto", "columns", "row_groups", "none"]
147
138
  # This determines the direction of parallelism. 'auto' will try to determine the
148
139
  # optimal direction.
140
+ # @param use_statistics [Boolean]
141
+ # Use statistics in the parquet to determine if pages
142
+ # can be skipped from reading.
143
+ # @param hive_partitioning [Boolean]
144
+ # Infer statistics and schema from hive partitioned URL and use them
145
+ # to prune reads.
146
+ # @param glob [Boolean]
147
+ # Expand path given via globbing rules.
148
+ # @param schema [Object]
149
+ # Specify the datatypes of the columns. The datatypes must match the
150
+ # datatypes in the file(s). If there are extra columns that are not in the
151
+ # file(s), consider also enabling `allow_missing_columns`.
152
+ # @param hive_schema [Object]
153
+ # The column names and data types of the columns by which the data is partitioned.
154
+ # If set to `nil` (default), the schema of the Hive partitions is inferred.
155
+ # @param try_parse_hive_dates [Boolean]
156
+ # Whether to try parsing hive values as date/datetime types.
149
157
  # @param rechunk [Boolean]
150
158
  # In case of reading multiple files via a glob pattern rechunk the final DataFrame
151
159
  # into contiguous memory chunks.
152
- # @param row_count_name [String]
153
- # If not nil, this will insert a row count column with give name into the
154
- # DataFrame.
155
- # @param row_count_offset [Integer]
156
- # Offset to start the row_count column (only use if the name is set).
157
- # @param storage_options [Hash]
158
- # Extra options that make sense for a particular storage connection.
159
160
  # @param low_memory [Boolean]
160
161
  # Reduce memory pressure at the expense of performance.
162
+ # @param cache [Boolean]
163
+ # Cache the result after reading.
164
+ # @param storage_options [Hash]
165
+ # Extra options that make sense for a particular storage connection.
166
+ # @param credential_provider [Object]
167
+ # Provide a function that can be called to provide cloud storage
168
+ # credentials. The function is expected to return a dictionary of
169
+ # credential keys along with an optional credential expiry time.
170
+ # @param retries [Integer]
171
+ # Number of retries if accessing a cloud instance fails.
161
172
  # @param include_file_paths [String]
162
- # Include the path of the source file(s) as a column with this name.
173
+ # Include the path of the source file(s) as a column with this name.
163
174
  #
164
175
  # @return [LazyFrame]
165
176
  def scan_parquet(
166
177
  source,
167
178
  n_rows: nil,
168
- cache: true,
169
- parallel: "auto",
170
- glob: true,
171
- rechunk: true,
172
179
  row_count_name: nil,
173
180
  row_count_offset: 0,
174
- storage_options: nil,
181
+ parallel: "auto",
182
+ use_statistics: true,
183
+ hive_partitioning: nil,
184
+ glob: true,
185
+ schema: nil,
186
+ hive_schema: nil,
187
+ try_parse_hive_dates: true,
188
+ rechunk: false,
175
189
  low_memory: false,
176
- include_file_paths: nil
190
+ cache: true,
191
+ storage_options: nil,
192
+ credential_provider: nil,
193
+ retries: 2,
194
+ include_file_paths: nil,
195
+ allow_missing_columns: false
177
196
  )
178
197
  if Utils.pathlike?(source)
179
- source = Utils.normalize_filepath(source)
198
+ source = Utils.normalize_filepath(source, check_not_directory: false)
199
+ elsif Utils.is_path_or_str_sequence(source)
200
+ source = source.map { |s| Utils.normalize_filepath(s, check_not_directory: false) }
201
+ end
202
+
203
+ if credential_provider
204
+ raise Todo
180
205
  end
181
206
 
182
207
  _scan_parquet_impl(
@@ -185,47 +210,79 @@ module Polars
185
210
  cache: cache,
186
211
  parallel: parallel,
187
212
  rechunk: rechunk,
188
- row_count_name: row_count_name,
189
- row_count_offset: row_count_offset,
213
+ row_index_name: row_count_name,
214
+ row_index_offset: row_count_offset,
190
215
  storage_options: storage_options,
216
+ credential_provider: credential_provider,
191
217
  low_memory: low_memory,
218
+ use_statistics: use_statistics,
219
+ hive_partitioning: hive_partitioning,
220
+ schema: schema,
221
+ hive_schema: hive_schema,
222
+ try_parse_hive_dates: try_parse_hive_dates,
223
+ retries: retries,
192
224
  glob: glob,
193
- include_file_paths: include_file_paths
225
+ include_file_paths: include_file_paths,
226
+ allow_missing_columns: allow_missing_columns
194
227
  )
195
228
  end
196
229
 
197
230
  # @private
198
231
  def _scan_parquet_impl(
199
- file,
232
+ source,
200
233
  n_rows: nil,
201
234
  cache: true,
202
235
  parallel: "auto",
203
236
  rechunk: true,
204
- row_count_name: nil,
205
- row_count_offset: 0,
237
+ row_index_name: nil,
238
+ row_index_offset: 0,
206
239
  storage_options: nil,
240
+ credential_provider: nil,
207
241
  low_memory: false,
208
242
  use_statistics: true,
209
243
  hive_partitioning: nil,
210
244
  glob: true,
211
- include_file_paths: nil
245
+ schema: nil,
246
+ hive_schema: nil,
247
+ try_parse_hive_dates: true,
248
+ retries: 2,
249
+ include_file_paths: nil,
250
+ allow_missing_columns: false
212
251
  )
252
+ if source.is_a?(::Array)
253
+ sources = source
254
+ source = nil
255
+ else
256
+ sources = []
257
+ end
258
+
259
+ if storage_options
260
+ storage_options = storage_options.map { |k, v| [k.to_s, v.to_s] }
261
+ else
262
+ storage_options = nil
263
+ end
264
+
213
265
  rblf =
214
266
  RbLazyFrame.new_from_parquet(
215
- file,
216
- [],
267
+ source,
268
+ sources,
217
269
  n_rows,
218
270
  cache,
219
271
  parallel,
220
272
  rechunk,
221
- Utils.parse_row_index_args(row_count_name, row_count_offset),
273
+ Utils.parse_row_index_args(row_index_name, row_index_offset),
222
274
  low_memory,
275
+ storage_options,
276
+ credential_provider,
223
277
  use_statistics,
224
278
  hive_partitioning,
225
- nil,
226
- true,
279
+ schema,
280
+ hive_schema,
281
+ try_parse_hive_dates,
282
+ retries,
227
283
  glob,
228
- include_file_paths
284
+ include_file_paths,
285
+ allow_missing_columns
229
286
  )
230
287
  Utils.wrap_ldf(rblf)
231
288
  end