polars-df 0.13.0-x86_64-linux-musl → 0.15.0-x86_64-linux-musl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +30 -0
  3. data/Cargo.lock +1368 -319
  4. data/LICENSE-THIRD-PARTY.txt +24801 -13447
  5. data/LICENSE.txt +1 -0
  6. data/README.md +1 -2
  7. data/lib/polars/3.1/polars.so +0 -0
  8. data/lib/polars/3.2/polars.so +0 -0
  9. data/lib/polars/3.3/polars.so +0 -0
  10. data/lib/polars/batched_csv_reader.rb +0 -2
  11. data/lib/polars/binary_expr.rb +133 -9
  12. data/lib/polars/binary_name_space.rb +101 -6
  13. data/lib/polars/config.rb +4 -0
  14. data/lib/polars/data_frame.rb +285 -62
  15. data/lib/polars/data_type_group.rb +28 -0
  16. data/lib/polars/data_types.rb +2 -0
  17. data/lib/polars/date_time_expr.rb +244 -0
  18. data/lib/polars/date_time_name_space.rb +87 -0
  19. data/lib/polars/expr.rb +109 -8
  20. data/lib/polars/functions/as_datatype.rb +51 -2
  21. data/lib/polars/functions/col.rb +1 -1
  22. data/lib/polars/functions/eager.rb +1 -3
  23. data/lib/polars/functions/lazy.rb +88 -10
  24. data/lib/polars/functions/range/time_range.rb +21 -21
  25. data/lib/polars/io/csv.rb +14 -16
  26. data/lib/polars/io/database.rb +2 -2
  27. data/lib/polars/io/ipc.rb +14 -12
  28. data/lib/polars/io/ndjson.rb +10 -0
  29. data/lib/polars/io/parquet.rb +168 -111
  30. data/lib/polars/lazy_frame.rb +649 -15
  31. data/lib/polars/list_name_space.rb +169 -0
  32. data/lib/polars/selectors.rb +1144 -0
  33. data/lib/polars/series.rb +470 -40
  34. data/lib/polars/string_cache.rb +27 -1
  35. data/lib/polars/string_expr.rb +0 -1
  36. data/lib/polars/string_name_space.rb +73 -3
  37. data/lib/polars/struct_name_space.rb +31 -7
  38. data/lib/polars/utils/various.rb +5 -1
  39. data/lib/polars/utils.rb +45 -10
  40. data/lib/polars/version.rb +1 -1
  41. data/lib/polars.rb +2 -1
  42. metadata +4 -3
  43. data/lib/polars/functions.rb +0 -57
@@ -2,120 +2,108 @@ module Polars
2
2
  module IO
3
3
  # Read into a DataFrame from a parquet file.
4
4
  #
5
- # @param source [String, Pathname, StringIO]
5
+ # @param source [Object]
6
6
  # Path to a file or a file-like object.
7
7
  # @param columns [Object]
8
8
  # Columns to select. Accepts a list of column indices (starting at zero) or a list
9
9
  # of column names.
10
10
  # @param n_rows [Integer]
11
11
  # Stop reading from parquet file after reading `n_rows`.
12
- # @param storage_options [Hash]
13
- # Extra options that make sense for a particular storage connection.
14
- # @param parallel ["auto", "columns", "row_groups", "none"]
15
- # This determines the direction of parallelism. 'auto' will try to determine the
16
- # optimal direction.
17
12
  # @param row_count_name [String]
18
13
  # If not nil, this will insert a row count column with give name into the
19
14
  # DataFrame.
20
15
  # @param row_count_offset [Integer]
21
16
  # Offset to start the row_count column (only use if the name is set).
22
- # @param low_memory [Boolean]
23
- # Reduce memory pressure at the expense of performance.
17
+ # @param parallel ["auto", "columns", "row_groups", "none"]
18
+ # This determines the direction of parallelism. 'auto' will try to determine the
19
+ # optimal direction.
24
20
  # @param use_statistics [Boolean]
25
21
  # Use statistics in the parquet to determine if pages
26
22
  # can be skipped from reading.
23
+ # @param hive_partitioning [Boolean]
24
+ # Infer statistics and schema from hive partitioned URL and use them
25
+ # to prune reads.
26
+ # @param glob [Boolean]
27
+ # Expand path given via globbing rules.
28
+ # @param schema [Object]
29
+ # Specify the datatypes of the columns. The datatypes must match the
30
+ # datatypes in the file(s). If there are extra columns that are not in the
31
+ # file(s), consider also enabling `allow_missing_columns`.
32
+ # @param hive_schema [Object]
33
+ # The column names and data types of the columns by which the data is partitioned.
34
+ # If set to `nil` (default), the schema of the Hive partitions is inferred.
35
+ # @param try_parse_hive_dates [Boolean]
36
+ # Whether to try parsing hive values as date/datetime types.
27
37
  # @param rechunk [Boolean]
28
- # Make sure that all columns are contiguous in memory by
29
- # aggregating the chunks into a single array.
38
+ # In case of reading multiple files via a glob pattern rechunk the final DataFrame
39
+ # into contiguous memory chunks.
40
+ # @param low_memory [Boolean]
41
+ # Reduce memory pressure at the expense of performance.
42
+ # @param storage_options [Hash]
43
+ # Extra options that make sense for a particular storage connection.
44
+ # @param credential_provider [Object]
45
+ # Provide a function that can be called to provide cloud storage
46
+ # credentials. The function is expected to return a dictionary of
47
+ # credential keys along with an optional credential expiry time.
48
+ # @param retries [Integer]
49
+ # Number of retries if accessing a cloud instance fails.
50
+ # @param include_file_paths [String]
51
+ # Include the path of the source file(s) as a column with this name.
30
52
  #
31
53
  # @return [DataFrame]
32
- #
33
- # @note
34
- # This operation defaults to a `rechunk` operation at the end, meaning that
35
- # all data will be stored continuously in memory.
36
- # Set `rechunk: false` if you are benchmarking the parquet-reader. A `rechunk` is
37
- # an expensive operation.
38
54
  def read_parquet(
39
55
  source,
40
56
  columns: nil,
41
57
  n_rows: nil,
42
- storage_options: nil,
43
- parallel: "auto",
44
58
  row_count_name: nil,
45
59
  row_count_offset: 0,
46
- low_memory: false,
60
+ parallel: "auto",
47
61
  use_statistics: true,
48
- rechunk: true
62
+ hive_partitioning: nil,
63
+ glob: true,
64
+ schema: nil,
65
+ hive_schema: nil,
66
+ try_parse_hive_dates: true,
67
+ rechunk: false,
68
+ low_memory: false,
69
+ storage_options: nil,
70
+ credential_provider: nil,
71
+ retries: 2,
72
+ include_file_paths: nil,
73
+ allow_missing_columns: false
49
74
  )
50
- _prepare_file_arg(source) do |data|
51
- _read_parquet_impl(
52
- data,
53
- columns: columns,
75
+ lf =
76
+ scan_parquet(
77
+ source,
54
78
  n_rows: n_rows,
55
- parallel: parallel,
56
79
  row_count_name: row_count_name,
57
80
  row_count_offset: row_count_offset,
58
- low_memory: low_memory,
81
+ parallel: parallel,
59
82
  use_statistics: use_statistics,
60
- rechunk: rechunk
83
+ hive_partitioning: hive_partitioning,
84
+ schema: schema,
85
+ hive_schema: hive_schema,
86
+ try_parse_hive_dates: try_parse_hive_dates,
87
+ rechunk: rechunk,
88
+ low_memory: low_memory,
89
+ cache: false,
90
+ storage_options: storage_options,
91
+ credential_provider: credential_provider,
92
+ retries: retries,
93
+ glob: glob,
94
+ include_file_paths: include_file_paths,
95
+ allow_missing_columns: allow_missing_columns
61
96
  )
62
- end
63
- end
64
97
 
65
- # @private
66
- def _read_parquet_impl(
67
- source,
68
- columns: nil,
69
- n_rows: nil,
70
- parallel: "auto",
71
- row_count_name: nil,
72
- row_count_offset: 0,
73
- low_memory: false,
74
- use_statistics: true,
75
- rechunk: true
76
- )
77
- if Utils.pathlike?(source)
78
- source = Utils.normalize_filepath(source)
79
- end
80
- if columns.is_a?(::String)
81
- columns = [columns]
82
- end
83
-
84
- if source.is_a?(::String) && source.include?("*") && Utils.local_file?(source)
85
- scan =
86
- scan_parquet(
87
- source,
88
- n_rows: n_rows,
89
- rechunk: true,
90
- parallel: parallel,
91
- row_count_name: row_count_name,
92
- row_count_offset: row_count_offset,
93
- low_memory: low_memory
94
- )
95
-
96
- if columns.nil?
97
- return scan.collect
98
- elsif Utils.is_str_sequence(columns, allow_str: false)
99
- return scan.select(columns).collect
98
+ if !columns.nil?
99
+ if Utils.is_int_sequence(columns)
100
+ lf = lf.select(F.nth(columns))
100
101
  else
101
- raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: Array[String]"
102
+ lf = lf.select(columns)
102
103
  end
103
104
  end
104
105
 
105
- projection, columns = Utils.handle_projection_columns(columns)
106
- rbdf =
107
- RbDataFrame.read_parquet(
108
- source,
109
- columns,
110
- projection,
111
- n_rows,
112
- parallel,
113
- Utils.parse_row_index_args(row_count_name, row_count_offset),
114
- low_memory,
115
- use_statistics,
116
- rechunk
117
- )
118
- Utils.wrap_df(rbdf)
106
+ lf.collect
119
107
  end
120
108
 
121
109
  # Get a schema of the Parquet file without reading data.
@@ -137,46 +125,83 @@ module Polars
137
125
  # This allows the query optimizer to push down predicates and projections to the scan
138
126
  # level, thereby potentially reducing memory overhead.
139
127
  #
140
- # @param source [String]
141
- # Path to a file.
128
+ # @param source [Object]
129
+ # Path to a file or a file-like object.
142
130
  # @param n_rows [Integer]
143
131
  # Stop reading from parquet file after reading `n_rows`.
144
- # @param cache [Boolean]
145
- # Cache the result after reading.
132
+ # @param row_count_name [String]
133
+ # If not nil, this will insert a row count column with give name into the
134
+ # DataFrame.
135
+ # @param row_count_offset [Integer]
136
+ # Offset to start the row_count column (only use if the name is set).
146
137
  # @param parallel ["auto", "columns", "row_groups", "none"]
147
138
  # This determines the direction of parallelism. 'auto' will try to determine the
148
139
  # optimal direction.
140
+ # @param use_statistics [Boolean]
141
+ # Use statistics in the parquet to determine if pages
142
+ # can be skipped from reading.
143
+ # @param hive_partitioning [Boolean]
144
+ # Infer statistics and schema from hive partitioned URL and use them
145
+ # to prune reads.
146
+ # @param glob [Boolean]
147
+ # Expand path given via globbing rules.
148
+ # @param schema [Object]
149
+ # Specify the datatypes of the columns. The datatypes must match the
150
+ # datatypes in the file(s). If there are extra columns that are not in the
151
+ # file(s), consider also enabling `allow_missing_columns`.
152
+ # @param hive_schema [Object]
153
+ # The column names and data types of the columns by which the data is partitioned.
154
+ # If set to `nil` (default), the schema of the Hive partitions is inferred.
155
+ # @param try_parse_hive_dates [Boolean]
156
+ # Whether to try parsing hive values as date/datetime types.
149
157
  # @param rechunk [Boolean]
150
158
  # In case of reading multiple files via a glob pattern rechunk the final DataFrame
151
159
  # into contiguous memory chunks.
152
- # @param row_count_name [String]
153
- # If not nil, this will insert a row count column with give name into the
154
- # DataFrame.
155
- # @param row_count_offset [Integer]
156
- # Offset to start the row_count column (only use if the name is set).
157
- # @param storage_options [Hash]
158
- # Extra options that make sense for a particular storage connection.
159
160
  # @param low_memory [Boolean]
160
161
  # Reduce memory pressure at the expense of performance.
162
+ # @param cache [Boolean]
163
+ # Cache the result after reading.
164
+ # @param storage_options [Hash]
165
+ # Extra options that make sense for a particular storage connection.
166
+ # @param credential_provider [Object]
167
+ # Provide a function that can be called to provide cloud storage
168
+ # credentials. The function is expected to return a dictionary of
169
+ # credential keys along with an optional credential expiry time.
170
+ # @param retries [Integer]
171
+ # Number of retries if accessing a cloud instance fails.
161
172
  # @param include_file_paths [String]
162
- # Include the path of the source file(s) as a column with this name.
173
+ # Include the path of the source file(s) as a column with this name.
163
174
  #
164
175
  # @return [LazyFrame]
165
176
  def scan_parquet(
166
177
  source,
167
178
  n_rows: nil,
168
- cache: true,
169
- parallel: "auto",
170
- glob: true,
171
- rechunk: true,
172
179
  row_count_name: nil,
173
180
  row_count_offset: 0,
174
- storage_options: nil,
181
+ parallel: "auto",
182
+ use_statistics: true,
183
+ hive_partitioning: nil,
184
+ glob: true,
185
+ schema: nil,
186
+ hive_schema: nil,
187
+ try_parse_hive_dates: true,
188
+ rechunk: false,
175
189
  low_memory: false,
176
- include_file_paths: nil
190
+ cache: true,
191
+ storage_options: nil,
192
+ credential_provider: nil,
193
+ retries: 2,
194
+ include_file_paths: nil,
195
+ allow_missing_columns: false
177
196
  )
178
197
  if Utils.pathlike?(source)
179
- source = Utils.normalize_filepath(source)
198
+ source = Utils.normalize_filepath(source, check_not_directory: false)
199
+ elsif Utils.is_path_or_str_sequence(source)
200
+ source = source.map { |s| Utils.normalize_filepath(s, check_not_directory: false) }
201
+ end
202
+
203
+ if credential_provider
204
+ raise Todo
180
205
  end
181
206
 
182
207
  _scan_parquet_impl(
@@ -185,47 +210,79 @@ module Polars
185
210
  cache: cache,
186
211
  parallel: parallel,
187
212
  rechunk: rechunk,
188
- row_count_name: row_count_name,
189
- row_count_offset: row_count_offset,
213
+ row_index_name: row_count_name,
214
+ row_index_offset: row_count_offset,
190
215
  storage_options: storage_options,
216
+ credential_provider: credential_provider,
191
217
  low_memory: low_memory,
218
+ use_statistics: use_statistics,
219
+ hive_partitioning: hive_partitioning,
220
+ schema: schema,
221
+ hive_schema: hive_schema,
222
+ try_parse_hive_dates: try_parse_hive_dates,
223
+ retries: retries,
192
224
  glob: glob,
193
- include_file_paths: include_file_paths
225
+ include_file_paths: include_file_paths,
226
+ allow_missing_columns: allow_missing_columns
194
227
  )
195
228
  end
196
229
 
197
230
  # @private
198
231
  def _scan_parquet_impl(
199
- file,
232
+ source,
200
233
  n_rows: nil,
201
234
  cache: true,
202
235
  parallel: "auto",
203
236
  rechunk: true,
204
- row_count_name: nil,
205
- row_count_offset: 0,
237
+ row_index_name: nil,
238
+ row_index_offset: 0,
206
239
  storage_options: nil,
240
+ credential_provider: nil,
207
241
  low_memory: false,
208
242
  use_statistics: true,
209
243
  hive_partitioning: nil,
210
244
  glob: true,
211
- include_file_paths: nil
245
+ schema: nil,
246
+ hive_schema: nil,
247
+ try_parse_hive_dates: true,
248
+ retries: 2,
249
+ include_file_paths: nil,
250
+ allow_missing_columns: false
212
251
  )
252
+ if source.is_a?(::Array)
253
+ sources = source
254
+ source = nil
255
+ else
256
+ sources = []
257
+ end
258
+
259
+ if storage_options
260
+ storage_options = storage_options.map { |k, v| [k.to_s, v.to_s] }
261
+ else
262
+ storage_options = nil
263
+ end
264
+
213
265
  rblf =
214
266
  RbLazyFrame.new_from_parquet(
215
- file,
216
- [],
267
+ source,
268
+ sources,
217
269
  n_rows,
218
270
  cache,
219
271
  parallel,
220
272
  rechunk,
221
- Utils.parse_row_index_args(row_count_name, row_count_offset),
273
+ Utils.parse_row_index_args(row_index_name, row_index_offset),
222
274
  low_memory,
275
+ storage_options,
276
+ credential_provider,
223
277
  use_statistics,
224
278
  hive_partitioning,
225
- nil,
226
- true,
279
+ schema,
280
+ hive_schema,
281
+ try_parse_hive_dates,
282
+ retries,
227
283
  glob,
228
- include_file_paths
284
+ include_file_paths,
285
+ allow_missing_columns
229
286
  )
230
287
  Utils.wrap_ldf(rblf)
231
288
  end