polars-df 0.11.0-x86_64-linux-musl → 0.13.0-x86_64-linux-musl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +22 -0
  3. data/Cargo.lock +428 -450
  4. data/LICENSE-THIRD-PARTY.txt +2502 -2242
  5. data/lib/polars/3.1/polars.so +0 -0
  6. data/lib/polars/3.2/polars.so +0 -0
  7. data/lib/polars/3.3/polars.so +0 -0
  8. data/lib/polars/array_expr.rb +4 -4
  9. data/lib/polars/batched_csv_reader.rb +2 -2
  10. data/lib/polars/cat_expr.rb +0 -36
  11. data/lib/polars/cat_name_space.rb +0 -37
  12. data/lib/polars/data_frame.rb +93 -101
  13. data/lib/polars/data_types.rb +1 -1
  14. data/lib/polars/date_time_expr.rb +525 -573
  15. data/lib/polars/date_time_name_space.rb +263 -464
  16. data/lib/polars/dynamic_group_by.rb +3 -3
  17. data/lib/polars/exceptions.rb +3 -0
  18. data/lib/polars/expr.rb +367 -330
  19. data/lib/polars/expr_dispatch.rb +1 -1
  20. data/lib/polars/functions/aggregation/horizontal.rb +8 -8
  21. data/lib/polars/functions/as_datatype.rb +63 -40
  22. data/lib/polars/functions/lazy.rb +63 -14
  23. data/lib/polars/functions/lit.rb +1 -1
  24. data/lib/polars/functions/range/date_range.rb +18 -77
  25. data/lib/polars/functions/range/datetime_range.rb +4 -4
  26. data/lib/polars/functions/range/int_range.rb +2 -2
  27. data/lib/polars/functions/range/time_range.rb +4 -4
  28. data/lib/polars/functions/repeat.rb +1 -1
  29. data/lib/polars/functions/whenthen.rb +1 -1
  30. data/lib/polars/io/csv.rb +8 -8
  31. data/lib/polars/io/ipc.rb +35 -7
  32. data/lib/polars/io/json.rb +13 -2
  33. data/lib/polars/io/ndjson.rb +15 -4
  34. data/lib/polars/io/parquet.rb +15 -8
  35. data/lib/polars/lazy_frame.rb +123 -105
  36. data/lib/polars/lazy_group_by.rb +1 -1
  37. data/lib/polars/list_expr.rb +11 -11
  38. data/lib/polars/list_name_space.rb +5 -1
  39. data/lib/polars/rolling_group_by.rb +5 -7
  40. data/lib/polars/series.rb +108 -191
  41. data/lib/polars/string_expr.rb +51 -76
  42. data/lib/polars/string_name_space.rb +5 -4
  43. data/lib/polars/testing.rb +2 -2
  44. data/lib/polars/utils/constants.rb +9 -0
  45. data/lib/polars/utils/convert.rb +97 -0
  46. data/lib/polars/utils/parse.rb +89 -0
  47. data/lib/polars/utils/various.rb +76 -0
  48. data/lib/polars/utils/wrap.rb +19 -0
  49. data/lib/polars/utils.rb +4 -330
  50. data/lib/polars/version.rb +1 -1
  51. data/lib/polars/whenthen.rb +6 -6
  52. data/lib/polars.rb +11 -0
  53. metadata +7 -2
data/lib/polars/io/csv.rb CHANGED
@@ -104,7 +104,7 @@ module Polars
104
104
  ignore_errors: false,
105
105
  parse_dates: false,
106
106
  n_threads: nil,
107
- infer_schema_length: 100,
107
+ infer_schema_length: N_INFER_DEFAULT,
108
108
  batch_size: 8192,
109
109
  n_rows: nil,
110
110
  encoding: "utf8",
@@ -192,7 +192,7 @@ module Polars
192
192
  ignore_errors: false,
193
193
  parse_dates: false,
194
194
  n_threads: nil,
195
- infer_schema_length: 100,
195
+ infer_schema_length: N_INFER_DEFAULT,
196
196
  batch_size: 8192,
197
197
  n_rows: nil,
198
198
  encoding: "utf8",
@@ -222,7 +222,7 @@ module Polars
222
222
  if !dtypes.nil?
223
223
  if dtypes.is_a?(Hash)
224
224
  dtype_list = []
225
- dtypes.each do|k, v|
225
+ dtypes.each do |k, v|
226
226
  dtype_list << [k, Utils.rb_type_to_dtype(v)]
227
227
  end
228
228
  elsif dtypes.is_a?(::Array)
@@ -304,7 +304,7 @@ module Polars
304
304
  missing_utf8_is_empty_string,
305
305
  parse_dates,
306
306
  skip_rows_after_header,
307
- Utils._prepare_row_count_args(row_count_name, row_count_offset),
307
+ Utils.parse_row_index_args(row_count_name, row_count_offset),
308
308
  sample_size,
309
309
  eol_char,
310
310
  raise_if_empty,
@@ -422,7 +422,7 @@ module Polars
422
422
  ignore_errors: false,
423
423
  parse_dates: false,
424
424
  n_threads: nil,
425
- infer_schema_length: 100,
425
+ infer_schema_length: N_INFER_DEFAULT,
426
426
  batch_size: 50_000,
427
427
  n_rows: nil,
428
428
  encoding: "utf8",
@@ -567,7 +567,7 @@ module Polars
567
567
  ignore_errors: false,
568
568
  cache: true,
569
569
  with_column_names: nil,
570
- infer_schema_length: 100,
570
+ infer_schema_length: N_INFER_DEFAULT,
571
571
  n_rows: nil,
572
572
  encoding: "utf8",
573
573
  low_memory: false,
@@ -629,7 +629,7 @@ module Polars
629
629
  ignore_errors: false,
630
630
  cache: true,
631
631
  with_column_names: nil,
632
- infer_schema_length: 100,
632
+ infer_schema_length: N_INFER_DEFAULT,
633
633
  n_rows: nil,
634
634
  encoding: "utf8",
635
635
  low_memory: false,
@@ -669,7 +669,7 @@ module Polars
669
669
  rechunk,
670
670
  skip_rows_after_header,
671
671
  encoding,
672
- Utils._prepare_row_count_args(row_count_name, row_count_offset),
672
+ Utils.parse_row_index_args(row_count_name, row_count_offset),
673
673
  parse_dates,
674
674
  eol_char,
675
675
  truncate_ragged_lines
data/lib/polars/io/ipc.rb CHANGED
@@ -76,7 +76,7 @@ module Polars
76
76
  columns,
77
77
  projection,
78
78
  n_rows,
79
- Utils._prepare_row_count_args(row_count_name, row_count_offset),
79
+ Utils.parse_row_index_args(row_count_name, row_count_offset),
80
80
  memory_map
81
81
  )
82
82
  Utils.wrap_df(rbdf)
@@ -149,7 +149,7 @@ module Polars
149
149
  columns,
150
150
  projection,
151
151
  n_rows,
152
- Utils._prepare_row_count_args(row_index_name, row_index_offset),
152
+ Utils.parse_row_index_args(row_index_name, row_index_offset),
153
153
  rechunk
154
154
  )
155
155
  Utils.wrap_df(pydf)
@@ -193,6 +193,18 @@ module Polars
193
193
  # Try to memory map the file. This can greatly improve performance on repeated
194
194
  # queries as the OS may cache pages.
195
195
  # Only uncompressed IPC files can be memory mapped.
196
+ # @param hive_partitioning [Boolean]
197
+ # Infer statistics and schema from Hive partitioned URL and use them
198
+ # to prune reads. This is unset by default (i.e. `nil`), meaning it is
199
+ # automatically enabled when a single directory is passed, and otherwise
200
+ # disabled.
201
+ # @param hive_schema [Hash]
202
+ # The column names and data types of the columns by which the data is partitioned.
203
+ # If set to `nil` (default), the schema of the Hive partitions is inferred.
204
+ # @param try_parse_hive_dates [Boolean]
205
+ # Whether to try parsing hive values as date/datetime types.
206
+ # @param include_file_paths [String]
207
+ # Include the path of the source file(s) as a column with this name.
196
208
  #
197
209
  # @return [LazyFrame]
198
210
  def scan_ipc(
@@ -203,7 +215,11 @@ module Polars
203
215
  row_count_name: nil,
204
216
  row_count_offset: 0,
205
217
  storage_options: nil,
206
- memory_map: true
218
+ memory_map: true,
219
+ hive_partitioning: nil,
220
+ hive_schema: nil,
221
+ try_parse_hive_dates: true,
222
+ include_file_paths: nil
207
223
  )
208
224
  _scan_ipc_impl(
209
225
  source,
@@ -213,7 +229,11 @@ module Polars
213
229
  row_count_name: row_count_name,
214
230
  row_count_offset: row_count_offset,
215
231
  storage_options: storage_options,
216
- memory_map: memory_map
232
+ memory_map: memory_map,
233
+ hive_partitioning: hive_partitioning,
234
+ hive_schema: hive_schema,
235
+ try_parse_hive_dates: try_parse_hive_dates,
236
+ include_file_paths: include_file_paths
217
237
  )
218
238
  end
219
239
 
@@ -226,7 +246,11 @@ module Polars
226
246
  row_count_name: nil,
227
247
  row_count_offset: 0,
228
248
  storage_options: nil,
229
- memory_map: true
249
+ memory_map: true,
250
+ hive_partitioning: nil,
251
+ hive_schema: nil,
252
+ try_parse_hive_dates: true,
253
+ include_file_paths: nil
230
254
  )
231
255
  if Utils.pathlike?(file)
232
256
  file = Utils.normalize_filepath(file)
@@ -238,8 +262,12 @@ module Polars
238
262
  n_rows,
239
263
  cache,
240
264
  rechunk,
241
- Utils._prepare_row_count_args(row_count_name, row_count_offset),
242
- memory_map
265
+ Utils.parse_row_index_args(row_count_name, row_count_offset),
266
+ memory_map,
267
+ hive_partitioning,
268
+ hive_schema,
269
+ try_parse_hive_dates,
270
+ include_file_paths
243
271
  )
244
272
  Utils.wrap_ldf(rblf)
245
273
  end
@@ -6,12 +6,23 @@ module Polars
6
6
  # Path to a file or a file-like object.
7
7
  #
8
8
  # @return [DataFrame]
9
- def read_json(source)
9
+ def read_json(
10
+ source,
11
+ schema: nil,
12
+ schema_overrides: nil,
13
+ infer_schema_length: N_INFER_DEFAULT
14
+ )
10
15
  if Utils.pathlike?(source)
11
16
  source = Utils.normalize_filepath(source)
12
17
  end
13
18
 
14
- rbdf = RbDataFrame.read_json(source)
19
+ rbdf =
20
+ RbDataFrame.read_json(
21
+ source,
22
+ infer_schema_length,
23
+ schema,
24
+ schema_overrides
25
+ )
15
26
  Utils.wrap_df(rbdf)
16
27
  end
17
28
  end
@@ -6,12 +6,23 @@ module Polars
6
6
  # Path to a file or a file-like object.
7
7
  #
8
8
  # @return [DataFrame]
9
- def read_ndjson(source)
9
+ def read_ndjson(
10
+ source,
11
+ schema: nil,
12
+ schema_overrides: nil,
13
+ ignore_errors: false
14
+ )
10
15
  if Utils.pathlike?(source)
11
16
  source = Utils.normalize_filepath(source)
12
17
  end
13
18
 
14
- rbdf = RbDataFrame.read_ndjson(source)
19
+ rbdf =
20
+ RbDataFrame.read_ndjson(
21
+ source,
22
+ ignore_errors,
23
+ schema,
24
+ schema_overrides
25
+ )
15
26
  Utils.wrap_df(rbdf)
16
27
  end
17
28
 
@@ -41,7 +52,7 @@ module Polars
41
52
  # @return [LazyFrame]
42
53
  def scan_ndjson(
43
54
  source,
44
- infer_schema_length: 100,
55
+ infer_schema_length: N_INFER_DEFAULT,
45
56
  batch_size: 1024,
46
57
  n_rows: nil,
47
58
  low_memory: false,
@@ -61,7 +72,7 @@ module Polars
61
72
  n_rows,
62
73
  low_memory,
63
74
  rechunk,
64
- Utils._prepare_row_count_args(row_count_name, row_count_offset)
75
+ Utils.parse_row_index_args(row_count_name, row_count_offset)
65
76
  )
66
77
  Utils.wrap_ldf(rblf)
67
78
  end
@@ -110,7 +110,7 @@ module Polars
110
110
  projection,
111
111
  n_rows,
112
112
  parallel,
113
- Utils._prepare_row_count_args(row_count_name, row_count_offset),
113
+ Utils.parse_row_index_args(row_count_name, row_count_offset),
114
114
  low_memory,
115
115
  use_statistics,
116
116
  rechunk
@@ -158,6 +158,8 @@ module Polars
158
158
  # Extra options that make sense for a particular storage connection.
159
159
  # @param low_memory [Boolean]
160
160
  # Reduce memory pressure at the expense of performance.
161
+ # @param include_file_paths [String]
162
+ # Include the path of the source file(s) as a column with this name.
161
163
  #
162
164
  # @return [LazyFrame]
163
165
  def scan_parquet(
@@ -170,7 +172,8 @@ module Polars
170
172
  row_count_name: nil,
171
173
  row_count_offset: 0,
172
174
  storage_options: nil,
173
- low_memory: false
175
+ low_memory: false,
176
+ include_file_paths: nil
174
177
  )
175
178
  if Utils.pathlike?(source)
176
179
  source = Utils.normalize_filepath(source)
@@ -178,7 +181,7 @@ module Polars
178
181
 
179
182
  _scan_parquet_impl(
180
183
  source,
181
- n_rows:n_rows,
184
+ n_rows: n_rows,
182
185
  cache: cache,
183
186
  parallel: parallel,
184
187
  rechunk: rechunk,
@@ -186,7 +189,8 @@ module Polars
186
189
  row_count_offset: row_count_offset,
187
190
  storage_options: storage_options,
188
191
  low_memory: low_memory,
189
- glob: glob
192
+ glob: glob,
193
+ include_file_paths: include_file_paths
190
194
  )
191
195
  end
192
196
 
@@ -202,8 +206,9 @@ module Polars
202
206
  storage_options: nil,
203
207
  low_memory: false,
204
208
  use_statistics: true,
205
- hive_partitioning: true,
206
- glob: true
209
+ hive_partitioning: nil,
210
+ glob: true,
211
+ include_file_paths: nil
207
212
  )
208
213
  rblf =
209
214
  RbLazyFrame.new_from_parquet(
@@ -213,12 +218,14 @@ module Polars
213
218
  cache,
214
219
  parallel,
215
220
  rechunk,
216
- Utils._prepare_row_count_args(row_count_name, row_count_offset),
221
+ Utils.parse_row_index_args(row_count_name, row_count_offset),
217
222
  low_memory,
218
223
  use_statistics,
219
224
  hive_partitioning,
220
225
  nil,
221
- glob
226
+ true,
227
+ glob,
228
+ include_file_paths
222
229
  )
223
230
  Utils.wrap_ldf(rblf)
224
231
  end