polars-df 0.11.0-x86_64-darwin → 0.13.0-x86_64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +22 -0
  3. data/Cargo.lock +428 -450
  4. data/LICENSE-THIRD-PARTY.txt +2212 -1952
  5. data/lib/polars/3.1/polars.bundle +0 -0
  6. data/lib/polars/3.2/polars.bundle +0 -0
  7. data/lib/polars/3.3/polars.bundle +0 -0
  8. data/lib/polars/array_expr.rb +4 -4
  9. data/lib/polars/batched_csv_reader.rb +2 -2
  10. data/lib/polars/cat_expr.rb +0 -36
  11. data/lib/polars/cat_name_space.rb +0 -37
  12. data/lib/polars/data_frame.rb +93 -101
  13. data/lib/polars/data_types.rb +1 -1
  14. data/lib/polars/date_time_expr.rb +525 -573
  15. data/lib/polars/date_time_name_space.rb +263 -464
  16. data/lib/polars/dynamic_group_by.rb +3 -3
  17. data/lib/polars/exceptions.rb +3 -0
  18. data/lib/polars/expr.rb +367 -330
  19. data/lib/polars/expr_dispatch.rb +1 -1
  20. data/lib/polars/functions/aggregation/horizontal.rb +8 -8
  21. data/lib/polars/functions/as_datatype.rb +63 -40
  22. data/lib/polars/functions/lazy.rb +63 -14
  23. data/lib/polars/functions/lit.rb +1 -1
  24. data/lib/polars/functions/range/date_range.rb +18 -77
  25. data/lib/polars/functions/range/datetime_range.rb +4 -4
  26. data/lib/polars/functions/range/int_range.rb +2 -2
  27. data/lib/polars/functions/range/time_range.rb +4 -4
  28. data/lib/polars/functions/repeat.rb +1 -1
  29. data/lib/polars/functions/whenthen.rb +1 -1
  30. data/lib/polars/io/csv.rb +8 -8
  31. data/lib/polars/io/ipc.rb +35 -7
  32. data/lib/polars/io/json.rb +13 -2
  33. data/lib/polars/io/ndjson.rb +15 -4
  34. data/lib/polars/io/parquet.rb +15 -8
  35. data/lib/polars/lazy_frame.rb +123 -105
  36. data/lib/polars/lazy_group_by.rb +1 -1
  37. data/lib/polars/list_expr.rb +11 -11
  38. data/lib/polars/list_name_space.rb +5 -1
  39. data/lib/polars/rolling_group_by.rb +5 -7
  40. data/lib/polars/series.rb +108 -191
  41. data/lib/polars/string_expr.rb +51 -76
  42. data/lib/polars/string_name_space.rb +5 -4
  43. data/lib/polars/testing.rb +2 -2
  44. data/lib/polars/utils/constants.rb +9 -0
  45. data/lib/polars/utils/convert.rb +97 -0
  46. data/lib/polars/utils/parse.rb +89 -0
  47. data/lib/polars/utils/various.rb +76 -0
  48. data/lib/polars/utils/wrap.rb +19 -0
  49. data/lib/polars/utils.rb +4 -330
  50. data/lib/polars/version.rb +1 -1
  51. data/lib/polars/whenthen.rb +6 -6
  52. data/lib/polars.rb +11 -0
  53. metadata +7 -2
data/lib/polars/io/csv.rb CHANGED
@@ -104,7 +104,7 @@ module Polars
104
104
  ignore_errors: false,
105
105
  parse_dates: false,
106
106
  n_threads: nil,
107
- infer_schema_length: 100,
107
+ infer_schema_length: N_INFER_DEFAULT,
108
108
  batch_size: 8192,
109
109
  n_rows: nil,
110
110
  encoding: "utf8",
@@ -192,7 +192,7 @@ module Polars
192
192
  ignore_errors: false,
193
193
  parse_dates: false,
194
194
  n_threads: nil,
195
- infer_schema_length: 100,
195
+ infer_schema_length: N_INFER_DEFAULT,
196
196
  batch_size: 8192,
197
197
  n_rows: nil,
198
198
  encoding: "utf8",
@@ -222,7 +222,7 @@ module Polars
222
222
  if !dtypes.nil?
223
223
  if dtypes.is_a?(Hash)
224
224
  dtype_list = []
225
- dtypes.each do|k, v|
225
+ dtypes.each do |k, v|
226
226
  dtype_list << [k, Utils.rb_type_to_dtype(v)]
227
227
  end
228
228
  elsif dtypes.is_a?(::Array)
@@ -304,7 +304,7 @@ module Polars
304
304
  missing_utf8_is_empty_string,
305
305
  parse_dates,
306
306
  skip_rows_after_header,
307
- Utils._prepare_row_count_args(row_count_name, row_count_offset),
307
+ Utils.parse_row_index_args(row_count_name, row_count_offset),
308
308
  sample_size,
309
309
  eol_char,
310
310
  raise_if_empty,
@@ -422,7 +422,7 @@ module Polars
422
422
  ignore_errors: false,
423
423
  parse_dates: false,
424
424
  n_threads: nil,
425
- infer_schema_length: 100,
425
+ infer_schema_length: N_INFER_DEFAULT,
426
426
  batch_size: 50_000,
427
427
  n_rows: nil,
428
428
  encoding: "utf8",
@@ -567,7 +567,7 @@ module Polars
567
567
  ignore_errors: false,
568
568
  cache: true,
569
569
  with_column_names: nil,
570
- infer_schema_length: 100,
570
+ infer_schema_length: N_INFER_DEFAULT,
571
571
  n_rows: nil,
572
572
  encoding: "utf8",
573
573
  low_memory: false,
@@ -629,7 +629,7 @@ module Polars
629
629
  ignore_errors: false,
630
630
  cache: true,
631
631
  with_column_names: nil,
632
- infer_schema_length: 100,
632
+ infer_schema_length: N_INFER_DEFAULT,
633
633
  n_rows: nil,
634
634
  encoding: "utf8",
635
635
  low_memory: false,
@@ -669,7 +669,7 @@ module Polars
669
669
  rechunk,
670
670
  skip_rows_after_header,
671
671
  encoding,
672
- Utils._prepare_row_count_args(row_count_name, row_count_offset),
672
+ Utils.parse_row_index_args(row_count_name, row_count_offset),
673
673
  parse_dates,
674
674
  eol_char,
675
675
  truncate_ragged_lines
data/lib/polars/io/ipc.rb CHANGED
@@ -76,7 +76,7 @@ module Polars
76
76
  columns,
77
77
  projection,
78
78
  n_rows,
79
- Utils._prepare_row_count_args(row_count_name, row_count_offset),
79
+ Utils.parse_row_index_args(row_count_name, row_count_offset),
80
80
  memory_map
81
81
  )
82
82
  Utils.wrap_df(rbdf)
@@ -149,7 +149,7 @@ module Polars
149
149
  columns,
150
150
  projection,
151
151
  n_rows,
152
- Utils._prepare_row_count_args(row_index_name, row_index_offset),
152
+ Utils.parse_row_index_args(row_index_name, row_index_offset),
153
153
  rechunk
154
154
  )
155
155
  Utils.wrap_df(pydf)
@@ -193,6 +193,18 @@ module Polars
193
193
  # Try to memory map the file. This can greatly improve performance on repeated
194
194
  # queries as the OS may cache pages.
195
195
  # Only uncompressed IPC files can be memory mapped.
196
+ # @param hive_partitioning [Boolean]
197
+ # Infer statistics and schema from Hive partitioned URL and use them
198
+ # to prune reads. This is unset by default (i.e. `nil`), meaning it is
199
+ # automatically enabled when a single directory is passed, and otherwise
200
+ # disabled.
201
+ # @param hive_schema [Hash]
202
+ # The column names and data types of the columns by which the data is partitioned.
203
+ # If set to `nil` (default), the schema of the Hive partitions is inferred.
204
+ # @param try_parse_hive_dates [Boolean]
205
+ # Whether to try parsing hive values as date/datetime types.
206
+ # @param include_file_paths [String]
207
+ # Include the path of the source file(s) as a column with this name.
196
208
  #
197
209
  # @return [LazyFrame]
198
210
  def scan_ipc(
@@ -203,7 +215,11 @@ module Polars
203
215
  row_count_name: nil,
204
216
  row_count_offset: 0,
205
217
  storage_options: nil,
206
- memory_map: true
218
+ memory_map: true,
219
+ hive_partitioning: nil,
220
+ hive_schema: nil,
221
+ try_parse_hive_dates: true,
222
+ include_file_paths: nil
207
223
  )
208
224
  _scan_ipc_impl(
209
225
  source,
@@ -213,7 +229,11 @@ module Polars
213
229
  row_count_name: row_count_name,
214
230
  row_count_offset: row_count_offset,
215
231
  storage_options: storage_options,
216
- memory_map: memory_map
232
+ memory_map: memory_map,
233
+ hive_partitioning: hive_partitioning,
234
+ hive_schema: hive_schema,
235
+ try_parse_hive_dates: try_parse_hive_dates,
236
+ include_file_paths: include_file_paths
217
237
  )
218
238
  end
219
239
 
@@ -226,7 +246,11 @@ module Polars
226
246
  row_count_name: nil,
227
247
  row_count_offset: 0,
228
248
  storage_options: nil,
229
- memory_map: true
249
+ memory_map: true,
250
+ hive_partitioning: nil,
251
+ hive_schema: nil,
252
+ try_parse_hive_dates: true,
253
+ include_file_paths: nil
230
254
  )
231
255
  if Utils.pathlike?(file)
232
256
  file = Utils.normalize_filepath(file)
@@ -238,8 +262,12 @@ module Polars
238
262
  n_rows,
239
263
  cache,
240
264
  rechunk,
241
- Utils._prepare_row_count_args(row_count_name, row_count_offset),
242
- memory_map
265
+ Utils.parse_row_index_args(row_count_name, row_count_offset),
266
+ memory_map,
267
+ hive_partitioning,
268
+ hive_schema,
269
+ try_parse_hive_dates,
270
+ include_file_paths
243
271
  )
244
272
  Utils.wrap_ldf(rblf)
245
273
  end
@@ -6,12 +6,23 @@ module Polars
6
6
  # Path to a file or a file-like object.
7
7
  #
8
8
  # @return [DataFrame]
9
- def read_json(source)
9
+ def read_json(
10
+ source,
11
+ schema: nil,
12
+ schema_overrides: nil,
13
+ infer_schema_length: N_INFER_DEFAULT
14
+ )
10
15
  if Utils.pathlike?(source)
11
16
  source = Utils.normalize_filepath(source)
12
17
  end
13
18
 
14
- rbdf = RbDataFrame.read_json(source)
19
+ rbdf =
20
+ RbDataFrame.read_json(
21
+ source,
22
+ infer_schema_length,
23
+ schema,
24
+ schema_overrides
25
+ )
15
26
  Utils.wrap_df(rbdf)
16
27
  end
17
28
  end
@@ -6,12 +6,23 @@ module Polars
6
6
  # Path to a file or a file-like object.
7
7
  #
8
8
  # @return [DataFrame]
9
- def read_ndjson(source)
9
+ def read_ndjson(
10
+ source,
11
+ schema: nil,
12
+ schema_overrides: nil,
13
+ ignore_errors: false
14
+ )
10
15
  if Utils.pathlike?(source)
11
16
  source = Utils.normalize_filepath(source)
12
17
  end
13
18
 
14
- rbdf = RbDataFrame.read_ndjson(source)
19
+ rbdf =
20
+ RbDataFrame.read_ndjson(
21
+ source,
22
+ ignore_errors,
23
+ schema,
24
+ schema_overrides
25
+ )
15
26
  Utils.wrap_df(rbdf)
16
27
  end
17
28
 
@@ -41,7 +52,7 @@ module Polars
41
52
  # @return [LazyFrame]
42
53
  def scan_ndjson(
43
54
  source,
44
- infer_schema_length: 100,
55
+ infer_schema_length: N_INFER_DEFAULT,
45
56
  batch_size: 1024,
46
57
  n_rows: nil,
47
58
  low_memory: false,
@@ -61,7 +72,7 @@ module Polars
61
72
  n_rows,
62
73
  low_memory,
63
74
  rechunk,
64
- Utils._prepare_row_count_args(row_count_name, row_count_offset)
75
+ Utils.parse_row_index_args(row_count_name, row_count_offset)
65
76
  )
66
77
  Utils.wrap_ldf(rblf)
67
78
  end
@@ -110,7 +110,7 @@ module Polars
110
110
  projection,
111
111
  n_rows,
112
112
  parallel,
113
- Utils._prepare_row_count_args(row_count_name, row_count_offset),
113
+ Utils.parse_row_index_args(row_count_name, row_count_offset),
114
114
  low_memory,
115
115
  use_statistics,
116
116
  rechunk
@@ -158,6 +158,8 @@ module Polars
158
158
  # Extra options that make sense for a particular storage connection.
159
159
  # @param low_memory [Boolean]
160
160
  # Reduce memory pressure at the expense of performance.
161
+ # @param include_file_paths [String]
162
+ # Include the path of the source file(s) as a column with this name.
161
163
  #
162
164
  # @return [LazyFrame]
163
165
  def scan_parquet(
@@ -170,7 +172,8 @@ module Polars
170
172
  row_count_name: nil,
171
173
  row_count_offset: 0,
172
174
  storage_options: nil,
173
- low_memory: false
175
+ low_memory: false,
176
+ include_file_paths: nil
174
177
  )
175
178
  if Utils.pathlike?(source)
176
179
  source = Utils.normalize_filepath(source)
@@ -178,7 +181,7 @@ module Polars
178
181
 
179
182
  _scan_parquet_impl(
180
183
  source,
181
- n_rows:n_rows,
184
+ n_rows: n_rows,
182
185
  cache: cache,
183
186
  parallel: parallel,
184
187
  rechunk: rechunk,
@@ -186,7 +189,8 @@ module Polars
186
189
  row_count_offset: row_count_offset,
187
190
  storage_options: storage_options,
188
191
  low_memory: low_memory,
189
- glob: glob
192
+ glob: glob,
193
+ include_file_paths: include_file_paths
190
194
  )
191
195
  end
192
196
 
@@ -202,8 +206,9 @@ module Polars
202
206
  storage_options: nil,
203
207
  low_memory: false,
204
208
  use_statistics: true,
205
- hive_partitioning: true,
206
- glob: true
209
+ hive_partitioning: nil,
210
+ glob: true,
211
+ include_file_paths: nil
207
212
  )
208
213
  rblf =
209
214
  RbLazyFrame.new_from_parquet(
@@ -213,12 +218,14 @@ module Polars
213
218
  cache,
214
219
  parallel,
215
220
  rechunk,
216
- Utils._prepare_row_count_args(row_count_name, row_count_offset),
221
+ Utils.parse_row_index_args(row_count_name, row_count_offset),
217
222
  low_memory,
218
223
  use_statistics,
219
224
  hive_partitioning,
220
225
  nil,
221
- glob
226
+ true,
227
+ glob,
228
+ include_file_paths
222
229
  )
223
230
  Utils.wrap_ldf(rblf)
224
231
  end