polars-df 0.11.0-arm64-darwin → 0.13.0-arm64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +22 -0
- data/Cargo.lock +428 -450
- data/LICENSE-THIRD-PARTY.txt +2212 -1952
- data/lib/polars/3.1/polars.bundle +0 -0
- data/lib/polars/3.2/polars.bundle +0 -0
- data/lib/polars/3.3/polars.bundle +0 -0
- data/lib/polars/array_expr.rb +4 -4
- data/lib/polars/batched_csv_reader.rb +2 -2
- data/lib/polars/cat_expr.rb +0 -36
- data/lib/polars/cat_name_space.rb +0 -37
- data/lib/polars/data_frame.rb +93 -101
- data/lib/polars/data_types.rb +1 -1
- data/lib/polars/date_time_expr.rb +525 -573
- data/lib/polars/date_time_name_space.rb +263 -464
- data/lib/polars/dynamic_group_by.rb +3 -3
- data/lib/polars/exceptions.rb +3 -0
- data/lib/polars/expr.rb +367 -330
- data/lib/polars/expr_dispatch.rb +1 -1
- data/lib/polars/functions/aggregation/horizontal.rb +8 -8
- data/lib/polars/functions/as_datatype.rb +63 -40
- data/lib/polars/functions/lazy.rb +63 -14
- data/lib/polars/functions/lit.rb +1 -1
- data/lib/polars/functions/range/date_range.rb +18 -77
- data/lib/polars/functions/range/datetime_range.rb +4 -4
- data/lib/polars/functions/range/int_range.rb +2 -2
- data/lib/polars/functions/range/time_range.rb +4 -4
- data/lib/polars/functions/repeat.rb +1 -1
- data/lib/polars/functions/whenthen.rb +1 -1
- data/lib/polars/io/csv.rb +8 -8
- data/lib/polars/io/ipc.rb +35 -7
- data/lib/polars/io/json.rb +13 -2
- data/lib/polars/io/ndjson.rb +15 -4
- data/lib/polars/io/parquet.rb +15 -8
- data/lib/polars/lazy_frame.rb +123 -105
- data/lib/polars/lazy_group_by.rb +1 -1
- data/lib/polars/list_expr.rb +11 -11
- data/lib/polars/list_name_space.rb +5 -1
- data/lib/polars/rolling_group_by.rb +5 -7
- data/lib/polars/series.rb +108 -191
- data/lib/polars/string_expr.rb +51 -76
- data/lib/polars/string_name_space.rb +5 -4
- data/lib/polars/testing.rb +2 -2
- data/lib/polars/utils/constants.rb +9 -0
- data/lib/polars/utils/convert.rb +97 -0
- data/lib/polars/utils/parse.rb +89 -0
- data/lib/polars/utils/various.rb +76 -0
- data/lib/polars/utils/wrap.rb +19 -0
- data/lib/polars/utils.rb +4 -330
- data/lib/polars/version.rb +1 -1
- data/lib/polars/whenthen.rb +6 -6
- data/lib/polars.rb +11 -0
- metadata +7 -2
data/lib/polars/io/csv.rb
CHANGED
@@ -104,7 +104,7 @@ module Polars
|
|
104
104
|
ignore_errors: false,
|
105
105
|
parse_dates: false,
|
106
106
|
n_threads: nil,
|
107
|
-
infer_schema_length:
|
107
|
+
infer_schema_length: N_INFER_DEFAULT,
|
108
108
|
batch_size: 8192,
|
109
109
|
n_rows: nil,
|
110
110
|
encoding: "utf8",
|
@@ -192,7 +192,7 @@ module Polars
|
|
192
192
|
ignore_errors: false,
|
193
193
|
parse_dates: false,
|
194
194
|
n_threads: nil,
|
195
|
-
infer_schema_length:
|
195
|
+
infer_schema_length: N_INFER_DEFAULT,
|
196
196
|
batch_size: 8192,
|
197
197
|
n_rows: nil,
|
198
198
|
encoding: "utf8",
|
@@ -222,7 +222,7 @@ module Polars
|
|
222
222
|
if !dtypes.nil?
|
223
223
|
if dtypes.is_a?(Hash)
|
224
224
|
dtype_list = []
|
225
|
-
dtypes.each do|k, v|
|
225
|
+
dtypes.each do |k, v|
|
226
226
|
dtype_list << [k, Utils.rb_type_to_dtype(v)]
|
227
227
|
end
|
228
228
|
elsif dtypes.is_a?(::Array)
|
@@ -304,7 +304,7 @@ module Polars
|
|
304
304
|
missing_utf8_is_empty_string,
|
305
305
|
parse_dates,
|
306
306
|
skip_rows_after_header,
|
307
|
-
Utils.
|
307
|
+
Utils.parse_row_index_args(row_count_name, row_count_offset),
|
308
308
|
sample_size,
|
309
309
|
eol_char,
|
310
310
|
raise_if_empty,
|
@@ -422,7 +422,7 @@ module Polars
|
|
422
422
|
ignore_errors: false,
|
423
423
|
parse_dates: false,
|
424
424
|
n_threads: nil,
|
425
|
-
infer_schema_length:
|
425
|
+
infer_schema_length: N_INFER_DEFAULT,
|
426
426
|
batch_size: 50_000,
|
427
427
|
n_rows: nil,
|
428
428
|
encoding: "utf8",
|
@@ -567,7 +567,7 @@ module Polars
|
|
567
567
|
ignore_errors: false,
|
568
568
|
cache: true,
|
569
569
|
with_column_names: nil,
|
570
|
-
infer_schema_length:
|
570
|
+
infer_schema_length: N_INFER_DEFAULT,
|
571
571
|
n_rows: nil,
|
572
572
|
encoding: "utf8",
|
573
573
|
low_memory: false,
|
@@ -629,7 +629,7 @@ module Polars
|
|
629
629
|
ignore_errors: false,
|
630
630
|
cache: true,
|
631
631
|
with_column_names: nil,
|
632
|
-
infer_schema_length:
|
632
|
+
infer_schema_length: N_INFER_DEFAULT,
|
633
633
|
n_rows: nil,
|
634
634
|
encoding: "utf8",
|
635
635
|
low_memory: false,
|
@@ -669,7 +669,7 @@ module Polars
|
|
669
669
|
rechunk,
|
670
670
|
skip_rows_after_header,
|
671
671
|
encoding,
|
672
|
-
Utils.
|
672
|
+
Utils.parse_row_index_args(row_count_name, row_count_offset),
|
673
673
|
parse_dates,
|
674
674
|
eol_char,
|
675
675
|
truncate_ragged_lines
|
data/lib/polars/io/ipc.rb
CHANGED
@@ -76,7 +76,7 @@ module Polars
|
|
76
76
|
columns,
|
77
77
|
projection,
|
78
78
|
n_rows,
|
79
|
-
Utils.
|
79
|
+
Utils.parse_row_index_args(row_count_name, row_count_offset),
|
80
80
|
memory_map
|
81
81
|
)
|
82
82
|
Utils.wrap_df(rbdf)
|
@@ -149,7 +149,7 @@ module Polars
|
|
149
149
|
columns,
|
150
150
|
projection,
|
151
151
|
n_rows,
|
152
|
-
Utils.
|
152
|
+
Utils.parse_row_index_args(row_index_name, row_index_offset),
|
153
153
|
rechunk
|
154
154
|
)
|
155
155
|
Utils.wrap_df(pydf)
|
@@ -193,6 +193,18 @@ module Polars
|
|
193
193
|
# Try to memory map the file. This can greatly improve performance on repeated
|
194
194
|
# queries as the OS may cache pages.
|
195
195
|
# Only uncompressed IPC files can be memory mapped.
|
196
|
+
# @param hive_partitioning [Boolean]
|
197
|
+
# Infer statistics and schema from Hive partitioned URL and use them
|
198
|
+
# to prune reads. This is unset by default (i.e. `nil`), meaning it is
|
199
|
+
# automatically enabled when a single directory is passed, and otherwise
|
200
|
+
# disabled.
|
201
|
+
# @param hive_schema [Hash]
|
202
|
+
# The column names and data types of the columns by which the data is partitioned.
|
203
|
+
# If set to `nil` (default), the schema of the Hive partitions is inferred.
|
204
|
+
# @param try_parse_hive_dates [Boolean]
|
205
|
+
# Whether to try parsing hive values as date/datetime types.
|
206
|
+
# @param include_file_paths [String]
|
207
|
+
# Include the path of the source file(s) as a column with this name.
|
196
208
|
#
|
197
209
|
# @return [LazyFrame]
|
198
210
|
def scan_ipc(
|
@@ -203,7 +215,11 @@ module Polars
|
|
203
215
|
row_count_name: nil,
|
204
216
|
row_count_offset: 0,
|
205
217
|
storage_options: nil,
|
206
|
-
memory_map: true
|
218
|
+
memory_map: true,
|
219
|
+
hive_partitioning: nil,
|
220
|
+
hive_schema: nil,
|
221
|
+
try_parse_hive_dates: true,
|
222
|
+
include_file_paths: nil
|
207
223
|
)
|
208
224
|
_scan_ipc_impl(
|
209
225
|
source,
|
@@ -213,7 +229,11 @@ module Polars
|
|
213
229
|
row_count_name: row_count_name,
|
214
230
|
row_count_offset: row_count_offset,
|
215
231
|
storage_options: storage_options,
|
216
|
-
memory_map: memory_map
|
232
|
+
memory_map: memory_map,
|
233
|
+
hive_partitioning: hive_partitioning,
|
234
|
+
hive_schema: hive_schema,
|
235
|
+
try_parse_hive_dates: try_parse_hive_dates,
|
236
|
+
include_file_paths: include_file_paths
|
217
237
|
)
|
218
238
|
end
|
219
239
|
|
@@ -226,7 +246,11 @@ module Polars
|
|
226
246
|
row_count_name: nil,
|
227
247
|
row_count_offset: 0,
|
228
248
|
storage_options: nil,
|
229
|
-
memory_map: true
|
249
|
+
memory_map: true,
|
250
|
+
hive_partitioning: nil,
|
251
|
+
hive_schema: nil,
|
252
|
+
try_parse_hive_dates: true,
|
253
|
+
include_file_paths: nil
|
230
254
|
)
|
231
255
|
if Utils.pathlike?(file)
|
232
256
|
file = Utils.normalize_filepath(file)
|
@@ -238,8 +262,12 @@ module Polars
|
|
238
262
|
n_rows,
|
239
263
|
cache,
|
240
264
|
rechunk,
|
241
|
-
Utils.
|
242
|
-
memory_map
|
265
|
+
Utils.parse_row_index_args(row_count_name, row_count_offset),
|
266
|
+
memory_map,
|
267
|
+
hive_partitioning,
|
268
|
+
hive_schema,
|
269
|
+
try_parse_hive_dates,
|
270
|
+
include_file_paths
|
243
271
|
)
|
244
272
|
Utils.wrap_ldf(rblf)
|
245
273
|
end
|
data/lib/polars/io/json.rb
CHANGED
@@ -6,12 +6,23 @@ module Polars
|
|
6
6
|
# Path to a file or a file-like object.
|
7
7
|
#
|
8
8
|
# @return [DataFrame]
|
9
|
-
def read_json(
|
9
|
+
def read_json(
|
10
|
+
source,
|
11
|
+
schema: nil,
|
12
|
+
schema_overrides: nil,
|
13
|
+
infer_schema_length: N_INFER_DEFAULT
|
14
|
+
)
|
10
15
|
if Utils.pathlike?(source)
|
11
16
|
source = Utils.normalize_filepath(source)
|
12
17
|
end
|
13
18
|
|
14
|
-
rbdf =
|
19
|
+
rbdf =
|
20
|
+
RbDataFrame.read_json(
|
21
|
+
source,
|
22
|
+
infer_schema_length,
|
23
|
+
schema,
|
24
|
+
schema_overrides
|
25
|
+
)
|
15
26
|
Utils.wrap_df(rbdf)
|
16
27
|
end
|
17
28
|
end
|
data/lib/polars/io/ndjson.rb
CHANGED
@@ -6,12 +6,23 @@ module Polars
|
|
6
6
|
# Path to a file or a file-like object.
|
7
7
|
#
|
8
8
|
# @return [DataFrame]
|
9
|
-
def read_ndjson(
|
9
|
+
def read_ndjson(
|
10
|
+
source,
|
11
|
+
schema: nil,
|
12
|
+
schema_overrides: nil,
|
13
|
+
ignore_errors: false
|
14
|
+
)
|
10
15
|
if Utils.pathlike?(source)
|
11
16
|
source = Utils.normalize_filepath(source)
|
12
17
|
end
|
13
18
|
|
14
|
-
rbdf =
|
19
|
+
rbdf =
|
20
|
+
RbDataFrame.read_ndjson(
|
21
|
+
source,
|
22
|
+
ignore_errors,
|
23
|
+
schema,
|
24
|
+
schema_overrides
|
25
|
+
)
|
15
26
|
Utils.wrap_df(rbdf)
|
16
27
|
end
|
17
28
|
|
@@ -41,7 +52,7 @@ module Polars
|
|
41
52
|
# @return [LazyFrame]
|
42
53
|
def scan_ndjson(
|
43
54
|
source,
|
44
|
-
infer_schema_length:
|
55
|
+
infer_schema_length: N_INFER_DEFAULT,
|
45
56
|
batch_size: 1024,
|
46
57
|
n_rows: nil,
|
47
58
|
low_memory: false,
|
@@ -61,7 +72,7 @@ module Polars
|
|
61
72
|
n_rows,
|
62
73
|
low_memory,
|
63
74
|
rechunk,
|
64
|
-
Utils.
|
75
|
+
Utils.parse_row_index_args(row_count_name, row_count_offset)
|
65
76
|
)
|
66
77
|
Utils.wrap_ldf(rblf)
|
67
78
|
end
|
data/lib/polars/io/parquet.rb
CHANGED
@@ -110,7 +110,7 @@ module Polars
|
|
110
110
|
projection,
|
111
111
|
n_rows,
|
112
112
|
parallel,
|
113
|
-
Utils.
|
113
|
+
Utils.parse_row_index_args(row_count_name, row_count_offset),
|
114
114
|
low_memory,
|
115
115
|
use_statistics,
|
116
116
|
rechunk
|
@@ -158,6 +158,8 @@ module Polars
|
|
158
158
|
# Extra options that make sense for a particular storage connection.
|
159
159
|
# @param low_memory [Boolean]
|
160
160
|
# Reduce memory pressure at the expense of performance.
|
161
|
+
# @param include_file_paths [String]
|
162
|
+
# Include the path of the source file(s) as a column with this name.
|
161
163
|
#
|
162
164
|
# @return [LazyFrame]
|
163
165
|
def scan_parquet(
|
@@ -170,7 +172,8 @@ module Polars
|
|
170
172
|
row_count_name: nil,
|
171
173
|
row_count_offset: 0,
|
172
174
|
storage_options: nil,
|
173
|
-
low_memory: false
|
175
|
+
low_memory: false,
|
176
|
+
include_file_paths: nil
|
174
177
|
)
|
175
178
|
if Utils.pathlike?(source)
|
176
179
|
source = Utils.normalize_filepath(source)
|
@@ -178,7 +181,7 @@ module Polars
|
|
178
181
|
|
179
182
|
_scan_parquet_impl(
|
180
183
|
source,
|
181
|
-
n_rows:n_rows,
|
184
|
+
n_rows: n_rows,
|
182
185
|
cache: cache,
|
183
186
|
parallel: parallel,
|
184
187
|
rechunk: rechunk,
|
@@ -186,7 +189,8 @@ module Polars
|
|
186
189
|
row_count_offset: row_count_offset,
|
187
190
|
storage_options: storage_options,
|
188
191
|
low_memory: low_memory,
|
189
|
-
glob: glob
|
192
|
+
glob: glob,
|
193
|
+
include_file_paths: include_file_paths
|
190
194
|
)
|
191
195
|
end
|
192
196
|
|
@@ -202,8 +206,9 @@ module Polars
|
|
202
206
|
storage_options: nil,
|
203
207
|
low_memory: false,
|
204
208
|
use_statistics: true,
|
205
|
-
hive_partitioning:
|
206
|
-
glob: true
|
209
|
+
hive_partitioning: nil,
|
210
|
+
glob: true,
|
211
|
+
include_file_paths: nil
|
207
212
|
)
|
208
213
|
rblf =
|
209
214
|
RbLazyFrame.new_from_parquet(
|
@@ -213,12 +218,14 @@ module Polars
|
|
213
218
|
cache,
|
214
219
|
parallel,
|
215
220
|
rechunk,
|
216
|
-
Utils.
|
221
|
+
Utils.parse_row_index_args(row_count_name, row_count_offset),
|
217
222
|
low_memory,
|
218
223
|
use_statistics,
|
219
224
|
hive_partitioning,
|
220
225
|
nil,
|
221
|
-
|
226
|
+
true,
|
227
|
+
glob,
|
228
|
+
include_file_paths
|
222
229
|
)
|
223
230
|
Utils.wrap_ldf(rblf)
|
224
231
|
end
|