polars-df 0.21.0-x86_64-linux-musl → 0.22.0-x86_64-linux-musl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +27 -0
- data/Cargo.lock +55 -48
- data/Cargo.toml +3 -0
- data/LICENSE-THIRD-PARTY.txt +23 -49
- data/README.md +12 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/3.3/polars.so +0 -0
- data/lib/polars/3.4/polars.so +0 -0
- data/lib/polars/array_expr.rb +382 -3
- data/lib/polars/array_name_space.rb +281 -0
- data/lib/polars/binary_expr.rb +67 -0
- data/lib/polars/binary_name_space.rb +43 -0
- data/lib/polars/cat_expr.rb +224 -0
- data/lib/polars/cat_name_space.rb +138 -0
- data/lib/polars/config.rb +2 -2
- data/lib/polars/convert.rb +6 -6
- data/lib/polars/data_frame.rb +794 -27
- data/lib/polars/data_type_expr.rb +52 -0
- data/lib/polars/data_types.rb +26 -5
- data/lib/polars/date_time_expr.rb +252 -1
- data/lib/polars/date_time_name_space.rb +299 -0
- data/lib/polars/expr.rb +1248 -206
- data/lib/polars/functions/business.rb +95 -0
- data/lib/polars/functions/datatype.rb +21 -0
- data/lib/polars/functions/lazy.rb +14 -1
- data/lib/polars/io/csv.rb +1 -1
- data/lib/polars/io/iceberg.rb +27 -0
- data/lib/polars/io/json.rb +4 -4
- data/lib/polars/io/ndjson.rb +4 -4
- data/lib/polars/io/parquet.rb +32 -7
- data/lib/polars/io/scan_options.rb +4 -1
- data/lib/polars/lazy_frame.rb +1028 -28
- data/lib/polars/list_expr.rb +217 -17
- data/lib/polars/list_name_space.rb +231 -22
- data/lib/polars/meta_expr.rb +89 -0
- data/lib/polars/name_expr.rb +36 -0
- data/lib/polars/query_opt_flags.rb +50 -0
- data/lib/polars/scan_cast_options.rb +20 -1
- data/lib/polars/schema.rb +79 -3
- data/lib/polars/selector.rb +72 -0
- data/lib/polars/selectors.rb +3 -3
- data/lib/polars/series.rb +1053 -54
- data/lib/polars/string_expr.rb +436 -32
- data/lib/polars/string_name_space.rb +736 -50
- data/lib/polars/struct_expr.rb +103 -0
- data/lib/polars/struct_name_space.rb +19 -1
- data/lib/polars/utils/serde.rb +17 -0
- data/lib/polars/utils/various.rb +22 -1
- data/lib/polars/utils.rb +5 -1
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +6 -0
- metadata +8 -2
@@ -0,0 +1,95 @@
|
|
1
|
+
module Polars
|
2
|
+
module Functions
|
3
|
+
# Count the number of business days between `start` and `end` (not including `end`).
|
4
|
+
#
|
5
|
+
# @note
|
6
|
+
# This functionality is considered **unstable**. It may be changed
|
7
|
+
# at any point without it being considered a breaking change.
|
8
|
+
#
|
9
|
+
# @param start [Object]
|
10
|
+
# Start dates.
|
11
|
+
# @param stop [Object]
|
12
|
+
# End dates.
|
13
|
+
# @param week_mask [Array]
|
14
|
+
# Which days of the week to count. The default is Monday to Friday.
|
15
|
+
# If you wanted to count only Monday to Thursday, you would pass
|
16
|
+
# `[true, true, true, true, false, false, false]`.
|
17
|
+
# @param holidays [Array]
|
18
|
+
# Holidays to exclude from the count.
|
19
|
+
#
|
20
|
+
# @return [Expr]
|
21
|
+
#
|
22
|
+
# @example
|
23
|
+
# df = Polars::DataFrame.new(
|
24
|
+
# {
|
25
|
+
# "start" => [Date.new(2020, 1, 1), Date.new(2020, 1, 2)],
|
26
|
+
# "end" => [Date.new(2020, 1, 2), Date.new(2020, 1, 10)]
|
27
|
+
# }
|
28
|
+
# )
|
29
|
+
# df.with_columns(
|
30
|
+
# business_day_count: Polars.business_day_count("start", "end")
|
31
|
+
# )
|
32
|
+
# # =>
|
33
|
+
# # shape: (2, 3)
|
34
|
+
# # ┌────────────┬────────────┬────────────────────┐
|
35
|
+
# # │ start ┆ end ┆ business_day_count │
|
36
|
+
# # │ --- ┆ --- ┆ --- │
|
37
|
+
# # │ date ┆ date ┆ i32 │
|
38
|
+
# # ╞════════════╪════════════╪════════════════════╡
|
39
|
+
# # │ 2020-01-01 ┆ 2020-01-02 ┆ 1 │
|
40
|
+
# # │ 2020-01-02 ┆ 2020-01-10 ┆ 6 │
|
41
|
+
# # └────────────┴────────────┴────────────────────┘
|
42
|
+
#
|
43
|
+
# @example You can pass a custom weekend - for example, if you only take Sunday off:
|
44
|
+
# week_mask = [true, true, true, true, true, true, false]
|
45
|
+
# df.with_columns(
|
46
|
+
# business_day_count: Polars.business_day_count(
|
47
|
+
# "start", "end", week_mask: week_mask
|
48
|
+
# )
|
49
|
+
# )
|
50
|
+
# # =>
|
51
|
+
# # shape: (2, 3)
|
52
|
+
# # ┌────────────┬────────────┬────────────────────┐
|
53
|
+
# # │ start ┆ end ┆ business_day_count │
|
54
|
+
# # │ --- ┆ --- ┆ --- │
|
55
|
+
# # │ date ┆ date ┆ i32 │
|
56
|
+
# # ╞════════════╪════════════╪════════════════════╡
|
57
|
+
# # │ 2020-01-01 ┆ 2020-01-02 ┆ 1 │
|
58
|
+
# # │ 2020-01-02 ┆ 2020-01-10 ┆ 7 │
|
59
|
+
# # └────────────┴────────────┴────────────────────┘
|
60
|
+
#
|
61
|
+
# @example You can also pass a list of holidays to exclude from the count:
|
62
|
+
# holidays = [Date.new(2020, 1, 1), Date.new(2020, 1, 2)]
|
63
|
+
# df.with_columns(
|
64
|
+
# business_day_count: Polars.business_day_count("start", "end", holidays: holidays)
|
65
|
+
# )
|
66
|
+
# # =>
|
67
|
+
# # shape: (2, 3)
|
68
|
+
# # ┌────────────┬────────────┬────────────────────┐
|
69
|
+
# # │ start ┆ end ┆ business_day_count │
|
70
|
+
# # │ --- ┆ --- ┆ --- │
|
71
|
+
# # │ date ┆ date ┆ i32 │
|
72
|
+
# # ╞════════════╪════════════╪════════════════════╡
|
73
|
+
# # │ 2020-01-01 ┆ 2020-01-02 ┆ 0 │
|
74
|
+
# # │ 2020-01-02 ┆ 2020-01-10 ┆ 5 │
|
75
|
+
# # └────────────┴────────────┴────────────────────┘
|
76
|
+
def business_day_count(
|
77
|
+
start,
|
78
|
+
stop,
|
79
|
+
week_mask: [true, true, true, true, true, false, false],
|
80
|
+
holidays: []
|
81
|
+
)
|
82
|
+
start_rbexpr = Utils.parse_into_expression(start)
|
83
|
+
end_rbexpr = Utils.parse_into_expression(stop)
|
84
|
+
unix_epoch = ::Date.new(1970, 1, 1)
|
85
|
+
Utils.wrap_expr(
|
86
|
+
Plr.business_day_count(
|
87
|
+
start_rbexpr,
|
88
|
+
end_rbexpr,
|
89
|
+
week_mask,
|
90
|
+
holidays.map { |holiday| holiday - unix_epoch }
|
91
|
+
)
|
92
|
+
)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module Polars
|
2
|
+
module Functions
|
3
|
+
# Get a lazily evaluated :class:`DataType` of a column or expression.
|
4
|
+
#
|
5
|
+
# @note
|
6
|
+
# This functionality is considered **unstable**. It may be changed
|
7
|
+
# at any point without it being considered a breaking change.
|
8
|
+
#
|
9
|
+
# @return [DataTypeExpr]
|
10
|
+
def dtype_of(col_or_expr)
|
11
|
+
e = nil
|
12
|
+
if col_or_expr.is_a?(::String)
|
13
|
+
e = F.col(col_or_expr)
|
14
|
+
else
|
15
|
+
e = col_or_expr
|
16
|
+
end
|
17
|
+
|
18
|
+
DataTypeExpr._from_rbdatatype_expr(RbDataTypeExpr.of_expr(e._rbexpr))
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -1,5 +1,18 @@
|
|
1
1
|
module Polars
|
2
2
|
module Functions
|
3
|
+
# Select a field in the current `struct.with_fields` scope.
|
4
|
+
#
|
5
|
+
# @param name [Object]
|
6
|
+
# Name of the field(s) to select.
|
7
|
+
#
|
8
|
+
# @return [Expr]
|
9
|
+
def field(name)
|
10
|
+
if name.is_a?(::String)
|
11
|
+
name = [name]
|
12
|
+
end
|
13
|
+
Utils.wrap_expr(Plr.field(name))
|
14
|
+
end
|
15
|
+
|
3
16
|
# Alias for an element in evaluated in an `eval` expression.
|
4
17
|
#
|
5
18
|
# @return [Expr]
|
@@ -810,7 +823,7 @@ module Polars
|
|
810
823
|
# # ┌─────┐
|
811
824
|
# # │ sum │
|
812
825
|
# # │ --- │
|
813
|
-
# # │
|
826
|
+
# # │ i32 │
|
814
827
|
# # ╞═════╡
|
815
828
|
# # │ 10 │
|
816
829
|
# # │ 13 │
|
data/lib/polars/io/csv.rb
CHANGED
@@ -499,7 +499,7 @@ module Polars
|
|
499
499
|
# for instance `#`.
|
500
500
|
# @param quote_char [String]
|
501
501
|
# Single byte character used for csv quoting.
|
502
|
-
# Set to
|
502
|
+
# Set to nil to turn off special handling and escaping of quotes.
|
503
503
|
# @param skip_rows [Integer]
|
504
504
|
# Start reading after `skip_rows` lines. The header will be parsed at this
|
505
505
|
# offset.
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Polars
|
2
|
+
module IO
|
3
|
+
# Lazily read from an Apache Iceberg table.
|
4
|
+
#
|
5
|
+
# @param source [Object]
|
6
|
+
# A Iceberg Ruby table, or a direct path to the metadata.
|
7
|
+
# @param snapshot_id [Integer]
|
8
|
+
# The snapshot ID to scan from.
|
9
|
+
# @param storage_options [Hash]
|
10
|
+
# Extra options for the storage backends.
|
11
|
+
#
|
12
|
+
# @return [LazyFrame]
|
13
|
+
def scan_iceberg(
|
14
|
+
source,
|
15
|
+
snapshot_id: nil,
|
16
|
+
storage_options: nil
|
17
|
+
)
|
18
|
+
require "iceberg"
|
19
|
+
|
20
|
+
unless source.is_a?(Iceberg::Table)
|
21
|
+
raise Todo
|
22
|
+
end
|
23
|
+
|
24
|
+
source.to_polars(snapshot_id:, storage_options:)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
data/lib/polars/io/json.rb
CHANGED
@@ -7,11 +7,11 @@ module Polars
|
|
7
7
|
# @param schema [Object]
|
8
8
|
# The DataFrame schema may be declared in several ways:
|
9
9
|
#
|
10
|
-
# * As a
|
11
|
-
# * As
|
12
|
-
# * As
|
10
|
+
# * As a hash of \\\\{name:type} pairs; if type is nil, it will be auto-inferred.
|
11
|
+
# * As an array of column names; in this case types are automatically inferred.
|
12
|
+
# * As an array of [name,type] pairs; this is equivalent to the hash form.
|
13
13
|
#
|
14
|
-
# If you supply
|
14
|
+
# If you supply an array of column names that does not match the names in the
|
15
15
|
# underlying data, the names given here will overwrite them. The number
|
16
16
|
# of names given in the schema should match the underlying data dimensions.
|
17
17
|
# @param schema_overrides [Hash]
|
data/lib/polars/io/ndjson.rb
CHANGED
@@ -7,11 +7,11 @@ module Polars
|
|
7
7
|
# @param schema [Object]
|
8
8
|
# The DataFrame schema may be declared in several ways:
|
9
9
|
#
|
10
|
-
# * As a
|
11
|
-
# * As
|
12
|
-
# * As
|
10
|
+
# * As a hash of \\\\{name:type} pairs; if type is nil, it will be auto-inferred.
|
11
|
+
# * As an array of column names; in this case types are automatically inferred.
|
12
|
+
# * As an array of [name,type] pairs; this is equivalent to the hash form.
|
13
13
|
#
|
14
|
-
# If you supply
|
14
|
+
# If you supply an array of column names that does not match the names in the
|
15
15
|
# underlying data, the names given here will overwrite them. The number
|
16
16
|
# of names given in the schema should match the underlying data dimensions.
|
17
17
|
# @param schema_overrides [Hash]
|
data/lib/polars/io/parquet.rb
CHANGED
@@ -43,7 +43,7 @@ module Polars
|
|
43
43
|
# Extra options that make sense for a particular storage connection.
|
44
44
|
# @param credential_provider [Object]
|
45
45
|
# Provide a function that can be called to provide cloud storage
|
46
|
-
# credentials. The function is expected to return a
|
46
|
+
# credentials. The function is expected to return a hash of
|
47
47
|
# credential keys along with an optional credential expiry time.
|
48
48
|
# @param retries [Integer]
|
49
49
|
# Number of retries if accessing a cloud instance fails.
|
@@ -117,13 +117,31 @@ module Polars
|
|
117
117
|
# @param source [Object]
|
118
118
|
# Path to a file or a file-like object.
|
119
119
|
#
|
120
|
-
# @return [
|
120
|
+
# @return [Schema]
|
121
121
|
def read_parquet_schema(source)
|
122
122
|
if Utils.pathlike?(source)
|
123
123
|
source = Utils.normalize_filepath(source)
|
124
124
|
end
|
125
125
|
|
126
|
-
|
126
|
+
scan_parquet(source).collect_schema
|
127
|
+
end
|
128
|
+
|
129
|
+
# Get file-level custom metadata of a Parquet file without reading data.
|
130
|
+
#
|
131
|
+
# @note
|
132
|
+
# This functionality is considered **experimental**. It may be removed or
|
133
|
+
# changed at any point without it being considered a breaking change.
|
134
|
+
#
|
135
|
+
# @param source [Object]
|
136
|
+
# Path to a file or a file-like object.
|
137
|
+
#
|
138
|
+
# @return [Hash]
|
139
|
+
def read_parquet_metadata(source)
|
140
|
+
if Utils.pathlike?(source)
|
141
|
+
source = Utils.normalize_filepath(source, check_not_directory: false)
|
142
|
+
end
|
143
|
+
|
144
|
+
Plr.read_parquet_metadata(source)
|
127
145
|
end
|
128
146
|
|
129
147
|
# Lazily read from a parquet file or multiple files via glob patterns.
|
@@ -171,7 +189,7 @@ module Polars
|
|
171
189
|
# Extra options that make sense for a particular storage connection.
|
172
190
|
# @param credential_provider [Object]
|
173
191
|
# Provide a function that can be called to provide cloud storage
|
174
|
-
# credentials. The function is expected to return a
|
192
|
+
# credentials. The function is expected to return a hash of
|
175
193
|
# credential keys along with an optional credential expiry time.
|
176
194
|
# @param retries [Integer]
|
177
195
|
# Number of retries if accessing a cloud instance fails.
|
@@ -188,6 +206,9 @@ module Polars
|
|
188
206
|
# defined schema are encountered in the data:
|
189
207
|
# * `ignore`: Silently ignores.
|
190
208
|
# * `raise`: Raises an error.
|
209
|
+
# @param cast_options [Object]
|
210
|
+
# Configuration for column type-casting during scans. Useful for datasets
|
211
|
+
# containing files that have differing schemas.
|
191
212
|
#
|
192
213
|
# @return [LazyFrame]
|
193
214
|
def scan_parquet(
|
@@ -210,7 +231,10 @@ module Polars
|
|
210
231
|
retries: 2,
|
211
232
|
include_file_paths: nil,
|
212
233
|
allow_missing_columns: false,
|
213
|
-
extra_columns: "raise"
|
234
|
+
extra_columns: "raise",
|
235
|
+
cast_options: nil,
|
236
|
+
_column_mapping: nil,
|
237
|
+
_deletion_files: nil
|
214
238
|
)
|
215
239
|
missing_columns = allow_missing_columns ? "insert" : "raise"
|
216
240
|
|
@@ -247,7 +271,7 @@ module Polars
|
|
247
271
|
ScanOptions.new(
|
248
272
|
row_index: !row_index_name.nil? ? [row_index_name, row_index_offset] : nil,
|
249
273
|
pre_slice: !n_rows.nil? ? [0, n_rows] : nil,
|
250
|
-
|
274
|
+
cast_options: cast_options,
|
251
275
|
extra_columns: extra_columns,
|
252
276
|
missing_columns: missing_columns,
|
253
277
|
include_file_paths: include_file_paths,
|
@@ -260,7 +284,8 @@ module Polars
|
|
260
284
|
storage_options: storage_options,
|
261
285
|
# credential_provider: credential_provider_builder,
|
262
286
|
retries: retries,
|
263
|
-
|
287
|
+
deletion_files: _deletion_files,
|
288
|
+
column_mapping: _column_mapping
|
264
289
|
),
|
265
290
|
parallel,
|
266
291
|
low_memory,
|
@@ -3,7 +3,8 @@ module Polars
|
|
3
3
|
class ScanOptions
|
4
4
|
attr_reader :row_index, :pre_slice, :cast_options, :extra_columns, :missing_columns,
|
5
5
|
:include_file_paths, :glob, :hive_partitioning, :hive_schema, :try_parse_hive_dates,
|
6
|
-
:rechunk, :cache, :storage_options, :credential_provider, :retries, :column_mapping,
|
6
|
+
:rechunk, :cache, :storage_options, :credential_provider, :retries, :column_mapping,
|
7
|
+
:default_values, :deletion_files
|
7
8
|
|
8
9
|
def initialize(
|
9
10
|
row_index: nil,
|
@@ -22,6 +23,7 @@ module Polars
|
|
22
23
|
credential_provider: nil,
|
23
24
|
retries: 2,
|
24
25
|
column_mapping: nil,
|
26
|
+
default_values: nil,
|
25
27
|
deletion_files: nil
|
26
28
|
)
|
27
29
|
@row_index = row_index
|
@@ -40,6 +42,7 @@ module Polars
|
|
40
42
|
@credential_provider = credential_provider
|
41
43
|
@retries = retries
|
42
44
|
@column_mapping = column_mapping
|
45
|
+
@default_values = default_values
|
43
46
|
@deletion_files = deletion_files
|
44
47
|
end
|
45
48
|
end
|