polars-df 0.14.0-aarch64-linux → 0.16.0-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +35 -0
- data/Cargo.lock +1523 -378
- data/LICENSE-THIRD-PARTY.txt +23495 -12923
- data/LICENSE.txt +1 -0
- data/README.md +38 -4
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/3.3/polars.so +0 -0
- data/lib/polars/{3.1 → 3.4}/polars.so +0 -0
- data/lib/polars/batched_csv_reader.rb +0 -2
- data/lib/polars/binary_expr.rb +133 -9
- data/lib/polars/binary_name_space.rb +101 -6
- data/lib/polars/config.rb +4 -0
- data/lib/polars/data_frame.rb +452 -101
- data/lib/polars/data_type_group.rb +28 -0
- data/lib/polars/data_types.rb +3 -1
- data/lib/polars/date_time_expr.rb +244 -0
- data/lib/polars/date_time_name_space.rb +87 -0
- data/lib/polars/expr.rb +103 -2
- data/lib/polars/functions/aggregation/horizontal.rb +10 -4
- data/lib/polars/functions/as_datatype.rb +51 -2
- data/lib/polars/functions/col.rb +1 -1
- data/lib/polars/functions/eager.rb +1 -3
- data/lib/polars/functions/lazy.rb +95 -13
- data/lib/polars/functions/range/time_range.rb +21 -21
- data/lib/polars/io/csv.rb +14 -16
- data/lib/polars/io/database.rb +2 -2
- data/lib/polars/io/delta.rb +126 -0
- data/lib/polars/io/ipc.rb +14 -4
- data/lib/polars/io/ndjson.rb +10 -0
- data/lib/polars/io/parquet.rb +168 -111
- data/lib/polars/lazy_frame.rb +684 -20
- data/lib/polars/list_name_space.rb +169 -0
- data/lib/polars/selectors.rb +1226 -0
- data/lib/polars/series.rb +465 -35
- data/lib/polars/string_cache.rb +27 -1
- data/lib/polars/string_expr.rb +0 -1
- data/lib/polars/string_name_space.rb +73 -3
- data/lib/polars/struct_name_space.rb +31 -7
- data/lib/polars/utils/various.rb +5 -1
- data/lib/polars/utils.rb +45 -10
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +17 -1
- metadata +9 -8
- data/lib/polars/functions.rb +0 -57
data/lib/polars/data_frame.rb
CHANGED
@@ -8,17 +8,49 @@ module Polars
|
|
8
8
|
|
9
9
|
# Create a new DataFrame.
|
10
10
|
#
|
11
|
-
# @param data [
|
12
|
-
# Two-dimensional data in various forms
|
13
|
-
#
|
14
|
-
# @param
|
15
|
-
#
|
16
|
-
#
|
17
|
-
#
|
18
|
-
#
|
11
|
+
# @param data [Object]
|
12
|
+
# Two-dimensional data in various forms; hash input must contain arrays
|
13
|
+
# or a range. Arrays may contain Series or other arrays.
|
14
|
+
# @param schema [Object]
|
15
|
+
# The schema of the resulting DataFrame. The schema may be declared in several
|
16
|
+
# ways:
|
17
|
+
#
|
18
|
+
# * As a hash of name:type pairs; if type is nil, it will be auto-inferred.
|
19
|
+
# * As an array of column names; in this case types are automatically inferred.
|
20
|
+
# * As an array of (name,type) pairs; this is equivalent to the dictionary form.
|
21
|
+
#
|
22
|
+
# If you supply a list of column names that does not match the names in the
|
23
|
+
# underlying data, the names given here will overwrite them. The number
|
24
|
+
# of names given in the schema should match the underlying data dimensions.
|
25
|
+
#
|
26
|
+
# If set to `nil` (default), the schema is inferred from the data.
|
27
|
+
# @param schema_overrides [Hash]
|
28
|
+
# Support type specification or override of one or more columns; note that
|
29
|
+
# any dtypes inferred from the schema param will be overridden.
|
30
|
+
#
|
31
|
+
# The number of entries in the schema should match the underlying data
|
32
|
+
# dimensions, unless an array of hashes is being passed, in which case
|
33
|
+
# a *partial* schema can be declared to prevent specific fields from being loaded.
|
34
|
+
# @param strict [Boolean]
|
35
|
+
# Throw an error if any `data` value does not exactly match the given or inferred
|
36
|
+
# data type for that column. If set to `false`, values that do not match the data
|
37
|
+
# type are cast to that data type or, if casting is not possible, set to null
|
38
|
+
# instead.
|
39
|
+
# @param orient ["col", "row"]
|
40
|
+
# Whether to interpret two-dimensional data as columns or as rows. If nil,
|
19
41
|
# the orientation is inferred by matching the columns and data dimensions. If
|
20
42
|
# this does not yield conclusive results, column orientation is used.
|
21
|
-
|
43
|
+
# @param infer_schema_length [Integer]
|
44
|
+
# The maximum number of rows to scan for schema inference. If set to `nil`, the
|
45
|
+
# full data may be scanned *(this can be slow)*. This parameter only applies if
|
46
|
+
# the input data is a sequence or generator of rows; other input is read as-is.
|
47
|
+
# @param nan_to_null [Boolean]
|
48
|
+
# If the data comes from one or more Numo arrays, can optionally convert input
|
49
|
+
# data NaN values to null instead. This is a no-op for all other input data.
|
50
|
+
def initialize(data = nil, schema: nil, columns: nil, schema_overrides: nil, strict: true, orient: nil, infer_schema_length: 100, nan_to_null: false)
|
51
|
+
if schema && columns
|
52
|
+
warn "columns is ignored when schema is passed"
|
53
|
+
end
|
22
54
|
schema ||= columns
|
23
55
|
|
24
56
|
if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
|
@@ -29,11 +61,17 @@ module Polars
|
|
29
61
|
self._df = self.class.hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
|
30
62
|
elsif data.is_a?(Hash)
|
31
63
|
data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
|
32
|
-
self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, nan_to_null: nan_to_null)
|
64
|
+
self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict, nan_to_null: nan_to_null)
|
33
65
|
elsif data.is_a?(::Array)
|
34
|
-
self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, orient: orient, infer_schema_length: infer_schema_length)
|
66
|
+
self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict, orient: orient, infer_schema_length: infer_schema_length)
|
35
67
|
elsif data.is_a?(Series)
|
36
|
-
self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides)
|
68
|
+
self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict)
|
69
|
+
elsif data.respond_to?(:arrow_c_stream)
|
70
|
+
# This uses the fact that RbSeries.from_arrow_c_stream will create a
|
71
|
+
# struct-typed Series. Then we unpack that to a DataFrame.
|
72
|
+
tmp_col_name = ""
|
73
|
+
s = Utils.wrap_s(RbSeries.from_arrow_c_stream(data))
|
74
|
+
self._df = s.to_frame(tmp_col_name).unnest(tmp_col_name)._df
|
37
75
|
else
|
38
76
|
raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
|
39
77
|
end
|
@@ -452,6 +490,11 @@ module Polars
|
|
452
490
|
end
|
453
491
|
end
|
454
492
|
|
493
|
+
# @private
|
494
|
+
def arrow_c_stream
|
495
|
+
_df.arrow_c_stream
|
496
|
+
end
|
497
|
+
|
455
498
|
# Return the dataframe as a scalar.
|
456
499
|
#
|
457
500
|
# Equivalent to `df[0,0]`, with a check that the shape is (1,1).
|
@@ -766,15 +809,18 @@ module Polars
|
|
766
809
|
# Compression method. Defaults to "uncompressed".
|
767
810
|
#
|
768
811
|
# @return [nil]
|
769
|
-
def write_avro(file, compression = "uncompressed")
|
812
|
+
def write_avro(file, compression = "uncompressed", name: "")
|
770
813
|
if compression.nil?
|
771
814
|
compression = "uncompressed"
|
772
815
|
end
|
773
816
|
if Utils.pathlike?(file)
|
774
817
|
file = Utils.normalize_filepath(file)
|
775
818
|
end
|
819
|
+
if name.nil?
|
820
|
+
name = ""
|
821
|
+
end
|
776
822
|
|
777
|
-
_df.write_avro(file, compression)
|
823
|
+
_df.write_avro(file, compression, name)
|
778
824
|
end
|
779
825
|
|
780
826
|
# Write to Arrow IPC binary stream or Feather file.
|
@@ -785,7 +831,13 @@ module Polars
|
|
785
831
|
# Compression method. Defaults to "uncompressed".
|
786
832
|
#
|
787
833
|
# @return [nil]
|
788
|
-
def write_ipc(
|
834
|
+
def write_ipc(
|
835
|
+
file,
|
836
|
+
compression: "uncompressed",
|
837
|
+
compat_level: nil,
|
838
|
+
storage_options: nil,
|
839
|
+
retries: 2
|
840
|
+
)
|
789
841
|
return_bytes = file.nil?
|
790
842
|
if return_bytes
|
791
843
|
file = StringIO.new
|
@@ -795,11 +847,21 @@ module Polars
|
|
795
847
|
file = Utils.normalize_filepath(file)
|
796
848
|
end
|
797
849
|
|
850
|
+
if compat_level.nil?
|
851
|
+
compat_level = true
|
852
|
+
end
|
853
|
+
|
798
854
|
if compression.nil?
|
799
855
|
compression = "uncompressed"
|
800
856
|
end
|
801
857
|
|
802
|
-
|
858
|
+
if storage_options&.any?
|
859
|
+
storage_options = storage_options.to_a
|
860
|
+
else
|
861
|
+
storage_options = nil
|
862
|
+
end
|
863
|
+
|
864
|
+
_df.write_ipc(file, compression, compat_level, storage_options, retries)
|
803
865
|
return_bytes ? file.string : nil
|
804
866
|
end
|
805
867
|
|
@@ -826,7 +888,8 @@ module Polars
|
|
826
888
|
# df.write_ipc_stream("new_file.arrow")
|
827
889
|
def write_ipc_stream(
|
828
890
|
file,
|
829
|
-
compression: "uncompressed"
|
891
|
+
compression: "uncompressed",
|
892
|
+
compat_level: nil
|
830
893
|
)
|
831
894
|
return_bytes = file.nil?
|
832
895
|
if return_bytes
|
@@ -836,11 +899,15 @@ module Polars
|
|
836
899
|
file = Utils.normalize_filepath(file)
|
837
900
|
end
|
838
901
|
|
902
|
+
if compat_level.nil?
|
903
|
+
compat_level = true
|
904
|
+
end
|
905
|
+
|
839
906
|
if compression.nil?
|
840
907
|
compression = "uncompressed"
|
841
908
|
end
|
842
909
|
|
843
|
-
_df.write_ipc_stream(file, compression)
|
910
|
+
_df.write_ipc_stream(file, compression, compat_level)
|
844
911
|
return_bytes ? file.string : nil
|
845
912
|
end
|
846
913
|
|
@@ -906,6 +973,61 @@ module Polars
|
|
906
973
|
)
|
907
974
|
end
|
908
975
|
|
976
|
+
# Write DataFrame as delta table.
|
977
|
+
#
|
978
|
+
# @param target [Object]
|
979
|
+
# URI of a table or a DeltaTable object.
|
980
|
+
# @param mode ["error", "append", "overwrite", "ignore", "merge"]
|
981
|
+
# How to handle existing data.
|
982
|
+
# @param storage_options [Hash]
|
983
|
+
# Extra options for the storage backends supported by `deltalake-rb`.
|
984
|
+
# @param delta_write_options [Hash]
|
985
|
+
# Additional keyword arguments while writing a Delta lake Table.
|
986
|
+
# @param delta_merge_options [Hash]
|
987
|
+
# Keyword arguments which are required to `MERGE` a Delta lake Table.
|
988
|
+
#
|
989
|
+
# @return [nil]
|
990
|
+
def write_delta(
|
991
|
+
target,
|
992
|
+
mode: "error",
|
993
|
+
storage_options: nil,
|
994
|
+
delta_write_options: nil,
|
995
|
+
delta_merge_options: nil
|
996
|
+
)
|
997
|
+
Polars.send(:_check_if_delta_available)
|
998
|
+
|
999
|
+
if Utils.pathlike?(target)
|
1000
|
+
target = Polars.send(:_resolve_delta_lake_uri, target.to_s, strict: false)
|
1001
|
+
end
|
1002
|
+
|
1003
|
+
data = self
|
1004
|
+
|
1005
|
+
if mode == "merge"
|
1006
|
+
if delta_merge_options.nil?
|
1007
|
+
msg = "You need to pass delta_merge_options with at least a given predicate for `MERGE` to work."
|
1008
|
+
raise ArgumentError, msg
|
1009
|
+
end
|
1010
|
+
if target.is_a?(::String)
|
1011
|
+
dt = DeltaLake::Table.new(target, storage_options: storage_options)
|
1012
|
+
else
|
1013
|
+
dt = target
|
1014
|
+
end
|
1015
|
+
|
1016
|
+
predicate = delta_merge_options.delete(:predicate)
|
1017
|
+
dt.merge(data, predicate, **delta_merge_options)
|
1018
|
+
else
|
1019
|
+
delta_write_options ||= {}
|
1020
|
+
|
1021
|
+
DeltaLake.write(
|
1022
|
+
target,
|
1023
|
+
data,
|
1024
|
+
mode: mode,
|
1025
|
+
storage_options: storage_options,
|
1026
|
+
**delta_write_options
|
1027
|
+
)
|
1028
|
+
end
|
1029
|
+
end
|
1030
|
+
|
909
1031
|
# Return an estimation of the total (heap) allocated size of the DataFrame.
|
910
1032
|
#
|
911
1033
|
# Estimated size is given in the specified unit (bytes by default).
|
@@ -1037,6 +1159,10 @@ module Polars
|
|
1037
1159
|
#
|
1038
1160
|
# @param mapping [Hash]
|
1039
1161
|
# Key value pairs that map from old name to new name.
|
1162
|
+
# @param strict [Boolean]
|
1163
|
+
# Validate that all column names exist in the current schema,
|
1164
|
+
# and throw an exception if any do not. (Note that this parameter
|
1165
|
+
# is a no-op when passing a function to `mapping`).
|
1040
1166
|
#
|
1041
1167
|
# @return [DataFrame]
|
1042
1168
|
#
|
@@ -1060,8 +1186,8 @@ module Polars
|
|
1060
1186
|
# # │ 2 ┆ 7 ┆ b │
|
1061
1187
|
# # │ 3 ┆ 8 ┆ c │
|
1062
1188
|
# # └───────┴─────┴─────┘
|
1063
|
-
def rename(mapping)
|
1064
|
-
lazy.rename(mapping).collect(no_optimization: true)
|
1189
|
+
def rename(mapping, strict: true)
|
1190
|
+
lazy.rename(mapping, strict: strict).collect(no_optimization: true)
|
1065
1191
|
end
|
1066
1192
|
|
1067
1193
|
# Insert a Series at a certain column index. This operation is in place.
|
@@ -2190,6 +2316,11 @@ module Polars
|
|
2190
2316
|
# @param force_parallel [Boolean]
|
2191
2317
|
# Force the physical plan to evaluate the computation of both DataFrames up to
|
2192
2318
|
# the join in parallel.
|
2319
|
+
# @param coalesce [Boolean]
|
2320
|
+
# Coalescing behavior (merging of join columns).
|
2321
|
+
# - true: -> Always coalesce join columns.
|
2322
|
+
# - false: -> Never coalesce join columns.
|
2323
|
+
# Note that joining on any other expressions than `col` will turn off coalescing.
|
2193
2324
|
#
|
2194
2325
|
# @return [DataFrame]
|
2195
2326
|
#
|
@@ -2243,7 +2374,8 @@ module Polars
|
|
2243
2374
|
suffix: "_right",
|
2244
2375
|
tolerance: nil,
|
2245
2376
|
allow_parallel: true,
|
2246
|
-
force_parallel: false
|
2377
|
+
force_parallel: false,
|
2378
|
+
coalesce: true
|
2247
2379
|
)
|
2248
2380
|
lazy
|
2249
2381
|
.join_asof(
|
@@ -2258,7 +2390,8 @@ module Polars
|
|
2258
2390
|
suffix: suffix,
|
2259
2391
|
tolerance: tolerance,
|
2260
2392
|
allow_parallel: allow_parallel,
|
2261
|
-
force_parallel: force_parallel
|
2393
|
+
force_parallel: force_parallel,
|
2394
|
+
coalesce: coalesce
|
2262
2395
|
)
|
2263
2396
|
.collect(no_optimization: true)
|
2264
2397
|
end
|
@@ -2277,8 +2410,20 @@ module Polars
|
|
2277
2410
|
# Join strategy.
|
2278
2411
|
# @param suffix [String]
|
2279
2412
|
# Suffix to append to columns with a duplicate name.
|
2413
|
+
# @param validate ['m:m', 'm:1', '1:m', '1:1']
|
2414
|
+
# Checks if join is of specified type.
|
2415
|
+
# * *many_to_many* - “m:m”: default, does not result in checks
|
2416
|
+
# * *one_to_one* - “1:1”: check if join keys are unique in both left and right datasets
|
2417
|
+
# * *one_to_many* - “1:m”: check if join keys are unique in left dataset
|
2418
|
+
# * *many_to_one* - “m:1”: check if join keys are unique in right dataset
|
2280
2419
|
# @param join_nulls [Boolean]
|
2281
2420
|
# Join on null values. By default null values will never produce matches.
|
2421
|
+
# @param coalesce [Boolean]
|
2422
|
+
# Coalescing behavior (merging of join columns).
|
2423
|
+
# - nil: -> join specific.
|
2424
|
+
# - true: -> Always coalesce join columns.
|
2425
|
+
# - false: -> Never coalesce join columns.
|
2426
|
+
# Note that joining on any other expressions than `col` will turn off coalescing.
|
2282
2427
|
#
|
2283
2428
|
# @return [DataFrame]
|
2284
2429
|
#
|
@@ -2361,7 +2506,16 @@ module Polars
|
|
2361
2506
|
# # ╞═════╪═════╪═════╡
|
2362
2507
|
# # │ 3 ┆ 8.0 ┆ c │
|
2363
2508
|
# # └─────┴─────┴─────┘
|
2364
|
-
def join(other,
|
2509
|
+
def join(other,
|
2510
|
+
left_on: nil,
|
2511
|
+
right_on: nil,
|
2512
|
+
on: nil,
|
2513
|
+
how: "inner",
|
2514
|
+
suffix: "_right",
|
2515
|
+
validate: "m:m",
|
2516
|
+
join_nulls: false,
|
2517
|
+
coalesce: nil
|
2518
|
+
)
|
2365
2519
|
lazy
|
2366
2520
|
.join(
|
2367
2521
|
other.lazy,
|
@@ -2370,7 +2524,9 @@ module Polars
|
|
2370
2524
|
on: on,
|
2371
2525
|
how: how,
|
2372
2526
|
suffix: suffix,
|
2373
|
-
|
2527
|
+
validate: validate,
|
2528
|
+
join_nulls: join_nulls,
|
2529
|
+
coalesce: coalesce
|
2374
2530
|
)
|
2375
2531
|
.collect(no_optimization: true)
|
2376
2532
|
end
|
@@ -2717,10 +2873,85 @@ module Polars
|
|
2717
2873
|
# Column to drop.
|
2718
2874
|
#
|
2719
2875
|
# @return [Series]
|
2876
|
+
#
|
2877
|
+
# @example
|
2878
|
+
# df = Polars::DataFrame.new(
|
2879
|
+
# {
|
2880
|
+
# "foo" => [1, 2, 3],
|
2881
|
+
# "bar" => [6, 7, 8],
|
2882
|
+
# "ham" => ["a", "b", "c"]
|
2883
|
+
# }
|
2884
|
+
# )
|
2885
|
+
# df.delete("ham")
|
2886
|
+
# # =>
|
2887
|
+
# # shape: (3,)
|
2888
|
+
# # Series: 'ham' [str]
|
2889
|
+
# # [
|
2890
|
+
# # "a"
|
2891
|
+
# # "b"
|
2892
|
+
# # "c"
|
2893
|
+
# # ]
|
2894
|
+
#
|
2895
|
+
# @example
|
2896
|
+
# df.delete("missing")
|
2897
|
+
# # => nil
|
2720
2898
|
def delete(name)
|
2721
2899
|
drop_in_place(name) if include?(name)
|
2722
2900
|
end
|
2723
2901
|
|
2902
|
+
# Cast DataFrame column(s) to the specified dtype(s).
|
2903
|
+
#
|
2904
|
+
# @param dtypes [Object]
|
2905
|
+
# Mapping of column names (or selector) to dtypes, or a single dtype
|
2906
|
+
# to which all columns will be cast.
|
2907
|
+
# @param strict [Boolean]
|
2908
|
+
# Throw an error if a cast could not be done (for instance, due to an
|
2909
|
+
# overflow).
|
2910
|
+
#
|
2911
|
+
# @return [DataFrame]
|
2912
|
+
#
|
2913
|
+
# @example Cast specific frame columns to the specified dtypes:
|
2914
|
+
# df = Polars::DataFrame.new(
|
2915
|
+
# {
|
2916
|
+
# "foo" => [1, 2, 3],
|
2917
|
+
# "bar" => [6.0, 7.0, 8.0],
|
2918
|
+
# "ham" => [Date.new(2020, 1, 2), Date.new(2021, 3, 4), Date.new(2022, 5, 6)]
|
2919
|
+
# }
|
2920
|
+
# )
|
2921
|
+
# df.cast({"foo" => Polars::Float32, "bar" => Polars::UInt8})
|
2922
|
+
# # =>
|
2923
|
+
# # shape: (3, 3)
|
2924
|
+
# # ┌─────┬─────┬────────────┐
|
2925
|
+
# # │ foo ┆ bar ┆ ham │
|
2926
|
+
# # │ --- ┆ --- ┆ --- │
|
2927
|
+
# # │ f32 ┆ u8 ┆ date │
|
2928
|
+
# # ╞═════╪═════╪════════════╡
|
2929
|
+
# # │ 1.0 ┆ 6 ┆ 2020-01-02 │
|
2930
|
+
# # │ 2.0 ┆ 7 ┆ 2021-03-04 │
|
2931
|
+
# # │ 3.0 ┆ 8 ┆ 2022-05-06 │
|
2932
|
+
# # └─────┴─────┴────────────┘
|
2933
|
+
#
|
2934
|
+
# @example Cast all frame columns matching one dtype (or dtype group) to another dtype:
|
2935
|
+
# df.cast({Polars::Date => Polars::Datetime})
|
2936
|
+
# # =>
|
2937
|
+
# # shape: (3, 3)
|
2938
|
+
# # ┌─────┬─────┬─────────────────────┐
|
2939
|
+
# # │ foo ┆ bar ┆ ham │
|
2940
|
+
# # │ --- ┆ --- ┆ --- │
|
2941
|
+
# # │ i64 ┆ f64 ┆ datetime[μs] │
|
2942
|
+
# # ╞═════╪═════╪═════════════════════╡
|
2943
|
+
# # │ 1 ┆ 6.0 ┆ 2020-01-02 00:00:00 │
|
2944
|
+
# # │ 2 ┆ 7.0 ┆ 2021-03-04 00:00:00 │
|
2945
|
+
# # │ 3 ┆ 8.0 ┆ 2022-05-06 00:00:00 │
|
2946
|
+
# # └─────┴─────┴─────────────────────┘
|
2947
|
+
#
|
2948
|
+
# @example Cast all frame columns to the specified dtype:
|
2949
|
+
# df.cast(Polars::String).to_h(as_series: false)
|
2950
|
+
# # => {"foo"=>["1", "2", "3"], "bar"=>["6.0", "7.0", "8.0"], "ham"=>["2020-01-02", "2021-03-04", "2022-05-06"]}
|
2951
|
+
def cast(dtypes, strict: true)
|
2952
|
+
lazy.cast(dtypes, strict: strict).collect(_eager: true)
|
2953
|
+
end
|
2954
|
+
|
2724
2955
|
# Create an empty copy of the current DataFrame.
|
2725
2956
|
#
|
2726
2957
|
# Returns a DataFrame with identical schema but no data.
|
@@ -2775,6 +3006,57 @@ module Polars
|
|
2775
3006
|
# Get the DataFrame as a Array of Series.
|
2776
3007
|
#
|
2777
3008
|
# @return [Array]
|
3009
|
+
#
|
3010
|
+
# @example
|
3011
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
|
3012
|
+
# df.get_columns
|
3013
|
+
# # =>
|
3014
|
+
# # [shape: (3,)
|
3015
|
+
# # Series: 'foo' [i64]
|
3016
|
+
# # [
|
3017
|
+
# # 1
|
3018
|
+
# # 2
|
3019
|
+
# # 3
|
3020
|
+
# # ], shape: (3,)
|
3021
|
+
# # Series: 'bar' [i64]
|
3022
|
+
# # [
|
3023
|
+
# # 4
|
3024
|
+
# # 5
|
3025
|
+
# # 6
|
3026
|
+
# # ]]
|
3027
|
+
#
|
3028
|
+
# @example
|
3029
|
+
# df = Polars::DataFrame.new(
|
3030
|
+
# {
|
3031
|
+
# "a" => [1, 2, 3, 4],
|
3032
|
+
# "b" => [0.5, 4, 10, 13],
|
3033
|
+
# "c" => [true, true, false, true]
|
3034
|
+
# }
|
3035
|
+
# )
|
3036
|
+
# df.get_columns
|
3037
|
+
# # =>
|
3038
|
+
# # [shape: (4,)
|
3039
|
+
# # Series: 'a' [i64]
|
3040
|
+
# # [
|
3041
|
+
# # 1
|
3042
|
+
# # 2
|
3043
|
+
# # 3
|
3044
|
+
# # 4
|
3045
|
+
# # ], shape: (4,)
|
3046
|
+
# # Series: 'b' [f64]
|
3047
|
+
# # [
|
3048
|
+
# # 0.5
|
3049
|
+
# # 4.0
|
3050
|
+
# # 10.0
|
3051
|
+
# # 13.0
|
3052
|
+
# # ], shape: (4,)
|
3053
|
+
# # Series: 'c' [bool]
|
3054
|
+
# # [
|
3055
|
+
# # true
|
3056
|
+
# # true
|
3057
|
+
# # false
|
3058
|
+
# # true
|
3059
|
+
# # ]]
|
2778
3060
|
def get_columns
|
2779
3061
|
_df.get_columns.map { |s| Utils.wrap_s(s) }
|
2780
3062
|
end
|
@@ -3083,7 +3365,7 @@ module Polars
|
|
3083
3365
|
# "c" => [2, 4, 6]
|
3084
3366
|
# }
|
3085
3367
|
# )
|
3086
|
-
# df.unpivot(Polars
|
3368
|
+
# df.unpivot(Polars.cs.numeric, index: "a")
|
3087
3369
|
# # =>
|
3088
3370
|
# # shape: (6, 3)
|
3089
3371
|
# # ┌─────┬──────────┬───────┐
|
@@ -3724,14 +4006,32 @@ module Polars
|
|
3724
4006
|
# # ╞═════╪═════╪═════╡
|
3725
4007
|
# # │ 3 ┆ 8 ┆ c │
|
3726
4008
|
# # └─────┴─────┴─────┘
|
3727
|
-
def max
|
3728
|
-
|
3729
|
-
|
3730
|
-
|
3731
|
-
|
3732
|
-
|
3733
|
-
|
3734
|
-
|
4009
|
+
def max
|
4010
|
+
lazy.max.collect(_eager: true)
|
4011
|
+
end
|
4012
|
+
|
4013
|
+
# Get the maximum value horizontally across columns.
|
4014
|
+
#
|
4015
|
+
# @return [Series]
|
4016
|
+
#
|
4017
|
+
# @example
|
4018
|
+
# df = Polars::DataFrame.new(
|
4019
|
+
# {
|
4020
|
+
# "foo" => [1, 2, 3],
|
4021
|
+
# "bar" => [4.0, 5.0, 6.0]
|
4022
|
+
# }
|
4023
|
+
# )
|
4024
|
+
# df.max_horizontal
|
4025
|
+
# # =>
|
4026
|
+
# # shape: (3,)
|
4027
|
+
# # Series: 'max' [f64]
|
4028
|
+
# # [
|
4029
|
+
# # 4.0
|
4030
|
+
# # 5.0
|
4031
|
+
# # 6.0
|
4032
|
+
# # ]
|
4033
|
+
def max_horizontal
|
4034
|
+
select(max: F.max_horizontal(F.all)).to_series
|
3735
4035
|
end
|
3736
4036
|
|
3737
4037
|
# Aggregate the columns of this DataFrame to their minimum value.
|
@@ -3756,22 +4056,35 @@ module Polars
|
|
3756
4056
|
# # ╞═════╪═════╪═════╡
|
3757
4057
|
# # │ 1 ┆ 6 ┆ a │
|
3758
4058
|
# # └─────┴─────┴─────┘
|
3759
|
-
def min
|
3760
|
-
|
3761
|
-
lazy.min.collect(_eager: true)
|
3762
|
-
elsif axis == 1
|
3763
|
-
Utils.wrap_s(_df.min_horizontal)
|
3764
|
-
else
|
3765
|
-
raise ArgumentError, "Axis should be 0 or 1."
|
3766
|
-
end
|
4059
|
+
def min
|
4060
|
+
lazy.min.collect(_eager: true)
|
3767
4061
|
end
|
3768
4062
|
|
3769
|
-
#
|
4063
|
+
# Get the minimum value horizontally across columns.
|
3770
4064
|
#
|
3771
|
-
# @
|
3772
|
-
#
|
3773
|
-
# @
|
3774
|
-
#
|
4065
|
+
# @return [Series]
|
4066
|
+
#
|
4067
|
+
# @example
|
4068
|
+
# df = Polars::DataFrame.new(
|
4069
|
+
# {
|
4070
|
+
# "foo" => [1, 2, 3],
|
4071
|
+
# "bar" => [4.0, 5.0, 6.0]
|
4072
|
+
# }
|
4073
|
+
# )
|
4074
|
+
# df.min_horizontal
|
4075
|
+
# # =>
|
4076
|
+
# # shape: (3,)
|
4077
|
+
# # Series: 'min' [f64]
|
4078
|
+
# # [
|
4079
|
+
# # 1.0
|
4080
|
+
# # 2.0
|
4081
|
+
# # 3.0
|
4082
|
+
# # ]
|
4083
|
+
def min_horizontal
|
4084
|
+
select(min: F.min_horizontal(F.all)).to_series
|
4085
|
+
end
|
4086
|
+
|
4087
|
+
# Aggregate the columns of this DataFrame to their sum value.
|
3775
4088
|
#
|
3776
4089
|
# @return [DataFrame]
|
3777
4090
|
#
|
@@ -3793,35 +4106,42 @@ module Polars
|
|
3793
4106
|
# # ╞═════╪═════╪══════╡
|
3794
4107
|
# # │ 6 ┆ 21 ┆ null │
|
3795
4108
|
# # └─────┴─────┴──────┘
|
4109
|
+
def sum
|
4110
|
+
lazy.sum.collect(_eager: true)
|
4111
|
+
end
|
4112
|
+
|
4113
|
+
# Sum all values horizontally across columns.
|
4114
|
+
#
|
4115
|
+
# @param ignore_nulls [Boolean]
|
4116
|
+
# Ignore null values (default).
|
4117
|
+
# If set to `false`, any null value in the input will lead to a null output.
|
4118
|
+
#
|
4119
|
+
# @return [Series]
|
3796
4120
|
#
|
3797
4121
|
# @example
|
3798
|
-
# df.
|
4122
|
+
# df = Polars::DataFrame.new(
|
4123
|
+
# {
|
4124
|
+
# "foo" => [1, 2, 3],
|
4125
|
+
# "bar" => [4.0, 5.0, 6.0]
|
4126
|
+
# }
|
4127
|
+
# )
|
4128
|
+
# df.sum_horizontal
|
3799
4129
|
# # =>
|
3800
4130
|
# # shape: (3,)
|
3801
|
-
# # Series: '
|
4131
|
+
# # Series: 'sum' [f64]
|
3802
4132
|
# # [
|
3803
|
-
# #
|
3804
|
-
# #
|
3805
|
-
# #
|
4133
|
+
# # 5.0
|
4134
|
+
# # 7.0
|
4135
|
+
# # 9.0
|
3806
4136
|
# # ]
|
3807
|
-
def
|
3808
|
-
|
3809
|
-
|
3810
|
-
|
3811
|
-
when 1
|
3812
|
-
Utils.wrap_s(_df.sum_horizontal(null_strategy))
|
3813
|
-
else
|
3814
|
-
raise ArgumentError, "Axis should be 0 or 1."
|
3815
|
-
end
|
4137
|
+
def sum_horizontal(ignore_nulls: true)
|
4138
|
+
select(
|
4139
|
+
sum: F.sum_horizontal(F.all, ignore_nulls: ignore_nulls)
|
4140
|
+
).to_series
|
3816
4141
|
end
|
3817
4142
|
|
3818
4143
|
# Aggregate the columns of this DataFrame to their mean value.
|
3819
4144
|
#
|
3820
|
-
# @param axis [Integer]
|
3821
|
-
# Either 0 or 1.
|
3822
|
-
# @param null_strategy ["ignore", "propagate"]
|
3823
|
-
# This argument is only used if axis == 1.
|
3824
|
-
#
|
3825
4145
|
# @return [DataFrame]
|
3826
4146
|
#
|
3827
4147
|
# @example
|
@@ -3842,15 +4162,38 @@ module Polars
|
|
3842
4162
|
# # ╞═════╪═════╪══════╡
|
3843
4163
|
# # │ 2.0 ┆ 7.0 ┆ null │
|
3844
4164
|
# # └─────┴─────┴──────┘
|
3845
|
-
def mean
|
3846
|
-
|
3847
|
-
|
3848
|
-
|
3849
|
-
|
3850
|
-
|
3851
|
-
|
3852
|
-
|
3853
|
-
|
4165
|
+
def mean
|
4166
|
+
lazy.mean.collect(_eager: true)
|
4167
|
+
end
|
4168
|
+
|
4169
|
+
# Take the mean of all values horizontally across columns.
|
4170
|
+
#
|
4171
|
+
# @param ignore_nulls [Boolean]
|
4172
|
+
# Ignore null values (default).
|
4173
|
+
# If set to `false`, any null value in the input will lead to a null output.
|
4174
|
+
#
|
4175
|
+
# @return [Series]
|
4176
|
+
#
|
4177
|
+
# @example
|
4178
|
+
# df = Polars::DataFrame.new(
|
4179
|
+
# {
|
4180
|
+
# "foo" => [1, 2, 3],
|
4181
|
+
# "bar" => [4.0, 5.0, 6.0]
|
4182
|
+
# }
|
4183
|
+
# )
|
4184
|
+
# df.mean_horizontal
|
4185
|
+
# # =>
|
4186
|
+
# # shape: (3,)
|
4187
|
+
# # Series: 'mean' [f64]
|
4188
|
+
# # [
|
4189
|
+
# # 2.5
|
4190
|
+
# # 3.5
|
4191
|
+
# # 4.5
|
4192
|
+
# # ]
|
4193
|
+
def mean_horizontal(ignore_nulls: true)
|
4194
|
+
select(
|
4195
|
+
mean: F.mean_horizontal(F.all, ignore_nulls: ignore_nulls)
|
4196
|
+
).to_series
|
3854
4197
|
end
|
3855
4198
|
|
3856
4199
|
# Aggregate the columns of this DataFrame to their standard deviation value.
|
@@ -4296,7 +4639,7 @@ module Polars
|
|
4296
4639
|
# @example A horizontal string concatenation:
|
4297
4640
|
# df = Polars::DataFrame.new(
|
4298
4641
|
# {
|
4299
|
-
# "a" => ["foo", "bar",
|
4642
|
+
# "a" => ["foo", "bar", nil],
|
4300
4643
|
# "b" => [1, 2, 3],
|
4301
4644
|
# "c" => [1.0, 2.0, 3.0]
|
4302
4645
|
# }
|
@@ -4327,11 +4670,11 @@ module Polars
|
|
4327
4670
|
# # true
|
4328
4671
|
# # true
|
4329
4672
|
# # ]
|
4330
|
-
def fold
|
4673
|
+
def fold
|
4331
4674
|
acc = to_series(0)
|
4332
4675
|
|
4333
4676
|
1.upto(width - 1) do |i|
|
4334
|
-
acc =
|
4677
|
+
acc = yield(acc, to_series(i))
|
4335
4678
|
end
|
4336
4679
|
acc
|
4337
4680
|
end
|
@@ -4843,7 +5186,7 @@ module Polars
|
|
4843
5186
|
end
|
4844
5187
|
|
4845
5188
|
# @private
|
4846
|
-
def self.expand_hash_scalars(data, schema_overrides: nil, order: nil, nan_to_null: false)
|
5189
|
+
def self.expand_hash_scalars(data, schema_overrides: nil, strict: true, order: nil, nan_to_null: false)
|
4847
5190
|
updated_data = {}
|
4848
5191
|
unless data.empty?
|
4849
5192
|
dtypes = schema_overrides || {}
|
@@ -4852,23 +5195,23 @@ module Polars
|
|
4852
5195
|
data.each do |name, val|
|
4853
5196
|
dtype = dtypes[name]
|
4854
5197
|
if val.is_a?(Hash) && dtype != Struct
|
4855
|
-
updated_data[name] = DataFrame.new(val).to_struct(name)
|
5198
|
+
updated_data[name] = DataFrame.new(val, strict: strict).to_struct(name)
|
4856
5199
|
elsif !Utils.arrlen(val).nil?
|
4857
|
-
updated_data[name] = Series.new(::String.new(name), val, dtype: dtype)
|
5200
|
+
updated_data[name] = Series.new(::String.new(name), val, dtype: dtype, strict: strict)
|
4858
5201
|
elsif val.nil? || [Integer, Float, TrueClass, FalseClass, ::String, ::Date, ::DateTime, ::Time].any? { |cls| val.is_a?(cls) }
|
4859
5202
|
dtype = Polars::Float64 if val.nil? && dtype.nil?
|
4860
|
-
updated_data[name] = Series.new(::String.new(name), [val], dtype: dtype).extend_constant(val, array_len - 1)
|
5203
|
+
updated_data[name] = Series.new(::String.new(name), [val], dtype: dtype, strict: strict).extend_constant(val, array_len - 1)
|
4861
5204
|
else
|
4862
5205
|
raise Todo
|
4863
5206
|
end
|
4864
5207
|
end
|
4865
5208
|
elsif data.values.all? { |val| Utils.arrlen(val) == 0 }
|
4866
5209
|
data.each do |name, val|
|
4867
|
-
updated_data[name] = Series.new(name, val, dtype: dtypes[name])
|
5210
|
+
updated_data[name] = Series.new(name, val, dtype: dtypes[name], strict: strict)
|
4868
5211
|
end
|
4869
5212
|
elsif data.values.all? { |val| Utils.arrlen(val).nil? }
|
4870
5213
|
data.each do |name, val|
|
4871
|
-
updated_data[name] = Series.new(name, [val], dtype: dtypes[name])
|
5214
|
+
updated_data[name] = Series.new(name, [val], dtype: dtypes[name], strict: strict)
|
4872
5215
|
end
|
4873
5216
|
end
|
4874
5217
|
end
|
@@ -4876,7 +5219,7 @@ module Polars
|
|
4876
5219
|
end
|
4877
5220
|
|
4878
5221
|
# @private
|
4879
|
-
def self.hash_to_rbdf(data, schema: nil, schema_overrides: nil, nan_to_null: nil)
|
5222
|
+
def self.hash_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true, nan_to_null: nil)
|
4880
5223
|
if schema.is_a?(Hash) && !data.empty?
|
4881
5224
|
if !data.all? { |col, _| schema[col] }
|
4882
5225
|
raise ArgumentError, "The given column-schema names do not match the data dictionary"
|
@@ -4893,9 +5236,9 @@ module Polars
|
|
4893
5236
|
end
|
4894
5237
|
|
4895
5238
|
if data.empty? && !schema_overrides.empty?
|
4896
|
-
data_series = column_names.map { |name| Series.new(name, [], dtype: schema_overrides[name], nan_to_null: nan_to_null)._s }
|
5239
|
+
data_series = column_names.map { |name| Series.new(name, [], dtype: schema_overrides[name], strict: strict, nan_to_null: nan_to_null)._s }
|
4897
5240
|
else
|
4898
|
-
data_series = expand_hash_scalars(data, schema_overrides: schema_overrides, nan_to_null: nan_to_null).values.map(&:_s)
|
5241
|
+
data_series = expand_hash_scalars(data, schema_overrides: schema_overrides, strict: strict, nan_to_null: nan_to_null).values.map(&:_s)
|
4899
5242
|
end
|
4900
5243
|
|
4901
5244
|
data_series = _handle_columns_arg(data_series, columns: column_names, from_hash: true)
|
@@ -4969,7 +5312,7 @@ module Polars
|
|
4969
5312
|
end
|
4970
5313
|
end
|
4971
5314
|
|
4972
|
-
def self._post_apply_columns(rbdf, columns, structs: nil, schema_overrides: nil)
|
5315
|
+
def self._post_apply_columns(rbdf, columns, structs: nil, schema_overrides: nil, strict: true)
|
4973
5316
|
rbdf_columns = rbdf.columns
|
4974
5317
|
rbdf_dtypes = rbdf.dtypes
|
4975
5318
|
columns, dtypes = _unpack_schema(
|
@@ -4985,13 +5328,13 @@ module Polars
|
|
4985
5328
|
end
|
4986
5329
|
|
4987
5330
|
column_casts = []
|
4988
|
-
columns.
|
5331
|
+
columns.each_with_index do |col, i|
|
4989
5332
|
if dtypes[col] == Categorical # != rbdf_dtypes[i]
|
4990
|
-
column_casts << Polars.col(col).cast(Categorical)._rbexpr
|
5333
|
+
column_casts << Polars.col(col).cast(Categorical, strict: strict)._rbexpr
|
4991
5334
|
elsif structs&.any? && structs.include?(col) && structs[col] != rbdf_dtypes[i]
|
4992
|
-
column_casts << Polars.col(col).cast(structs[col])._rbexpr
|
5335
|
+
column_casts << Polars.col(col).cast(structs[col], strict: strict)._rbexpr
|
4993
5336
|
elsif dtypes.include?(col) && dtypes[col] != rbdf_dtypes[i]
|
4994
|
-
column_casts << Polars.col(col).cast(dtypes[col])._rbexpr
|
5337
|
+
column_casts << Polars.col(col).cast(dtypes[col], strict: strict)._rbexpr
|
4995
5338
|
end
|
4996
5339
|
end
|
4997
5340
|
|
@@ -5010,12 +5353,11 @@ module Polars
|
|
5010
5353
|
end
|
5011
5354
|
|
5012
5355
|
# @private
|
5013
|
-
def self.sequence_to_rbdf(data, schema: nil, schema_overrides: nil, orient: nil, infer_schema_length: 50)
|
5014
|
-
raise Todo if schema_overrides
|
5356
|
+
def self.sequence_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true, orient: nil, infer_schema_length: 50)
|
5015
5357
|
columns = schema
|
5016
5358
|
|
5017
5359
|
if data.length == 0
|
5018
|
-
return hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
|
5360
|
+
return hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides, strict: strict)
|
5019
5361
|
end
|
5020
5362
|
|
5021
5363
|
if data[0].is_a?(Series)
|
@@ -5028,7 +5370,7 @@ module Polars
|
|
5028
5370
|
elsif data[0].is_a?(Hash)
|
5029
5371
|
column_names, dtypes = _unpack_schema(columns)
|
5030
5372
|
schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
|
5031
|
-
rbdf = RbDataFrame.from_hashes(data, schema, schema_overrides,
|
5373
|
+
rbdf = RbDataFrame.from_hashes(data, schema, schema_overrides, strict, infer_schema_length)
|
5032
5374
|
if column_names
|
5033
5375
|
rbdf = _post_apply_columns(rbdf, column_names)
|
5034
5376
|
end
|
@@ -5048,7 +5390,7 @@ module Polars
|
|
5048
5390
|
schema, schema_overrides: schema_overrides, n_expected: first_element.length
|
5049
5391
|
)
|
5050
5392
|
local_schema_override = (
|
5051
|
-
schema_overrides.any? ? (
|
5393
|
+
schema_overrides.any? ? _include_unknowns(schema_overrides, column_names) : {}
|
5052
5394
|
)
|
5053
5395
|
if column_names.any? && first_element.length > 0 && first_element.length != column_names.length
|
5054
5396
|
raise ArgumentError, "the row data does not match the number of columns"
|
@@ -5056,7 +5398,11 @@ module Polars
|
|
5056
5398
|
|
5057
5399
|
unpack_nested = false
|
5058
5400
|
local_schema_override.each do |col, tp|
|
5059
|
-
|
5401
|
+
if [Categorical, Enum].include?(tp)
|
5402
|
+
local_schema_override[col] = String
|
5403
|
+
elsif !unpack_nested && [Unknown, Struct].include?(tp.base_type)
|
5404
|
+
raise Todo
|
5405
|
+
end
|
5060
5406
|
end
|
5061
5407
|
|
5062
5408
|
if unpack_nested
|
@@ -5070,7 +5416,7 @@ module Polars
|
|
5070
5416
|
end
|
5071
5417
|
if column_names.any? || schema_overrides.any?
|
5072
5418
|
rbdf = _post_apply_columns(
|
5073
|
-
rbdf, column_names, schema_overrides: schema_overrides
|
5419
|
+
rbdf, column_names, schema_overrides: schema_overrides, strict: strict
|
5074
5420
|
)
|
5075
5421
|
end
|
5076
5422
|
return rbdf
|
@@ -5080,7 +5426,7 @@ module Polars
|
|
5080
5426
|
)
|
5081
5427
|
data_series =
|
5082
5428
|
data.map.with_index do |element, i|
|
5083
|
-
Series.new(column_names[i], element, dtype: schema_overrides[column_names[i]])._s
|
5429
|
+
Series.new(column_names[i], element, dtype: schema_overrides[column_names[i]], strict: strict)._s
|
5084
5430
|
end
|
5085
5431
|
return RbDataFrame.new(data_series)
|
5086
5432
|
else
|
@@ -5093,7 +5439,12 @@ module Polars
|
|
5093
5439
|
end
|
5094
5440
|
|
5095
5441
|
# @private
|
5096
|
-
def self.
|
5442
|
+
def self._include_unknowns(schema, cols)
|
5443
|
+
cols.to_h { |col| [col, schema[col] || Unknown] }
|
5444
|
+
end
|
5445
|
+
|
5446
|
+
# @private
|
5447
|
+
def self.series_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true)
|
5097
5448
|
data_series = [data._s]
|
5098
5449
|
series_name = data_series.map(&:name)
|
5099
5450
|
column_names, schema_overrides = _unpack_schema(
|
@@ -5102,7 +5453,7 @@ module Polars
|
|
5102
5453
|
if schema_overrides.any?
|
5103
5454
|
new_dtype = schema_overrides.values[0]
|
5104
5455
|
if new_dtype != data.dtype
|
5105
|
-
data_series[0] = data_series[0].cast(new_dtype,
|
5456
|
+
data_series[0] = data_series[0].cast(new_dtype, strict)
|
5106
5457
|
end
|
5107
5458
|
end
|
5108
5459
|
|