polars-df 0.14.0-x64-mingw-ucrt → 0.16.0-x64-mingw-ucrt
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +35 -0
- data/Cargo.lock +1523 -378
- data/LICENSE-THIRD-PARTY.txt +24369 -14580
- data/LICENSE.txt +1 -0
- data/README.md +38 -4
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/3.3/polars.so +0 -0
- data/lib/polars/{3.1 → 3.4}/polars.so +0 -0
- data/lib/polars/batched_csv_reader.rb +0 -2
- data/lib/polars/binary_expr.rb +133 -9
- data/lib/polars/binary_name_space.rb +101 -6
- data/lib/polars/config.rb +4 -0
- data/lib/polars/data_frame.rb +452 -101
- data/lib/polars/data_type_group.rb +28 -0
- data/lib/polars/data_types.rb +3 -1
- data/lib/polars/date_time_expr.rb +244 -0
- data/lib/polars/date_time_name_space.rb +87 -0
- data/lib/polars/expr.rb +103 -2
- data/lib/polars/functions/aggregation/horizontal.rb +10 -4
- data/lib/polars/functions/as_datatype.rb +51 -2
- data/lib/polars/functions/col.rb +1 -1
- data/lib/polars/functions/eager.rb +1 -3
- data/lib/polars/functions/lazy.rb +95 -13
- data/lib/polars/functions/range/time_range.rb +21 -21
- data/lib/polars/io/csv.rb +14 -16
- data/lib/polars/io/database.rb +2 -2
- data/lib/polars/io/delta.rb +126 -0
- data/lib/polars/io/ipc.rb +14 -4
- data/lib/polars/io/ndjson.rb +10 -0
- data/lib/polars/io/parquet.rb +168 -111
- data/lib/polars/lazy_frame.rb +684 -20
- data/lib/polars/list_name_space.rb +169 -0
- data/lib/polars/selectors.rb +1226 -0
- data/lib/polars/series.rb +465 -35
- data/lib/polars/string_cache.rb +27 -1
- data/lib/polars/string_expr.rb +0 -1
- data/lib/polars/string_name_space.rb +73 -3
- data/lib/polars/struct_name_space.rb +31 -7
- data/lib/polars/utils/various.rb +5 -1
- data/lib/polars/utils.rb +45 -10
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +17 -1
- metadata +9 -8
- data/lib/polars/functions.rb +0 -57
data/lib/polars/data_frame.rb
CHANGED
@@ -8,17 +8,49 @@ module Polars
|
|
8
8
|
|
9
9
|
# Create a new DataFrame.
|
10
10
|
#
|
11
|
-
# @param data [
|
12
|
-
# Two-dimensional data in various forms
|
13
|
-
#
|
14
|
-
# @param
|
15
|
-
#
|
16
|
-
#
|
17
|
-
#
|
18
|
-
#
|
11
|
+
# @param data [Object]
|
12
|
+
# Two-dimensional data in various forms; hash input must contain arrays
|
13
|
+
# or a range. Arrays may contain Series or other arrays.
|
14
|
+
# @param schema [Object]
|
15
|
+
# The schema of the resulting DataFrame. The schema may be declared in several
|
16
|
+
# ways:
|
17
|
+
#
|
18
|
+
# * As a hash of name:type pairs; if type is nil, it will be auto-inferred.
|
19
|
+
# * As an array of column names; in this case types are automatically inferred.
|
20
|
+
# * As an array of (name,type) pairs; this is equivalent to the dictionary form.
|
21
|
+
#
|
22
|
+
# If you supply a list of column names that does not match the names in the
|
23
|
+
# underlying data, the names given here will overwrite them. The number
|
24
|
+
# of names given in the schema should match the underlying data dimensions.
|
25
|
+
#
|
26
|
+
# If set to `nil` (default), the schema is inferred from the data.
|
27
|
+
# @param schema_overrides [Hash]
|
28
|
+
# Support type specification or override of one or more columns; note that
|
29
|
+
# any dtypes inferred from the schema param will be overridden.
|
30
|
+
#
|
31
|
+
# The number of entries in the schema should match the underlying data
|
32
|
+
# dimensions, unless an array of hashes is being passed, in which case
|
33
|
+
# a *partial* schema can be declared to prevent specific fields from being loaded.
|
34
|
+
# @param strict [Boolean]
|
35
|
+
# Throw an error if any `data` value does not exactly match the given or inferred
|
36
|
+
# data type for that column. If set to `false`, values that do not match the data
|
37
|
+
# type are cast to that data type or, if casting is not possible, set to null
|
38
|
+
# instead.
|
39
|
+
# @param orient ["col", "row"]
|
40
|
+
# Whether to interpret two-dimensional data as columns or as rows. If nil,
|
19
41
|
# the orientation is inferred by matching the columns and data dimensions. If
|
20
42
|
# this does not yield conclusive results, column orientation is used.
|
21
|
-
|
43
|
+
# @param infer_schema_length [Integer]
|
44
|
+
# The maximum number of rows to scan for schema inference. If set to `nil`, the
|
45
|
+
# full data may be scanned *(this can be slow)*. This parameter only applies if
|
46
|
+
# the input data is a sequence or generator of rows; other input is read as-is.
|
47
|
+
# @param nan_to_null [Boolean]
|
48
|
+
# If the data comes from one or more Numo arrays, can optionally convert input
|
49
|
+
# data NaN values to null instead. This is a no-op for all other input data.
|
50
|
+
def initialize(data = nil, schema: nil, columns: nil, schema_overrides: nil, strict: true, orient: nil, infer_schema_length: 100, nan_to_null: false)
|
51
|
+
if schema && columns
|
52
|
+
warn "columns is ignored when schema is passed"
|
53
|
+
end
|
22
54
|
schema ||= columns
|
23
55
|
|
24
56
|
if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
|
@@ -29,11 +61,17 @@ module Polars
|
|
29
61
|
self._df = self.class.hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
|
30
62
|
elsif data.is_a?(Hash)
|
31
63
|
data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
|
32
|
-
self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, nan_to_null: nan_to_null)
|
64
|
+
self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict, nan_to_null: nan_to_null)
|
33
65
|
elsif data.is_a?(::Array)
|
34
|
-
self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, orient: orient, infer_schema_length: infer_schema_length)
|
66
|
+
self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict, orient: orient, infer_schema_length: infer_schema_length)
|
35
67
|
elsif data.is_a?(Series)
|
36
|
-
self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides)
|
68
|
+
self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict)
|
69
|
+
elsif data.respond_to?(:arrow_c_stream)
|
70
|
+
# This uses the fact that RbSeries.from_arrow_c_stream will create a
|
71
|
+
# struct-typed Series. Then we unpack that to a DataFrame.
|
72
|
+
tmp_col_name = ""
|
73
|
+
s = Utils.wrap_s(RbSeries.from_arrow_c_stream(data))
|
74
|
+
self._df = s.to_frame(tmp_col_name).unnest(tmp_col_name)._df
|
37
75
|
else
|
38
76
|
raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
|
39
77
|
end
|
@@ -452,6 +490,11 @@ module Polars
|
|
452
490
|
end
|
453
491
|
end
|
454
492
|
|
493
|
+
# @private
|
494
|
+
def arrow_c_stream
|
495
|
+
_df.arrow_c_stream
|
496
|
+
end
|
497
|
+
|
455
498
|
# Return the dataframe as a scalar.
|
456
499
|
#
|
457
500
|
# Equivalent to `df[0,0]`, with a check that the shape is (1,1).
|
@@ -766,15 +809,18 @@ module Polars
|
|
766
809
|
# Compression method. Defaults to "uncompressed".
|
767
810
|
#
|
768
811
|
# @return [nil]
|
769
|
-
def write_avro(file, compression = "uncompressed")
|
812
|
+
def write_avro(file, compression = "uncompressed", name: "")
|
770
813
|
if compression.nil?
|
771
814
|
compression = "uncompressed"
|
772
815
|
end
|
773
816
|
if Utils.pathlike?(file)
|
774
817
|
file = Utils.normalize_filepath(file)
|
775
818
|
end
|
819
|
+
if name.nil?
|
820
|
+
name = ""
|
821
|
+
end
|
776
822
|
|
777
|
-
_df.write_avro(file, compression)
|
823
|
+
_df.write_avro(file, compression, name)
|
778
824
|
end
|
779
825
|
|
780
826
|
# Write to Arrow IPC binary stream or Feather file.
|
@@ -785,7 +831,13 @@ module Polars
|
|
785
831
|
# Compression method. Defaults to "uncompressed".
|
786
832
|
#
|
787
833
|
# @return [nil]
|
788
|
-
def write_ipc(
|
834
|
+
def write_ipc(
|
835
|
+
file,
|
836
|
+
compression: "uncompressed",
|
837
|
+
compat_level: nil,
|
838
|
+
storage_options: nil,
|
839
|
+
retries: 2
|
840
|
+
)
|
789
841
|
return_bytes = file.nil?
|
790
842
|
if return_bytes
|
791
843
|
file = StringIO.new
|
@@ -795,11 +847,21 @@ module Polars
|
|
795
847
|
file = Utils.normalize_filepath(file)
|
796
848
|
end
|
797
849
|
|
850
|
+
if compat_level.nil?
|
851
|
+
compat_level = true
|
852
|
+
end
|
853
|
+
|
798
854
|
if compression.nil?
|
799
855
|
compression = "uncompressed"
|
800
856
|
end
|
801
857
|
|
802
|
-
|
858
|
+
if storage_options&.any?
|
859
|
+
storage_options = storage_options.to_a
|
860
|
+
else
|
861
|
+
storage_options = nil
|
862
|
+
end
|
863
|
+
|
864
|
+
_df.write_ipc(file, compression, compat_level, storage_options, retries)
|
803
865
|
return_bytes ? file.string : nil
|
804
866
|
end
|
805
867
|
|
@@ -826,7 +888,8 @@ module Polars
|
|
826
888
|
# df.write_ipc_stream("new_file.arrow")
|
827
889
|
def write_ipc_stream(
|
828
890
|
file,
|
829
|
-
compression: "uncompressed"
|
891
|
+
compression: "uncompressed",
|
892
|
+
compat_level: nil
|
830
893
|
)
|
831
894
|
return_bytes = file.nil?
|
832
895
|
if return_bytes
|
@@ -836,11 +899,15 @@ module Polars
|
|
836
899
|
file = Utils.normalize_filepath(file)
|
837
900
|
end
|
838
901
|
|
902
|
+
if compat_level.nil?
|
903
|
+
compat_level = true
|
904
|
+
end
|
905
|
+
|
839
906
|
if compression.nil?
|
840
907
|
compression = "uncompressed"
|
841
908
|
end
|
842
909
|
|
843
|
-
_df.write_ipc_stream(file, compression)
|
910
|
+
_df.write_ipc_stream(file, compression, compat_level)
|
844
911
|
return_bytes ? file.string : nil
|
845
912
|
end
|
846
913
|
|
@@ -906,6 +973,61 @@ module Polars
|
|
906
973
|
)
|
907
974
|
end
|
908
975
|
|
976
|
+
# Write DataFrame as delta table.
|
977
|
+
#
|
978
|
+
# @param target [Object]
|
979
|
+
# URI of a table or a DeltaTable object.
|
980
|
+
# @param mode ["error", "append", "overwrite", "ignore", "merge"]
|
981
|
+
# How to handle existing data.
|
982
|
+
# @param storage_options [Hash]
|
983
|
+
# Extra options for the storage backends supported by `deltalake-rb`.
|
984
|
+
# @param delta_write_options [Hash]
|
985
|
+
# Additional keyword arguments while writing a Delta lake Table.
|
986
|
+
# @param delta_merge_options [Hash]
|
987
|
+
# Keyword arguments which are required to `MERGE` a Delta lake Table.
|
988
|
+
#
|
989
|
+
# @return [nil]
|
990
|
+
def write_delta(
|
991
|
+
target,
|
992
|
+
mode: "error",
|
993
|
+
storage_options: nil,
|
994
|
+
delta_write_options: nil,
|
995
|
+
delta_merge_options: nil
|
996
|
+
)
|
997
|
+
Polars.send(:_check_if_delta_available)
|
998
|
+
|
999
|
+
if Utils.pathlike?(target)
|
1000
|
+
target = Polars.send(:_resolve_delta_lake_uri, target.to_s, strict: false)
|
1001
|
+
end
|
1002
|
+
|
1003
|
+
data = self
|
1004
|
+
|
1005
|
+
if mode == "merge"
|
1006
|
+
if delta_merge_options.nil?
|
1007
|
+
msg = "You need to pass delta_merge_options with at least a given predicate for `MERGE` to work."
|
1008
|
+
raise ArgumentError, msg
|
1009
|
+
end
|
1010
|
+
if target.is_a?(::String)
|
1011
|
+
dt = DeltaLake::Table.new(target, storage_options: storage_options)
|
1012
|
+
else
|
1013
|
+
dt = target
|
1014
|
+
end
|
1015
|
+
|
1016
|
+
predicate = delta_merge_options.delete(:predicate)
|
1017
|
+
dt.merge(data, predicate, **delta_merge_options)
|
1018
|
+
else
|
1019
|
+
delta_write_options ||= {}
|
1020
|
+
|
1021
|
+
DeltaLake.write(
|
1022
|
+
target,
|
1023
|
+
data,
|
1024
|
+
mode: mode,
|
1025
|
+
storage_options: storage_options,
|
1026
|
+
**delta_write_options
|
1027
|
+
)
|
1028
|
+
end
|
1029
|
+
end
|
1030
|
+
|
909
1031
|
# Return an estimation of the total (heap) allocated size of the DataFrame.
|
910
1032
|
#
|
911
1033
|
# Estimated size is given in the specified unit (bytes by default).
|
@@ -1037,6 +1159,10 @@ module Polars
|
|
1037
1159
|
#
|
1038
1160
|
# @param mapping [Hash]
|
1039
1161
|
# Key value pairs that map from old name to new name.
|
1162
|
+
# @param strict [Boolean]
|
1163
|
+
# Validate that all column names exist in the current schema,
|
1164
|
+
# and throw an exception if any do not. (Note that this parameter
|
1165
|
+
# is a no-op when passing a function to `mapping`).
|
1040
1166
|
#
|
1041
1167
|
# @return [DataFrame]
|
1042
1168
|
#
|
@@ -1060,8 +1186,8 @@ module Polars
|
|
1060
1186
|
# # │ 2 ┆ 7 ┆ b │
|
1061
1187
|
# # │ 3 ┆ 8 ┆ c │
|
1062
1188
|
# # └───────┴─────┴─────┘
|
1063
|
-
def rename(mapping)
|
1064
|
-
lazy.rename(mapping).collect(no_optimization: true)
|
1189
|
+
def rename(mapping, strict: true)
|
1190
|
+
lazy.rename(mapping, strict: strict).collect(no_optimization: true)
|
1065
1191
|
end
|
1066
1192
|
|
1067
1193
|
# Insert a Series at a certain column index. This operation is in place.
|
@@ -2190,6 +2316,11 @@ module Polars
|
|
2190
2316
|
# @param force_parallel [Boolean]
|
2191
2317
|
# Force the physical plan to evaluate the computation of both DataFrames up to
|
2192
2318
|
# the join in parallel.
|
2319
|
+
# @param coalesce [Boolean]
|
2320
|
+
# Coalescing behavior (merging of join columns).
|
2321
|
+
# - true: -> Always coalesce join columns.
|
2322
|
+
# - false: -> Never coalesce join columns.
|
2323
|
+
# Note that joining on any other expressions than `col` will turn off coalescing.
|
2193
2324
|
#
|
2194
2325
|
# @return [DataFrame]
|
2195
2326
|
#
|
@@ -2243,7 +2374,8 @@ module Polars
|
|
2243
2374
|
suffix: "_right",
|
2244
2375
|
tolerance: nil,
|
2245
2376
|
allow_parallel: true,
|
2246
|
-
force_parallel: false
|
2377
|
+
force_parallel: false,
|
2378
|
+
coalesce: true
|
2247
2379
|
)
|
2248
2380
|
lazy
|
2249
2381
|
.join_asof(
|
@@ -2258,7 +2390,8 @@ module Polars
|
|
2258
2390
|
suffix: suffix,
|
2259
2391
|
tolerance: tolerance,
|
2260
2392
|
allow_parallel: allow_parallel,
|
2261
|
-
force_parallel: force_parallel
|
2393
|
+
force_parallel: force_parallel,
|
2394
|
+
coalesce: coalesce
|
2262
2395
|
)
|
2263
2396
|
.collect(no_optimization: true)
|
2264
2397
|
end
|
@@ -2277,8 +2410,20 @@ module Polars
|
|
2277
2410
|
# Join strategy.
|
2278
2411
|
# @param suffix [String]
|
2279
2412
|
# Suffix to append to columns with a duplicate name.
|
2413
|
+
# @param validate ['m:m', 'm:1', '1:m', '1:1']
|
2414
|
+
# Checks if join is of specified type.
|
2415
|
+
# * *many_to_many* - “m:m”: default, does not result in checks
|
2416
|
+
# * *one_to_one* - “1:1”: check if join keys are unique in both left and right datasets
|
2417
|
+
# * *one_to_many* - “1:m”: check if join keys are unique in left dataset
|
2418
|
+
# * *many_to_one* - “m:1”: check if join keys are unique in right dataset
|
2280
2419
|
# @param join_nulls [Boolean]
|
2281
2420
|
# Join on null values. By default null values will never produce matches.
|
2421
|
+
# @param coalesce [Boolean]
|
2422
|
+
# Coalescing behavior (merging of join columns).
|
2423
|
+
# - nil: -> join specific.
|
2424
|
+
# - true: -> Always coalesce join columns.
|
2425
|
+
# - false: -> Never coalesce join columns.
|
2426
|
+
# Note that joining on any other expressions than `col` will turn off coalescing.
|
2282
2427
|
#
|
2283
2428
|
# @return [DataFrame]
|
2284
2429
|
#
|
@@ -2361,7 +2506,16 @@ module Polars
|
|
2361
2506
|
# # ╞═════╪═════╪═════╡
|
2362
2507
|
# # │ 3 ┆ 8.0 ┆ c │
|
2363
2508
|
# # └─────┴─────┴─────┘
|
2364
|
-
def join(other,
|
2509
|
+
def join(other,
|
2510
|
+
left_on: nil,
|
2511
|
+
right_on: nil,
|
2512
|
+
on: nil,
|
2513
|
+
how: "inner",
|
2514
|
+
suffix: "_right",
|
2515
|
+
validate: "m:m",
|
2516
|
+
join_nulls: false,
|
2517
|
+
coalesce: nil
|
2518
|
+
)
|
2365
2519
|
lazy
|
2366
2520
|
.join(
|
2367
2521
|
other.lazy,
|
@@ -2370,7 +2524,9 @@ module Polars
|
|
2370
2524
|
on: on,
|
2371
2525
|
how: how,
|
2372
2526
|
suffix: suffix,
|
2373
|
-
|
2527
|
+
validate: validate,
|
2528
|
+
join_nulls: join_nulls,
|
2529
|
+
coalesce: coalesce
|
2374
2530
|
)
|
2375
2531
|
.collect(no_optimization: true)
|
2376
2532
|
end
|
@@ -2717,10 +2873,85 @@ module Polars
|
|
2717
2873
|
# Column to drop.
|
2718
2874
|
#
|
2719
2875
|
# @return [Series]
|
2876
|
+
#
|
2877
|
+
# @example
|
2878
|
+
# df = Polars::DataFrame.new(
|
2879
|
+
# {
|
2880
|
+
# "foo" => [1, 2, 3],
|
2881
|
+
# "bar" => [6, 7, 8],
|
2882
|
+
# "ham" => ["a", "b", "c"]
|
2883
|
+
# }
|
2884
|
+
# )
|
2885
|
+
# df.delete("ham")
|
2886
|
+
# # =>
|
2887
|
+
# # shape: (3,)
|
2888
|
+
# # Series: 'ham' [str]
|
2889
|
+
# # [
|
2890
|
+
# # "a"
|
2891
|
+
# # "b"
|
2892
|
+
# # "c"
|
2893
|
+
# # ]
|
2894
|
+
#
|
2895
|
+
# @example
|
2896
|
+
# df.delete("missing")
|
2897
|
+
# # => nil
|
2720
2898
|
def delete(name)
|
2721
2899
|
drop_in_place(name) if include?(name)
|
2722
2900
|
end
|
2723
2901
|
|
2902
|
+
# Cast DataFrame column(s) to the specified dtype(s).
|
2903
|
+
#
|
2904
|
+
# @param dtypes [Object]
|
2905
|
+
# Mapping of column names (or selector) to dtypes, or a single dtype
|
2906
|
+
# to which all columns will be cast.
|
2907
|
+
# @param strict [Boolean]
|
2908
|
+
# Throw an error if a cast could not be done (for instance, due to an
|
2909
|
+
# overflow).
|
2910
|
+
#
|
2911
|
+
# @return [DataFrame]
|
2912
|
+
#
|
2913
|
+
# @example Cast specific frame columns to the specified dtypes:
|
2914
|
+
# df = Polars::DataFrame.new(
|
2915
|
+
# {
|
2916
|
+
# "foo" => [1, 2, 3],
|
2917
|
+
# "bar" => [6.0, 7.0, 8.0],
|
2918
|
+
# "ham" => [Date.new(2020, 1, 2), Date.new(2021, 3, 4), Date.new(2022, 5, 6)]
|
2919
|
+
# }
|
2920
|
+
# )
|
2921
|
+
# df.cast({"foo" => Polars::Float32, "bar" => Polars::UInt8})
|
2922
|
+
# # =>
|
2923
|
+
# # shape: (3, 3)
|
2924
|
+
# # ┌─────┬─────┬────────────┐
|
2925
|
+
# # │ foo ┆ bar ┆ ham │
|
2926
|
+
# # │ --- ┆ --- ┆ --- │
|
2927
|
+
# # │ f32 ┆ u8 ┆ date │
|
2928
|
+
# # ╞═════╪═════╪════════════╡
|
2929
|
+
# # │ 1.0 ┆ 6 ┆ 2020-01-02 │
|
2930
|
+
# # │ 2.0 ┆ 7 ┆ 2021-03-04 │
|
2931
|
+
# # │ 3.0 ┆ 8 ┆ 2022-05-06 │
|
2932
|
+
# # └─────┴─────┴────────────┘
|
2933
|
+
#
|
2934
|
+
# @example Cast all frame columns matching one dtype (or dtype group) to another dtype:
|
2935
|
+
# df.cast({Polars::Date => Polars::Datetime})
|
2936
|
+
# # =>
|
2937
|
+
# # shape: (3, 3)
|
2938
|
+
# # ┌─────┬─────┬─────────────────────┐
|
2939
|
+
# # │ foo ┆ bar ┆ ham │
|
2940
|
+
# # │ --- ┆ --- ┆ --- │
|
2941
|
+
# # │ i64 ┆ f64 ┆ datetime[μs] │
|
2942
|
+
# # ╞═════╪═════╪═════════════════════╡
|
2943
|
+
# # │ 1 ┆ 6.0 ┆ 2020-01-02 00:00:00 │
|
2944
|
+
# # │ 2 ┆ 7.0 ┆ 2021-03-04 00:00:00 │
|
2945
|
+
# # │ 3 ┆ 8.0 ┆ 2022-05-06 00:00:00 │
|
2946
|
+
# # └─────┴─────┴─────────────────────┘
|
2947
|
+
#
|
2948
|
+
# @example Cast all frame columns to the specified dtype:
|
2949
|
+
# df.cast(Polars::String).to_h(as_series: false)
|
2950
|
+
# # => {"foo"=>["1", "2", "3"], "bar"=>["6.0", "7.0", "8.0"], "ham"=>["2020-01-02", "2021-03-04", "2022-05-06"]}
|
2951
|
+
def cast(dtypes, strict: true)
|
2952
|
+
lazy.cast(dtypes, strict: strict).collect(_eager: true)
|
2953
|
+
end
|
2954
|
+
|
2724
2955
|
# Create an empty copy of the current DataFrame.
|
2725
2956
|
#
|
2726
2957
|
# Returns a DataFrame with identical schema but no data.
|
@@ -2775,6 +3006,57 @@ module Polars
|
|
2775
3006
|
# Get the DataFrame as a Array of Series.
|
2776
3007
|
#
|
2777
3008
|
# @return [Array]
|
3009
|
+
#
|
3010
|
+
# @example
|
3011
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
|
3012
|
+
# df.get_columns
|
3013
|
+
# # =>
|
3014
|
+
# # [shape: (3,)
|
3015
|
+
# # Series: 'foo' [i64]
|
3016
|
+
# # [
|
3017
|
+
# # 1
|
3018
|
+
# # 2
|
3019
|
+
# # 3
|
3020
|
+
# # ], shape: (3,)
|
3021
|
+
# # Series: 'bar' [i64]
|
3022
|
+
# # [
|
3023
|
+
# # 4
|
3024
|
+
# # 5
|
3025
|
+
# # 6
|
3026
|
+
# # ]]
|
3027
|
+
#
|
3028
|
+
# @example
|
3029
|
+
# df = Polars::DataFrame.new(
|
3030
|
+
# {
|
3031
|
+
# "a" => [1, 2, 3, 4],
|
3032
|
+
# "b" => [0.5, 4, 10, 13],
|
3033
|
+
# "c" => [true, true, false, true]
|
3034
|
+
# }
|
3035
|
+
# )
|
3036
|
+
# df.get_columns
|
3037
|
+
# # =>
|
3038
|
+
# # [shape: (4,)
|
3039
|
+
# # Series: 'a' [i64]
|
3040
|
+
# # [
|
3041
|
+
# # 1
|
3042
|
+
# # 2
|
3043
|
+
# # 3
|
3044
|
+
# # 4
|
3045
|
+
# # ], shape: (4,)
|
3046
|
+
# # Series: 'b' [f64]
|
3047
|
+
# # [
|
3048
|
+
# # 0.5
|
3049
|
+
# # 4.0
|
3050
|
+
# # 10.0
|
3051
|
+
# # 13.0
|
3052
|
+
# # ], shape: (4,)
|
3053
|
+
# # Series: 'c' [bool]
|
3054
|
+
# # [
|
3055
|
+
# # true
|
3056
|
+
# # true
|
3057
|
+
# # false
|
3058
|
+
# # true
|
3059
|
+
# # ]]
|
2778
3060
|
def get_columns
|
2779
3061
|
_df.get_columns.map { |s| Utils.wrap_s(s) }
|
2780
3062
|
end
|
@@ -3083,7 +3365,7 @@ module Polars
|
|
3083
3365
|
# "c" => [2, 4, 6]
|
3084
3366
|
# }
|
3085
3367
|
# )
|
3086
|
-
# df.unpivot(Polars
|
3368
|
+
# df.unpivot(Polars.cs.numeric, index: "a")
|
3087
3369
|
# # =>
|
3088
3370
|
# # shape: (6, 3)
|
3089
3371
|
# # ┌─────┬──────────┬───────┐
|
@@ -3724,14 +4006,32 @@ module Polars
|
|
3724
4006
|
# # ╞═════╪═════╪═════╡
|
3725
4007
|
# # │ 3 ┆ 8 ┆ c │
|
3726
4008
|
# # └─────┴─────┴─────┘
|
3727
|
-
def max
|
3728
|
-
|
3729
|
-
|
3730
|
-
|
3731
|
-
|
3732
|
-
|
3733
|
-
|
3734
|
-
|
4009
|
+
def max
|
4010
|
+
lazy.max.collect(_eager: true)
|
4011
|
+
end
|
4012
|
+
|
4013
|
+
# Get the maximum value horizontally across columns.
|
4014
|
+
#
|
4015
|
+
# @return [Series]
|
4016
|
+
#
|
4017
|
+
# @example
|
4018
|
+
# df = Polars::DataFrame.new(
|
4019
|
+
# {
|
4020
|
+
# "foo" => [1, 2, 3],
|
4021
|
+
# "bar" => [4.0, 5.0, 6.0]
|
4022
|
+
# }
|
4023
|
+
# )
|
4024
|
+
# df.max_horizontal
|
4025
|
+
# # =>
|
4026
|
+
# # shape: (3,)
|
4027
|
+
# # Series: 'max' [f64]
|
4028
|
+
# # [
|
4029
|
+
# # 4.0
|
4030
|
+
# # 5.0
|
4031
|
+
# # 6.0
|
4032
|
+
# # ]
|
4033
|
+
def max_horizontal
|
4034
|
+
select(max: F.max_horizontal(F.all)).to_series
|
3735
4035
|
end
|
3736
4036
|
|
3737
4037
|
# Aggregate the columns of this DataFrame to their minimum value.
|
@@ -3756,22 +4056,35 @@ module Polars
|
|
3756
4056
|
# # ╞═════╪═════╪═════╡
|
3757
4057
|
# # │ 1 ┆ 6 ┆ a │
|
3758
4058
|
# # └─────┴─────┴─────┘
|
3759
|
-
def min
|
3760
|
-
|
3761
|
-
lazy.min.collect(_eager: true)
|
3762
|
-
elsif axis == 1
|
3763
|
-
Utils.wrap_s(_df.min_horizontal)
|
3764
|
-
else
|
3765
|
-
raise ArgumentError, "Axis should be 0 or 1."
|
3766
|
-
end
|
4059
|
+
def min
|
4060
|
+
lazy.min.collect(_eager: true)
|
3767
4061
|
end
|
3768
4062
|
|
3769
|
-
#
|
4063
|
+
# Get the minimum value horizontally across columns.
|
3770
4064
|
#
|
3771
|
-
# @
|
3772
|
-
#
|
3773
|
-
# @
|
3774
|
-
#
|
4065
|
+
# @return [Series]
|
4066
|
+
#
|
4067
|
+
# @example
|
4068
|
+
# df = Polars::DataFrame.new(
|
4069
|
+
# {
|
4070
|
+
# "foo" => [1, 2, 3],
|
4071
|
+
# "bar" => [4.0, 5.0, 6.0]
|
4072
|
+
# }
|
4073
|
+
# )
|
4074
|
+
# df.min_horizontal
|
4075
|
+
# # =>
|
4076
|
+
# # shape: (3,)
|
4077
|
+
# # Series: 'min' [f64]
|
4078
|
+
# # [
|
4079
|
+
# # 1.0
|
4080
|
+
# # 2.0
|
4081
|
+
# # 3.0
|
4082
|
+
# # ]
|
4083
|
+
def min_horizontal
|
4084
|
+
select(min: F.min_horizontal(F.all)).to_series
|
4085
|
+
end
|
4086
|
+
|
4087
|
+
# Aggregate the columns of this DataFrame to their sum value.
|
3775
4088
|
#
|
3776
4089
|
# @return [DataFrame]
|
3777
4090
|
#
|
@@ -3793,35 +4106,42 @@ module Polars
|
|
3793
4106
|
# # ╞═════╪═════╪══════╡
|
3794
4107
|
# # │ 6 ┆ 21 ┆ null │
|
3795
4108
|
# # └─────┴─────┴──────┘
|
4109
|
+
def sum
|
4110
|
+
lazy.sum.collect(_eager: true)
|
4111
|
+
end
|
4112
|
+
|
4113
|
+
# Sum all values horizontally across columns.
|
4114
|
+
#
|
4115
|
+
# @param ignore_nulls [Boolean]
|
4116
|
+
# Ignore null values (default).
|
4117
|
+
# If set to `false`, any null value in the input will lead to a null output.
|
4118
|
+
#
|
4119
|
+
# @return [Series]
|
3796
4120
|
#
|
3797
4121
|
# @example
|
3798
|
-
# df.
|
4122
|
+
# df = Polars::DataFrame.new(
|
4123
|
+
# {
|
4124
|
+
# "foo" => [1, 2, 3],
|
4125
|
+
# "bar" => [4.0, 5.0, 6.0]
|
4126
|
+
# }
|
4127
|
+
# )
|
4128
|
+
# df.sum_horizontal
|
3799
4129
|
# # =>
|
3800
4130
|
# # shape: (3,)
|
3801
|
-
# # Series: '
|
4131
|
+
# # Series: 'sum' [f64]
|
3802
4132
|
# # [
|
3803
|
-
# #
|
3804
|
-
# #
|
3805
|
-
# #
|
4133
|
+
# # 5.0
|
4134
|
+
# # 7.0
|
4135
|
+
# # 9.0
|
3806
4136
|
# # ]
|
3807
|
-
def
|
3808
|
-
|
3809
|
-
|
3810
|
-
|
3811
|
-
when 1
|
3812
|
-
Utils.wrap_s(_df.sum_horizontal(null_strategy))
|
3813
|
-
else
|
3814
|
-
raise ArgumentError, "Axis should be 0 or 1."
|
3815
|
-
end
|
4137
|
+
def sum_horizontal(ignore_nulls: true)
|
4138
|
+
select(
|
4139
|
+
sum: F.sum_horizontal(F.all, ignore_nulls: ignore_nulls)
|
4140
|
+
).to_series
|
3816
4141
|
end
|
3817
4142
|
|
3818
4143
|
# Aggregate the columns of this DataFrame to their mean value.
|
3819
4144
|
#
|
3820
|
-
# @param axis [Integer]
|
3821
|
-
# Either 0 or 1.
|
3822
|
-
# @param null_strategy ["ignore", "propagate"]
|
3823
|
-
# This argument is only used if axis == 1.
|
3824
|
-
#
|
3825
4145
|
# @return [DataFrame]
|
3826
4146
|
#
|
3827
4147
|
# @example
|
@@ -3842,15 +4162,38 @@ module Polars
|
|
3842
4162
|
# # ╞═════╪═════╪══════╡
|
3843
4163
|
# # │ 2.0 ┆ 7.0 ┆ null │
|
3844
4164
|
# # └─────┴─────┴──────┘
|
3845
|
-
def mean
|
3846
|
-
|
3847
|
-
|
3848
|
-
|
3849
|
-
|
3850
|
-
|
3851
|
-
|
3852
|
-
|
3853
|
-
|
4165
|
+
def mean
|
4166
|
+
lazy.mean.collect(_eager: true)
|
4167
|
+
end
|
4168
|
+
|
4169
|
+
# Take the mean of all values horizontally across columns.
|
4170
|
+
#
|
4171
|
+
# @param ignore_nulls [Boolean]
|
4172
|
+
# Ignore null values (default).
|
4173
|
+
# If set to `false`, any null value in the input will lead to a null output.
|
4174
|
+
#
|
4175
|
+
# @return [Series]
|
4176
|
+
#
|
4177
|
+
# @example
|
4178
|
+
# df = Polars::DataFrame.new(
|
4179
|
+
# {
|
4180
|
+
# "foo" => [1, 2, 3],
|
4181
|
+
# "bar" => [4.0, 5.0, 6.0]
|
4182
|
+
# }
|
4183
|
+
# )
|
4184
|
+
# df.mean_horizontal
|
4185
|
+
# # =>
|
4186
|
+
# # shape: (3,)
|
4187
|
+
# # Series: 'mean' [f64]
|
4188
|
+
# # [
|
4189
|
+
# # 2.5
|
4190
|
+
# # 3.5
|
4191
|
+
# # 4.5
|
4192
|
+
# # ]
|
4193
|
+
def mean_horizontal(ignore_nulls: true)
|
4194
|
+
select(
|
4195
|
+
mean: F.mean_horizontal(F.all, ignore_nulls: ignore_nulls)
|
4196
|
+
).to_series
|
3854
4197
|
end
|
3855
4198
|
|
3856
4199
|
# Aggregate the columns of this DataFrame to their standard deviation value.
|
@@ -4296,7 +4639,7 @@ module Polars
|
|
4296
4639
|
# @example A horizontal string concatenation:
|
4297
4640
|
# df = Polars::DataFrame.new(
|
4298
4641
|
# {
|
4299
|
-
# "a" => ["foo", "bar",
|
4642
|
+
# "a" => ["foo", "bar", nil],
|
4300
4643
|
# "b" => [1, 2, 3],
|
4301
4644
|
# "c" => [1.0, 2.0, 3.0]
|
4302
4645
|
# }
|
@@ -4327,11 +4670,11 @@ module Polars
|
|
4327
4670
|
# # true
|
4328
4671
|
# # true
|
4329
4672
|
# # ]
|
4330
|
-
def fold
|
4673
|
+
def fold
|
4331
4674
|
acc = to_series(0)
|
4332
4675
|
|
4333
4676
|
1.upto(width - 1) do |i|
|
4334
|
-
acc =
|
4677
|
+
acc = yield(acc, to_series(i))
|
4335
4678
|
end
|
4336
4679
|
acc
|
4337
4680
|
end
|
@@ -4843,7 +5186,7 @@ module Polars
|
|
4843
5186
|
end
|
4844
5187
|
|
4845
5188
|
# @private
|
4846
|
-
def self.expand_hash_scalars(data, schema_overrides: nil, order: nil, nan_to_null: false)
|
5189
|
+
def self.expand_hash_scalars(data, schema_overrides: nil, strict: true, order: nil, nan_to_null: false)
|
4847
5190
|
updated_data = {}
|
4848
5191
|
unless data.empty?
|
4849
5192
|
dtypes = schema_overrides || {}
|
@@ -4852,23 +5195,23 @@ module Polars
|
|
4852
5195
|
data.each do |name, val|
|
4853
5196
|
dtype = dtypes[name]
|
4854
5197
|
if val.is_a?(Hash) && dtype != Struct
|
4855
|
-
updated_data[name] = DataFrame.new(val).to_struct(name)
|
5198
|
+
updated_data[name] = DataFrame.new(val, strict: strict).to_struct(name)
|
4856
5199
|
elsif !Utils.arrlen(val).nil?
|
4857
|
-
updated_data[name] = Series.new(::String.new(name), val, dtype: dtype)
|
5200
|
+
updated_data[name] = Series.new(::String.new(name), val, dtype: dtype, strict: strict)
|
4858
5201
|
elsif val.nil? || [Integer, Float, TrueClass, FalseClass, ::String, ::Date, ::DateTime, ::Time].any? { |cls| val.is_a?(cls) }
|
4859
5202
|
dtype = Polars::Float64 if val.nil? && dtype.nil?
|
4860
|
-
updated_data[name] = Series.new(::String.new(name), [val], dtype: dtype).extend_constant(val, array_len - 1)
|
5203
|
+
updated_data[name] = Series.new(::String.new(name), [val], dtype: dtype, strict: strict).extend_constant(val, array_len - 1)
|
4861
5204
|
else
|
4862
5205
|
raise Todo
|
4863
5206
|
end
|
4864
5207
|
end
|
4865
5208
|
elsif data.values.all? { |val| Utils.arrlen(val) == 0 }
|
4866
5209
|
data.each do |name, val|
|
4867
|
-
updated_data[name] = Series.new(name, val, dtype: dtypes[name])
|
5210
|
+
updated_data[name] = Series.new(name, val, dtype: dtypes[name], strict: strict)
|
4868
5211
|
end
|
4869
5212
|
elsif data.values.all? { |val| Utils.arrlen(val).nil? }
|
4870
5213
|
data.each do |name, val|
|
4871
|
-
updated_data[name] = Series.new(name, [val], dtype: dtypes[name])
|
5214
|
+
updated_data[name] = Series.new(name, [val], dtype: dtypes[name], strict: strict)
|
4872
5215
|
end
|
4873
5216
|
end
|
4874
5217
|
end
|
@@ -4876,7 +5219,7 @@ module Polars
|
|
4876
5219
|
end
|
4877
5220
|
|
4878
5221
|
# @private
|
4879
|
-
def self.hash_to_rbdf(data, schema: nil, schema_overrides: nil, nan_to_null: nil)
|
5222
|
+
def self.hash_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true, nan_to_null: nil)
|
4880
5223
|
if schema.is_a?(Hash) && !data.empty?
|
4881
5224
|
if !data.all? { |col, _| schema[col] }
|
4882
5225
|
raise ArgumentError, "The given column-schema names do not match the data dictionary"
|
@@ -4893,9 +5236,9 @@ module Polars
|
|
4893
5236
|
end
|
4894
5237
|
|
4895
5238
|
if data.empty? && !schema_overrides.empty?
|
4896
|
-
data_series = column_names.map { |name| Series.new(name, [], dtype: schema_overrides[name], nan_to_null: nan_to_null)._s }
|
5239
|
+
data_series = column_names.map { |name| Series.new(name, [], dtype: schema_overrides[name], strict: strict, nan_to_null: nan_to_null)._s }
|
4897
5240
|
else
|
4898
|
-
data_series = expand_hash_scalars(data, schema_overrides: schema_overrides, nan_to_null: nan_to_null).values.map(&:_s)
|
5241
|
+
data_series = expand_hash_scalars(data, schema_overrides: schema_overrides, strict: strict, nan_to_null: nan_to_null).values.map(&:_s)
|
4899
5242
|
end
|
4900
5243
|
|
4901
5244
|
data_series = _handle_columns_arg(data_series, columns: column_names, from_hash: true)
|
@@ -4969,7 +5312,7 @@ module Polars
|
|
4969
5312
|
end
|
4970
5313
|
end
|
4971
5314
|
|
4972
|
-
def self._post_apply_columns(rbdf, columns, structs: nil, schema_overrides: nil)
|
5315
|
+
def self._post_apply_columns(rbdf, columns, structs: nil, schema_overrides: nil, strict: true)
|
4973
5316
|
rbdf_columns = rbdf.columns
|
4974
5317
|
rbdf_dtypes = rbdf.dtypes
|
4975
5318
|
columns, dtypes = _unpack_schema(
|
@@ -4985,13 +5328,13 @@ module Polars
|
|
4985
5328
|
end
|
4986
5329
|
|
4987
5330
|
column_casts = []
|
4988
|
-
columns.
|
5331
|
+
columns.each_with_index do |col, i|
|
4989
5332
|
if dtypes[col] == Categorical # != rbdf_dtypes[i]
|
4990
|
-
column_casts << Polars.col(col).cast(Categorical)._rbexpr
|
5333
|
+
column_casts << Polars.col(col).cast(Categorical, strict: strict)._rbexpr
|
4991
5334
|
elsif structs&.any? && structs.include?(col) && structs[col] != rbdf_dtypes[i]
|
4992
|
-
column_casts << Polars.col(col).cast(structs[col])._rbexpr
|
5335
|
+
column_casts << Polars.col(col).cast(structs[col], strict: strict)._rbexpr
|
4993
5336
|
elsif dtypes.include?(col) && dtypes[col] != rbdf_dtypes[i]
|
4994
|
-
column_casts << Polars.col(col).cast(dtypes[col])._rbexpr
|
5337
|
+
column_casts << Polars.col(col).cast(dtypes[col], strict: strict)._rbexpr
|
4995
5338
|
end
|
4996
5339
|
end
|
4997
5340
|
|
@@ -5010,12 +5353,11 @@ module Polars
|
|
5010
5353
|
end
|
5011
5354
|
|
5012
5355
|
# @private
|
5013
|
-
def self.sequence_to_rbdf(data, schema: nil, schema_overrides: nil, orient: nil, infer_schema_length: 50)
|
5014
|
-
raise Todo if schema_overrides
|
5356
|
+
def self.sequence_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true, orient: nil, infer_schema_length: 50)
|
5015
5357
|
columns = schema
|
5016
5358
|
|
5017
5359
|
if data.length == 0
|
5018
|
-
return hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
|
5360
|
+
return hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides, strict: strict)
|
5019
5361
|
end
|
5020
5362
|
|
5021
5363
|
if data[0].is_a?(Series)
|
@@ -5028,7 +5370,7 @@ module Polars
|
|
5028
5370
|
elsif data[0].is_a?(Hash)
|
5029
5371
|
column_names, dtypes = _unpack_schema(columns)
|
5030
5372
|
schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
|
5031
|
-
rbdf = RbDataFrame.from_hashes(data, schema, schema_overrides,
|
5373
|
+
rbdf = RbDataFrame.from_hashes(data, schema, schema_overrides, strict, infer_schema_length)
|
5032
5374
|
if column_names
|
5033
5375
|
rbdf = _post_apply_columns(rbdf, column_names)
|
5034
5376
|
end
|
@@ -5048,7 +5390,7 @@ module Polars
|
|
5048
5390
|
schema, schema_overrides: schema_overrides, n_expected: first_element.length
|
5049
5391
|
)
|
5050
5392
|
local_schema_override = (
|
5051
|
-
schema_overrides.any? ? (
|
5393
|
+
schema_overrides.any? ? _include_unknowns(schema_overrides, column_names) : {}
|
5052
5394
|
)
|
5053
5395
|
if column_names.any? && first_element.length > 0 && first_element.length != column_names.length
|
5054
5396
|
raise ArgumentError, "the row data does not match the number of columns"
|
@@ -5056,7 +5398,11 @@ module Polars
|
|
5056
5398
|
|
5057
5399
|
unpack_nested = false
|
5058
5400
|
local_schema_override.each do |col, tp|
|
5059
|
-
|
5401
|
+
if [Categorical, Enum].include?(tp)
|
5402
|
+
local_schema_override[col] = String
|
5403
|
+
elsif !unpack_nested && [Unknown, Struct].include?(tp.base_type)
|
5404
|
+
raise Todo
|
5405
|
+
end
|
5060
5406
|
end
|
5061
5407
|
|
5062
5408
|
if unpack_nested
|
@@ -5070,7 +5416,7 @@ module Polars
|
|
5070
5416
|
end
|
5071
5417
|
if column_names.any? || schema_overrides.any?
|
5072
5418
|
rbdf = _post_apply_columns(
|
5073
|
-
rbdf, column_names, schema_overrides: schema_overrides
|
5419
|
+
rbdf, column_names, schema_overrides: schema_overrides, strict: strict
|
5074
5420
|
)
|
5075
5421
|
end
|
5076
5422
|
return rbdf
|
@@ -5080,7 +5426,7 @@ module Polars
|
|
5080
5426
|
)
|
5081
5427
|
data_series =
|
5082
5428
|
data.map.with_index do |element, i|
|
5083
|
-
Series.new(column_names[i], element, dtype: schema_overrides[column_names[i]])._s
|
5429
|
+
Series.new(column_names[i], element, dtype: schema_overrides[column_names[i]], strict: strict)._s
|
5084
5430
|
end
|
5085
5431
|
return RbDataFrame.new(data_series)
|
5086
5432
|
else
|
@@ -5093,7 +5439,12 @@ module Polars
|
|
5093
5439
|
end
|
5094
5440
|
|
5095
5441
|
# @private
|
5096
|
-
def self.
|
5442
|
+
def self._include_unknowns(schema, cols)
|
5443
|
+
cols.to_h { |col| [col, schema[col] || Unknown] }
|
5444
|
+
end
|
5445
|
+
|
5446
|
+
# @private
|
5447
|
+
def self.series_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true)
|
5097
5448
|
data_series = [data._s]
|
5098
5449
|
series_name = data_series.map(&:name)
|
5099
5450
|
column_names, schema_overrides = _unpack_schema(
|
@@ -5102,7 +5453,7 @@ module Polars
|
|
5102
5453
|
if schema_overrides.any?
|
5103
5454
|
new_dtype = schema_overrides.values[0]
|
5104
5455
|
if new_dtype != data.dtype
|
5105
|
-
data_series[0] = data_series[0].cast(new_dtype,
|
5456
|
+
data_series[0] = data_series[0].cast(new_dtype, strict)
|
5106
5457
|
end
|
5107
5458
|
end
|
5108
5459
|
|