polars-df 0.13.0-x86_64-linux → 0.15.0-x86_64-linux
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +30 -0
- data/Cargo.lock +1368 -319
- data/LICENSE-THIRD-PARTY.txt +24801 -13447
- data/LICENSE.txt +1 -0
- data/README.md +1 -2
- data/lib/polars/3.1/polars.so +0 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/3.3/polars.so +0 -0
- data/lib/polars/batched_csv_reader.rb +0 -2
- data/lib/polars/binary_expr.rb +133 -9
- data/lib/polars/binary_name_space.rb +101 -6
- data/lib/polars/config.rb +4 -0
- data/lib/polars/data_frame.rb +285 -62
- data/lib/polars/data_type_group.rb +28 -0
- data/lib/polars/data_types.rb +2 -0
- data/lib/polars/date_time_expr.rb +244 -0
- data/lib/polars/date_time_name_space.rb +87 -0
- data/lib/polars/expr.rb +109 -8
- data/lib/polars/functions/as_datatype.rb +51 -2
- data/lib/polars/functions/col.rb +1 -1
- data/lib/polars/functions/eager.rb +1 -3
- data/lib/polars/functions/lazy.rb +88 -10
- data/lib/polars/functions/range/time_range.rb +21 -21
- data/lib/polars/io/csv.rb +14 -16
- data/lib/polars/io/database.rb +2 -2
- data/lib/polars/io/ipc.rb +14 -12
- data/lib/polars/io/ndjson.rb +10 -0
- data/lib/polars/io/parquet.rb +168 -111
- data/lib/polars/lazy_frame.rb +649 -15
- data/lib/polars/list_name_space.rb +169 -0
- data/lib/polars/selectors.rb +1144 -0
- data/lib/polars/series.rb +470 -40
- data/lib/polars/string_cache.rb +27 -1
- data/lib/polars/string_expr.rb +0 -1
- data/lib/polars/string_name_space.rb +73 -3
- data/lib/polars/struct_name_space.rb +31 -7
- data/lib/polars/utils/various.rb +5 -1
- data/lib/polars/utils.rb +45 -10
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +2 -1
- metadata +4 -3
- data/lib/polars/functions.rb +0 -57
data/lib/polars/data_frame.rb
CHANGED
@@ -8,17 +8,49 @@ module Polars
|
|
8
8
|
|
9
9
|
# Create a new DataFrame.
|
10
10
|
#
|
11
|
-
# @param data [
|
12
|
-
# Two-dimensional data in various forms
|
13
|
-
#
|
14
|
-
# @param
|
15
|
-
#
|
16
|
-
#
|
17
|
-
#
|
18
|
-
#
|
11
|
+
# @param data [Object]
|
12
|
+
# Two-dimensional data in various forms; hash input must contain arrays
|
13
|
+
# or a range. Arrays may contain Series or other arrays.
|
14
|
+
# @param schema [Object]
|
15
|
+
# The schema of the resulting DataFrame. The schema may be declared in several
|
16
|
+
# ways:
|
17
|
+
#
|
18
|
+
# * As a hash of name:type pairs; if type is nil, it will be auto-inferred.
|
19
|
+
# * As an array of column names; in this case types are automatically inferred.
|
20
|
+
# * As an array of (name,type) pairs; this is equivalent to the dictionary form.
|
21
|
+
#
|
22
|
+
# If you supply a list of column names that does not match the names in the
|
23
|
+
# underlying data, the names given here will overwrite them. The number
|
24
|
+
# of names given in the schema should match the underlying data dimensions.
|
25
|
+
#
|
26
|
+
# If set to `nil` (default), the schema is inferred from the data.
|
27
|
+
# @param schema_overrides [Hash]
|
28
|
+
# Support type specification or override of one or more columns; note that
|
29
|
+
# any dtypes inferred from the schema param will be overridden.
|
30
|
+
#
|
31
|
+
# The number of entries in the schema should match the underlying data
|
32
|
+
# dimensions, unless an array of hashes is being passed, in which case
|
33
|
+
# a *partial* schema can be declared to prevent specific fields from being loaded.
|
34
|
+
# @param strict [Boolean]
|
35
|
+
# Throw an error if any `data` value does not exactly match the given or inferred
|
36
|
+
# data type for that column. If set to `false`, values that do not match the data
|
37
|
+
# type are cast to that data type or, if casting is not possible, set to null
|
38
|
+
# instead.
|
39
|
+
# @param orient ["col", "row"]
|
40
|
+
# Whether to interpret two-dimensional data as columns or as rows. If nil,
|
19
41
|
# the orientation is inferred by matching the columns and data dimensions. If
|
20
42
|
# this does not yield conclusive results, column orientation is used.
|
21
|
-
|
43
|
+
# @param infer_schema_length [Integer]
|
44
|
+
# The maximum number of rows to scan for schema inference. If set to `nil`, the
|
45
|
+
# full data may be scanned *(this can be slow)*. This parameter only applies if
|
46
|
+
# the input data is a sequence or generator of rows; other input is read as-is.
|
47
|
+
# @param nan_to_null [Boolean]
|
48
|
+
# If the data comes from one or more Numo arrays, can optionally convert input
|
49
|
+
# data NaN values to null instead. This is a no-op for all other input data.
|
50
|
+
def initialize(data = nil, schema: nil, columns: nil, schema_overrides: nil, strict: true, orient: nil, infer_schema_length: 100, nan_to_null: false)
|
51
|
+
if schema && columns
|
52
|
+
warn "columns is ignored when schema is passed"
|
53
|
+
end
|
22
54
|
schema ||= columns
|
23
55
|
|
24
56
|
if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
|
@@ -29,11 +61,17 @@ module Polars
|
|
29
61
|
self._df = self.class.hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
|
30
62
|
elsif data.is_a?(Hash)
|
31
63
|
data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
|
32
|
-
self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, nan_to_null: nan_to_null)
|
64
|
+
self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict, nan_to_null: nan_to_null)
|
33
65
|
elsif data.is_a?(::Array)
|
34
|
-
self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, orient: orient, infer_schema_length: infer_schema_length)
|
66
|
+
self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict, orient: orient, infer_schema_length: infer_schema_length)
|
35
67
|
elsif data.is_a?(Series)
|
36
|
-
self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides)
|
68
|
+
self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict)
|
69
|
+
elsif data.respond_to?(:arrow_c_stream)
|
70
|
+
# This uses the fact that RbSeries.from_arrow_c_stream will create a
|
71
|
+
# struct-typed Series. Then we unpack that to a DataFrame.
|
72
|
+
tmp_col_name = ""
|
73
|
+
s = Utils.wrap_s(RbSeries.from_arrow_c_stream(data))
|
74
|
+
self._df = s.to_frame(tmp_col_name).unnest(tmp_col_name)._df
|
37
75
|
else
|
38
76
|
raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
|
39
77
|
end
|
@@ -452,6 +490,11 @@ module Polars
|
|
452
490
|
end
|
453
491
|
end
|
454
492
|
|
493
|
+
# @private
|
494
|
+
def arrow_c_stream
|
495
|
+
_df.arrow_c_stream
|
496
|
+
end
|
497
|
+
|
455
498
|
# Return the dataframe as a scalar.
|
456
499
|
#
|
457
500
|
# Equivalent to `df[0,0]`, with a check that the shape is (1,1).
|
@@ -766,15 +809,18 @@ module Polars
|
|
766
809
|
# Compression method. Defaults to "uncompressed".
|
767
810
|
#
|
768
811
|
# @return [nil]
|
769
|
-
def write_avro(file, compression = "uncompressed")
|
812
|
+
def write_avro(file, compression = "uncompressed", name: "")
|
770
813
|
if compression.nil?
|
771
814
|
compression = "uncompressed"
|
772
815
|
end
|
773
816
|
if Utils.pathlike?(file)
|
774
817
|
file = Utils.normalize_filepath(file)
|
775
818
|
end
|
819
|
+
if name.nil?
|
820
|
+
name = ""
|
821
|
+
end
|
776
822
|
|
777
|
-
_df.write_avro(file, compression)
|
823
|
+
_df.write_avro(file, compression, name)
|
778
824
|
end
|
779
825
|
|
780
826
|
# Write to Arrow IPC binary stream or Feather file.
|
@@ -785,7 +831,7 @@ module Polars
|
|
785
831
|
# Compression method. Defaults to "uncompressed".
|
786
832
|
#
|
787
833
|
# @return [nil]
|
788
|
-
def write_ipc(file, compression: "uncompressed")
|
834
|
+
def write_ipc(file, compression: "uncompressed", compat_level: nil)
|
789
835
|
return_bytes = file.nil?
|
790
836
|
if return_bytes
|
791
837
|
file = StringIO.new
|
@@ -795,11 +841,15 @@ module Polars
|
|
795
841
|
file = Utils.normalize_filepath(file)
|
796
842
|
end
|
797
843
|
|
844
|
+
if compat_level.nil?
|
845
|
+
compat_level = true
|
846
|
+
end
|
847
|
+
|
798
848
|
if compression.nil?
|
799
849
|
compression = "uncompressed"
|
800
850
|
end
|
801
851
|
|
802
|
-
_df.write_ipc(file, compression)
|
852
|
+
_df.write_ipc(file, compression, compat_level)
|
803
853
|
return_bytes ? file.string : nil
|
804
854
|
end
|
805
855
|
|
@@ -826,7 +876,8 @@ module Polars
|
|
826
876
|
# df.write_ipc_stream("new_file.arrow")
|
827
877
|
def write_ipc_stream(
|
828
878
|
file,
|
829
|
-
compression: "uncompressed"
|
879
|
+
compression: "uncompressed",
|
880
|
+
compat_level: nil
|
830
881
|
)
|
831
882
|
return_bytes = file.nil?
|
832
883
|
if return_bytes
|
@@ -836,11 +887,15 @@ module Polars
|
|
836
887
|
file = Utils.normalize_filepath(file)
|
837
888
|
end
|
838
889
|
|
890
|
+
if compat_level.nil?
|
891
|
+
compat_level = true
|
892
|
+
end
|
893
|
+
|
839
894
|
if compression.nil?
|
840
895
|
compression = "uncompressed"
|
841
896
|
end
|
842
897
|
|
843
|
-
_df.write_ipc_stream(file, compression)
|
898
|
+
_df.write_ipc_stream(file, compression, compat_level)
|
844
899
|
return_bytes ? file.string : nil
|
845
900
|
end
|
846
901
|
|
@@ -1037,6 +1092,10 @@ module Polars
|
|
1037
1092
|
#
|
1038
1093
|
# @param mapping [Hash]
|
1039
1094
|
# Key value pairs that map from old name to new name.
|
1095
|
+
# @param strict [Boolean]
|
1096
|
+
# Validate that all column names exist in the current schema,
|
1097
|
+
# and throw an exception if any do not. (Note that this parameter
|
1098
|
+
# is a no-op when passing a function to `mapping`).
|
1040
1099
|
#
|
1041
1100
|
# @return [DataFrame]
|
1042
1101
|
#
|
@@ -1060,8 +1119,8 @@ module Polars
|
|
1060
1119
|
# # │ 2 ┆ 7 ┆ b │
|
1061
1120
|
# # │ 3 ┆ 8 ┆ c │
|
1062
1121
|
# # └───────┴─────┴─────┘
|
1063
|
-
def rename(mapping)
|
1064
|
-
lazy.rename(mapping).collect(no_optimization: true)
|
1122
|
+
def rename(mapping, strict: true)
|
1123
|
+
lazy.rename(mapping, strict: strict).collect(no_optimization: true)
|
1065
1124
|
end
|
1066
1125
|
|
1067
1126
|
# Insert a Series at a certain column index. This operation is in place.
|
@@ -2190,6 +2249,11 @@ module Polars
|
|
2190
2249
|
# @param force_parallel [Boolean]
|
2191
2250
|
# Force the physical plan to evaluate the computation of both DataFrames up to
|
2192
2251
|
# the join in parallel.
|
2252
|
+
# @param coalesce [Boolean]
|
2253
|
+
# Coalescing behavior (merging of join columns).
|
2254
|
+
# - true: -> Always coalesce join columns.
|
2255
|
+
# - false: -> Never coalesce join columns.
|
2256
|
+
# Note that joining on any other expressions than `col` will turn off coalescing.
|
2193
2257
|
#
|
2194
2258
|
# @return [DataFrame]
|
2195
2259
|
#
|
@@ -2243,7 +2307,8 @@ module Polars
|
|
2243
2307
|
suffix: "_right",
|
2244
2308
|
tolerance: nil,
|
2245
2309
|
allow_parallel: true,
|
2246
|
-
force_parallel: false
|
2310
|
+
force_parallel: false,
|
2311
|
+
coalesce: true
|
2247
2312
|
)
|
2248
2313
|
lazy
|
2249
2314
|
.join_asof(
|
@@ -2258,7 +2323,8 @@ module Polars
|
|
2258
2323
|
suffix: suffix,
|
2259
2324
|
tolerance: tolerance,
|
2260
2325
|
allow_parallel: allow_parallel,
|
2261
|
-
force_parallel: force_parallel
|
2326
|
+
force_parallel: force_parallel,
|
2327
|
+
coalesce: coalesce
|
2262
2328
|
)
|
2263
2329
|
.collect(no_optimization: true)
|
2264
2330
|
end
|
@@ -2277,8 +2343,20 @@ module Polars
|
|
2277
2343
|
# Join strategy.
|
2278
2344
|
# @param suffix [String]
|
2279
2345
|
# Suffix to append to columns with a duplicate name.
|
2346
|
+
# @param validate ['m:m', 'm:1', '1:m', '1:1']
|
2347
|
+
# Checks if join is of specified type.
|
2348
|
+
# * *many_to_many* - “m:m”: default, does not result in checks
|
2349
|
+
# * *one_to_one* - “1:1”: check if join keys are unique in both left and right datasets
|
2350
|
+
# * *one_to_many* - “1:m”: check if join keys are unique in left dataset
|
2351
|
+
# * *many_to_one* - “m:1”: check if join keys are unique in right dataset
|
2280
2352
|
# @param join_nulls [Boolean]
|
2281
2353
|
# Join on null values. By default null values will never produce matches.
|
2354
|
+
# @param coalesce [Boolean]
|
2355
|
+
# Coalescing behavior (merging of join columns).
|
2356
|
+
# - nil: -> join specific.
|
2357
|
+
# - true: -> Always coalesce join columns.
|
2358
|
+
# - false: -> Never coalesce join columns.
|
2359
|
+
# Note that joining on any other expressions than `col` will turn off coalescing.
|
2282
2360
|
#
|
2283
2361
|
# @return [DataFrame]
|
2284
2362
|
#
|
@@ -2361,7 +2439,16 @@ module Polars
|
|
2361
2439
|
# # ╞═════╪═════╪═════╡
|
2362
2440
|
# # │ 3 ┆ 8.0 ┆ c │
|
2363
2441
|
# # └─────┴─────┴─────┘
|
2364
|
-
def join(other,
|
2442
|
+
def join(other,
|
2443
|
+
left_on: nil,
|
2444
|
+
right_on: nil,
|
2445
|
+
on: nil,
|
2446
|
+
how: "inner",
|
2447
|
+
suffix: "_right",
|
2448
|
+
validate: "m:m",
|
2449
|
+
join_nulls: false,
|
2450
|
+
coalesce: nil
|
2451
|
+
)
|
2365
2452
|
lazy
|
2366
2453
|
.join(
|
2367
2454
|
other.lazy,
|
@@ -2370,7 +2457,9 @@ module Polars
|
|
2370
2457
|
on: on,
|
2371
2458
|
how: how,
|
2372
2459
|
suffix: suffix,
|
2373
|
-
|
2460
|
+
validate: validate,
|
2461
|
+
join_nulls: join_nulls,
|
2462
|
+
coalesce: coalesce
|
2374
2463
|
)
|
2375
2464
|
.collect(no_optimization: true)
|
2376
2465
|
end
|
@@ -2426,15 +2515,15 @@ module Polars
|
|
2426
2515
|
# df.map_rows { |t| t[0] * 2 + t[1] }
|
2427
2516
|
# # =>
|
2428
2517
|
# # shape: (3, 1)
|
2429
|
-
# #
|
2430
|
-
# # │
|
2431
|
-
# # │ ---
|
2432
|
-
# # │ i64
|
2433
|
-
# #
|
2434
|
-
# # │ 1
|
2435
|
-
# # │ 9
|
2436
|
-
# # │ 14
|
2437
|
-
# #
|
2518
|
+
# # ┌─────┐
|
2519
|
+
# # │ map │
|
2520
|
+
# # │ --- │
|
2521
|
+
# # │ i64 │
|
2522
|
+
# # ╞═════╡
|
2523
|
+
# # │ 1 │
|
2524
|
+
# # │ 9 │
|
2525
|
+
# # │ 14 │
|
2526
|
+
# # └─────┘
|
2438
2527
|
def map_rows(return_dtype: nil, inference_size: 256, &f)
|
2439
2528
|
out, is_df = _df.map_rows(f, return_dtype, inference_size)
|
2440
2529
|
if is_df
|
@@ -2717,10 +2806,85 @@ module Polars
|
|
2717
2806
|
# Column to drop.
|
2718
2807
|
#
|
2719
2808
|
# @return [Series]
|
2809
|
+
#
|
2810
|
+
# @example
|
2811
|
+
# df = Polars::DataFrame.new(
|
2812
|
+
# {
|
2813
|
+
# "foo" => [1, 2, 3],
|
2814
|
+
# "bar" => [6, 7, 8],
|
2815
|
+
# "ham" => ["a", "b", "c"]
|
2816
|
+
# }
|
2817
|
+
# )
|
2818
|
+
# df.delete("ham")
|
2819
|
+
# # =>
|
2820
|
+
# # shape: (3,)
|
2821
|
+
# # Series: 'ham' [str]
|
2822
|
+
# # [
|
2823
|
+
# # "a"
|
2824
|
+
# # "b"
|
2825
|
+
# # "c"
|
2826
|
+
# # ]
|
2827
|
+
#
|
2828
|
+
# @example
|
2829
|
+
# df.delete("missing")
|
2830
|
+
# # => nil
|
2720
2831
|
def delete(name)
|
2721
2832
|
drop_in_place(name) if include?(name)
|
2722
2833
|
end
|
2723
2834
|
|
2835
|
+
# Cast DataFrame column(s) to the specified dtype(s).
|
2836
|
+
#
|
2837
|
+
# @param dtypes [Object]
|
2838
|
+
# Mapping of column names (or selector) to dtypes, or a single dtype
|
2839
|
+
# to which all columns will be cast.
|
2840
|
+
# @param strict [Boolean]
|
2841
|
+
# Throw an error if a cast could not be done (for instance, due to an
|
2842
|
+
# overflow).
|
2843
|
+
#
|
2844
|
+
# @return [DataFrame]
|
2845
|
+
#
|
2846
|
+
# @example Cast specific frame columns to the specified dtypes:
|
2847
|
+
# df = Polars::DataFrame.new(
|
2848
|
+
# {
|
2849
|
+
# "foo" => [1, 2, 3],
|
2850
|
+
# "bar" => [6.0, 7.0, 8.0],
|
2851
|
+
# "ham" => [Date.new(2020, 1, 2), Date.new(2021, 3, 4), Date.new(2022, 5, 6)]
|
2852
|
+
# }
|
2853
|
+
# )
|
2854
|
+
# df.cast({"foo" => Polars::Float32, "bar" => Polars::UInt8})
|
2855
|
+
# # =>
|
2856
|
+
# # shape: (3, 3)
|
2857
|
+
# # ┌─────┬─────┬────────────┐
|
2858
|
+
# # │ foo ┆ bar ┆ ham │
|
2859
|
+
# # │ --- ┆ --- ┆ --- │
|
2860
|
+
# # │ f32 ┆ u8 ┆ date │
|
2861
|
+
# # ╞═════╪═════╪════════════╡
|
2862
|
+
# # │ 1.0 ┆ 6 ┆ 2020-01-02 │
|
2863
|
+
# # │ 2.0 ┆ 7 ┆ 2021-03-04 │
|
2864
|
+
# # │ 3.0 ┆ 8 ┆ 2022-05-06 │
|
2865
|
+
# # └─────┴─────┴────────────┘
|
2866
|
+
#
|
2867
|
+
# @example Cast all frame columns matching one dtype (or dtype group) to another dtype:
|
2868
|
+
# df.cast({Polars::Date => Polars::Datetime})
|
2869
|
+
# # =>
|
2870
|
+
# # shape: (3, 3)
|
2871
|
+
# # ┌─────┬─────┬─────────────────────┐
|
2872
|
+
# # │ foo ┆ bar ┆ ham │
|
2873
|
+
# # │ --- ┆ --- ┆ --- │
|
2874
|
+
# # │ i64 ┆ f64 ┆ datetime[μs] │
|
2875
|
+
# # ╞═════╪═════╪═════════════════════╡
|
2876
|
+
# # │ 1 ┆ 6.0 ┆ 2020-01-02 00:00:00 │
|
2877
|
+
# # │ 2 ┆ 7.0 ┆ 2021-03-04 00:00:00 │
|
2878
|
+
# # │ 3 ┆ 8.0 ┆ 2022-05-06 00:00:00 │
|
2879
|
+
# # └─────┴─────┴─────────────────────┘
|
2880
|
+
#
|
2881
|
+
# @example Cast all frame columns to the specified dtype:
|
2882
|
+
# df.cast(Polars::String).to_h(as_series: false)
|
2883
|
+
# # => {"foo"=>["1", "2", "3"], "bar"=>["6.0", "7.0", "8.0"], "ham"=>["2020-01-02", "2021-03-04", "2022-05-06"]}
|
2884
|
+
def cast(dtypes, strict: true)
|
2885
|
+
lazy.cast(dtypes, strict: strict).collect(_eager: true)
|
2886
|
+
end
|
2887
|
+
|
2724
2888
|
# Create an empty copy of the current DataFrame.
|
2725
2889
|
#
|
2726
2890
|
# Returns a DataFrame with identical schema but no data.
|
@@ -2775,6 +2939,57 @@ module Polars
|
|
2775
2939
|
# Get the DataFrame as a Array of Series.
|
2776
2940
|
#
|
2777
2941
|
# @return [Array]
|
2942
|
+
#
|
2943
|
+
# @example
|
2944
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
|
2945
|
+
# df.get_columns
|
2946
|
+
# # =>
|
2947
|
+
# # [shape: (3,)
|
2948
|
+
# # Series: 'foo' [i64]
|
2949
|
+
# # [
|
2950
|
+
# # 1
|
2951
|
+
# # 2
|
2952
|
+
# # 3
|
2953
|
+
# # ], shape: (3,)
|
2954
|
+
# # Series: 'bar' [i64]
|
2955
|
+
# # [
|
2956
|
+
# # 4
|
2957
|
+
# # 5
|
2958
|
+
# # 6
|
2959
|
+
# # ]]
|
2960
|
+
#
|
2961
|
+
# @example
|
2962
|
+
# df = Polars::DataFrame.new(
|
2963
|
+
# {
|
2964
|
+
# "a" => [1, 2, 3, 4],
|
2965
|
+
# "b" => [0.5, 4, 10, 13],
|
2966
|
+
# "c" => [true, true, false, true]
|
2967
|
+
# }
|
2968
|
+
# )
|
2969
|
+
# df.get_columns
|
2970
|
+
# # =>
|
2971
|
+
# # [shape: (4,)
|
2972
|
+
# # Series: 'a' [i64]
|
2973
|
+
# # [
|
2974
|
+
# # 1
|
2975
|
+
# # 2
|
2976
|
+
# # 3
|
2977
|
+
# # 4
|
2978
|
+
# # ], shape: (4,)
|
2979
|
+
# # Series: 'b' [f64]
|
2980
|
+
# # [
|
2981
|
+
# # 0.5
|
2982
|
+
# # 4.0
|
2983
|
+
# # 10.0
|
2984
|
+
# # 13.0
|
2985
|
+
# # ], shape: (4,)
|
2986
|
+
# # Series: 'c' [bool]
|
2987
|
+
# # [
|
2988
|
+
# # true
|
2989
|
+
# # true
|
2990
|
+
# # false
|
2991
|
+
# # true
|
2992
|
+
# # ]]
|
2778
2993
|
def get_columns
|
2779
2994
|
_df.get_columns.map { |s| Utils.wrap_s(s) }
|
2780
2995
|
end
|
@@ -3083,7 +3298,7 @@ module Polars
|
|
3083
3298
|
# "c" => [2, 4, 6]
|
3084
3299
|
# }
|
3085
3300
|
# )
|
3086
|
-
# df.unpivot(Polars
|
3301
|
+
# df.unpivot(Polars.cs.numeric, index: "a")
|
3087
3302
|
# # =>
|
3088
3303
|
# # shape: (6, 3)
|
3089
3304
|
# # ┌─────┬──────────┬───────┐
|
@@ -4234,7 +4449,7 @@ module Polars
|
|
4234
4449
|
if n.nil? && !frac.nil?
|
4235
4450
|
frac = Series.new("frac", [frac]) unless frac.is_a?(Series)
|
4236
4451
|
|
4237
|
-
_from_rbdf(
|
4452
|
+
return _from_rbdf(
|
4238
4453
|
_df.sample_frac(frac._s, with_replacement, shuffle, seed)
|
4239
4454
|
)
|
4240
4455
|
end
|
@@ -4296,7 +4511,7 @@ module Polars
|
|
4296
4511
|
# @example A horizontal string concatenation:
|
4297
4512
|
# df = Polars::DataFrame.new(
|
4298
4513
|
# {
|
4299
|
-
# "a" => ["foo", "bar",
|
4514
|
+
# "a" => ["foo", "bar", nil],
|
4300
4515
|
# "b" => [1, 2, 3],
|
4301
4516
|
# "c" => [1.0, 2.0, 3.0]
|
4302
4517
|
# }
|
@@ -4327,11 +4542,11 @@ module Polars
|
|
4327
4542
|
# # true
|
4328
4543
|
# # true
|
4329
4544
|
# # ]
|
4330
|
-
def fold
|
4545
|
+
def fold
|
4331
4546
|
acc = to_series(0)
|
4332
4547
|
|
4333
4548
|
1.upto(width - 1) do |i|
|
4334
|
-
acc =
|
4549
|
+
acc = yield(acc, to_series(i))
|
4335
4550
|
end
|
4336
4551
|
acc
|
4337
4552
|
end
|
@@ -4843,7 +5058,7 @@ module Polars
|
|
4843
5058
|
end
|
4844
5059
|
|
4845
5060
|
# @private
|
4846
|
-
def self.expand_hash_scalars(data, schema_overrides: nil, order: nil, nan_to_null: false)
|
5061
|
+
def self.expand_hash_scalars(data, schema_overrides: nil, strict: true, order: nil, nan_to_null: false)
|
4847
5062
|
updated_data = {}
|
4848
5063
|
unless data.empty?
|
4849
5064
|
dtypes = schema_overrides || {}
|
@@ -4852,23 +5067,23 @@ module Polars
|
|
4852
5067
|
data.each do |name, val|
|
4853
5068
|
dtype = dtypes[name]
|
4854
5069
|
if val.is_a?(Hash) && dtype != Struct
|
4855
|
-
updated_data[name] = DataFrame.new(val).to_struct(name)
|
5070
|
+
updated_data[name] = DataFrame.new(val, strict: strict).to_struct(name)
|
4856
5071
|
elsif !Utils.arrlen(val).nil?
|
4857
|
-
updated_data[name] = Series.new(::String.new(name), val, dtype: dtype)
|
5072
|
+
updated_data[name] = Series.new(::String.new(name), val, dtype: dtype, strict: strict)
|
4858
5073
|
elsif val.nil? || [Integer, Float, TrueClass, FalseClass, ::String, ::Date, ::DateTime, ::Time].any? { |cls| val.is_a?(cls) }
|
4859
5074
|
dtype = Polars::Float64 if val.nil? && dtype.nil?
|
4860
|
-
updated_data[name] = Series.new(::String.new(name), [val], dtype: dtype).extend_constant(val, array_len - 1)
|
5075
|
+
updated_data[name] = Series.new(::String.new(name), [val], dtype: dtype, strict: strict).extend_constant(val, array_len - 1)
|
4861
5076
|
else
|
4862
5077
|
raise Todo
|
4863
5078
|
end
|
4864
5079
|
end
|
4865
5080
|
elsif data.values.all? { |val| Utils.arrlen(val) == 0 }
|
4866
5081
|
data.each do |name, val|
|
4867
|
-
updated_data[name] = Series.new(name, val, dtype: dtypes[name])
|
5082
|
+
updated_data[name] = Series.new(name, val, dtype: dtypes[name], strict: strict)
|
4868
5083
|
end
|
4869
5084
|
elsif data.values.all? { |val| Utils.arrlen(val).nil? }
|
4870
5085
|
data.each do |name, val|
|
4871
|
-
updated_data[name] = Series.new(name, [val], dtype: dtypes[name])
|
5086
|
+
updated_data[name] = Series.new(name, [val], dtype: dtypes[name], strict: strict)
|
4872
5087
|
end
|
4873
5088
|
end
|
4874
5089
|
end
|
@@ -4876,7 +5091,7 @@ module Polars
|
|
4876
5091
|
end
|
4877
5092
|
|
4878
5093
|
# @private
|
4879
|
-
def self.hash_to_rbdf(data, schema: nil, schema_overrides: nil, nan_to_null: nil)
|
5094
|
+
def self.hash_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true, nan_to_null: nil)
|
4880
5095
|
if schema.is_a?(Hash) && !data.empty?
|
4881
5096
|
if !data.all? { |col, _| schema[col] }
|
4882
5097
|
raise ArgumentError, "The given column-schema names do not match the data dictionary"
|
@@ -4893,9 +5108,9 @@ module Polars
|
|
4893
5108
|
end
|
4894
5109
|
|
4895
5110
|
if data.empty? && !schema_overrides.empty?
|
4896
|
-
data_series = column_names.map { |name| Series.new(name, [], dtype: schema_overrides[name], nan_to_null: nan_to_null)._s }
|
5111
|
+
data_series = column_names.map { |name| Series.new(name, [], dtype: schema_overrides[name], strict: strict, nan_to_null: nan_to_null)._s }
|
4897
5112
|
else
|
4898
|
-
data_series = expand_hash_scalars(data, schema_overrides: schema_overrides, nan_to_null: nan_to_null).values.map(&:_s)
|
5113
|
+
data_series = expand_hash_scalars(data, schema_overrides: schema_overrides, strict: strict, nan_to_null: nan_to_null).values.map(&:_s)
|
4899
5114
|
end
|
4900
5115
|
|
4901
5116
|
data_series = _handle_columns_arg(data_series, columns: column_names, from_hash: true)
|
@@ -4969,7 +5184,7 @@ module Polars
|
|
4969
5184
|
end
|
4970
5185
|
end
|
4971
5186
|
|
4972
|
-
def self._post_apply_columns(rbdf, columns, structs: nil, schema_overrides: nil)
|
5187
|
+
def self._post_apply_columns(rbdf, columns, structs: nil, schema_overrides: nil, strict: true)
|
4973
5188
|
rbdf_columns = rbdf.columns
|
4974
5189
|
rbdf_dtypes = rbdf.dtypes
|
4975
5190
|
columns, dtypes = _unpack_schema(
|
@@ -4985,13 +5200,13 @@ module Polars
|
|
4985
5200
|
end
|
4986
5201
|
|
4987
5202
|
column_casts = []
|
4988
|
-
columns.
|
5203
|
+
columns.each_with_index do |col, i|
|
4989
5204
|
if dtypes[col] == Categorical # != rbdf_dtypes[i]
|
4990
|
-
column_casts << Polars.col(col).cast(Categorical)._rbexpr
|
5205
|
+
column_casts << Polars.col(col).cast(Categorical, strict: strict)._rbexpr
|
4991
5206
|
elsif structs&.any? && structs.include?(col) && structs[col] != rbdf_dtypes[i]
|
4992
|
-
column_casts << Polars.col(col).cast(structs[col])._rbexpr
|
5207
|
+
column_casts << Polars.col(col).cast(structs[col], strict: strict)._rbexpr
|
4993
5208
|
elsif dtypes.include?(col) && dtypes[col] != rbdf_dtypes[i]
|
4994
|
-
column_casts << Polars.col(col).cast(dtypes[col])._rbexpr
|
5209
|
+
column_casts << Polars.col(col).cast(dtypes[col], strict: strict)._rbexpr
|
4995
5210
|
end
|
4996
5211
|
end
|
4997
5212
|
|
@@ -5010,12 +5225,11 @@ module Polars
|
|
5010
5225
|
end
|
5011
5226
|
|
5012
5227
|
# @private
|
5013
|
-
def self.sequence_to_rbdf(data, schema: nil, schema_overrides: nil, orient: nil, infer_schema_length: 50)
|
5014
|
-
raise Todo if schema_overrides
|
5228
|
+
def self.sequence_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true, orient: nil, infer_schema_length: 50)
|
5015
5229
|
columns = schema
|
5016
5230
|
|
5017
5231
|
if data.length == 0
|
5018
|
-
return hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
|
5232
|
+
return hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides, strict: strict)
|
5019
5233
|
end
|
5020
5234
|
|
5021
5235
|
if data[0].is_a?(Series)
|
@@ -5028,7 +5242,7 @@ module Polars
|
|
5028
5242
|
elsif data[0].is_a?(Hash)
|
5029
5243
|
column_names, dtypes = _unpack_schema(columns)
|
5030
5244
|
schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
|
5031
|
-
rbdf = RbDataFrame.from_hashes(data, schema, schema_overrides,
|
5245
|
+
rbdf = RbDataFrame.from_hashes(data, schema, schema_overrides, strict, infer_schema_length)
|
5032
5246
|
if column_names
|
5033
5247
|
rbdf = _post_apply_columns(rbdf, column_names)
|
5034
5248
|
end
|
@@ -5048,7 +5262,7 @@ module Polars
|
|
5048
5262
|
schema, schema_overrides: schema_overrides, n_expected: first_element.length
|
5049
5263
|
)
|
5050
5264
|
local_schema_override = (
|
5051
|
-
schema_overrides.any? ? (
|
5265
|
+
schema_overrides.any? ? _include_unknowns(schema_overrides, column_names) : {}
|
5052
5266
|
)
|
5053
5267
|
if column_names.any? && first_element.length > 0 && first_element.length != column_names.length
|
5054
5268
|
raise ArgumentError, "the row data does not match the number of columns"
|
@@ -5056,7 +5270,11 @@ module Polars
|
|
5056
5270
|
|
5057
5271
|
unpack_nested = false
|
5058
5272
|
local_schema_override.each do |col, tp|
|
5059
|
-
|
5273
|
+
if [Categorical, Enum].include?(tp)
|
5274
|
+
local_schema_override[col] = String
|
5275
|
+
elsif !unpack_nested && [Unknown, Struct].include?(tp.base_type)
|
5276
|
+
raise Todo
|
5277
|
+
end
|
5060
5278
|
end
|
5061
5279
|
|
5062
5280
|
if unpack_nested
|
@@ -5070,7 +5288,7 @@ module Polars
|
|
5070
5288
|
end
|
5071
5289
|
if column_names.any? || schema_overrides.any?
|
5072
5290
|
rbdf = _post_apply_columns(
|
5073
|
-
rbdf, column_names, schema_overrides: schema_overrides
|
5291
|
+
rbdf, column_names, schema_overrides: schema_overrides, strict: strict
|
5074
5292
|
)
|
5075
5293
|
end
|
5076
5294
|
return rbdf
|
@@ -5080,7 +5298,7 @@ module Polars
|
|
5080
5298
|
)
|
5081
5299
|
data_series =
|
5082
5300
|
data.map.with_index do |element, i|
|
5083
|
-
Series.new(column_names[i], element, dtype: schema_overrides[column_names[i]])._s
|
5301
|
+
Series.new(column_names[i], element, dtype: schema_overrides[column_names[i]], strict: strict)._s
|
5084
5302
|
end
|
5085
5303
|
return RbDataFrame.new(data_series)
|
5086
5304
|
else
|
@@ -5093,7 +5311,12 @@ module Polars
|
|
5093
5311
|
end
|
5094
5312
|
|
5095
5313
|
# @private
|
5096
|
-
def self.
|
5314
|
+
def self._include_unknowns(schema, cols)
|
5315
|
+
cols.to_h { |col| [col, schema[col] || Unknown] }
|
5316
|
+
end
|
5317
|
+
|
5318
|
+
# @private
|
5319
|
+
def self.series_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true)
|
5097
5320
|
data_series = [data._s]
|
5098
5321
|
series_name = data_series.map(&:name)
|
5099
5322
|
column_names, schema_overrides = _unpack_schema(
|
@@ -5102,7 +5325,7 @@ module Polars
|
|
5102
5325
|
if schema_overrides.any?
|
5103
5326
|
new_dtype = schema_overrides.values[0]
|
5104
5327
|
if new_dtype != data.dtype
|
5105
|
-
data_series[0] = data_series[0].cast(new_dtype,
|
5328
|
+
data_series[0] = data_series[0].cast(new_dtype, strict)
|
5106
5329
|
end
|
5107
5330
|
end
|
5108
5331
|
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Polars
|
2
|
+
class DataTypeGroup < Set
|
3
|
+
end
|
4
|
+
|
5
|
+
SIGNED_INTEGER_DTYPES = DataTypeGroup.new(
|
6
|
+
[
|
7
|
+
Int8,
|
8
|
+
Int16,
|
9
|
+
Int32,
|
10
|
+
Int64
|
11
|
+
]
|
12
|
+
)
|
13
|
+
UNSIGNED_INTEGER_DTYPES = DataTypeGroup.new(
|
14
|
+
[
|
15
|
+
UInt8,
|
16
|
+
UInt16,
|
17
|
+
UInt32,
|
18
|
+
UInt64
|
19
|
+
]
|
20
|
+
)
|
21
|
+
INTEGER_DTYPES = (
|
22
|
+
SIGNED_INTEGER_DTYPES | UNSIGNED_INTEGER_DTYPES
|
23
|
+
)
|
24
|
+
FLOAT_DTYPES = DataTypeGroup.new([Float32, Float64])
|
25
|
+
NUMERIC_DTYPES = DataTypeGroup.new(
|
26
|
+
FLOAT_DTYPES + INTEGER_DTYPES | [Decimal]
|
27
|
+
)
|
28
|
+
end
|