polars-df 0.13.0-x64-mingw-ucrt → 0.15.0-x64-mingw-ucrt
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +30 -0
- data/Cargo.lock +1368 -319
- data/LICENSE-THIRD-PARTY.txt +24818 -14217
- data/LICENSE.txt +1 -0
- data/README.md +1 -2
- data/lib/polars/3.1/polars.so +0 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/3.3/polars.so +0 -0
- data/lib/polars/batched_csv_reader.rb +0 -2
- data/lib/polars/binary_expr.rb +133 -9
- data/lib/polars/binary_name_space.rb +101 -6
- data/lib/polars/config.rb +4 -0
- data/lib/polars/data_frame.rb +285 -62
- data/lib/polars/data_type_group.rb +28 -0
- data/lib/polars/data_types.rb +2 -0
- data/lib/polars/date_time_expr.rb +244 -0
- data/lib/polars/date_time_name_space.rb +87 -0
- data/lib/polars/expr.rb +109 -8
- data/lib/polars/functions/as_datatype.rb +51 -2
- data/lib/polars/functions/col.rb +1 -1
- data/lib/polars/functions/eager.rb +1 -3
- data/lib/polars/functions/lazy.rb +88 -10
- data/lib/polars/functions/range/time_range.rb +21 -21
- data/lib/polars/io/csv.rb +14 -16
- data/lib/polars/io/database.rb +2 -2
- data/lib/polars/io/ipc.rb +14 -12
- data/lib/polars/io/ndjson.rb +10 -0
- data/lib/polars/io/parquet.rb +168 -111
- data/lib/polars/lazy_frame.rb +649 -15
- data/lib/polars/list_name_space.rb +169 -0
- data/lib/polars/selectors.rb +1144 -0
- data/lib/polars/series.rb +470 -40
- data/lib/polars/string_cache.rb +27 -1
- data/lib/polars/string_expr.rb +0 -1
- data/lib/polars/string_name_space.rb +73 -3
- data/lib/polars/struct_name_space.rb +31 -7
- data/lib/polars/utils/various.rb +5 -1
- data/lib/polars/utils.rb +45 -10
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +2 -1
- metadata +4 -3
- data/lib/polars/functions.rb +0 -57
data/lib/polars/data_frame.rb
CHANGED
@@ -8,17 +8,49 @@ module Polars
|
|
8
8
|
|
9
9
|
# Create a new DataFrame.
|
10
10
|
#
|
11
|
-
# @param data [
|
12
|
-
# Two-dimensional data in various forms
|
13
|
-
#
|
14
|
-
# @param
|
15
|
-
#
|
16
|
-
#
|
17
|
-
#
|
18
|
-
#
|
11
|
+
# @param data [Object]
|
12
|
+
# Two-dimensional data in various forms; hash input must contain arrays
|
13
|
+
# or a range. Arrays may contain Series or other arrays.
|
14
|
+
# @param schema [Object]
|
15
|
+
# The schema of the resulting DataFrame. The schema may be declared in several
|
16
|
+
# ways:
|
17
|
+
#
|
18
|
+
# * As a hash of name:type pairs; if type is nil, it will be auto-inferred.
|
19
|
+
# * As an array of column names; in this case types are automatically inferred.
|
20
|
+
# * As an array of (name,type) pairs; this is equivalent to the dictionary form.
|
21
|
+
#
|
22
|
+
# If you supply a list of column names that does not match the names in the
|
23
|
+
# underlying data, the names given here will overwrite them. The number
|
24
|
+
# of names given in the schema should match the underlying data dimensions.
|
25
|
+
#
|
26
|
+
# If set to `nil` (default), the schema is inferred from the data.
|
27
|
+
# @param schema_overrides [Hash]
|
28
|
+
# Support type specification or override of one or more columns; note that
|
29
|
+
# any dtypes inferred from the schema param will be overridden.
|
30
|
+
#
|
31
|
+
# The number of entries in the schema should match the underlying data
|
32
|
+
# dimensions, unless an array of hashes is being passed, in which case
|
33
|
+
# a *partial* schema can be declared to prevent specific fields from being loaded.
|
34
|
+
# @param strict [Boolean]
|
35
|
+
# Throw an error if any `data` value does not exactly match the given or inferred
|
36
|
+
# data type for that column. If set to `false`, values that do not match the data
|
37
|
+
# type are cast to that data type or, if casting is not possible, set to null
|
38
|
+
# instead.
|
39
|
+
# @param orient ["col", "row"]
|
40
|
+
# Whether to interpret two-dimensional data as columns or as rows. If nil,
|
19
41
|
# the orientation is inferred by matching the columns and data dimensions. If
|
20
42
|
# this does not yield conclusive results, column orientation is used.
|
21
|
-
|
43
|
+
# @param infer_schema_length [Integer]
|
44
|
+
# The maximum number of rows to scan for schema inference. If set to `nil`, the
|
45
|
+
# full data may be scanned *(this can be slow)*. This parameter only applies if
|
46
|
+
# the input data is a sequence or generator of rows; other input is read as-is.
|
47
|
+
# @param nan_to_null [Boolean]
|
48
|
+
# If the data comes from one or more Numo arrays, can optionally convert input
|
49
|
+
# data NaN values to null instead. This is a no-op for all other input data.
|
50
|
+
def initialize(data = nil, schema: nil, columns: nil, schema_overrides: nil, strict: true, orient: nil, infer_schema_length: 100, nan_to_null: false)
|
51
|
+
if schema && columns
|
52
|
+
warn "columns is ignored when schema is passed"
|
53
|
+
end
|
22
54
|
schema ||= columns
|
23
55
|
|
24
56
|
if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
|
@@ -29,11 +61,17 @@ module Polars
|
|
29
61
|
self._df = self.class.hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
|
30
62
|
elsif data.is_a?(Hash)
|
31
63
|
data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
|
32
|
-
self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, nan_to_null: nan_to_null)
|
64
|
+
self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict, nan_to_null: nan_to_null)
|
33
65
|
elsif data.is_a?(::Array)
|
34
|
-
self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, orient: orient, infer_schema_length: infer_schema_length)
|
66
|
+
self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict, orient: orient, infer_schema_length: infer_schema_length)
|
35
67
|
elsif data.is_a?(Series)
|
36
|
-
self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides)
|
68
|
+
self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict)
|
69
|
+
elsif data.respond_to?(:arrow_c_stream)
|
70
|
+
# This uses the fact that RbSeries.from_arrow_c_stream will create a
|
71
|
+
# struct-typed Series. Then we unpack that to a DataFrame.
|
72
|
+
tmp_col_name = ""
|
73
|
+
s = Utils.wrap_s(RbSeries.from_arrow_c_stream(data))
|
74
|
+
self._df = s.to_frame(tmp_col_name).unnest(tmp_col_name)._df
|
37
75
|
else
|
38
76
|
raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
|
39
77
|
end
|
@@ -452,6 +490,11 @@ module Polars
|
|
452
490
|
end
|
453
491
|
end
|
454
492
|
|
493
|
+
# @private
|
494
|
+
def arrow_c_stream
|
495
|
+
_df.arrow_c_stream
|
496
|
+
end
|
497
|
+
|
455
498
|
# Return the dataframe as a scalar.
|
456
499
|
#
|
457
500
|
# Equivalent to `df[0,0]`, with a check that the shape is (1,1).
|
@@ -766,15 +809,18 @@ module Polars
|
|
766
809
|
# Compression method. Defaults to "uncompressed".
|
767
810
|
#
|
768
811
|
# @return [nil]
|
769
|
-
def write_avro(file, compression = "uncompressed")
|
812
|
+
def write_avro(file, compression = "uncompressed", name: "")
|
770
813
|
if compression.nil?
|
771
814
|
compression = "uncompressed"
|
772
815
|
end
|
773
816
|
if Utils.pathlike?(file)
|
774
817
|
file = Utils.normalize_filepath(file)
|
775
818
|
end
|
819
|
+
if name.nil?
|
820
|
+
name = ""
|
821
|
+
end
|
776
822
|
|
777
|
-
_df.write_avro(file, compression)
|
823
|
+
_df.write_avro(file, compression, name)
|
778
824
|
end
|
779
825
|
|
780
826
|
# Write to Arrow IPC binary stream or Feather file.
|
@@ -785,7 +831,7 @@ module Polars
|
|
785
831
|
# Compression method. Defaults to "uncompressed".
|
786
832
|
#
|
787
833
|
# @return [nil]
|
788
|
-
def write_ipc(file, compression: "uncompressed")
|
834
|
+
def write_ipc(file, compression: "uncompressed", compat_level: nil)
|
789
835
|
return_bytes = file.nil?
|
790
836
|
if return_bytes
|
791
837
|
file = StringIO.new
|
@@ -795,11 +841,15 @@ module Polars
|
|
795
841
|
file = Utils.normalize_filepath(file)
|
796
842
|
end
|
797
843
|
|
844
|
+
if compat_level.nil?
|
845
|
+
compat_level = true
|
846
|
+
end
|
847
|
+
|
798
848
|
if compression.nil?
|
799
849
|
compression = "uncompressed"
|
800
850
|
end
|
801
851
|
|
802
|
-
_df.write_ipc(file, compression)
|
852
|
+
_df.write_ipc(file, compression, compat_level)
|
803
853
|
return_bytes ? file.string : nil
|
804
854
|
end
|
805
855
|
|
@@ -826,7 +876,8 @@ module Polars
|
|
826
876
|
# df.write_ipc_stream("new_file.arrow")
|
827
877
|
def write_ipc_stream(
|
828
878
|
file,
|
829
|
-
compression: "uncompressed"
|
879
|
+
compression: "uncompressed",
|
880
|
+
compat_level: nil
|
830
881
|
)
|
831
882
|
return_bytes = file.nil?
|
832
883
|
if return_bytes
|
@@ -836,11 +887,15 @@ module Polars
|
|
836
887
|
file = Utils.normalize_filepath(file)
|
837
888
|
end
|
838
889
|
|
890
|
+
if compat_level.nil?
|
891
|
+
compat_level = true
|
892
|
+
end
|
893
|
+
|
839
894
|
if compression.nil?
|
840
895
|
compression = "uncompressed"
|
841
896
|
end
|
842
897
|
|
843
|
-
_df.write_ipc_stream(file, compression)
|
898
|
+
_df.write_ipc_stream(file, compression, compat_level)
|
844
899
|
return_bytes ? file.string : nil
|
845
900
|
end
|
846
901
|
|
@@ -1037,6 +1092,10 @@ module Polars
|
|
1037
1092
|
#
|
1038
1093
|
# @param mapping [Hash]
|
1039
1094
|
# Key value pairs that map from old name to new name.
|
1095
|
+
# @param strict [Boolean]
|
1096
|
+
# Validate that all column names exist in the current schema,
|
1097
|
+
# and throw an exception if any do not. (Note that this parameter
|
1098
|
+
# is a no-op when passing a function to `mapping`).
|
1040
1099
|
#
|
1041
1100
|
# @return [DataFrame]
|
1042
1101
|
#
|
@@ -1060,8 +1119,8 @@ module Polars
|
|
1060
1119
|
# # │ 2 ┆ 7 ┆ b │
|
1061
1120
|
# # │ 3 ┆ 8 ┆ c │
|
1062
1121
|
# # └───────┴─────┴─────┘
|
1063
|
-
def rename(mapping)
|
1064
|
-
lazy.rename(mapping).collect(no_optimization: true)
|
1122
|
+
def rename(mapping, strict: true)
|
1123
|
+
lazy.rename(mapping, strict: strict).collect(no_optimization: true)
|
1065
1124
|
end
|
1066
1125
|
|
1067
1126
|
# Insert a Series at a certain column index. This operation is in place.
|
@@ -2190,6 +2249,11 @@ module Polars
|
|
2190
2249
|
# @param force_parallel [Boolean]
|
2191
2250
|
# Force the physical plan to evaluate the computation of both DataFrames up to
|
2192
2251
|
# the join in parallel.
|
2252
|
+
# @param coalesce [Boolean]
|
2253
|
+
# Coalescing behavior (merging of join columns).
|
2254
|
+
# - true: -> Always coalesce join columns.
|
2255
|
+
# - false: -> Never coalesce join columns.
|
2256
|
+
# Note that joining on any other expressions than `col` will turn off coalescing.
|
2193
2257
|
#
|
2194
2258
|
# @return [DataFrame]
|
2195
2259
|
#
|
@@ -2243,7 +2307,8 @@ module Polars
|
|
2243
2307
|
suffix: "_right",
|
2244
2308
|
tolerance: nil,
|
2245
2309
|
allow_parallel: true,
|
2246
|
-
force_parallel: false
|
2310
|
+
force_parallel: false,
|
2311
|
+
coalesce: true
|
2247
2312
|
)
|
2248
2313
|
lazy
|
2249
2314
|
.join_asof(
|
@@ -2258,7 +2323,8 @@ module Polars
|
|
2258
2323
|
suffix: suffix,
|
2259
2324
|
tolerance: tolerance,
|
2260
2325
|
allow_parallel: allow_parallel,
|
2261
|
-
force_parallel: force_parallel
|
2326
|
+
force_parallel: force_parallel,
|
2327
|
+
coalesce: coalesce
|
2262
2328
|
)
|
2263
2329
|
.collect(no_optimization: true)
|
2264
2330
|
end
|
@@ -2277,8 +2343,20 @@ module Polars
|
|
2277
2343
|
# Join strategy.
|
2278
2344
|
# @param suffix [String]
|
2279
2345
|
# Suffix to append to columns with a duplicate name.
|
2346
|
+
# @param validate ['m:m', 'm:1', '1:m', '1:1']
|
2347
|
+
# Checks if join is of specified type.
|
2348
|
+
# * *many_to_many* - “m:m”: default, does not result in checks
|
2349
|
+
# * *one_to_one* - “1:1”: check if join keys are unique in both left and right datasets
|
2350
|
+
# * *one_to_many* - “1:m”: check if join keys are unique in left dataset
|
2351
|
+
# * *many_to_one* - “m:1”: check if join keys are unique in right dataset
|
2280
2352
|
# @param join_nulls [Boolean]
|
2281
2353
|
# Join on null values. By default null values will never produce matches.
|
2354
|
+
# @param coalesce [Boolean]
|
2355
|
+
# Coalescing behavior (merging of join columns).
|
2356
|
+
# - nil: -> join specific.
|
2357
|
+
# - true: -> Always coalesce join columns.
|
2358
|
+
# - false: -> Never coalesce join columns.
|
2359
|
+
# Note that joining on any other expressions than `col` will turn off coalescing.
|
2282
2360
|
#
|
2283
2361
|
# @return [DataFrame]
|
2284
2362
|
#
|
@@ -2361,7 +2439,16 @@ module Polars
|
|
2361
2439
|
# # ╞═════╪═════╪═════╡
|
2362
2440
|
# # │ 3 ┆ 8.0 ┆ c │
|
2363
2441
|
# # └─────┴─────┴─────┘
|
2364
|
-
def join(other,
|
2442
|
+
def join(other,
|
2443
|
+
left_on: nil,
|
2444
|
+
right_on: nil,
|
2445
|
+
on: nil,
|
2446
|
+
how: "inner",
|
2447
|
+
suffix: "_right",
|
2448
|
+
validate: "m:m",
|
2449
|
+
join_nulls: false,
|
2450
|
+
coalesce: nil
|
2451
|
+
)
|
2365
2452
|
lazy
|
2366
2453
|
.join(
|
2367
2454
|
other.lazy,
|
@@ -2370,7 +2457,9 @@ module Polars
|
|
2370
2457
|
on: on,
|
2371
2458
|
how: how,
|
2372
2459
|
suffix: suffix,
|
2373
|
-
|
2460
|
+
validate: validate,
|
2461
|
+
join_nulls: join_nulls,
|
2462
|
+
coalesce: coalesce
|
2374
2463
|
)
|
2375
2464
|
.collect(no_optimization: true)
|
2376
2465
|
end
|
@@ -2426,15 +2515,15 @@ module Polars
|
|
2426
2515
|
# df.map_rows { |t| t[0] * 2 + t[1] }
|
2427
2516
|
# # =>
|
2428
2517
|
# # shape: (3, 1)
|
2429
|
-
# #
|
2430
|
-
# # │
|
2431
|
-
# # │ ---
|
2432
|
-
# # │ i64
|
2433
|
-
# #
|
2434
|
-
# # │ 1
|
2435
|
-
# # │ 9
|
2436
|
-
# # │ 14
|
2437
|
-
# #
|
2518
|
+
# # ┌─────┐
|
2519
|
+
# # │ map │
|
2520
|
+
# # │ --- │
|
2521
|
+
# # │ i64 │
|
2522
|
+
# # ╞═════╡
|
2523
|
+
# # │ 1 │
|
2524
|
+
# # │ 9 │
|
2525
|
+
# # │ 14 │
|
2526
|
+
# # └─────┘
|
2438
2527
|
def map_rows(return_dtype: nil, inference_size: 256, &f)
|
2439
2528
|
out, is_df = _df.map_rows(f, return_dtype, inference_size)
|
2440
2529
|
if is_df
|
@@ -2717,10 +2806,85 @@ module Polars
|
|
2717
2806
|
# Column to drop.
|
2718
2807
|
#
|
2719
2808
|
# @return [Series]
|
2809
|
+
#
|
2810
|
+
# @example
|
2811
|
+
# df = Polars::DataFrame.new(
|
2812
|
+
# {
|
2813
|
+
# "foo" => [1, 2, 3],
|
2814
|
+
# "bar" => [6, 7, 8],
|
2815
|
+
# "ham" => ["a", "b", "c"]
|
2816
|
+
# }
|
2817
|
+
# )
|
2818
|
+
# df.delete("ham")
|
2819
|
+
# # =>
|
2820
|
+
# # shape: (3,)
|
2821
|
+
# # Series: 'ham' [str]
|
2822
|
+
# # [
|
2823
|
+
# # "a"
|
2824
|
+
# # "b"
|
2825
|
+
# # "c"
|
2826
|
+
# # ]
|
2827
|
+
#
|
2828
|
+
# @example
|
2829
|
+
# df.delete("missing")
|
2830
|
+
# # => nil
|
2720
2831
|
def delete(name)
|
2721
2832
|
drop_in_place(name) if include?(name)
|
2722
2833
|
end
|
2723
2834
|
|
2835
|
+
# Cast DataFrame column(s) to the specified dtype(s).
|
2836
|
+
#
|
2837
|
+
# @param dtypes [Object]
|
2838
|
+
# Mapping of column names (or selector) to dtypes, or a single dtype
|
2839
|
+
# to which all columns will be cast.
|
2840
|
+
# @param strict [Boolean]
|
2841
|
+
# Throw an error if a cast could not be done (for instance, due to an
|
2842
|
+
# overflow).
|
2843
|
+
#
|
2844
|
+
# @return [DataFrame]
|
2845
|
+
#
|
2846
|
+
# @example Cast specific frame columns to the specified dtypes:
|
2847
|
+
# df = Polars::DataFrame.new(
|
2848
|
+
# {
|
2849
|
+
# "foo" => [1, 2, 3],
|
2850
|
+
# "bar" => [6.0, 7.0, 8.0],
|
2851
|
+
# "ham" => [Date.new(2020, 1, 2), Date.new(2021, 3, 4), Date.new(2022, 5, 6)]
|
2852
|
+
# }
|
2853
|
+
# )
|
2854
|
+
# df.cast({"foo" => Polars::Float32, "bar" => Polars::UInt8})
|
2855
|
+
# # =>
|
2856
|
+
# # shape: (3, 3)
|
2857
|
+
# # ┌─────┬─────┬────────────┐
|
2858
|
+
# # │ foo ┆ bar ┆ ham │
|
2859
|
+
# # │ --- ┆ --- ┆ --- │
|
2860
|
+
# # │ f32 ┆ u8 ┆ date │
|
2861
|
+
# # ╞═════╪═════╪════════════╡
|
2862
|
+
# # │ 1.0 ┆ 6 ┆ 2020-01-02 │
|
2863
|
+
# # │ 2.0 ┆ 7 ┆ 2021-03-04 │
|
2864
|
+
# # │ 3.0 ┆ 8 ┆ 2022-05-06 │
|
2865
|
+
# # └─────┴─────┴────────────┘
|
2866
|
+
#
|
2867
|
+
# @example Cast all frame columns matching one dtype (or dtype group) to another dtype:
|
2868
|
+
# df.cast({Polars::Date => Polars::Datetime})
|
2869
|
+
# # =>
|
2870
|
+
# # shape: (3, 3)
|
2871
|
+
# # ┌─────┬─────┬─────────────────────┐
|
2872
|
+
# # │ foo ┆ bar ┆ ham │
|
2873
|
+
# # │ --- ┆ --- ┆ --- │
|
2874
|
+
# # │ i64 ┆ f64 ┆ datetime[μs] │
|
2875
|
+
# # ╞═════╪═════╪═════════════════════╡
|
2876
|
+
# # │ 1 ┆ 6.0 ┆ 2020-01-02 00:00:00 │
|
2877
|
+
# # │ 2 ┆ 7.0 ┆ 2021-03-04 00:00:00 │
|
2878
|
+
# # │ 3 ┆ 8.0 ┆ 2022-05-06 00:00:00 │
|
2879
|
+
# # └─────┴─────┴─────────────────────┘
|
2880
|
+
#
|
2881
|
+
# @example Cast all frame columns to the specified dtype:
|
2882
|
+
# df.cast(Polars::String).to_h(as_series: false)
|
2883
|
+
# # => {"foo"=>["1", "2", "3"], "bar"=>["6.0", "7.0", "8.0"], "ham"=>["2020-01-02", "2021-03-04", "2022-05-06"]}
|
2884
|
+
def cast(dtypes, strict: true)
|
2885
|
+
lazy.cast(dtypes, strict: strict).collect(_eager: true)
|
2886
|
+
end
|
2887
|
+
|
2724
2888
|
# Create an empty copy of the current DataFrame.
|
2725
2889
|
#
|
2726
2890
|
# Returns a DataFrame with identical schema but no data.
|
@@ -2775,6 +2939,57 @@ module Polars
|
|
2775
2939
|
# Get the DataFrame as a Array of Series.
|
2776
2940
|
#
|
2777
2941
|
# @return [Array]
|
2942
|
+
#
|
2943
|
+
# @example
|
2944
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
|
2945
|
+
# df.get_columns
|
2946
|
+
# # =>
|
2947
|
+
# # [shape: (3,)
|
2948
|
+
# # Series: 'foo' [i64]
|
2949
|
+
# # [
|
2950
|
+
# # 1
|
2951
|
+
# # 2
|
2952
|
+
# # 3
|
2953
|
+
# # ], shape: (3,)
|
2954
|
+
# # Series: 'bar' [i64]
|
2955
|
+
# # [
|
2956
|
+
# # 4
|
2957
|
+
# # 5
|
2958
|
+
# # 6
|
2959
|
+
# # ]]
|
2960
|
+
#
|
2961
|
+
# @example
|
2962
|
+
# df = Polars::DataFrame.new(
|
2963
|
+
# {
|
2964
|
+
# "a" => [1, 2, 3, 4],
|
2965
|
+
# "b" => [0.5, 4, 10, 13],
|
2966
|
+
# "c" => [true, true, false, true]
|
2967
|
+
# }
|
2968
|
+
# )
|
2969
|
+
# df.get_columns
|
2970
|
+
# # =>
|
2971
|
+
# # [shape: (4,)
|
2972
|
+
# # Series: 'a' [i64]
|
2973
|
+
# # [
|
2974
|
+
# # 1
|
2975
|
+
# # 2
|
2976
|
+
# # 3
|
2977
|
+
# # 4
|
2978
|
+
# # ], shape: (4,)
|
2979
|
+
# # Series: 'b' [f64]
|
2980
|
+
# # [
|
2981
|
+
# # 0.5
|
2982
|
+
# # 4.0
|
2983
|
+
# # 10.0
|
2984
|
+
# # 13.0
|
2985
|
+
# # ], shape: (4,)
|
2986
|
+
# # Series: 'c' [bool]
|
2987
|
+
# # [
|
2988
|
+
# # true
|
2989
|
+
# # true
|
2990
|
+
# # false
|
2991
|
+
# # true
|
2992
|
+
# # ]]
|
2778
2993
|
def get_columns
|
2779
2994
|
_df.get_columns.map { |s| Utils.wrap_s(s) }
|
2780
2995
|
end
|
@@ -3083,7 +3298,7 @@ module Polars
|
|
3083
3298
|
# "c" => [2, 4, 6]
|
3084
3299
|
# }
|
3085
3300
|
# )
|
3086
|
-
# df.unpivot(Polars
|
3301
|
+
# df.unpivot(Polars.cs.numeric, index: "a")
|
3087
3302
|
# # =>
|
3088
3303
|
# # shape: (6, 3)
|
3089
3304
|
# # ┌─────┬──────────┬───────┐
|
@@ -4234,7 +4449,7 @@ module Polars
|
|
4234
4449
|
if n.nil? && !frac.nil?
|
4235
4450
|
frac = Series.new("frac", [frac]) unless frac.is_a?(Series)
|
4236
4451
|
|
4237
|
-
_from_rbdf(
|
4452
|
+
return _from_rbdf(
|
4238
4453
|
_df.sample_frac(frac._s, with_replacement, shuffle, seed)
|
4239
4454
|
)
|
4240
4455
|
end
|
@@ -4296,7 +4511,7 @@ module Polars
|
|
4296
4511
|
# @example A horizontal string concatenation:
|
4297
4512
|
# df = Polars::DataFrame.new(
|
4298
4513
|
# {
|
4299
|
-
# "a" => ["foo", "bar",
|
4514
|
+
# "a" => ["foo", "bar", nil],
|
4300
4515
|
# "b" => [1, 2, 3],
|
4301
4516
|
# "c" => [1.0, 2.0, 3.0]
|
4302
4517
|
# }
|
@@ -4327,11 +4542,11 @@ module Polars
|
|
4327
4542
|
# # true
|
4328
4543
|
# # true
|
4329
4544
|
# # ]
|
4330
|
-
def fold
|
4545
|
+
def fold
|
4331
4546
|
acc = to_series(0)
|
4332
4547
|
|
4333
4548
|
1.upto(width - 1) do |i|
|
4334
|
-
acc =
|
4549
|
+
acc = yield(acc, to_series(i))
|
4335
4550
|
end
|
4336
4551
|
acc
|
4337
4552
|
end
|
@@ -4843,7 +5058,7 @@ module Polars
|
|
4843
5058
|
end
|
4844
5059
|
|
4845
5060
|
# @private
|
4846
|
-
def self.expand_hash_scalars(data, schema_overrides: nil, order: nil, nan_to_null: false)
|
5061
|
+
def self.expand_hash_scalars(data, schema_overrides: nil, strict: true, order: nil, nan_to_null: false)
|
4847
5062
|
updated_data = {}
|
4848
5063
|
unless data.empty?
|
4849
5064
|
dtypes = schema_overrides || {}
|
@@ -4852,23 +5067,23 @@ module Polars
|
|
4852
5067
|
data.each do |name, val|
|
4853
5068
|
dtype = dtypes[name]
|
4854
5069
|
if val.is_a?(Hash) && dtype != Struct
|
4855
|
-
updated_data[name] = DataFrame.new(val).to_struct(name)
|
5070
|
+
updated_data[name] = DataFrame.new(val, strict: strict).to_struct(name)
|
4856
5071
|
elsif !Utils.arrlen(val).nil?
|
4857
|
-
updated_data[name] = Series.new(::String.new(name), val, dtype: dtype)
|
5072
|
+
updated_data[name] = Series.new(::String.new(name), val, dtype: dtype, strict: strict)
|
4858
5073
|
elsif val.nil? || [Integer, Float, TrueClass, FalseClass, ::String, ::Date, ::DateTime, ::Time].any? { |cls| val.is_a?(cls) }
|
4859
5074
|
dtype = Polars::Float64 if val.nil? && dtype.nil?
|
4860
|
-
updated_data[name] = Series.new(::String.new(name), [val], dtype: dtype).extend_constant(val, array_len - 1)
|
5075
|
+
updated_data[name] = Series.new(::String.new(name), [val], dtype: dtype, strict: strict).extend_constant(val, array_len - 1)
|
4861
5076
|
else
|
4862
5077
|
raise Todo
|
4863
5078
|
end
|
4864
5079
|
end
|
4865
5080
|
elsif data.values.all? { |val| Utils.arrlen(val) == 0 }
|
4866
5081
|
data.each do |name, val|
|
4867
|
-
updated_data[name] = Series.new(name, val, dtype: dtypes[name])
|
5082
|
+
updated_data[name] = Series.new(name, val, dtype: dtypes[name], strict: strict)
|
4868
5083
|
end
|
4869
5084
|
elsif data.values.all? { |val| Utils.arrlen(val).nil? }
|
4870
5085
|
data.each do |name, val|
|
4871
|
-
updated_data[name] = Series.new(name, [val], dtype: dtypes[name])
|
5086
|
+
updated_data[name] = Series.new(name, [val], dtype: dtypes[name], strict: strict)
|
4872
5087
|
end
|
4873
5088
|
end
|
4874
5089
|
end
|
@@ -4876,7 +5091,7 @@ module Polars
|
|
4876
5091
|
end
|
4877
5092
|
|
4878
5093
|
# @private
|
4879
|
-
def self.hash_to_rbdf(data, schema: nil, schema_overrides: nil, nan_to_null: nil)
|
5094
|
+
def self.hash_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true, nan_to_null: nil)
|
4880
5095
|
if schema.is_a?(Hash) && !data.empty?
|
4881
5096
|
if !data.all? { |col, _| schema[col] }
|
4882
5097
|
raise ArgumentError, "The given column-schema names do not match the data dictionary"
|
@@ -4893,9 +5108,9 @@ module Polars
|
|
4893
5108
|
end
|
4894
5109
|
|
4895
5110
|
if data.empty? && !schema_overrides.empty?
|
4896
|
-
data_series = column_names.map { |name| Series.new(name, [], dtype: schema_overrides[name], nan_to_null: nan_to_null)._s }
|
5111
|
+
data_series = column_names.map { |name| Series.new(name, [], dtype: schema_overrides[name], strict: strict, nan_to_null: nan_to_null)._s }
|
4897
5112
|
else
|
4898
|
-
data_series = expand_hash_scalars(data, schema_overrides: schema_overrides, nan_to_null: nan_to_null).values.map(&:_s)
|
5113
|
+
data_series = expand_hash_scalars(data, schema_overrides: schema_overrides, strict: strict, nan_to_null: nan_to_null).values.map(&:_s)
|
4899
5114
|
end
|
4900
5115
|
|
4901
5116
|
data_series = _handle_columns_arg(data_series, columns: column_names, from_hash: true)
|
@@ -4969,7 +5184,7 @@ module Polars
|
|
4969
5184
|
end
|
4970
5185
|
end
|
4971
5186
|
|
4972
|
-
def self._post_apply_columns(rbdf, columns, structs: nil, schema_overrides: nil)
|
5187
|
+
def self._post_apply_columns(rbdf, columns, structs: nil, schema_overrides: nil, strict: true)
|
4973
5188
|
rbdf_columns = rbdf.columns
|
4974
5189
|
rbdf_dtypes = rbdf.dtypes
|
4975
5190
|
columns, dtypes = _unpack_schema(
|
@@ -4985,13 +5200,13 @@ module Polars
|
|
4985
5200
|
end
|
4986
5201
|
|
4987
5202
|
column_casts = []
|
4988
|
-
columns.
|
5203
|
+
columns.each_with_index do |col, i|
|
4989
5204
|
if dtypes[col] == Categorical # != rbdf_dtypes[i]
|
4990
|
-
column_casts << Polars.col(col).cast(Categorical)._rbexpr
|
5205
|
+
column_casts << Polars.col(col).cast(Categorical, strict: strict)._rbexpr
|
4991
5206
|
elsif structs&.any? && structs.include?(col) && structs[col] != rbdf_dtypes[i]
|
4992
|
-
column_casts << Polars.col(col).cast(structs[col])._rbexpr
|
5207
|
+
column_casts << Polars.col(col).cast(structs[col], strict: strict)._rbexpr
|
4993
5208
|
elsif dtypes.include?(col) && dtypes[col] != rbdf_dtypes[i]
|
4994
|
-
column_casts << Polars.col(col).cast(dtypes[col])._rbexpr
|
5209
|
+
column_casts << Polars.col(col).cast(dtypes[col], strict: strict)._rbexpr
|
4995
5210
|
end
|
4996
5211
|
end
|
4997
5212
|
|
@@ -5010,12 +5225,11 @@ module Polars
|
|
5010
5225
|
end
|
5011
5226
|
|
5012
5227
|
# @private
|
5013
|
-
def self.sequence_to_rbdf(data, schema: nil, schema_overrides: nil, orient: nil, infer_schema_length: 50)
|
5014
|
-
raise Todo if schema_overrides
|
5228
|
+
def self.sequence_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true, orient: nil, infer_schema_length: 50)
|
5015
5229
|
columns = schema
|
5016
5230
|
|
5017
5231
|
if data.length == 0
|
5018
|
-
return hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
|
5232
|
+
return hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides, strict: strict)
|
5019
5233
|
end
|
5020
5234
|
|
5021
5235
|
if data[0].is_a?(Series)
|
@@ -5028,7 +5242,7 @@ module Polars
|
|
5028
5242
|
elsif data[0].is_a?(Hash)
|
5029
5243
|
column_names, dtypes = _unpack_schema(columns)
|
5030
5244
|
schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
|
5031
|
-
rbdf = RbDataFrame.from_hashes(data, schema, schema_overrides,
|
5245
|
+
rbdf = RbDataFrame.from_hashes(data, schema, schema_overrides, strict, infer_schema_length)
|
5032
5246
|
if column_names
|
5033
5247
|
rbdf = _post_apply_columns(rbdf, column_names)
|
5034
5248
|
end
|
@@ -5048,7 +5262,7 @@ module Polars
|
|
5048
5262
|
schema, schema_overrides: schema_overrides, n_expected: first_element.length
|
5049
5263
|
)
|
5050
5264
|
local_schema_override = (
|
5051
|
-
schema_overrides.any? ? (
|
5265
|
+
schema_overrides.any? ? _include_unknowns(schema_overrides, column_names) : {}
|
5052
5266
|
)
|
5053
5267
|
if column_names.any? && first_element.length > 0 && first_element.length != column_names.length
|
5054
5268
|
raise ArgumentError, "the row data does not match the number of columns"
|
@@ -5056,7 +5270,11 @@ module Polars
|
|
5056
5270
|
|
5057
5271
|
unpack_nested = false
|
5058
5272
|
local_schema_override.each do |col, tp|
|
5059
|
-
|
5273
|
+
if [Categorical, Enum].include?(tp)
|
5274
|
+
local_schema_override[col] = String
|
5275
|
+
elsif !unpack_nested && [Unknown, Struct].include?(tp.base_type)
|
5276
|
+
raise Todo
|
5277
|
+
end
|
5060
5278
|
end
|
5061
5279
|
|
5062
5280
|
if unpack_nested
|
@@ -5070,7 +5288,7 @@ module Polars
|
|
5070
5288
|
end
|
5071
5289
|
if column_names.any? || schema_overrides.any?
|
5072
5290
|
rbdf = _post_apply_columns(
|
5073
|
-
rbdf, column_names, schema_overrides: schema_overrides
|
5291
|
+
rbdf, column_names, schema_overrides: schema_overrides, strict: strict
|
5074
5292
|
)
|
5075
5293
|
end
|
5076
5294
|
return rbdf
|
@@ -5080,7 +5298,7 @@ module Polars
|
|
5080
5298
|
)
|
5081
5299
|
data_series =
|
5082
5300
|
data.map.with_index do |element, i|
|
5083
|
-
Series.new(column_names[i], element, dtype: schema_overrides[column_names[i]])._s
|
5301
|
+
Series.new(column_names[i], element, dtype: schema_overrides[column_names[i]], strict: strict)._s
|
5084
5302
|
end
|
5085
5303
|
return RbDataFrame.new(data_series)
|
5086
5304
|
else
|
@@ -5093,7 +5311,12 @@ module Polars
|
|
5093
5311
|
end
|
5094
5312
|
|
5095
5313
|
# @private
|
5096
|
-
def self.
|
5314
|
+
def self._include_unknowns(schema, cols)
|
5315
|
+
cols.to_h { |col| [col, schema[col] || Unknown] }
|
5316
|
+
end
|
5317
|
+
|
5318
|
+
# @private
|
5319
|
+
def self.series_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true)
|
5097
5320
|
data_series = [data._s]
|
5098
5321
|
series_name = data_series.map(&:name)
|
5099
5322
|
column_names, schema_overrides = _unpack_schema(
|
@@ -5102,7 +5325,7 @@ module Polars
|
|
5102
5325
|
if schema_overrides.any?
|
5103
5326
|
new_dtype = schema_overrides.values[0]
|
5104
5327
|
if new_dtype != data.dtype
|
5105
|
-
data_series[0] = data_series[0].cast(new_dtype,
|
5328
|
+
data_series[0] = data_series[0].cast(new_dtype, strict)
|
5106
5329
|
end
|
5107
5330
|
end
|
5108
5331
|
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Polars
|
2
|
+
class DataTypeGroup < Set
|
3
|
+
end
|
4
|
+
|
5
|
+
SIGNED_INTEGER_DTYPES = DataTypeGroup.new(
|
6
|
+
[
|
7
|
+
Int8,
|
8
|
+
Int16,
|
9
|
+
Int32,
|
10
|
+
Int64
|
11
|
+
]
|
12
|
+
)
|
13
|
+
UNSIGNED_INTEGER_DTYPES = DataTypeGroup.new(
|
14
|
+
[
|
15
|
+
UInt8,
|
16
|
+
UInt16,
|
17
|
+
UInt32,
|
18
|
+
UInt64
|
19
|
+
]
|
20
|
+
)
|
21
|
+
INTEGER_DTYPES = (
|
22
|
+
SIGNED_INTEGER_DTYPES | UNSIGNED_INTEGER_DTYPES
|
23
|
+
)
|
24
|
+
FLOAT_DTYPES = DataTypeGroup.new([Float32, Float64])
|
25
|
+
NUMERIC_DTYPES = DataTypeGroup.new(
|
26
|
+
FLOAT_DTYPES + INTEGER_DTYPES | [Decimal]
|
27
|
+
)
|
28
|
+
end
|