polars-df 0.14.0-aarch64-linux-musl → 0.16.0-aarch64-linux-musl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +35 -0
  3. data/Cargo.lock +1523 -378
  4. data/LICENSE-THIRD-PARTY.txt +23495 -12923
  5. data/LICENSE.txt +1 -0
  6. data/README.md +38 -4
  7. data/lib/polars/3.2/polars.so +0 -0
  8. data/lib/polars/3.3/polars.so +0 -0
  9. data/lib/polars/{3.1 → 3.4}/polars.so +0 -0
  10. data/lib/polars/batched_csv_reader.rb +0 -2
  11. data/lib/polars/binary_expr.rb +133 -9
  12. data/lib/polars/binary_name_space.rb +101 -6
  13. data/lib/polars/config.rb +4 -0
  14. data/lib/polars/data_frame.rb +452 -101
  15. data/lib/polars/data_type_group.rb +28 -0
  16. data/lib/polars/data_types.rb +3 -1
  17. data/lib/polars/date_time_expr.rb +244 -0
  18. data/lib/polars/date_time_name_space.rb +87 -0
  19. data/lib/polars/expr.rb +103 -2
  20. data/lib/polars/functions/aggregation/horizontal.rb +10 -4
  21. data/lib/polars/functions/as_datatype.rb +51 -2
  22. data/lib/polars/functions/col.rb +1 -1
  23. data/lib/polars/functions/eager.rb +1 -3
  24. data/lib/polars/functions/lazy.rb +95 -13
  25. data/lib/polars/functions/range/time_range.rb +21 -21
  26. data/lib/polars/io/csv.rb +14 -16
  27. data/lib/polars/io/database.rb +2 -2
  28. data/lib/polars/io/delta.rb +126 -0
  29. data/lib/polars/io/ipc.rb +14 -4
  30. data/lib/polars/io/ndjson.rb +10 -0
  31. data/lib/polars/io/parquet.rb +168 -111
  32. data/lib/polars/lazy_frame.rb +684 -20
  33. data/lib/polars/list_name_space.rb +169 -0
  34. data/lib/polars/selectors.rb +1226 -0
  35. data/lib/polars/series.rb +465 -35
  36. data/lib/polars/string_cache.rb +27 -1
  37. data/lib/polars/string_expr.rb +0 -1
  38. data/lib/polars/string_name_space.rb +73 -3
  39. data/lib/polars/struct_name_space.rb +31 -7
  40. data/lib/polars/utils/various.rb +5 -1
  41. data/lib/polars/utils.rb +45 -10
  42. data/lib/polars/version.rb +1 -1
  43. data/lib/polars.rb +17 -1
  44. metadata +10 -9
  45. data/lib/polars/functions.rb +0 -57
@@ -8,17 +8,49 @@ module Polars
8
8
 
9
9
  # Create a new DataFrame.
10
10
  #
11
- # @param data [Hash, Array, Series, nil]
12
- # Two-dimensional data in various forms. Hash must contain Arrays.
13
- # Array may contain Series.
14
- # @param columns [Array, Hash, nil]
15
- # Column labels to use for resulting DataFrame. If specified, overrides any
16
- # labels already present in the data. Must match data dimensions.
17
- # @param orient ["col", "row", nil]
18
- # Whether to interpret two-dimensional data as columns or as rows. If `nil`,
11
+ # @param data [Object]
12
+ # Two-dimensional data in various forms; hash input must contain arrays
13
+ # or a range. Arrays may contain Series or other arrays.
14
+ # @param schema [Object]
15
+ # The schema of the resulting DataFrame. The schema may be declared in several
16
+ # ways:
17
+ #
18
+ # * As a hash of name:type pairs; if type is nil, it will be auto-inferred.
19
+ # * As an array of column names; in this case types are automatically inferred.
20
+ # * As an array of (name,type) pairs; this is equivalent to the dictionary form.
21
+ #
22
+ # If you supply a list of column names that does not match the names in the
23
+ # underlying data, the names given here will overwrite them. The number
24
+ # of names given in the schema should match the underlying data dimensions.
25
+ #
26
+ # If set to `nil` (default), the schema is inferred from the data.
27
+ # @param schema_overrides [Hash]
28
+ # Support type specification or override of one or more columns; note that
29
+ # any dtypes inferred from the schema param will be overridden.
30
+ #
31
+ # The number of entries in the schema should match the underlying data
32
+ # dimensions, unless an array of hashes is being passed, in which case
33
+ # a *partial* schema can be declared to prevent specific fields from being loaded.
34
+ # @param strict [Boolean]
35
+ # Throw an error if any `data` value does not exactly match the given or inferred
36
+ # data type for that column. If set to `false`, values that do not match the data
37
+ # type are cast to that data type or, if casting is not possible, set to null
38
+ # instead.
39
+ # @param orient ["col", "row"]
40
+ # Whether to interpret two-dimensional data as columns or as rows. If nil,
19
41
  # the orientation is inferred by matching the columns and data dimensions. If
20
42
  # this does not yield conclusive results, column orientation is used.
21
- def initialize(data = nil, schema: nil, columns: nil, schema_overrides: nil, orient: nil, infer_schema_length: 100, nan_to_null: false)
43
+ # @param infer_schema_length [Integer]
44
+ # The maximum number of rows to scan for schema inference. If set to `nil`, the
45
+ # full data may be scanned *(this can be slow)*. This parameter only applies if
46
+ # the input data is a sequence or generator of rows; other input is read as-is.
47
+ # @param nan_to_null [Boolean]
48
+ # If the data comes from one or more Numo arrays, can optionally convert input
49
+ # data NaN values to null instead. This is a no-op for all other input data.
50
+ def initialize(data = nil, schema: nil, columns: nil, schema_overrides: nil, strict: true, orient: nil, infer_schema_length: 100, nan_to_null: false)
51
+ if schema && columns
52
+ warn "columns is ignored when schema is passed"
53
+ end
22
54
  schema ||= columns
23
55
 
24
56
  if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
@@ -29,11 +61,17 @@ module Polars
29
61
  self._df = self.class.hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
30
62
  elsif data.is_a?(Hash)
31
63
  data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
32
- self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, nan_to_null: nan_to_null)
64
+ self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict, nan_to_null: nan_to_null)
33
65
  elsif data.is_a?(::Array)
34
- self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, orient: orient, infer_schema_length: infer_schema_length)
66
+ self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict, orient: orient, infer_schema_length: infer_schema_length)
35
67
  elsif data.is_a?(Series)
36
- self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides)
68
+ self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict)
69
+ elsif data.respond_to?(:arrow_c_stream)
70
+ # This uses the fact that RbSeries.from_arrow_c_stream will create a
71
+ # struct-typed Series. Then we unpack that to a DataFrame.
72
+ tmp_col_name = ""
73
+ s = Utils.wrap_s(RbSeries.from_arrow_c_stream(data))
74
+ self._df = s.to_frame(tmp_col_name).unnest(tmp_col_name)._df
37
75
  else
38
76
  raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
39
77
  end
@@ -452,6 +490,11 @@ module Polars
452
490
  end
453
491
  end
454
492
 
493
+ # @private
494
+ def arrow_c_stream
495
+ _df.arrow_c_stream
496
+ end
497
+
455
498
  # Return the dataframe as a scalar.
456
499
  #
457
500
  # Equivalent to `df[0,0]`, with a check that the shape is (1,1).
@@ -766,15 +809,18 @@ module Polars
766
809
  # Compression method. Defaults to "uncompressed".
767
810
  #
768
811
  # @return [nil]
769
- def write_avro(file, compression = "uncompressed")
812
+ def write_avro(file, compression = "uncompressed", name: "")
770
813
  if compression.nil?
771
814
  compression = "uncompressed"
772
815
  end
773
816
  if Utils.pathlike?(file)
774
817
  file = Utils.normalize_filepath(file)
775
818
  end
819
+ if name.nil?
820
+ name = ""
821
+ end
776
822
 
777
- _df.write_avro(file, compression)
823
+ _df.write_avro(file, compression, name)
778
824
  end
779
825
 
780
826
  # Write to Arrow IPC binary stream or Feather file.
@@ -785,7 +831,13 @@ module Polars
785
831
  # Compression method. Defaults to "uncompressed".
786
832
  #
787
833
  # @return [nil]
788
- def write_ipc(file, compression: "uncompressed")
834
+ def write_ipc(
835
+ file,
836
+ compression: "uncompressed",
837
+ compat_level: nil,
838
+ storage_options: nil,
839
+ retries: 2
840
+ )
789
841
  return_bytes = file.nil?
790
842
  if return_bytes
791
843
  file = StringIO.new
@@ -795,11 +847,21 @@ module Polars
795
847
  file = Utils.normalize_filepath(file)
796
848
  end
797
849
 
850
+ if compat_level.nil?
851
+ compat_level = true
852
+ end
853
+
798
854
  if compression.nil?
799
855
  compression = "uncompressed"
800
856
  end
801
857
 
802
- _df.write_ipc(file, compression)
858
+ if storage_options&.any?
859
+ storage_options = storage_options.to_a
860
+ else
861
+ storage_options = nil
862
+ end
863
+
864
+ _df.write_ipc(file, compression, compat_level, storage_options, retries)
803
865
  return_bytes ? file.string : nil
804
866
  end
805
867
 
@@ -826,7 +888,8 @@ module Polars
826
888
  # df.write_ipc_stream("new_file.arrow")
827
889
  def write_ipc_stream(
828
890
  file,
829
- compression: "uncompressed"
891
+ compression: "uncompressed",
892
+ compat_level: nil
830
893
  )
831
894
  return_bytes = file.nil?
832
895
  if return_bytes
@@ -836,11 +899,15 @@ module Polars
836
899
  file = Utils.normalize_filepath(file)
837
900
  end
838
901
 
902
+ if compat_level.nil?
903
+ compat_level = true
904
+ end
905
+
839
906
  if compression.nil?
840
907
  compression = "uncompressed"
841
908
  end
842
909
 
843
- _df.write_ipc_stream(file, compression)
910
+ _df.write_ipc_stream(file, compression, compat_level)
844
911
  return_bytes ? file.string : nil
845
912
  end
846
913
 
@@ -906,6 +973,61 @@ module Polars
906
973
  )
907
974
  end
908
975
 
976
+ # Write DataFrame as delta table.
977
+ #
978
+ # @param target [Object]
979
+ # URI of a table or a DeltaTable object.
980
+ # @param mode ["error", "append", "overwrite", "ignore", "merge"]
981
+ # How to handle existing data.
982
+ # @param storage_options [Hash]
983
+ # Extra options for the storage backends supported by `deltalake-rb`.
984
+ # @param delta_write_options [Hash]
985
+ # Additional keyword arguments while writing a Delta lake Table.
986
+ # @param delta_merge_options [Hash]
987
+ # Keyword arguments which are required to `MERGE` a Delta lake Table.
988
+ #
989
+ # @return [nil]
990
+ def write_delta(
991
+ target,
992
+ mode: "error",
993
+ storage_options: nil,
994
+ delta_write_options: nil,
995
+ delta_merge_options: nil
996
+ )
997
+ Polars.send(:_check_if_delta_available)
998
+
999
+ if Utils.pathlike?(target)
1000
+ target = Polars.send(:_resolve_delta_lake_uri, target.to_s, strict: false)
1001
+ end
1002
+
1003
+ data = self
1004
+
1005
+ if mode == "merge"
1006
+ if delta_merge_options.nil?
1007
+ msg = "You need to pass delta_merge_options with at least a given predicate for `MERGE` to work."
1008
+ raise ArgumentError, msg
1009
+ end
1010
+ if target.is_a?(::String)
1011
+ dt = DeltaLake::Table.new(target, storage_options: storage_options)
1012
+ else
1013
+ dt = target
1014
+ end
1015
+
1016
+ predicate = delta_merge_options.delete(:predicate)
1017
+ dt.merge(data, predicate, **delta_merge_options)
1018
+ else
1019
+ delta_write_options ||= {}
1020
+
1021
+ DeltaLake.write(
1022
+ target,
1023
+ data,
1024
+ mode: mode,
1025
+ storage_options: storage_options,
1026
+ **delta_write_options
1027
+ )
1028
+ end
1029
+ end
1030
+
909
1031
  # Return an estimation of the total (heap) allocated size of the DataFrame.
910
1032
  #
911
1033
  # Estimated size is given in the specified unit (bytes by default).
@@ -1037,6 +1159,10 @@ module Polars
1037
1159
  #
1038
1160
  # @param mapping [Hash]
1039
1161
  # Key value pairs that map from old name to new name.
1162
+ # @param strict [Boolean]
1163
+ # Validate that all column names exist in the current schema,
1164
+ # and throw an exception if any do not. (Note that this parameter
1165
+ # is a no-op when passing a function to `mapping`).
1040
1166
  #
1041
1167
  # @return [DataFrame]
1042
1168
  #
@@ -1060,8 +1186,8 @@ module Polars
1060
1186
  # # │ 2 ┆ 7 ┆ b │
1061
1187
  # # │ 3 ┆ 8 ┆ c │
1062
1188
  # # └───────┴─────┴─────┘
1063
- def rename(mapping)
1064
- lazy.rename(mapping).collect(no_optimization: true)
1189
+ def rename(mapping, strict: true)
1190
+ lazy.rename(mapping, strict: strict).collect(no_optimization: true)
1065
1191
  end
1066
1192
 
1067
1193
  # Insert a Series at a certain column index. This operation is in place.
@@ -2190,6 +2316,11 @@ module Polars
2190
2316
  # @param force_parallel [Boolean]
2191
2317
  # Force the physical plan to evaluate the computation of both DataFrames up to
2192
2318
  # the join in parallel.
2319
+ # @param coalesce [Boolean]
2320
+ # Coalescing behavior (merging of join columns).
2321
+ # - true: -> Always coalesce join columns.
2322
+ # - false: -> Never coalesce join columns.
2323
+ # Note that joining on any other expressions than `col` will turn off coalescing.
2193
2324
  #
2194
2325
  # @return [DataFrame]
2195
2326
  #
@@ -2243,7 +2374,8 @@ module Polars
2243
2374
  suffix: "_right",
2244
2375
  tolerance: nil,
2245
2376
  allow_parallel: true,
2246
- force_parallel: false
2377
+ force_parallel: false,
2378
+ coalesce: true
2247
2379
  )
2248
2380
  lazy
2249
2381
  .join_asof(
@@ -2258,7 +2390,8 @@ module Polars
2258
2390
  suffix: suffix,
2259
2391
  tolerance: tolerance,
2260
2392
  allow_parallel: allow_parallel,
2261
- force_parallel: force_parallel
2393
+ force_parallel: force_parallel,
2394
+ coalesce: coalesce
2262
2395
  )
2263
2396
  .collect(no_optimization: true)
2264
2397
  end
@@ -2277,8 +2410,20 @@ module Polars
2277
2410
  # Join strategy.
2278
2411
  # @param suffix [String]
2279
2412
  # Suffix to append to columns with a duplicate name.
2413
+ # @param validate ['m:m', 'm:1', '1:m', '1:1']
2414
+ # Checks if join is of specified type.
2415
+ # * *many_to_many* - “m:m”: default, does not result in checks
2416
+ # * *one_to_one* - “1:1”: check if join keys are unique in both left and right datasets
2417
+ # * *one_to_many* - “1:m”: check if join keys are unique in left dataset
2418
+ # * *many_to_one* - “m:1”: check if join keys are unique in right dataset
2280
2419
  # @param join_nulls [Boolean]
2281
2420
  # Join on null values. By default null values will never produce matches.
2421
+ # @param coalesce [Boolean]
2422
+ # Coalescing behavior (merging of join columns).
2423
+ # - nil: -> join specific.
2424
+ # - true: -> Always coalesce join columns.
2425
+ # - false: -> Never coalesce join columns.
2426
+ # Note that joining on any other expressions than `col` will turn off coalescing.
2282
2427
  #
2283
2428
  # @return [DataFrame]
2284
2429
  #
@@ -2361,7 +2506,16 @@ module Polars
2361
2506
  # # ╞═════╪═════╪═════╡
2362
2507
  # # │ 3 ┆ 8.0 ┆ c │
2363
2508
  # # └─────┴─────┴─────┘
2364
- def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right", join_nulls: false)
2509
+ def join(other,
2510
+ left_on: nil,
2511
+ right_on: nil,
2512
+ on: nil,
2513
+ how: "inner",
2514
+ suffix: "_right",
2515
+ validate: "m:m",
2516
+ join_nulls: false,
2517
+ coalesce: nil
2518
+ )
2365
2519
  lazy
2366
2520
  .join(
2367
2521
  other.lazy,
@@ -2370,7 +2524,9 @@ module Polars
2370
2524
  on: on,
2371
2525
  how: how,
2372
2526
  suffix: suffix,
2373
- join_nulls: join_nulls
2527
+ validate: validate,
2528
+ join_nulls: join_nulls,
2529
+ coalesce: coalesce
2374
2530
  )
2375
2531
  .collect(no_optimization: true)
2376
2532
  end
@@ -2717,10 +2873,85 @@ module Polars
2717
2873
  # Column to drop.
2718
2874
  #
2719
2875
  # @return [Series]
2876
+ #
2877
+ # @example
2878
+ # df = Polars::DataFrame.new(
2879
+ # {
2880
+ # "foo" => [1, 2, 3],
2881
+ # "bar" => [6, 7, 8],
2882
+ # "ham" => ["a", "b", "c"]
2883
+ # }
2884
+ # )
2885
+ # df.delete("ham")
2886
+ # # =>
2887
+ # # shape: (3,)
2888
+ # # Series: 'ham' [str]
2889
+ # # [
2890
+ # # "a"
2891
+ # # "b"
2892
+ # # "c"
2893
+ # # ]
2894
+ #
2895
+ # @example
2896
+ # df.delete("missing")
2897
+ # # => nil
2720
2898
  def delete(name)
2721
2899
  drop_in_place(name) if include?(name)
2722
2900
  end
2723
2901
 
2902
+ # Cast DataFrame column(s) to the specified dtype(s).
2903
+ #
2904
+ # @param dtypes [Object]
2905
+ # Mapping of column names (or selector) to dtypes, or a single dtype
2906
+ # to which all columns will be cast.
2907
+ # @param strict [Boolean]
2908
+ # Throw an error if a cast could not be done (for instance, due to an
2909
+ # overflow).
2910
+ #
2911
+ # @return [DataFrame]
2912
+ #
2913
+ # @example Cast specific frame columns to the specified dtypes:
2914
+ # df = Polars::DataFrame.new(
2915
+ # {
2916
+ # "foo" => [1, 2, 3],
2917
+ # "bar" => [6.0, 7.0, 8.0],
2918
+ # "ham" => [Date.new(2020, 1, 2), Date.new(2021, 3, 4), Date.new(2022, 5, 6)]
2919
+ # }
2920
+ # )
2921
+ # df.cast({"foo" => Polars::Float32, "bar" => Polars::UInt8})
2922
+ # # =>
2923
+ # # shape: (3, 3)
2924
+ # # ┌─────┬─────┬────────────┐
2925
+ # # │ foo ┆ bar ┆ ham │
2926
+ # # │ --- ┆ --- ┆ --- │
2927
+ # # │ f32 ┆ u8 ┆ date │
2928
+ # # ╞═════╪═════╪════════════╡
2929
+ # # │ 1.0 ┆ 6 ┆ 2020-01-02 │
2930
+ # # │ 2.0 ┆ 7 ┆ 2021-03-04 │
2931
+ # # │ 3.0 ┆ 8 ┆ 2022-05-06 │
2932
+ # # └─────┴─────┴────────────┘
2933
+ #
2934
+ # @example Cast all frame columns matching one dtype (or dtype group) to another dtype:
2935
+ # df.cast({Polars::Date => Polars::Datetime})
2936
+ # # =>
2937
+ # # shape: (3, 3)
2938
+ # # ┌─────┬─────┬─────────────────────┐
2939
+ # # │ foo ┆ bar ┆ ham │
2940
+ # # │ --- ┆ --- ┆ --- │
2941
+ # # │ i64 ┆ f64 ┆ datetime[μs] │
2942
+ # # ╞═════╪═════╪═════════════════════╡
2943
+ # # │ 1 ┆ 6.0 ┆ 2020-01-02 00:00:00 │
2944
+ # # │ 2 ┆ 7.0 ┆ 2021-03-04 00:00:00 │
2945
+ # # │ 3 ┆ 8.0 ┆ 2022-05-06 00:00:00 │
2946
+ # # └─────┴─────┴─────────────────────┘
2947
+ #
2948
+ # @example Cast all frame columns to the specified dtype:
2949
+ # df.cast(Polars::String).to_h(as_series: false)
2950
+ # # => {"foo"=>["1", "2", "3"], "bar"=>["6.0", "7.0", "8.0"], "ham"=>["2020-01-02", "2021-03-04", "2022-05-06"]}
2951
+ def cast(dtypes, strict: true)
2952
+ lazy.cast(dtypes, strict: strict).collect(_eager: true)
2953
+ end
2954
+
2724
2955
  # Create an empty copy of the current DataFrame.
2725
2956
  #
2726
2957
  # Returns a DataFrame with identical schema but no data.
@@ -2775,6 +3006,57 @@ module Polars
2775
3006
  # Get the DataFrame as a Array of Series.
2776
3007
  #
2777
3008
  # @return [Array]
3009
+ #
3010
+ # @example
3011
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
3012
+ # df.get_columns
3013
+ # # =>
3014
+ # # [shape: (3,)
3015
+ # # Series: 'foo' [i64]
3016
+ # # [
3017
+ # # 1
3018
+ # # 2
3019
+ # # 3
3020
+ # # ], shape: (3,)
3021
+ # # Series: 'bar' [i64]
3022
+ # # [
3023
+ # # 4
3024
+ # # 5
3025
+ # # 6
3026
+ # # ]]
3027
+ #
3028
+ # @example
3029
+ # df = Polars::DataFrame.new(
3030
+ # {
3031
+ # "a" => [1, 2, 3, 4],
3032
+ # "b" => [0.5, 4, 10, 13],
3033
+ # "c" => [true, true, false, true]
3034
+ # }
3035
+ # )
3036
+ # df.get_columns
3037
+ # # =>
3038
+ # # [shape: (4,)
3039
+ # # Series: 'a' [i64]
3040
+ # # [
3041
+ # # 1
3042
+ # # 2
3043
+ # # 3
3044
+ # # 4
3045
+ # # ], shape: (4,)
3046
+ # # Series: 'b' [f64]
3047
+ # # [
3048
+ # # 0.5
3049
+ # # 4.0
3050
+ # # 10.0
3051
+ # # 13.0
3052
+ # # ], shape: (4,)
3053
+ # # Series: 'c' [bool]
3054
+ # # [
3055
+ # # true
3056
+ # # true
3057
+ # # false
3058
+ # # true
3059
+ # # ]]
2778
3060
  def get_columns
2779
3061
  _df.get_columns.map { |s| Utils.wrap_s(s) }
2780
3062
  end
@@ -3083,7 +3365,7 @@ module Polars
3083
3365
  # "c" => [2, 4, 6]
3084
3366
  # }
3085
3367
  # )
3086
- # df.unpivot(Polars::Selectors.numeric, index: "a")
3368
+ # df.unpivot(Polars.cs.numeric, index: "a")
3087
3369
  # # =>
3088
3370
  # # shape: (6, 3)
3089
3371
  # # ┌─────┬──────────┬───────┐
@@ -3724,14 +4006,32 @@ module Polars
3724
4006
  # # ╞═════╪═════╪═════╡
3725
4007
  # # │ 3 ┆ 8 ┆ c │
3726
4008
  # # └─────┴─────┴─────┘
3727
- def max(axis: 0)
3728
- if axis == 0
3729
- lazy.max.collect(_eager: true)
3730
- elsif axis == 1
3731
- Utils.wrap_s(_df.max_horizontal)
3732
- else
3733
- raise ArgumentError, "Axis should be 0 or 1."
3734
- end
4009
+ def max
4010
+ lazy.max.collect(_eager: true)
4011
+ end
4012
+
4013
+ # Get the maximum value horizontally across columns.
4014
+ #
4015
+ # @return [Series]
4016
+ #
4017
+ # @example
4018
+ # df = Polars::DataFrame.new(
4019
+ # {
4020
+ # "foo" => [1, 2, 3],
4021
+ # "bar" => [4.0, 5.0, 6.0]
4022
+ # }
4023
+ # )
4024
+ # df.max_horizontal
4025
+ # # =>
4026
+ # # shape: (3,)
4027
+ # # Series: 'max' [f64]
4028
+ # # [
4029
+ # # 4.0
4030
+ # # 5.0
4031
+ # # 6.0
4032
+ # # ]
4033
+ def max_horizontal
4034
+ select(max: F.max_horizontal(F.all)).to_series
3735
4035
  end
3736
4036
 
3737
4037
  # Aggregate the columns of this DataFrame to their minimum value.
@@ -3756,22 +4056,35 @@ module Polars
3756
4056
  # # ╞═════╪═════╪═════╡
3757
4057
  # # │ 1 ┆ 6 ┆ a │
3758
4058
  # # └─────┴─────┴─────┘
3759
- def min(axis: 0)
3760
- if axis == 0
3761
- lazy.min.collect(_eager: true)
3762
- elsif axis == 1
3763
- Utils.wrap_s(_df.min_horizontal)
3764
- else
3765
- raise ArgumentError, "Axis should be 0 or 1."
3766
- end
4059
+ def min
4060
+ lazy.min.collect(_eager: true)
3767
4061
  end
3768
4062
 
3769
- # Aggregate the columns of this DataFrame to their sum value.
4063
+ # Get the minimum value horizontally across columns.
3770
4064
  #
3771
- # @param axis [Integer]
3772
- # Either 0 or 1.
3773
- # @param null_strategy ["ignore", "propagate"]
3774
- # This argument is only used if axis == 1.
4065
+ # @return [Series]
4066
+ #
4067
+ # @example
4068
+ # df = Polars::DataFrame.new(
4069
+ # {
4070
+ # "foo" => [1, 2, 3],
4071
+ # "bar" => [4.0, 5.0, 6.0]
4072
+ # }
4073
+ # )
4074
+ # df.min_horizontal
4075
+ # # =>
4076
+ # # shape: (3,)
4077
+ # # Series: 'min' [f64]
4078
+ # # [
4079
+ # # 1.0
4080
+ # # 2.0
4081
+ # # 3.0
4082
+ # # ]
4083
+ def min_horizontal
4084
+ select(min: F.min_horizontal(F.all)).to_series
4085
+ end
4086
+
4087
+ # Aggregate the columns of this DataFrame to their sum value.
3775
4088
  #
3776
4089
  # @return [DataFrame]
3777
4090
  #
@@ -3793,35 +4106,42 @@ module Polars
3793
4106
  # # ╞═════╪═════╪══════╡
3794
4107
  # # │ 6 ┆ 21 ┆ null │
3795
4108
  # # └─────┴─────┴──────┘
4109
+ def sum
4110
+ lazy.sum.collect(_eager: true)
4111
+ end
4112
+
4113
+ # Sum all values horizontally across columns.
4114
+ #
4115
+ # @param ignore_nulls [Boolean]
4116
+ # Ignore null values (default).
4117
+ # If set to `false`, any null value in the input will lead to a null output.
4118
+ #
4119
+ # @return [Series]
3796
4120
  #
3797
4121
  # @example
3798
- # df.sum(axis: 1)
4122
+ # df = Polars::DataFrame.new(
4123
+ # {
4124
+ # "foo" => [1, 2, 3],
4125
+ # "bar" => [4.0, 5.0, 6.0]
4126
+ # }
4127
+ # )
4128
+ # df.sum_horizontal
3799
4129
  # # =>
3800
4130
  # # shape: (3,)
3801
- # # Series: 'foo' [str]
4131
+ # # Series: 'sum' [f64]
3802
4132
  # # [
3803
- # # "16a"
3804
- # # "27b"
3805
- # # "38c"
4133
+ # # 5.0
4134
+ # # 7.0
4135
+ # # 9.0
3806
4136
  # # ]
3807
- def sum(axis: 0, null_strategy: "ignore")
3808
- case axis
3809
- when 0
3810
- lazy.sum.collect(_eager: true)
3811
- when 1
3812
- Utils.wrap_s(_df.sum_horizontal(null_strategy))
3813
- else
3814
- raise ArgumentError, "Axis should be 0 or 1."
3815
- end
4137
+ def sum_horizontal(ignore_nulls: true)
4138
+ select(
4139
+ sum: F.sum_horizontal(F.all, ignore_nulls: ignore_nulls)
4140
+ ).to_series
3816
4141
  end
3817
4142
 
3818
4143
  # Aggregate the columns of this DataFrame to their mean value.
3819
4144
  #
3820
- # @param axis [Integer]
3821
- # Either 0 or 1.
3822
- # @param null_strategy ["ignore", "propagate"]
3823
- # This argument is only used if axis == 1.
3824
- #
3825
4145
  # @return [DataFrame]
3826
4146
  #
3827
4147
  # @example
@@ -3842,15 +4162,38 @@ module Polars
3842
4162
  # # ╞═════╪═════╪══════╡
3843
4163
  # # │ 2.0 ┆ 7.0 ┆ null │
3844
4164
  # # └─────┴─────┴──────┘
3845
- def mean(axis: 0, null_strategy: "ignore")
3846
- case axis
3847
- when 0
3848
- lazy.mean.collect(_eager: true)
3849
- when 1
3850
- Utils.wrap_s(_df.mean_horizontal(null_strategy))
3851
- else
3852
- raise ArgumentError, "Axis should be 0 or 1."
3853
- end
4165
+ def mean
4166
+ lazy.mean.collect(_eager: true)
4167
+ end
4168
+
4169
+ # Take the mean of all values horizontally across columns.
4170
+ #
4171
+ # @param ignore_nulls [Boolean]
4172
+ # Ignore null values (default).
4173
+ # If set to `false`, any null value in the input will lead to a null output.
4174
+ #
4175
+ # @return [Series]
4176
+ #
4177
+ # @example
4178
+ # df = Polars::DataFrame.new(
4179
+ # {
4180
+ # "foo" => [1, 2, 3],
4181
+ # "bar" => [4.0, 5.0, 6.0]
4182
+ # }
4183
+ # )
4184
+ # df.mean_horizontal
4185
+ # # =>
4186
+ # # shape: (3,)
4187
+ # # Series: 'mean' [f64]
4188
+ # # [
4189
+ # # 2.5
4190
+ # # 3.5
4191
+ # # 4.5
4192
+ # # ]
4193
+ def mean_horizontal(ignore_nulls: true)
4194
+ select(
4195
+ mean: F.mean_horizontal(F.all, ignore_nulls: ignore_nulls)
4196
+ ).to_series
3854
4197
  end
3855
4198
 
3856
4199
  # Aggregate the columns of this DataFrame to their standard deviation value.
@@ -4296,7 +4639,7 @@ module Polars
4296
4639
  # @example A horizontal string concatenation:
4297
4640
  # df = Polars::DataFrame.new(
4298
4641
  # {
4299
- # "a" => ["foo", "bar", 2],
4642
+ # "a" => ["foo", "bar", nil],
4300
4643
  # "b" => [1, 2, 3],
4301
4644
  # "c" => [1.0, 2.0, 3.0]
4302
4645
  # }
@@ -4327,11 +4670,11 @@ module Polars
4327
4670
  # # true
4328
4671
  # # true
4329
4672
  # # ]
4330
- def fold(&operation)
4673
+ def fold
4331
4674
  acc = to_series(0)
4332
4675
 
4333
4676
  1.upto(width - 1) do |i|
4334
- acc = operation.call(acc, to_series(i))
4677
+ acc = yield(acc, to_series(i))
4335
4678
  end
4336
4679
  acc
4337
4680
  end
@@ -4843,7 +5186,7 @@ module Polars
4843
5186
  end
4844
5187
 
4845
5188
  # @private
4846
- def self.expand_hash_scalars(data, schema_overrides: nil, order: nil, nan_to_null: false)
5189
+ def self.expand_hash_scalars(data, schema_overrides: nil, strict: true, order: nil, nan_to_null: false)
4847
5190
  updated_data = {}
4848
5191
  unless data.empty?
4849
5192
  dtypes = schema_overrides || {}
@@ -4852,23 +5195,23 @@ module Polars
4852
5195
  data.each do |name, val|
4853
5196
  dtype = dtypes[name]
4854
5197
  if val.is_a?(Hash) && dtype != Struct
4855
- updated_data[name] = DataFrame.new(val).to_struct(name)
5198
+ updated_data[name] = DataFrame.new(val, strict: strict).to_struct(name)
4856
5199
  elsif !Utils.arrlen(val).nil?
4857
- updated_data[name] = Series.new(::String.new(name), val, dtype: dtype)
5200
+ updated_data[name] = Series.new(::String.new(name), val, dtype: dtype, strict: strict)
4858
5201
  elsif val.nil? || [Integer, Float, TrueClass, FalseClass, ::String, ::Date, ::DateTime, ::Time].any? { |cls| val.is_a?(cls) }
4859
5202
  dtype = Polars::Float64 if val.nil? && dtype.nil?
4860
- updated_data[name] = Series.new(::String.new(name), [val], dtype: dtype).extend_constant(val, array_len - 1)
5203
+ updated_data[name] = Series.new(::String.new(name), [val], dtype: dtype, strict: strict).extend_constant(val, array_len - 1)
4861
5204
  else
4862
5205
  raise Todo
4863
5206
  end
4864
5207
  end
4865
5208
  elsif data.values.all? { |val| Utils.arrlen(val) == 0 }
4866
5209
  data.each do |name, val|
4867
- updated_data[name] = Series.new(name, val, dtype: dtypes[name])
5210
+ updated_data[name] = Series.new(name, val, dtype: dtypes[name], strict: strict)
4868
5211
  end
4869
5212
  elsif data.values.all? { |val| Utils.arrlen(val).nil? }
4870
5213
  data.each do |name, val|
4871
- updated_data[name] = Series.new(name, [val], dtype: dtypes[name])
5214
+ updated_data[name] = Series.new(name, [val], dtype: dtypes[name], strict: strict)
4872
5215
  end
4873
5216
  end
4874
5217
  end
@@ -4876,7 +5219,7 @@ module Polars
4876
5219
  end
4877
5220
 
4878
5221
  # @private
4879
- def self.hash_to_rbdf(data, schema: nil, schema_overrides: nil, nan_to_null: nil)
5222
+ def self.hash_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true, nan_to_null: nil)
4880
5223
  if schema.is_a?(Hash) && !data.empty?
4881
5224
  if !data.all? { |col, _| schema[col] }
4882
5225
  raise ArgumentError, "The given column-schema names do not match the data dictionary"
@@ -4893,9 +5236,9 @@ module Polars
4893
5236
  end
4894
5237
 
4895
5238
  if data.empty? && !schema_overrides.empty?
4896
- data_series = column_names.map { |name| Series.new(name, [], dtype: schema_overrides[name], nan_to_null: nan_to_null)._s }
5239
+ data_series = column_names.map { |name| Series.new(name, [], dtype: schema_overrides[name], strict: strict, nan_to_null: nan_to_null)._s }
4897
5240
  else
4898
- data_series = expand_hash_scalars(data, schema_overrides: schema_overrides, nan_to_null: nan_to_null).values.map(&:_s)
5241
+ data_series = expand_hash_scalars(data, schema_overrides: schema_overrides, strict: strict, nan_to_null: nan_to_null).values.map(&:_s)
4899
5242
  end
4900
5243
 
4901
5244
  data_series = _handle_columns_arg(data_series, columns: column_names, from_hash: true)
@@ -4969,7 +5312,7 @@ module Polars
4969
5312
  end
4970
5313
  end
4971
5314
 
4972
- def self._post_apply_columns(rbdf, columns, structs: nil, schema_overrides: nil)
5315
+ def self._post_apply_columns(rbdf, columns, structs: nil, schema_overrides: nil, strict: true)
4973
5316
  rbdf_columns = rbdf.columns
4974
5317
  rbdf_dtypes = rbdf.dtypes
4975
5318
  columns, dtypes = _unpack_schema(
@@ -4985,13 +5328,13 @@ module Polars
4985
5328
  end
4986
5329
 
4987
5330
  column_casts = []
4988
- columns.each do |col, i|
5331
+ columns.each_with_index do |col, i|
4989
5332
  if dtypes[col] == Categorical # != rbdf_dtypes[i]
4990
- column_casts << Polars.col(col).cast(Categorical)._rbexpr
5333
+ column_casts << Polars.col(col).cast(Categorical, strict: strict)._rbexpr
4991
5334
  elsif structs&.any? && structs.include?(col) && structs[col] != rbdf_dtypes[i]
4992
- column_casts << Polars.col(col).cast(structs[col])._rbexpr
5335
+ column_casts << Polars.col(col).cast(structs[col], strict: strict)._rbexpr
4993
5336
  elsif dtypes.include?(col) && dtypes[col] != rbdf_dtypes[i]
4994
- column_casts << Polars.col(col).cast(dtypes[col])._rbexpr
5337
+ column_casts << Polars.col(col).cast(dtypes[col], strict: strict)._rbexpr
4995
5338
  end
4996
5339
  end
4997
5340
 
@@ -5010,12 +5353,11 @@ module Polars
5010
5353
  end
5011
5354
 
5012
5355
  # @private
5013
- def self.sequence_to_rbdf(data, schema: nil, schema_overrides: nil, orient: nil, infer_schema_length: 50)
5014
- raise Todo if schema_overrides
5356
+ def self.sequence_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true, orient: nil, infer_schema_length: 50)
5015
5357
  columns = schema
5016
5358
 
5017
5359
  if data.length == 0
5018
- return hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
5360
+ return hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides, strict: strict)
5019
5361
  end
5020
5362
 
5021
5363
  if data[0].is_a?(Series)
@@ -5028,7 +5370,7 @@ module Polars
5028
5370
  elsif data[0].is_a?(Hash)
5029
5371
  column_names, dtypes = _unpack_schema(columns)
5030
5372
  schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
5031
- rbdf = RbDataFrame.from_hashes(data, schema, schema_overrides, false, infer_schema_length)
5373
+ rbdf = RbDataFrame.from_hashes(data, schema, schema_overrides, strict, infer_schema_length)
5032
5374
  if column_names
5033
5375
  rbdf = _post_apply_columns(rbdf, column_names)
5034
5376
  end
@@ -5048,7 +5390,7 @@ module Polars
5048
5390
  schema, schema_overrides: schema_overrides, n_expected: first_element.length
5049
5391
  )
5050
5392
  local_schema_override = (
5051
- schema_overrides.any? ? (raise Todo) : {}
5393
+ schema_overrides.any? ? _include_unknowns(schema_overrides, column_names) : {}
5052
5394
  )
5053
5395
  if column_names.any? && first_element.length > 0 && first_element.length != column_names.length
5054
5396
  raise ArgumentError, "the row data does not match the number of columns"
@@ -5056,7 +5398,11 @@ module Polars
5056
5398
 
5057
5399
  unpack_nested = false
5058
5400
  local_schema_override.each do |col, tp|
5059
- raise Todo
5401
+ if [Categorical, Enum].include?(tp)
5402
+ local_schema_override[col] = String
5403
+ elsif !unpack_nested && [Unknown, Struct].include?(tp.base_type)
5404
+ raise Todo
5405
+ end
5060
5406
  end
5061
5407
 
5062
5408
  if unpack_nested
@@ -5070,7 +5416,7 @@ module Polars
5070
5416
  end
5071
5417
  if column_names.any? || schema_overrides.any?
5072
5418
  rbdf = _post_apply_columns(
5073
- rbdf, column_names, schema_overrides: schema_overrides
5419
+ rbdf, column_names, schema_overrides: schema_overrides, strict: strict
5074
5420
  )
5075
5421
  end
5076
5422
  return rbdf
@@ -5080,7 +5426,7 @@ module Polars
5080
5426
  )
5081
5427
  data_series =
5082
5428
  data.map.with_index do |element, i|
5083
- Series.new(column_names[i], element, dtype: schema_overrides[column_names[i]])._s
5429
+ Series.new(column_names[i], element, dtype: schema_overrides[column_names[i]], strict: strict)._s
5084
5430
  end
5085
5431
  return RbDataFrame.new(data_series)
5086
5432
  else
@@ -5093,7 +5439,12 @@ module Polars
5093
5439
  end
5094
5440
 
5095
5441
  # @private
5096
- def self.series_to_rbdf(data, schema: nil, schema_overrides: nil)
5442
+ def self._include_unknowns(schema, cols)
5443
+ cols.to_h { |col| [col, schema[col] || Unknown] }
5444
+ end
5445
+
5446
+ # @private
5447
+ def self.series_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true)
5097
5448
  data_series = [data._s]
5098
5449
  series_name = data_series.map(&:name)
5099
5450
  column_names, schema_overrides = _unpack_schema(
@@ -5102,7 +5453,7 @@ module Polars
5102
5453
  if schema_overrides.any?
5103
5454
  new_dtype = schema_overrides.values[0]
5104
5455
  if new_dtype != data.dtype
5105
- data_series[0] = data_series[0].cast(new_dtype, true)
5456
+ data_series[0] = data_series[0].cast(new_dtype, strict)
5106
5457
  end
5107
5458
  end
5108
5459