polars-df 0.14.0-x86_64-linux-musl → 0.16.0-x86_64-linux-musl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +35 -0
  3. data/Cargo.lock +1523 -378
  4. data/LICENSE-THIRD-PARTY.txt +23495 -12923
  5. data/LICENSE.txt +1 -0
  6. data/README.md +38 -4
  7. data/lib/polars/3.2/polars.so +0 -0
  8. data/lib/polars/3.3/polars.so +0 -0
  9. data/lib/polars/{3.1 → 3.4}/polars.so +0 -0
  10. data/lib/polars/batched_csv_reader.rb +0 -2
  11. data/lib/polars/binary_expr.rb +133 -9
  12. data/lib/polars/binary_name_space.rb +101 -6
  13. data/lib/polars/config.rb +4 -0
  14. data/lib/polars/data_frame.rb +452 -101
  15. data/lib/polars/data_type_group.rb +28 -0
  16. data/lib/polars/data_types.rb +3 -1
  17. data/lib/polars/date_time_expr.rb +244 -0
  18. data/lib/polars/date_time_name_space.rb +87 -0
  19. data/lib/polars/expr.rb +103 -2
  20. data/lib/polars/functions/aggregation/horizontal.rb +10 -4
  21. data/lib/polars/functions/as_datatype.rb +51 -2
  22. data/lib/polars/functions/col.rb +1 -1
  23. data/lib/polars/functions/eager.rb +1 -3
  24. data/lib/polars/functions/lazy.rb +95 -13
  25. data/lib/polars/functions/range/time_range.rb +21 -21
  26. data/lib/polars/io/csv.rb +14 -16
  27. data/lib/polars/io/database.rb +2 -2
  28. data/lib/polars/io/delta.rb +126 -0
  29. data/lib/polars/io/ipc.rb +14 -4
  30. data/lib/polars/io/ndjson.rb +10 -0
  31. data/lib/polars/io/parquet.rb +168 -111
  32. data/lib/polars/lazy_frame.rb +684 -20
  33. data/lib/polars/list_name_space.rb +169 -0
  34. data/lib/polars/selectors.rb +1226 -0
  35. data/lib/polars/series.rb +465 -35
  36. data/lib/polars/string_cache.rb +27 -1
  37. data/lib/polars/string_expr.rb +0 -1
  38. data/lib/polars/string_name_space.rb +73 -3
  39. data/lib/polars/struct_name_space.rb +31 -7
  40. data/lib/polars/utils/various.rb +5 -1
  41. data/lib/polars/utils.rb +45 -10
  42. data/lib/polars/version.rb +1 -1
  43. data/lib/polars.rb +17 -1
  44. metadata +10 -9
  45. data/lib/polars/functions.rb +0 -57
@@ -8,17 +8,49 @@ module Polars
8
8
 
9
9
  # Create a new DataFrame.
10
10
  #
11
- # @param data [Hash, Array, Series, nil]
12
- # Two-dimensional data in various forms. Hash must contain Arrays.
13
- # Array may contain Series.
14
- # @param columns [Array, Hash, nil]
15
- # Column labels to use for resulting DataFrame. If specified, overrides any
16
- # labels already present in the data. Must match data dimensions.
17
- # @param orient ["col", "row", nil]
18
- # Whether to interpret two-dimensional data as columns or as rows. If `nil`,
11
+ # @param data [Object]
12
+ # Two-dimensional data in various forms; hash input must contain arrays
13
+ # or a range. Arrays may contain Series or other arrays.
14
+ # @param schema [Object]
15
+ # The schema of the resulting DataFrame. The schema may be declared in several
16
+ # ways:
17
+ #
18
+ # * As a hash of name:type pairs; if type is nil, it will be auto-inferred.
19
+ # * As an array of column names; in this case types are automatically inferred.
20
+ # * As an array of (name,type) pairs; this is equivalent to the dictionary form.
21
+ #
22
+ # If you supply a list of column names that does not match the names in the
23
+ # underlying data, the names given here will overwrite them. The number
24
+ # of names given in the schema should match the underlying data dimensions.
25
+ #
26
+ # If set to `nil` (default), the schema is inferred from the data.
27
+ # @param schema_overrides [Hash]
28
+ # Support type specification or override of one or more columns; note that
29
+ # any dtypes inferred from the schema param will be overridden.
30
+ #
31
+ # The number of entries in the schema should match the underlying data
32
+ # dimensions, unless an array of hashes is being passed, in which case
33
+ # a *partial* schema can be declared to prevent specific fields from being loaded.
34
+ # @param strict [Boolean]
35
+ # Throw an error if any `data` value does not exactly match the given or inferred
36
+ # data type for that column. If set to `false`, values that do not match the data
37
+ # type are cast to that data type or, if casting is not possible, set to null
38
+ # instead.
39
+ # @param orient ["col", "row"]
40
+ # Whether to interpret two-dimensional data as columns or as rows. If nil,
19
41
  # the orientation is inferred by matching the columns and data dimensions. If
20
42
  # this does not yield conclusive results, column orientation is used.
21
- def initialize(data = nil, schema: nil, columns: nil, schema_overrides: nil, orient: nil, infer_schema_length: 100, nan_to_null: false)
43
+ # @param infer_schema_length [Integer]
44
+ # The maximum number of rows to scan for schema inference. If set to `nil`, the
45
+ # full data may be scanned *(this can be slow)*. This parameter only applies if
46
+ # the input data is a sequence or generator of rows; other input is read as-is.
47
+ # @param nan_to_null [Boolean]
48
+ # If the data comes from one or more Numo arrays, can optionally convert input
49
+ # data NaN values to null instead. This is a no-op for all other input data.
50
+ def initialize(data = nil, schema: nil, columns: nil, schema_overrides: nil, strict: true, orient: nil, infer_schema_length: 100, nan_to_null: false)
51
+ if schema && columns
52
+ warn "columns is ignored when schema is passed"
53
+ end
22
54
  schema ||= columns
23
55
 
24
56
  if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
@@ -29,11 +61,17 @@ module Polars
29
61
  self._df = self.class.hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
30
62
  elsif data.is_a?(Hash)
31
63
  data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
32
- self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, nan_to_null: nan_to_null)
64
+ self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict, nan_to_null: nan_to_null)
33
65
  elsif data.is_a?(::Array)
34
- self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, orient: orient, infer_schema_length: infer_schema_length)
66
+ self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict, orient: orient, infer_schema_length: infer_schema_length)
35
67
  elsif data.is_a?(Series)
36
- self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides)
68
+ self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict)
69
+ elsif data.respond_to?(:arrow_c_stream)
70
+ # This uses the fact that RbSeries.from_arrow_c_stream will create a
71
+ # struct-typed Series. Then we unpack that to a DataFrame.
72
+ tmp_col_name = ""
73
+ s = Utils.wrap_s(RbSeries.from_arrow_c_stream(data))
74
+ self._df = s.to_frame(tmp_col_name).unnest(tmp_col_name)._df
37
75
  else
38
76
  raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
39
77
  end
@@ -452,6 +490,11 @@ module Polars
452
490
  end
453
491
  end
454
492
 
493
+ # @private
494
+ def arrow_c_stream
495
+ _df.arrow_c_stream
496
+ end
497
+
455
498
  # Return the dataframe as a scalar.
456
499
  #
457
500
  # Equivalent to `df[0,0]`, with a check that the shape is (1,1).
@@ -766,15 +809,18 @@ module Polars
766
809
  # Compression method. Defaults to "uncompressed".
767
810
  #
768
811
  # @return [nil]
769
- def write_avro(file, compression = "uncompressed")
812
+ def write_avro(file, compression = "uncompressed", name: "")
770
813
  if compression.nil?
771
814
  compression = "uncompressed"
772
815
  end
773
816
  if Utils.pathlike?(file)
774
817
  file = Utils.normalize_filepath(file)
775
818
  end
819
+ if name.nil?
820
+ name = ""
821
+ end
776
822
 
777
- _df.write_avro(file, compression)
823
+ _df.write_avro(file, compression, name)
778
824
  end
779
825
 
780
826
  # Write to Arrow IPC binary stream or Feather file.
@@ -785,7 +831,13 @@ module Polars
785
831
  # Compression method. Defaults to "uncompressed".
786
832
  #
787
833
  # @return [nil]
788
- def write_ipc(file, compression: "uncompressed")
834
+ def write_ipc(
835
+ file,
836
+ compression: "uncompressed",
837
+ compat_level: nil,
838
+ storage_options: nil,
839
+ retries: 2
840
+ )
789
841
  return_bytes = file.nil?
790
842
  if return_bytes
791
843
  file = StringIO.new
@@ -795,11 +847,21 @@ module Polars
795
847
  file = Utils.normalize_filepath(file)
796
848
  end
797
849
 
850
+ if compat_level.nil?
851
+ compat_level = true
852
+ end
853
+
798
854
  if compression.nil?
799
855
  compression = "uncompressed"
800
856
  end
801
857
 
802
- _df.write_ipc(file, compression)
858
+ if storage_options&.any?
859
+ storage_options = storage_options.to_a
860
+ else
861
+ storage_options = nil
862
+ end
863
+
864
+ _df.write_ipc(file, compression, compat_level, storage_options, retries)
803
865
  return_bytes ? file.string : nil
804
866
  end
805
867
 
@@ -826,7 +888,8 @@ module Polars
826
888
  # df.write_ipc_stream("new_file.arrow")
827
889
  def write_ipc_stream(
828
890
  file,
829
- compression: "uncompressed"
891
+ compression: "uncompressed",
892
+ compat_level: nil
830
893
  )
831
894
  return_bytes = file.nil?
832
895
  if return_bytes
@@ -836,11 +899,15 @@ module Polars
836
899
  file = Utils.normalize_filepath(file)
837
900
  end
838
901
 
902
+ if compat_level.nil?
903
+ compat_level = true
904
+ end
905
+
839
906
  if compression.nil?
840
907
  compression = "uncompressed"
841
908
  end
842
909
 
843
- _df.write_ipc_stream(file, compression)
910
+ _df.write_ipc_stream(file, compression, compat_level)
844
911
  return_bytes ? file.string : nil
845
912
  end
846
913
 
@@ -906,6 +973,61 @@ module Polars
906
973
  )
907
974
  end
908
975
 
976
+ # Write DataFrame as delta table.
977
+ #
978
+ # @param target [Object]
979
+ # URI of a table or a DeltaTable object.
980
+ # @param mode ["error", "append", "overwrite", "ignore", "merge"]
981
+ # How to handle existing data.
982
+ # @param storage_options [Hash]
983
+ # Extra options for the storage backends supported by `deltalake-rb`.
984
+ # @param delta_write_options [Hash]
985
+ # Additional keyword arguments while writing a Delta lake Table.
986
+ # @param delta_merge_options [Hash]
987
+ # Keyword arguments which are required to `MERGE` a Delta lake Table.
988
+ #
989
+ # @return [nil]
990
+ def write_delta(
991
+ target,
992
+ mode: "error",
993
+ storage_options: nil,
994
+ delta_write_options: nil,
995
+ delta_merge_options: nil
996
+ )
997
+ Polars.send(:_check_if_delta_available)
998
+
999
+ if Utils.pathlike?(target)
1000
+ target = Polars.send(:_resolve_delta_lake_uri, target.to_s, strict: false)
1001
+ end
1002
+
1003
+ data = self
1004
+
1005
+ if mode == "merge"
1006
+ if delta_merge_options.nil?
1007
+ msg = "You need to pass delta_merge_options with at least a given predicate for `MERGE` to work."
1008
+ raise ArgumentError, msg
1009
+ end
1010
+ if target.is_a?(::String)
1011
+ dt = DeltaLake::Table.new(target, storage_options: storage_options)
1012
+ else
1013
+ dt = target
1014
+ end
1015
+
1016
+ predicate = delta_merge_options.delete(:predicate)
1017
+ dt.merge(data, predicate, **delta_merge_options)
1018
+ else
1019
+ delta_write_options ||= {}
1020
+
1021
+ DeltaLake.write(
1022
+ target,
1023
+ data,
1024
+ mode: mode,
1025
+ storage_options: storage_options,
1026
+ **delta_write_options
1027
+ )
1028
+ end
1029
+ end
1030
+
909
1031
  # Return an estimation of the total (heap) allocated size of the DataFrame.
910
1032
  #
911
1033
  # Estimated size is given in the specified unit (bytes by default).
@@ -1037,6 +1159,10 @@ module Polars
1037
1159
  #
1038
1160
  # @param mapping [Hash]
1039
1161
  # Key value pairs that map from old name to new name.
1162
+ # @param strict [Boolean]
1163
+ # Validate that all column names exist in the current schema,
1164
+ # and throw an exception if any do not. (Note that this parameter
1165
+ # is a no-op when passing a function to `mapping`).
1040
1166
  #
1041
1167
  # @return [DataFrame]
1042
1168
  #
@@ -1060,8 +1186,8 @@ module Polars
1060
1186
  # # │ 2 ┆ 7 ┆ b │
1061
1187
  # # │ 3 ┆ 8 ┆ c │
1062
1188
  # # └───────┴─────┴─────┘
1063
- def rename(mapping)
1064
- lazy.rename(mapping).collect(no_optimization: true)
1189
+ def rename(mapping, strict: true)
1190
+ lazy.rename(mapping, strict: strict).collect(no_optimization: true)
1065
1191
  end
1066
1192
 
1067
1193
  # Insert a Series at a certain column index. This operation is in place.
@@ -2190,6 +2316,11 @@ module Polars
2190
2316
  # @param force_parallel [Boolean]
2191
2317
  # Force the physical plan to evaluate the computation of both DataFrames up to
2192
2318
  # the join in parallel.
2319
+ # @param coalesce [Boolean]
2320
+ # Coalescing behavior (merging of join columns).
2321
+ # - true: -> Always coalesce join columns.
2322
+ # - false: -> Never coalesce join columns.
2323
+ # Note that joining on any other expressions than `col` will turn off coalescing.
2193
2324
  #
2194
2325
  # @return [DataFrame]
2195
2326
  #
@@ -2243,7 +2374,8 @@ module Polars
2243
2374
  suffix: "_right",
2244
2375
  tolerance: nil,
2245
2376
  allow_parallel: true,
2246
- force_parallel: false
2377
+ force_parallel: false,
2378
+ coalesce: true
2247
2379
  )
2248
2380
  lazy
2249
2381
  .join_asof(
@@ -2258,7 +2390,8 @@ module Polars
2258
2390
  suffix: suffix,
2259
2391
  tolerance: tolerance,
2260
2392
  allow_parallel: allow_parallel,
2261
- force_parallel: force_parallel
2393
+ force_parallel: force_parallel,
2394
+ coalesce: coalesce
2262
2395
  )
2263
2396
  .collect(no_optimization: true)
2264
2397
  end
@@ -2277,8 +2410,20 @@ module Polars
2277
2410
  # Join strategy.
2278
2411
  # @param suffix [String]
2279
2412
  # Suffix to append to columns with a duplicate name.
2413
+ # @param validate ['m:m', 'm:1', '1:m', '1:1']
2414
+ # Checks if join is of specified type.
2415
+ # * *many_to_many* - “m:m”: default, does not result in checks
2416
+ # * *one_to_one* - “1:1”: check if join keys are unique in both left and right datasets
2417
+ # * *one_to_many* - “1:m”: check if join keys are unique in left dataset
2418
+ # * *many_to_one* - “m:1”: check if join keys are unique in right dataset
2280
2419
  # @param join_nulls [Boolean]
2281
2420
  # Join on null values. By default null values will never produce matches.
2421
+ # @param coalesce [Boolean]
2422
+ # Coalescing behavior (merging of join columns).
2423
+ # - nil: -> join specific.
2424
+ # - true: -> Always coalesce join columns.
2425
+ # - false: -> Never coalesce join columns.
2426
+ # Note that joining on any other expressions than `col` will turn off coalescing.
2282
2427
  #
2283
2428
  # @return [DataFrame]
2284
2429
  #
@@ -2361,7 +2506,16 @@ module Polars
2361
2506
  # # ╞═════╪═════╪═════╡
2362
2507
  # # │ 3 ┆ 8.0 ┆ c │
2363
2508
  # # └─────┴─────┴─────┘
2364
- def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right", join_nulls: false)
2509
+ def join(other,
2510
+ left_on: nil,
2511
+ right_on: nil,
2512
+ on: nil,
2513
+ how: "inner",
2514
+ suffix: "_right",
2515
+ validate: "m:m",
2516
+ join_nulls: false,
2517
+ coalesce: nil
2518
+ )
2365
2519
  lazy
2366
2520
  .join(
2367
2521
  other.lazy,
@@ -2370,7 +2524,9 @@ module Polars
2370
2524
  on: on,
2371
2525
  how: how,
2372
2526
  suffix: suffix,
2373
- join_nulls: join_nulls
2527
+ validate: validate,
2528
+ join_nulls: join_nulls,
2529
+ coalesce: coalesce
2374
2530
  )
2375
2531
  .collect(no_optimization: true)
2376
2532
  end
@@ -2717,10 +2873,85 @@ module Polars
2717
2873
  # Column to drop.
2718
2874
  #
2719
2875
  # @return [Series]
2876
+ #
2877
+ # @example
2878
+ # df = Polars::DataFrame.new(
2879
+ # {
2880
+ # "foo" => [1, 2, 3],
2881
+ # "bar" => [6, 7, 8],
2882
+ # "ham" => ["a", "b", "c"]
2883
+ # }
2884
+ # )
2885
+ # df.delete("ham")
2886
+ # # =>
2887
+ # # shape: (3,)
2888
+ # # Series: 'ham' [str]
2889
+ # # [
2890
+ # # "a"
2891
+ # # "b"
2892
+ # # "c"
2893
+ # # ]
2894
+ #
2895
+ # @example
2896
+ # df.delete("missing")
2897
+ # # => nil
2720
2898
  def delete(name)
2721
2899
  drop_in_place(name) if include?(name)
2722
2900
  end
2723
2901
 
2902
+ # Cast DataFrame column(s) to the specified dtype(s).
2903
+ #
2904
+ # @param dtypes [Object]
2905
+ # Mapping of column names (or selector) to dtypes, or a single dtype
2906
+ # to which all columns will be cast.
2907
+ # @param strict [Boolean]
2908
+ # Throw an error if a cast could not be done (for instance, due to an
2909
+ # overflow).
2910
+ #
2911
+ # @return [DataFrame]
2912
+ #
2913
+ # @example Cast specific frame columns to the specified dtypes:
2914
+ # df = Polars::DataFrame.new(
2915
+ # {
2916
+ # "foo" => [1, 2, 3],
2917
+ # "bar" => [6.0, 7.0, 8.0],
2918
+ # "ham" => [Date.new(2020, 1, 2), Date.new(2021, 3, 4), Date.new(2022, 5, 6)]
2919
+ # }
2920
+ # )
2921
+ # df.cast({"foo" => Polars::Float32, "bar" => Polars::UInt8})
2922
+ # # =>
2923
+ # # shape: (3, 3)
2924
+ # # ┌─────┬─────┬────────────┐
2925
+ # # │ foo ┆ bar ┆ ham │
2926
+ # # │ --- ┆ --- ┆ --- │
2927
+ # # │ f32 ┆ u8 ┆ date │
2928
+ # # ╞═════╪═════╪════════════╡
2929
+ # # │ 1.0 ┆ 6 ┆ 2020-01-02 │
2930
+ # # │ 2.0 ┆ 7 ┆ 2021-03-04 │
2931
+ # # │ 3.0 ┆ 8 ┆ 2022-05-06 │
2932
+ # # └─────┴─────┴────────────┘
2933
+ #
2934
+ # @example Cast all frame columns matching one dtype (or dtype group) to another dtype:
2935
+ # df.cast({Polars::Date => Polars::Datetime})
2936
+ # # =>
2937
+ # # shape: (3, 3)
2938
+ # # ┌─────┬─────┬─────────────────────┐
2939
+ # # │ foo ┆ bar ┆ ham │
2940
+ # # │ --- ┆ --- ┆ --- │
2941
+ # # │ i64 ┆ f64 ┆ datetime[μs] │
2942
+ # # ╞═════╪═════╪═════════════════════╡
2943
+ # # │ 1 ┆ 6.0 ┆ 2020-01-02 00:00:00 │
2944
+ # # │ 2 ┆ 7.0 ┆ 2021-03-04 00:00:00 │
2945
+ # # │ 3 ┆ 8.0 ┆ 2022-05-06 00:00:00 │
2946
+ # # └─────┴─────┴─────────────────────┘
2947
+ #
2948
+ # @example Cast all frame columns to the specified dtype:
2949
+ # df.cast(Polars::String).to_h(as_series: false)
2950
+ # # => {"foo"=>["1", "2", "3"], "bar"=>["6.0", "7.0", "8.0"], "ham"=>["2020-01-02", "2021-03-04", "2022-05-06"]}
2951
+ def cast(dtypes, strict: true)
2952
+ lazy.cast(dtypes, strict: strict).collect(_eager: true)
2953
+ end
2954
+
2724
2955
  # Create an empty copy of the current DataFrame.
2725
2956
  #
2726
2957
  # Returns a DataFrame with identical schema but no data.
@@ -2775,6 +3006,57 @@ module Polars
2775
3006
  # Get the DataFrame as a Array of Series.
2776
3007
  #
2777
3008
  # @return [Array]
3009
+ #
3010
+ # @example
3011
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
3012
+ # df.get_columns
3013
+ # # =>
3014
+ # # [shape: (3,)
3015
+ # # Series: 'foo' [i64]
3016
+ # # [
3017
+ # # 1
3018
+ # # 2
3019
+ # # 3
3020
+ # # ], shape: (3,)
3021
+ # # Series: 'bar' [i64]
3022
+ # # [
3023
+ # # 4
3024
+ # # 5
3025
+ # # 6
3026
+ # # ]]
3027
+ #
3028
+ # @example
3029
+ # df = Polars::DataFrame.new(
3030
+ # {
3031
+ # "a" => [1, 2, 3, 4],
3032
+ # "b" => [0.5, 4, 10, 13],
3033
+ # "c" => [true, true, false, true]
3034
+ # }
3035
+ # )
3036
+ # df.get_columns
3037
+ # # =>
3038
+ # # [shape: (4,)
3039
+ # # Series: 'a' [i64]
3040
+ # # [
3041
+ # # 1
3042
+ # # 2
3043
+ # # 3
3044
+ # # 4
3045
+ # # ], shape: (4,)
3046
+ # # Series: 'b' [f64]
3047
+ # # [
3048
+ # # 0.5
3049
+ # # 4.0
3050
+ # # 10.0
3051
+ # # 13.0
3052
+ # # ], shape: (4,)
3053
+ # # Series: 'c' [bool]
3054
+ # # [
3055
+ # # true
3056
+ # # true
3057
+ # # false
3058
+ # # true
3059
+ # # ]]
2778
3060
  def get_columns
2779
3061
  _df.get_columns.map { |s| Utils.wrap_s(s) }
2780
3062
  end
@@ -3083,7 +3365,7 @@ module Polars
3083
3365
  # "c" => [2, 4, 6]
3084
3366
  # }
3085
3367
  # )
3086
- # df.unpivot(Polars::Selectors.numeric, index: "a")
3368
+ # df.unpivot(Polars.cs.numeric, index: "a")
3087
3369
  # # =>
3088
3370
  # # shape: (6, 3)
3089
3371
  # # ┌─────┬──────────┬───────┐
@@ -3724,14 +4006,32 @@ module Polars
3724
4006
  # # ╞═════╪═════╪═════╡
3725
4007
  # # │ 3 ┆ 8 ┆ c │
3726
4008
  # # └─────┴─────┴─────┘
3727
- def max(axis: 0)
3728
- if axis == 0
3729
- lazy.max.collect(_eager: true)
3730
- elsif axis == 1
3731
- Utils.wrap_s(_df.max_horizontal)
3732
- else
3733
- raise ArgumentError, "Axis should be 0 or 1."
3734
- end
4009
+ def max
4010
+ lazy.max.collect(_eager: true)
4011
+ end
4012
+
4013
+ # Get the maximum value horizontally across columns.
4014
+ #
4015
+ # @return [Series]
4016
+ #
4017
+ # @example
4018
+ # df = Polars::DataFrame.new(
4019
+ # {
4020
+ # "foo" => [1, 2, 3],
4021
+ # "bar" => [4.0, 5.0, 6.0]
4022
+ # }
4023
+ # )
4024
+ # df.max_horizontal
4025
+ # # =>
4026
+ # # shape: (3,)
4027
+ # # Series: 'max' [f64]
4028
+ # # [
4029
+ # # 4.0
4030
+ # # 5.0
4031
+ # # 6.0
4032
+ # # ]
4033
+ def max_horizontal
4034
+ select(max: F.max_horizontal(F.all)).to_series
3735
4035
  end
3736
4036
 
3737
4037
  # Aggregate the columns of this DataFrame to their minimum value.
@@ -3756,22 +4056,35 @@ module Polars
3756
4056
  # # ╞═════╪═════╪═════╡
3757
4057
  # # │ 1 ┆ 6 ┆ a │
3758
4058
  # # └─────┴─────┴─────┘
3759
- def min(axis: 0)
3760
- if axis == 0
3761
- lazy.min.collect(_eager: true)
3762
- elsif axis == 1
3763
- Utils.wrap_s(_df.min_horizontal)
3764
- else
3765
- raise ArgumentError, "Axis should be 0 or 1."
3766
- end
4059
+ def min
4060
+ lazy.min.collect(_eager: true)
3767
4061
  end
3768
4062
 
3769
- # Aggregate the columns of this DataFrame to their sum value.
4063
+ # Get the minimum value horizontally across columns.
3770
4064
  #
3771
- # @param axis [Integer]
3772
- # Either 0 or 1.
3773
- # @param null_strategy ["ignore", "propagate"]
3774
- # This argument is only used if axis == 1.
4065
+ # @return [Series]
4066
+ #
4067
+ # @example
4068
+ # df = Polars::DataFrame.new(
4069
+ # {
4070
+ # "foo" => [1, 2, 3],
4071
+ # "bar" => [4.0, 5.0, 6.0]
4072
+ # }
4073
+ # )
4074
+ # df.min_horizontal
4075
+ # # =>
4076
+ # # shape: (3,)
4077
+ # # Series: 'min' [f64]
4078
+ # # [
4079
+ # # 1.0
4080
+ # # 2.0
4081
+ # # 3.0
4082
+ # # ]
4083
+ def min_horizontal
4084
+ select(min: F.min_horizontal(F.all)).to_series
4085
+ end
4086
+
4087
+ # Aggregate the columns of this DataFrame to their sum value.
3775
4088
  #
3776
4089
  # @return [DataFrame]
3777
4090
  #
@@ -3793,35 +4106,42 @@ module Polars
3793
4106
  # # ╞═════╪═════╪══════╡
3794
4107
  # # │ 6 ┆ 21 ┆ null │
3795
4108
  # # └─────┴─────┴──────┘
4109
+ def sum
4110
+ lazy.sum.collect(_eager: true)
4111
+ end
4112
+
4113
+ # Sum all values horizontally across columns.
4114
+ #
4115
+ # @param ignore_nulls [Boolean]
4116
+ # Ignore null values (default).
4117
+ # If set to `false`, any null value in the input will lead to a null output.
4118
+ #
4119
+ # @return [Series]
3796
4120
  #
3797
4121
  # @example
3798
- # df.sum(axis: 1)
4122
+ # df = Polars::DataFrame.new(
4123
+ # {
4124
+ # "foo" => [1, 2, 3],
4125
+ # "bar" => [4.0, 5.0, 6.0]
4126
+ # }
4127
+ # )
4128
+ # df.sum_horizontal
3799
4129
  # # =>
3800
4130
  # # shape: (3,)
3801
- # # Series: 'foo' [str]
4131
+ # # Series: 'sum' [f64]
3802
4132
  # # [
3803
- # # "16a"
3804
- # # "27b"
3805
- # # "38c"
4133
+ # # 5.0
4134
+ # # 7.0
4135
+ # # 9.0
3806
4136
  # # ]
3807
- def sum(axis: 0, null_strategy: "ignore")
3808
- case axis
3809
- when 0
3810
- lazy.sum.collect(_eager: true)
3811
- when 1
3812
- Utils.wrap_s(_df.sum_horizontal(null_strategy))
3813
- else
3814
- raise ArgumentError, "Axis should be 0 or 1."
3815
- end
4137
+ def sum_horizontal(ignore_nulls: true)
4138
+ select(
4139
+ sum: F.sum_horizontal(F.all, ignore_nulls: ignore_nulls)
4140
+ ).to_series
3816
4141
  end
3817
4142
 
3818
4143
  # Aggregate the columns of this DataFrame to their mean value.
3819
4144
  #
3820
- # @param axis [Integer]
3821
- # Either 0 or 1.
3822
- # @param null_strategy ["ignore", "propagate"]
3823
- # This argument is only used if axis == 1.
3824
- #
3825
4145
  # @return [DataFrame]
3826
4146
  #
3827
4147
  # @example
@@ -3842,15 +4162,38 @@ module Polars
3842
4162
  # # ╞═════╪═════╪══════╡
3843
4163
  # # │ 2.0 ┆ 7.0 ┆ null │
3844
4164
  # # └─────┴─────┴──────┘
3845
- def mean(axis: 0, null_strategy: "ignore")
3846
- case axis
3847
- when 0
3848
- lazy.mean.collect(_eager: true)
3849
- when 1
3850
- Utils.wrap_s(_df.mean_horizontal(null_strategy))
3851
- else
3852
- raise ArgumentError, "Axis should be 0 or 1."
3853
- end
4165
+ def mean
4166
+ lazy.mean.collect(_eager: true)
4167
+ end
4168
+
4169
+ # Take the mean of all values horizontally across columns.
4170
+ #
4171
+ # @param ignore_nulls [Boolean]
4172
+ # Ignore null values (default).
4173
+ # If set to `false`, any null value in the input will lead to a null output.
4174
+ #
4175
+ # @return [Series]
4176
+ #
4177
+ # @example
4178
+ # df = Polars::DataFrame.new(
4179
+ # {
4180
+ # "foo" => [1, 2, 3],
4181
+ # "bar" => [4.0, 5.0, 6.0]
4182
+ # }
4183
+ # )
4184
+ # df.mean_horizontal
4185
+ # # =>
4186
+ # # shape: (3,)
4187
+ # # Series: 'mean' [f64]
4188
+ # # [
4189
+ # # 2.5
4190
+ # # 3.5
4191
+ # # 4.5
4192
+ # # ]
4193
+ def mean_horizontal(ignore_nulls: true)
4194
+ select(
4195
+ mean: F.mean_horizontal(F.all, ignore_nulls: ignore_nulls)
4196
+ ).to_series
3854
4197
  end
3855
4198
 
3856
4199
  # Aggregate the columns of this DataFrame to their standard deviation value.
@@ -4296,7 +4639,7 @@ module Polars
4296
4639
  # @example A horizontal string concatenation:
4297
4640
  # df = Polars::DataFrame.new(
4298
4641
  # {
4299
- # "a" => ["foo", "bar", 2],
4642
+ # "a" => ["foo", "bar", nil],
4300
4643
  # "b" => [1, 2, 3],
4301
4644
  # "c" => [1.0, 2.0, 3.0]
4302
4645
  # }
@@ -4327,11 +4670,11 @@ module Polars
4327
4670
  # # true
4328
4671
  # # true
4329
4672
  # # ]
4330
- def fold(&operation)
4673
+ def fold
4331
4674
  acc = to_series(0)
4332
4675
 
4333
4676
  1.upto(width - 1) do |i|
4334
- acc = operation.call(acc, to_series(i))
4677
+ acc = yield(acc, to_series(i))
4335
4678
  end
4336
4679
  acc
4337
4680
  end
@@ -4843,7 +5186,7 @@ module Polars
4843
5186
  end
4844
5187
 
4845
5188
  # @private
4846
- def self.expand_hash_scalars(data, schema_overrides: nil, order: nil, nan_to_null: false)
5189
+ def self.expand_hash_scalars(data, schema_overrides: nil, strict: true, order: nil, nan_to_null: false)
4847
5190
  updated_data = {}
4848
5191
  unless data.empty?
4849
5192
  dtypes = schema_overrides || {}
@@ -4852,23 +5195,23 @@ module Polars
4852
5195
  data.each do |name, val|
4853
5196
  dtype = dtypes[name]
4854
5197
  if val.is_a?(Hash) && dtype != Struct
4855
- updated_data[name] = DataFrame.new(val).to_struct(name)
5198
+ updated_data[name] = DataFrame.new(val, strict: strict).to_struct(name)
4856
5199
  elsif !Utils.arrlen(val).nil?
4857
- updated_data[name] = Series.new(::String.new(name), val, dtype: dtype)
5200
+ updated_data[name] = Series.new(::String.new(name), val, dtype: dtype, strict: strict)
4858
5201
  elsif val.nil? || [Integer, Float, TrueClass, FalseClass, ::String, ::Date, ::DateTime, ::Time].any? { |cls| val.is_a?(cls) }
4859
5202
  dtype = Polars::Float64 if val.nil? && dtype.nil?
4860
- updated_data[name] = Series.new(::String.new(name), [val], dtype: dtype).extend_constant(val, array_len - 1)
5203
+ updated_data[name] = Series.new(::String.new(name), [val], dtype: dtype, strict: strict).extend_constant(val, array_len - 1)
4861
5204
  else
4862
5205
  raise Todo
4863
5206
  end
4864
5207
  end
4865
5208
  elsif data.values.all? { |val| Utils.arrlen(val) == 0 }
4866
5209
  data.each do |name, val|
4867
- updated_data[name] = Series.new(name, val, dtype: dtypes[name])
5210
+ updated_data[name] = Series.new(name, val, dtype: dtypes[name], strict: strict)
4868
5211
  end
4869
5212
  elsif data.values.all? { |val| Utils.arrlen(val).nil? }
4870
5213
  data.each do |name, val|
4871
- updated_data[name] = Series.new(name, [val], dtype: dtypes[name])
5214
+ updated_data[name] = Series.new(name, [val], dtype: dtypes[name], strict: strict)
4872
5215
  end
4873
5216
  end
4874
5217
  end
@@ -4876,7 +5219,7 @@ module Polars
4876
5219
  end
4877
5220
 
4878
5221
  # @private
4879
- def self.hash_to_rbdf(data, schema: nil, schema_overrides: nil, nan_to_null: nil)
5222
+ def self.hash_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true, nan_to_null: nil)
4880
5223
  if schema.is_a?(Hash) && !data.empty?
4881
5224
  if !data.all? { |col, _| schema[col] }
4882
5225
  raise ArgumentError, "The given column-schema names do not match the data dictionary"
@@ -4893,9 +5236,9 @@ module Polars
4893
5236
  end
4894
5237
 
4895
5238
  if data.empty? && !schema_overrides.empty?
4896
- data_series = column_names.map { |name| Series.new(name, [], dtype: schema_overrides[name], nan_to_null: nan_to_null)._s }
5239
+ data_series = column_names.map { |name| Series.new(name, [], dtype: schema_overrides[name], strict: strict, nan_to_null: nan_to_null)._s }
4897
5240
  else
4898
- data_series = expand_hash_scalars(data, schema_overrides: schema_overrides, nan_to_null: nan_to_null).values.map(&:_s)
5241
+ data_series = expand_hash_scalars(data, schema_overrides: schema_overrides, strict: strict, nan_to_null: nan_to_null).values.map(&:_s)
4899
5242
  end
4900
5243
 
4901
5244
  data_series = _handle_columns_arg(data_series, columns: column_names, from_hash: true)
@@ -4969,7 +5312,7 @@ module Polars
4969
5312
  end
4970
5313
  end
4971
5314
 
4972
- def self._post_apply_columns(rbdf, columns, structs: nil, schema_overrides: nil)
5315
+ def self._post_apply_columns(rbdf, columns, structs: nil, schema_overrides: nil, strict: true)
4973
5316
  rbdf_columns = rbdf.columns
4974
5317
  rbdf_dtypes = rbdf.dtypes
4975
5318
  columns, dtypes = _unpack_schema(
@@ -4985,13 +5328,13 @@ module Polars
4985
5328
  end
4986
5329
 
4987
5330
  column_casts = []
4988
- columns.each do |col, i|
5331
+ columns.each_with_index do |col, i|
4989
5332
  if dtypes[col] == Categorical # != rbdf_dtypes[i]
4990
- column_casts << Polars.col(col).cast(Categorical)._rbexpr
5333
+ column_casts << Polars.col(col).cast(Categorical, strict: strict)._rbexpr
4991
5334
  elsif structs&.any? && structs.include?(col) && structs[col] != rbdf_dtypes[i]
4992
- column_casts << Polars.col(col).cast(structs[col])._rbexpr
5335
+ column_casts << Polars.col(col).cast(structs[col], strict: strict)._rbexpr
4993
5336
  elsif dtypes.include?(col) && dtypes[col] != rbdf_dtypes[i]
4994
- column_casts << Polars.col(col).cast(dtypes[col])._rbexpr
5337
+ column_casts << Polars.col(col).cast(dtypes[col], strict: strict)._rbexpr
4995
5338
  end
4996
5339
  end
4997
5340
 
@@ -5010,12 +5353,11 @@ module Polars
5010
5353
  end
5011
5354
 
5012
5355
  # @private
5013
- def self.sequence_to_rbdf(data, schema: nil, schema_overrides: nil, orient: nil, infer_schema_length: 50)
5014
- raise Todo if schema_overrides
5356
+ def self.sequence_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true, orient: nil, infer_schema_length: 50)
5015
5357
  columns = schema
5016
5358
 
5017
5359
  if data.length == 0
5018
- return hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
5360
+ return hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides, strict: strict)
5019
5361
  end
5020
5362
 
5021
5363
  if data[0].is_a?(Series)
@@ -5028,7 +5370,7 @@ module Polars
5028
5370
  elsif data[0].is_a?(Hash)
5029
5371
  column_names, dtypes = _unpack_schema(columns)
5030
5372
  schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
5031
- rbdf = RbDataFrame.from_hashes(data, schema, schema_overrides, false, infer_schema_length)
5373
+ rbdf = RbDataFrame.from_hashes(data, schema, schema_overrides, strict, infer_schema_length)
5032
5374
  if column_names
5033
5375
  rbdf = _post_apply_columns(rbdf, column_names)
5034
5376
  end
@@ -5048,7 +5390,7 @@ module Polars
5048
5390
  schema, schema_overrides: schema_overrides, n_expected: first_element.length
5049
5391
  )
5050
5392
  local_schema_override = (
5051
- schema_overrides.any? ? (raise Todo) : {}
5393
+ schema_overrides.any? ? _include_unknowns(schema_overrides, column_names) : {}
5052
5394
  )
5053
5395
  if column_names.any? && first_element.length > 0 && first_element.length != column_names.length
5054
5396
  raise ArgumentError, "the row data does not match the number of columns"
@@ -5056,7 +5398,11 @@ module Polars
5056
5398
 
5057
5399
  unpack_nested = false
5058
5400
  local_schema_override.each do |col, tp|
5059
- raise Todo
5401
+ if [Categorical, Enum].include?(tp)
5402
+ local_schema_override[col] = String
5403
+ elsif !unpack_nested && [Unknown, Struct].include?(tp.base_type)
5404
+ raise Todo
5405
+ end
5060
5406
  end
5061
5407
 
5062
5408
  if unpack_nested
@@ -5070,7 +5416,7 @@ module Polars
5070
5416
  end
5071
5417
  if column_names.any? || schema_overrides.any?
5072
5418
  rbdf = _post_apply_columns(
5073
- rbdf, column_names, schema_overrides: schema_overrides
5419
+ rbdf, column_names, schema_overrides: schema_overrides, strict: strict
5074
5420
  )
5075
5421
  end
5076
5422
  return rbdf
@@ -5080,7 +5426,7 @@ module Polars
5080
5426
  )
5081
5427
  data_series =
5082
5428
  data.map.with_index do |element, i|
5083
- Series.new(column_names[i], element, dtype: schema_overrides[column_names[i]])._s
5429
+ Series.new(column_names[i], element, dtype: schema_overrides[column_names[i]], strict: strict)._s
5084
5430
  end
5085
5431
  return RbDataFrame.new(data_series)
5086
5432
  else
@@ -5093,7 +5439,12 @@ module Polars
5093
5439
  end
5094
5440
 
5095
5441
  # @private
5096
- def self.series_to_rbdf(data, schema: nil, schema_overrides: nil)
5442
+ def self._include_unknowns(schema, cols)
5443
+ cols.to_h { |col| [col, schema[col] || Unknown] }
5444
+ end
5445
+
5446
+ # @private
5447
+ def self.series_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true)
5097
5448
  data_series = [data._s]
5098
5449
  series_name = data_series.map(&:name)
5099
5450
  column_names, schema_overrides = _unpack_schema(
@@ -5102,7 +5453,7 @@ module Polars
5102
5453
  if schema_overrides.any?
5103
5454
  new_dtype = schema_overrides.values[0]
5104
5455
  if new_dtype != data.dtype
5105
- data_series[0] = data_series[0].cast(new_dtype, true)
5456
+ data_series[0] = data_series[0].cast(new_dtype, strict)
5106
5457
  end
5107
5458
  end
5108
5459