polars-df 0.13.0-x64-mingw-ucrt → 0.15.0-x64-mingw-ucrt

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +30 -0
  3. data/Cargo.lock +1368 -319
  4. data/LICENSE-THIRD-PARTY.txt +24818 -14217
  5. data/LICENSE.txt +1 -0
  6. data/README.md +1 -2
  7. data/lib/polars/3.1/polars.so +0 -0
  8. data/lib/polars/3.2/polars.so +0 -0
  9. data/lib/polars/3.3/polars.so +0 -0
  10. data/lib/polars/batched_csv_reader.rb +0 -2
  11. data/lib/polars/binary_expr.rb +133 -9
  12. data/lib/polars/binary_name_space.rb +101 -6
  13. data/lib/polars/config.rb +4 -0
  14. data/lib/polars/data_frame.rb +285 -62
  15. data/lib/polars/data_type_group.rb +28 -0
  16. data/lib/polars/data_types.rb +2 -0
  17. data/lib/polars/date_time_expr.rb +244 -0
  18. data/lib/polars/date_time_name_space.rb +87 -0
  19. data/lib/polars/expr.rb +109 -8
  20. data/lib/polars/functions/as_datatype.rb +51 -2
  21. data/lib/polars/functions/col.rb +1 -1
  22. data/lib/polars/functions/eager.rb +1 -3
  23. data/lib/polars/functions/lazy.rb +88 -10
  24. data/lib/polars/functions/range/time_range.rb +21 -21
  25. data/lib/polars/io/csv.rb +14 -16
  26. data/lib/polars/io/database.rb +2 -2
  27. data/lib/polars/io/ipc.rb +14 -12
  28. data/lib/polars/io/ndjson.rb +10 -0
  29. data/lib/polars/io/parquet.rb +168 -111
  30. data/lib/polars/lazy_frame.rb +649 -15
  31. data/lib/polars/list_name_space.rb +169 -0
  32. data/lib/polars/selectors.rb +1144 -0
  33. data/lib/polars/series.rb +470 -40
  34. data/lib/polars/string_cache.rb +27 -1
  35. data/lib/polars/string_expr.rb +0 -1
  36. data/lib/polars/string_name_space.rb +73 -3
  37. data/lib/polars/struct_name_space.rb +31 -7
  38. data/lib/polars/utils/various.rb +5 -1
  39. data/lib/polars/utils.rb +45 -10
  40. data/lib/polars/version.rb +1 -1
  41. data/lib/polars.rb +2 -1
  42. metadata +4 -3
  43. data/lib/polars/functions.rb +0 -57
@@ -8,17 +8,49 @@ module Polars
8
8
 
9
9
  # Create a new DataFrame.
10
10
  #
11
- # @param data [Hash, Array, Series, nil]
12
- # Two-dimensional data in various forms. Hash must contain Arrays.
13
- # Array may contain Series.
14
- # @param columns [Array, Hash, nil]
15
- # Column labels to use for resulting DataFrame. If specified, overrides any
16
- # labels already present in the data. Must match data dimensions.
17
- # @param orient ["col", "row", nil]
18
- # Whether to interpret two-dimensional data as columns or as rows. If `nil`,
11
+ # @param data [Object]
12
+ # Two-dimensional data in various forms; hash input must contain arrays
13
+ # or a range. Arrays may contain Series or other arrays.
14
+ # @param schema [Object]
15
+ # The schema of the resulting DataFrame. The schema may be declared in several
16
+ # ways:
17
+ #
18
+ # * As a hash of name:type pairs; if type is nil, it will be auto-inferred.
19
+ # * As an array of column names; in this case types are automatically inferred.
20
+ # * As an array of (name,type) pairs; this is equivalent to the dictionary form.
21
+ #
22
+ # If you supply a list of column names that does not match the names in the
23
+ # underlying data, the names given here will overwrite them. The number
24
+ # of names given in the schema should match the underlying data dimensions.
25
+ #
26
+ # If set to `nil` (default), the schema is inferred from the data.
27
+ # @param schema_overrides [Hash]
28
+ # Support type specification or override of one or more columns; note that
29
+ # any dtypes inferred from the schema param will be overridden.
30
+ #
31
+ # The number of entries in the schema should match the underlying data
32
+ # dimensions, unless an array of hashes is being passed, in which case
33
+ # a *partial* schema can be declared to prevent specific fields from being loaded.
34
+ # @param strict [Boolean]
35
+ # Throw an error if any `data` value does not exactly match the given or inferred
36
+ # data type for that column. If set to `false`, values that do not match the data
37
+ # type are cast to that data type or, if casting is not possible, set to null
38
+ # instead.
39
+ # @param orient ["col", "row"]
40
+ # Whether to interpret two-dimensional data as columns or as rows. If nil,
19
41
  # the orientation is inferred by matching the columns and data dimensions. If
20
42
  # this does not yield conclusive results, column orientation is used.
21
- def initialize(data = nil, schema: nil, columns: nil, schema_overrides: nil, orient: nil, infer_schema_length: 100, nan_to_null: false)
43
+ # @param infer_schema_length [Integer]
44
+ # The maximum number of rows to scan for schema inference. If set to `nil`, the
45
+ # full data may be scanned *(this can be slow)*. This parameter only applies if
46
+ # the input data is a sequence or generator of rows; other input is read as-is.
47
+ # @param nan_to_null [Boolean]
48
+ # If the data comes from one or more Numo arrays, can optionally convert input
49
+ # data NaN values to null instead. This is a no-op for all other input data.
50
+ def initialize(data = nil, schema: nil, columns: nil, schema_overrides: nil, strict: true, orient: nil, infer_schema_length: 100, nan_to_null: false)
51
+ if schema && columns
52
+ warn "columns is ignored when schema is passed"
53
+ end
22
54
  schema ||= columns
23
55
 
24
56
  if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
@@ -29,11 +61,17 @@ module Polars
29
61
  self._df = self.class.hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
30
62
  elsif data.is_a?(Hash)
31
63
  data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
32
- self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, nan_to_null: nan_to_null)
64
+ self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict, nan_to_null: nan_to_null)
33
65
  elsif data.is_a?(::Array)
34
- self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, orient: orient, infer_schema_length: infer_schema_length)
66
+ self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict, orient: orient, infer_schema_length: infer_schema_length)
35
67
  elsif data.is_a?(Series)
36
- self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides)
68
+ self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict)
69
+ elsif data.respond_to?(:arrow_c_stream)
70
+ # This uses the fact that RbSeries.from_arrow_c_stream will create a
71
+ # struct-typed Series. Then we unpack that to a DataFrame.
72
+ tmp_col_name = ""
73
+ s = Utils.wrap_s(RbSeries.from_arrow_c_stream(data))
74
+ self._df = s.to_frame(tmp_col_name).unnest(tmp_col_name)._df
37
75
  else
38
76
  raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
39
77
  end
@@ -452,6 +490,11 @@ module Polars
452
490
  end
453
491
  end
454
492
 
493
+ # @private
494
+ def arrow_c_stream
495
+ _df.arrow_c_stream
496
+ end
497
+
455
498
  # Return the dataframe as a scalar.
456
499
  #
457
500
  # Equivalent to `df[0,0]`, with a check that the shape is (1,1).
@@ -766,15 +809,18 @@ module Polars
766
809
  # Compression method. Defaults to "uncompressed".
767
810
  #
768
811
  # @return [nil]
769
- def write_avro(file, compression = "uncompressed")
812
+ def write_avro(file, compression = "uncompressed", name: "")
770
813
  if compression.nil?
771
814
  compression = "uncompressed"
772
815
  end
773
816
  if Utils.pathlike?(file)
774
817
  file = Utils.normalize_filepath(file)
775
818
  end
819
+ if name.nil?
820
+ name = ""
821
+ end
776
822
 
777
- _df.write_avro(file, compression)
823
+ _df.write_avro(file, compression, name)
778
824
  end
779
825
 
780
826
  # Write to Arrow IPC binary stream or Feather file.
@@ -785,7 +831,7 @@ module Polars
785
831
  # Compression method. Defaults to "uncompressed".
786
832
  #
787
833
  # @return [nil]
788
- def write_ipc(file, compression: "uncompressed")
834
+ def write_ipc(file, compression: "uncompressed", compat_level: nil)
789
835
  return_bytes = file.nil?
790
836
  if return_bytes
791
837
  file = StringIO.new
@@ -795,11 +841,15 @@ module Polars
795
841
  file = Utils.normalize_filepath(file)
796
842
  end
797
843
 
844
+ if compat_level.nil?
845
+ compat_level = true
846
+ end
847
+
798
848
  if compression.nil?
799
849
  compression = "uncompressed"
800
850
  end
801
851
 
802
- _df.write_ipc(file, compression)
852
+ _df.write_ipc(file, compression, compat_level)
803
853
  return_bytes ? file.string : nil
804
854
  end
805
855
 
@@ -826,7 +876,8 @@ module Polars
826
876
  # df.write_ipc_stream("new_file.arrow")
827
877
  def write_ipc_stream(
828
878
  file,
829
- compression: "uncompressed"
879
+ compression: "uncompressed",
880
+ compat_level: nil
830
881
  )
831
882
  return_bytes = file.nil?
832
883
  if return_bytes
@@ -836,11 +887,15 @@ module Polars
836
887
  file = Utils.normalize_filepath(file)
837
888
  end
838
889
 
890
+ if compat_level.nil?
891
+ compat_level = true
892
+ end
893
+
839
894
  if compression.nil?
840
895
  compression = "uncompressed"
841
896
  end
842
897
 
843
- _df.write_ipc_stream(file, compression)
898
+ _df.write_ipc_stream(file, compression, compat_level)
844
899
  return_bytes ? file.string : nil
845
900
  end
846
901
 
@@ -1037,6 +1092,10 @@ module Polars
1037
1092
  #
1038
1093
  # @param mapping [Hash]
1039
1094
  # Key value pairs that map from old name to new name.
1095
+ # @param strict [Boolean]
1096
+ # Validate that all column names exist in the current schema,
1097
+ # and throw an exception if any do not. (Note that this parameter
1098
+ # is a no-op when passing a function to `mapping`).
1040
1099
  #
1041
1100
  # @return [DataFrame]
1042
1101
  #
@@ -1060,8 +1119,8 @@ module Polars
1060
1119
  # # │ 2 ┆ 7 ┆ b │
1061
1120
  # # │ 3 ┆ 8 ┆ c │
1062
1121
  # # └───────┴─────┴─────┘
1063
- def rename(mapping)
1064
- lazy.rename(mapping).collect(no_optimization: true)
1122
+ def rename(mapping, strict: true)
1123
+ lazy.rename(mapping, strict: strict).collect(no_optimization: true)
1065
1124
  end
1066
1125
 
1067
1126
  # Insert a Series at a certain column index. This operation is in place.
@@ -2190,6 +2249,11 @@ module Polars
2190
2249
  # @param force_parallel [Boolean]
2191
2250
  # Force the physical plan to evaluate the computation of both DataFrames up to
2192
2251
  # the join in parallel.
2252
+ # @param coalesce [Boolean]
2253
+ # Coalescing behavior (merging of join columns).
2254
+ # - true: -> Always coalesce join columns.
2255
+ # - false: -> Never coalesce join columns.
2256
+ # Note that joining on any other expressions than `col` will turn off coalescing.
2193
2257
  #
2194
2258
  # @return [DataFrame]
2195
2259
  #
@@ -2243,7 +2307,8 @@ module Polars
2243
2307
  suffix: "_right",
2244
2308
  tolerance: nil,
2245
2309
  allow_parallel: true,
2246
- force_parallel: false
2310
+ force_parallel: false,
2311
+ coalesce: true
2247
2312
  )
2248
2313
  lazy
2249
2314
  .join_asof(
@@ -2258,7 +2323,8 @@ module Polars
2258
2323
  suffix: suffix,
2259
2324
  tolerance: tolerance,
2260
2325
  allow_parallel: allow_parallel,
2261
- force_parallel: force_parallel
2326
+ force_parallel: force_parallel,
2327
+ coalesce: coalesce
2262
2328
  )
2263
2329
  .collect(no_optimization: true)
2264
2330
  end
@@ -2277,8 +2343,20 @@ module Polars
2277
2343
  # Join strategy.
2278
2344
  # @param suffix [String]
2279
2345
  # Suffix to append to columns with a duplicate name.
2346
+ # @param validate ['m:m', 'm:1', '1:m', '1:1']
2347
+ # Checks if join is of specified type.
2348
+ # * *many_to_many* - “m:m”: default, does not result in checks
2349
+ # * *one_to_one* - “1:1”: check if join keys are unique in both left and right datasets
2350
+ # * *one_to_many* - “1:m”: check if join keys are unique in left dataset
2351
+ # * *many_to_one* - “m:1”: check if join keys are unique in right dataset
2280
2352
  # @param join_nulls [Boolean]
2281
2353
  # Join on null values. By default null values will never produce matches.
2354
+ # @param coalesce [Boolean]
2355
+ # Coalescing behavior (merging of join columns).
2356
+ # - nil: -> join specific.
2357
+ # - true: -> Always coalesce join columns.
2358
+ # - false: -> Never coalesce join columns.
2359
+ # Note that joining on any other expressions than `col` will turn off coalescing.
2282
2360
  #
2283
2361
  # @return [DataFrame]
2284
2362
  #
@@ -2361,7 +2439,16 @@ module Polars
2361
2439
  # # ╞═════╪═════╪═════╡
2362
2440
  # # │ 3 ┆ 8.0 ┆ c │
2363
2441
  # # └─────┴─────┴─────┘
2364
- def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right", join_nulls: false)
2442
+ def join(other,
2443
+ left_on: nil,
2444
+ right_on: nil,
2445
+ on: nil,
2446
+ how: "inner",
2447
+ suffix: "_right",
2448
+ validate: "m:m",
2449
+ join_nulls: false,
2450
+ coalesce: nil
2451
+ )
2365
2452
  lazy
2366
2453
  .join(
2367
2454
  other.lazy,
@@ -2370,7 +2457,9 @@ module Polars
2370
2457
  on: on,
2371
2458
  how: how,
2372
2459
  suffix: suffix,
2373
- join_nulls: join_nulls
2460
+ validate: validate,
2461
+ join_nulls: join_nulls,
2462
+ coalesce: coalesce
2374
2463
  )
2375
2464
  .collect(no_optimization: true)
2376
2465
  end
@@ -2426,15 +2515,15 @@ module Polars
2426
2515
  # df.map_rows { |t| t[0] * 2 + t[1] }
2427
2516
  # # =>
2428
2517
  # # shape: (3, 1)
2429
- # # ┌───────┐
2430
- # # │ apply
2431
- # # │ ---
2432
- # # │ i64
2433
- # # ╞═══════╡
2434
- # # │ 1
2435
- # # │ 9
2436
- # # │ 14
2437
- # # └───────┘
2518
+ # # ┌─────┐
2519
+ # # │ map
2520
+ # # │ ---
2521
+ # # │ i64
2522
+ # # ╞═════╡
2523
+ # # │ 1
2524
+ # # │ 9
2525
+ # # │ 14
2526
+ # # └─────┘
2438
2527
  def map_rows(return_dtype: nil, inference_size: 256, &f)
2439
2528
  out, is_df = _df.map_rows(f, return_dtype, inference_size)
2440
2529
  if is_df
@@ -2717,10 +2806,85 @@ module Polars
2717
2806
  # Column to drop.
2718
2807
  #
2719
2808
  # @return [Series]
2809
+ #
2810
+ # @example
2811
+ # df = Polars::DataFrame.new(
2812
+ # {
2813
+ # "foo" => [1, 2, 3],
2814
+ # "bar" => [6, 7, 8],
2815
+ # "ham" => ["a", "b", "c"]
2816
+ # }
2817
+ # )
2818
+ # df.delete("ham")
2819
+ # # =>
2820
+ # # shape: (3,)
2821
+ # # Series: 'ham' [str]
2822
+ # # [
2823
+ # # "a"
2824
+ # # "b"
2825
+ # # "c"
2826
+ # # ]
2827
+ #
2828
+ # @example
2829
+ # df.delete("missing")
2830
+ # # => nil
2720
2831
  def delete(name)
2721
2832
  drop_in_place(name) if include?(name)
2722
2833
  end
2723
2834
 
2835
+ # Cast DataFrame column(s) to the specified dtype(s).
2836
+ #
2837
+ # @param dtypes [Object]
2838
+ # Mapping of column names (or selector) to dtypes, or a single dtype
2839
+ # to which all columns will be cast.
2840
+ # @param strict [Boolean]
2841
+ # Throw an error if a cast could not be done (for instance, due to an
2842
+ # overflow).
2843
+ #
2844
+ # @return [DataFrame]
2845
+ #
2846
+ # @example Cast specific frame columns to the specified dtypes:
2847
+ # df = Polars::DataFrame.new(
2848
+ # {
2849
+ # "foo" => [1, 2, 3],
2850
+ # "bar" => [6.0, 7.0, 8.0],
2851
+ # "ham" => [Date.new(2020, 1, 2), Date.new(2021, 3, 4), Date.new(2022, 5, 6)]
2852
+ # }
2853
+ # )
2854
+ # df.cast({"foo" => Polars::Float32, "bar" => Polars::UInt8})
2855
+ # # =>
2856
+ # # shape: (3, 3)
2857
+ # # ┌─────┬─────┬────────────┐
2858
+ # # │ foo ┆ bar ┆ ham │
2859
+ # # │ --- ┆ --- ┆ --- │
2860
+ # # │ f32 ┆ u8 ┆ date │
2861
+ # # ╞═════╪═════╪════════════╡
2862
+ # # │ 1.0 ┆ 6 ┆ 2020-01-02 │
2863
+ # # │ 2.0 ┆ 7 ┆ 2021-03-04 │
2864
+ # # │ 3.0 ┆ 8 ┆ 2022-05-06 │
2865
+ # # └─────┴─────┴────────────┘
2866
+ #
2867
+ # @example Cast all frame columns matching one dtype (or dtype group) to another dtype:
2868
+ # df.cast({Polars::Date => Polars::Datetime})
2869
+ # # =>
2870
+ # # shape: (3, 3)
2871
+ # # ┌─────┬─────┬─────────────────────┐
2872
+ # # │ foo ┆ bar ┆ ham │
2873
+ # # │ --- ┆ --- ┆ --- │
2874
+ # # │ i64 ┆ f64 ┆ datetime[μs] │
2875
+ # # ╞═════╪═════╪═════════════════════╡
2876
+ # # │ 1 ┆ 6.0 ┆ 2020-01-02 00:00:00 │
2877
+ # # │ 2 ┆ 7.0 ┆ 2021-03-04 00:00:00 │
2878
+ # # │ 3 ┆ 8.0 ┆ 2022-05-06 00:00:00 │
2879
+ # # └─────┴─────┴─────────────────────┘
2880
+ #
2881
+ # @example Cast all frame columns to the specified dtype:
2882
+ # df.cast(Polars::String).to_h(as_series: false)
2883
+ # # => {"foo"=>["1", "2", "3"], "bar"=>["6.0", "7.0", "8.0"], "ham"=>["2020-01-02", "2021-03-04", "2022-05-06"]}
2884
+ def cast(dtypes, strict: true)
2885
+ lazy.cast(dtypes, strict: strict).collect(_eager: true)
2886
+ end
2887
+
2724
2888
  # Create an empty copy of the current DataFrame.
2725
2889
  #
2726
2890
  # Returns a DataFrame with identical schema but no data.
@@ -2775,6 +2939,57 @@ module Polars
2775
2939
  # Get the DataFrame as a Array of Series.
2776
2940
  #
2777
2941
  # @return [Array]
2942
+ #
2943
+ # @example
2944
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
2945
+ # df.get_columns
2946
+ # # =>
2947
+ # # [shape: (3,)
2948
+ # # Series: 'foo' [i64]
2949
+ # # [
2950
+ # # 1
2951
+ # # 2
2952
+ # # 3
2953
+ # # ], shape: (3,)
2954
+ # # Series: 'bar' [i64]
2955
+ # # [
2956
+ # # 4
2957
+ # # 5
2958
+ # # 6
2959
+ # # ]]
2960
+ #
2961
+ # @example
2962
+ # df = Polars::DataFrame.new(
2963
+ # {
2964
+ # "a" => [1, 2, 3, 4],
2965
+ # "b" => [0.5, 4, 10, 13],
2966
+ # "c" => [true, true, false, true]
2967
+ # }
2968
+ # )
2969
+ # df.get_columns
2970
+ # # =>
2971
+ # # [shape: (4,)
2972
+ # # Series: 'a' [i64]
2973
+ # # [
2974
+ # # 1
2975
+ # # 2
2976
+ # # 3
2977
+ # # 4
2978
+ # # ], shape: (4,)
2979
+ # # Series: 'b' [f64]
2980
+ # # [
2981
+ # # 0.5
2982
+ # # 4.0
2983
+ # # 10.0
2984
+ # # 13.0
2985
+ # # ], shape: (4,)
2986
+ # # Series: 'c' [bool]
2987
+ # # [
2988
+ # # true
2989
+ # # true
2990
+ # # false
2991
+ # # true
2992
+ # # ]]
2778
2993
  def get_columns
2779
2994
  _df.get_columns.map { |s| Utils.wrap_s(s) }
2780
2995
  end
@@ -3083,7 +3298,7 @@ module Polars
3083
3298
  # "c" => [2, 4, 6]
3084
3299
  # }
3085
3300
  # )
3086
- # df.unpivot(Polars::Selectors.numeric, index: "a")
3301
+ # df.unpivot(Polars.cs.numeric, index: "a")
3087
3302
  # # =>
3088
3303
  # # shape: (6, 3)
3089
3304
  # # ┌─────┬──────────┬───────┐
@@ -4234,7 +4449,7 @@ module Polars
4234
4449
  if n.nil? && !frac.nil?
4235
4450
  frac = Series.new("frac", [frac]) unless frac.is_a?(Series)
4236
4451
 
4237
- _from_rbdf(
4452
+ return _from_rbdf(
4238
4453
  _df.sample_frac(frac._s, with_replacement, shuffle, seed)
4239
4454
  )
4240
4455
  end
@@ -4296,7 +4511,7 @@ module Polars
4296
4511
  # @example A horizontal string concatenation:
4297
4512
  # df = Polars::DataFrame.new(
4298
4513
  # {
4299
- # "a" => ["foo", "bar", 2],
4514
+ # "a" => ["foo", "bar", nil],
4300
4515
  # "b" => [1, 2, 3],
4301
4516
  # "c" => [1.0, 2.0, 3.0]
4302
4517
  # }
@@ -4327,11 +4542,11 @@ module Polars
4327
4542
  # # true
4328
4543
  # # true
4329
4544
  # # ]
4330
- def fold(&operation)
4545
+ def fold
4331
4546
  acc = to_series(0)
4332
4547
 
4333
4548
  1.upto(width - 1) do |i|
4334
- acc = operation.call(acc, to_series(i))
4549
+ acc = yield(acc, to_series(i))
4335
4550
  end
4336
4551
  acc
4337
4552
  end
@@ -4843,7 +5058,7 @@ module Polars
4843
5058
  end
4844
5059
 
4845
5060
  # @private
4846
- def self.expand_hash_scalars(data, schema_overrides: nil, order: nil, nan_to_null: false)
5061
+ def self.expand_hash_scalars(data, schema_overrides: nil, strict: true, order: nil, nan_to_null: false)
4847
5062
  updated_data = {}
4848
5063
  unless data.empty?
4849
5064
  dtypes = schema_overrides || {}
@@ -4852,23 +5067,23 @@ module Polars
4852
5067
  data.each do |name, val|
4853
5068
  dtype = dtypes[name]
4854
5069
  if val.is_a?(Hash) && dtype != Struct
4855
- updated_data[name] = DataFrame.new(val).to_struct(name)
5070
+ updated_data[name] = DataFrame.new(val, strict: strict).to_struct(name)
4856
5071
  elsif !Utils.arrlen(val).nil?
4857
- updated_data[name] = Series.new(::String.new(name), val, dtype: dtype)
5072
+ updated_data[name] = Series.new(::String.new(name), val, dtype: dtype, strict: strict)
4858
5073
  elsif val.nil? || [Integer, Float, TrueClass, FalseClass, ::String, ::Date, ::DateTime, ::Time].any? { |cls| val.is_a?(cls) }
4859
5074
  dtype = Polars::Float64 if val.nil? && dtype.nil?
4860
- updated_data[name] = Series.new(::String.new(name), [val], dtype: dtype).extend_constant(val, array_len - 1)
5075
+ updated_data[name] = Series.new(::String.new(name), [val], dtype: dtype, strict: strict).extend_constant(val, array_len - 1)
4861
5076
  else
4862
5077
  raise Todo
4863
5078
  end
4864
5079
  end
4865
5080
  elsif data.values.all? { |val| Utils.arrlen(val) == 0 }
4866
5081
  data.each do |name, val|
4867
- updated_data[name] = Series.new(name, val, dtype: dtypes[name])
5082
+ updated_data[name] = Series.new(name, val, dtype: dtypes[name], strict: strict)
4868
5083
  end
4869
5084
  elsif data.values.all? { |val| Utils.arrlen(val).nil? }
4870
5085
  data.each do |name, val|
4871
- updated_data[name] = Series.new(name, [val], dtype: dtypes[name])
5086
+ updated_data[name] = Series.new(name, [val], dtype: dtypes[name], strict: strict)
4872
5087
  end
4873
5088
  end
4874
5089
  end
@@ -4876,7 +5091,7 @@ module Polars
4876
5091
  end
4877
5092
 
4878
5093
  # @private
4879
- def self.hash_to_rbdf(data, schema: nil, schema_overrides: nil, nan_to_null: nil)
5094
+ def self.hash_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true, nan_to_null: nil)
4880
5095
  if schema.is_a?(Hash) && !data.empty?
4881
5096
  if !data.all? { |col, _| schema[col] }
4882
5097
  raise ArgumentError, "The given column-schema names do not match the data dictionary"
@@ -4893,9 +5108,9 @@ module Polars
4893
5108
  end
4894
5109
 
4895
5110
  if data.empty? && !schema_overrides.empty?
4896
- data_series = column_names.map { |name| Series.new(name, [], dtype: schema_overrides[name], nan_to_null: nan_to_null)._s }
5111
+ data_series = column_names.map { |name| Series.new(name, [], dtype: schema_overrides[name], strict: strict, nan_to_null: nan_to_null)._s }
4897
5112
  else
4898
- data_series = expand_hash_scalars(data, schema_overrides: schema_overrides, nan_to_null: nan_to_null).values.map(&:_s)
5113
+ data_series = expand_hash_scalars(data, schema_overrides: schema_overrides, strict: strict, nan_to_null: nan_to_null).values.map(&:_s)
4899
5114
  end
4900
5115
 
4901
5116
  data_series = _handle_columns_arg(data_series, columns: column_names, from_hash: true)
@@ -4969,7 +5184,7 @@ module Polars
4969
5184
  end
4970
5185
  end
4971
5186
 
4972
- def self._post_apply_columns(rbdf, columns, structs: nil, schema_overrides: nil)
5187
+ def self._post_apply_columns(rbdf, columns, structs: nil, schema_overrides: nil, strict: true)
4973
5188
  rbdf_columns = rbdf.columns
4974
5189
  rbdf_dtypes = rbdf.dtypes
4975
5190
  columns, dtypes = _unpack_schema(
@@ -4985,13 +5200,13 @@ module Polars
4985
5200
  end
4986
5201
 
4987
5202
  column_casts = []
4988
- columns.each do |col, i|
5203
+ columns.each_with_index do |col, i|
4989
5204
  if dtypes[col] == Categorical # != rbdf_dtypes[i]
4990
- column_casts << Polars.col(col).cast(Categorical)._rbexpr
5205
+ column_casts << Polars.col(col).cast(Categorical, strict: strict)._rbexpr
4991
5206
  elsif structs&.any? && structs.include?(col) && structs[col] != rbdf_dtypes[i]
4992
- column_casts << Polars.col(col).cast(structs[col])._rbexpr
5207
+ column_casts << Polars.col(col).cast(structs[col], strict: strict)._rbexpr
4993
5208
  elsif dtypes.include?(col) && dtypes[col] != rbdf_dtypes[i]
4994
- column_casts << Polars.col(col).cast(dtypes[col])._rbexpr
5209
+ column_casts << Polars.col(col).cast(dtypes[col], strict: strict)._rbexpr
4995
5210
  end
4996
5211
  end
4997
5212
 
@@ -5010,12 +5225,11 @@ module Polars
5010
5225
  end
5011
5226
 
5012
5227
  # @private
5013
- def self.sequence_to_rbdf(data, schema: nil, schema_overrides: nil, orient: nil, infer_schema_length: 50)
5014
- raise Todo if schema_overrides
5228
+ def self.sequence_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true, orient: nil, infer_schema_length: 50)
5015
5229
  columns = schema
5016
5230
 
5017
5231
  if data.length == 0
5018
- return hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
5232
+ return hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides, strict: strict)
5019
5233
  end
5020
5234
 
5021
5235
  if data[0].is_a?(Series)
@@ -5028,7 +5242,7 @@ module Polars
5028
5242
  elsif data[0].is_a?(Hash)
5029
5243
  column_names, dtypes = _unpack_schema(columns)
5030
5244
  schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
5031
- rbdf = RbDataFrame.from_hashes(data, schema, schema_overrides, false, infer_schema_length)
5245
+ rbdf = RbDataFrame.from_hashes(data, schema, schema_overrides, strict, infer_schema_length)
5032
5246
  if column_names
5033
5247
  rbdf = _post_apply_columns(rbdf, column_names)
5034
5248
  end
@@ -5048,7 +5262,7 @@ module Polars
5048
5262
  schema, schema_overrides: schema_overrides, n_expected: first_element.length
5049
5263
  )
5050
5264
  local_schema_override = (
5051
- schema_overrides.any? ? (raise Todo) : {}
5265
+ schema_overrides.any? ? _include_unknowns(schema_overrides, column_names) : {}
5052
5266
  )
5053
5267
  if column_names.any? && first_element.length > 0 && first_element.length != column_names.length
5054
5268
  raise ArgumentError, "the row data does not match the number of columns"
@@ -5056,7 +5270,11 @@ module Polars
5056
5270
 
5057
5271
  unpack_nested = false
5058
5272
  local_schema_override.each do |col, tp|
5059
- raise Todo
5273
+ if [Categorical, Enum].include?(tp)
5274
+ local_schema_override[col] = String
5275
+ elsif !unpack_nested && [Unknown, Struct].include?(tp.base_type)
5276
+ raise Todo
5277
+ end
5060
5278
  end
5061
5279
 
5062
5280
  if unpack_nested
@@ -5070,7 +5288,7 @@ module Polars
5070
5288
  end
5071
5289
  if column_names.any? || schema_overrides.any?
5072
5290
  rbdf = _post_apply_columns(
5073
- rbdf, column_names, schema_overrides: schema_overrides
5291
+ rbdf, column_names, schema_overrides: schema_overrides, strict: strict
5074
5292
  )
5075
5293
  end
5076
5294
  return rbdf
@@ -5080,7 +5298,7 @@ module Polars
5080
5298
  )
5081
5299
  data_series =
5082
5300
  data.map.with_index do |element, i|
5083
- Series.new(column_names[i], element, dtype: schema_overrides[column_names[i]])._s
5301
+ Series.new(column_names[i], element, dtype: schema_overrides[column_names[i]], strict: strict)._s
5084
5302
  end
5085
5303
  return RbDataFrame.new(data_series)
5086
5304
  else
@@ -5093,7 +5311,12 @@ module Polars
5093
5311
  end
5094
5312
 
5095
5313
  # @private
5096
- def self.series_to_rbdf(data, schema: nil, schema_overrides: nil)
5314
+ def self._include_unknowns(schema, cols)
5315
+ cols.to_h { |col| [col, schema[col] || Unknown] }
5316
+ end
5317
+
5318
+ # @private
5319
+ def self.series_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true)
5097
5320
  data_series = [data._s]
5098
5321
  series_name = data_series.map(&:name)
5099
5322
  column_names, schema_overrides = _unpack_schema(
@@ -5102,7 +5325,7 @@ module Polars
5102
5325
  if schema_overrides.any?
5103
5326
  new_dtype = schema_overrides.values[0]
5104
5327
  if new_dtype != data.dtype
5105
- data_series[0] = data_series[0].cast(new_dtype, true)
5328
+ data_series[0] = data_series[0].cast(new_dtype, strict)
5106
5329
  end
5107
5330
  end
5108
5331
 
@@ -0,0 +1,28 @@
1
+ module Polars
2
+ class DataTypeGroup < Set
3
+ end
4
+
5
+ SIGNED_INTEGER_DTYPES = DataTypeGroup.new(
6
+ [
7
+ Int8,
8
+ Int16,
9
+ Int32,
10
+ Int64
11
+ ]
12
+ )
13
+ UNSIGNED_INTEGER_DTYPES = DataTypeGroup.new(
14
+ [
15
+ UInt8,
16
+ UInt16,
17
+ UInt32,
18
+ UInt64
19
+ ]
20
+ )
21
+ INTEGER_DTYPES = (
22
+ SIGNED_INTEGER_DTYPES | UNSIGNED_INTEGER_DTYPES
23
+ )
24
+ FLOAT_DTYPES = DataTypeGroup.new([Float32, Float64])
25
+ NUMERIC_DTYPES = DataTypeGroup.new(
26
+ FLOAT_DTYPES + INTEGER_DTYPES | [Decimal]
27
+ )
28
+ end
@@ -292,6 +292,8 @@ module Polars
292
292
 
293
293
  # A categorical encoding of a set of strings.
294
294
  class Categorical < DataType
295
+ attr_reader :ordering
296
+
295
297
  def initialize(ordering = "physical")
296
298
  @ordering = ordering
297
299
  end