polars-df 0.13.0-x86_64-linux → 0.15.0-x86_64-linux

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +30 -0
  3. data/Cargo.lock +1368 -319
  4. data/LICENSE-THIRD-PARTY.txt +24801 -13447
  5. data/LICENSE.txt +1 -0
  6. data/README.md +1 -2
  7. data/lib/polars/3.1/polars.so +0 -0
  8. data/lib/polars/3.2/polars.so +0 -0
  9. data/lib/polars/3.3/polars.so +0 -0
  10. data/lib/polars/batched_csv_reader.rb +0 -2
  11. data/lib/polars/binary_expr.rb +133 -9
  12. data/lib/polars/binary_name_space.rb +101 -6
  13. data/lib/polars/config.rb +4 -0
  14. data/lib/polars/data_frame.rb +285 -62
  15. data/lib/polars/data_type_group.rb +28 -0
  16. data/lib/polars/data_types.rb +2 -0
  17. data/lib/polars/date_time_expr.rb +244 -0
  18. data/lib/polars/date_time_name_space.rb +87 -0
  19. data/lib/polars/expr.rb +109 -8
  20. data/lib/polars/functions/as_datatype.rb +51 -2
  21. data/lib/polars/functions/col.rb +1 -1
  22. data/lib/polars/functions/eager.rb +1 -3
  23. data/lib/polars/functions/lazy.rb +88 -10
  24. data/lib/polars/functions/range/time_range.rb +21 -21
  25. data/lib/polars/io/csv.rb +14 -16
  26. data/lib/polars/io/database.rb +2 -2
  27. data/lib/polars/io/ipc.rb +14 -12
  28. data/lib/polars/io/ndjson.rb +10 -0
  29. data/lib/polars/io/parquet.rb +168 -111
  30. data/lib/polars/lazy_frame.rb +649 -15
  31. data/lib/polars/list_name_space.rb +169 -0
  32. data/lib/polars/selectors.rb +1144 -0
  33. data/lib/polars/series.rb +470 -40
  34. data/lib/polars/string_cache.rb +27 -1
  35. data/lib/polars/string_expr.rb +0 -1
  36. data/lib/polars/string_name_space.rb +73 -3
  37. data/lib/polars/struct_name_space.rb +31 -7
  38. data/lib/polars/utils/various.rb +5 -1
  39. data/lib/polars/utils.rb +45 -10
  40. data/lib/polars/version.rb +1 -1
  41. data/lib/polars.rb +2 -1
  42. metadata +4 -3
  43. data/lib/polars/functions.rb +0 -57
@@ -8,17 +8,49 @@ module Polars
8
8
 
9
9
  # Create a new DataFrame.
10
10
  #
11
- # @param data [Hash, Array, Series, nil]
12
- # Two-dimensional data in various forms. Hash must contain Arrays.
13
- # Array may contain Series.
14
- # @param columns [Array, Hash, nil]
15
- # Column labels to use for resulting DataFrame. If specified, overrides any
16
- # labels already present in the data. Must match data dimensions.
17
- # @param orient ["col", "row", nil]
18
- # Whether to interpret two-dimensional data as columns or as rows. If `nil`,
11
+ # @param data [Object]
12
+ # Two-dimensional data in various forms; hash input must contain arrays
13
+ # or a range. Arrays may contain Series or other arrays.
14
+ # @param schema [Object]
15
+ # The schema of the resulting DataFrame. The schema may be declared in several
16
+ # ways:
17
+ #
18
+ # * As a hash of name:type pairs; if type is nil, it will be auto-inferred.
19
+ # * As an array of column names; in this case types are automatically inferred.
20
+ # * As an array of (name,type) pairs; this is equivalent to the dictionary form.
21
+ #
22
+ # If you supply a list of column names that does not match the names in the
23
+ # underlying data, the names given here will overwrite them. The number
24
+ # of names given in the schema should match the underlying data dimensions.
25
+ #
26
+ # If set to `nil` (default), the schema is inferred from the data.
27
+ # @param schema_overrides [Hash]
28
+ # Support type specification or override of one or more columns; note that
29
+ # any dtypes inferred from the schema param will be overridden.
30
+ #
31
+ # The number of entries in the schema should match the underlying data
32
+ # dimensions, unless an array of hashes is being passed, in which case
33
+ # a *partial* schema can be declared to prevent specific fields from being loaded.
34
+ # @param strict [Boolean]
35
+ # Throw an error if any `data` value does not exactly match the given or inferred
36
+ # data type for that column. If set to `false`, values that do not match the data
37
+ # type are cast to that data type or, if casting is not possible, set to null
38
+ # instead.
39
+ # @param orient ["col", "row"]
40
+ # Whether to interpret two-dimensional data as columns or as rows. If nil,
19
41
  # the orientation is inferred by matching the columns and data dimensions. If
20
42
  # this does not yield conclusive results, column orientation is used.
21
- def initialize(data = nil, schema: nil, columns: nil, schema_overrides: nil, orient: nil, infer_schema_length: 100, nan_to_null: false)
43
+ # @param infer_schema_length [Integer]
44
+ # The maximum number of rows to scan for schema inference. If set to `nil`, the
45
+ # full data may be scanned *(this can be slow)*. This parameter only applies if
46
+ # the input data is a sequence or generator of rows; other input is read as-is.
47
+ # @param nan_to_null [Boolean]
48
+ # If the data comes from one or more Numo arrays, can optionally convert input
49
+ # data NaN values to null instead. This is a no-op for all other input data.
50
+ def initialize(data = nil, schema: nil, columns: nil, schema_overrides: nil, strict: true, orient: nil, infer_schema_length: 100, nan_to_null: false)
51
+ if schema && columns
52
+ warn "columns is ignored when schema is passed"
53
+ end
22
54
  schema ||= columns
23
55
 
24
56
  if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
@@ -29,11 +61,17 @@ module Polars
29
61
  self._df = self.class.hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
30
62
  elsif data.is_a?(Hash)
31
63
  data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
32
- self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, nan_to_null: nan_to_null)
64
+ self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict, nan_to_null: nan_to_null)
33
65
  elsif data.is_a?(::Array)
34
- self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, orient: orient, infer_schema_length: infer_schema_length)
66
+ self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict, orient: orient, infer_schema_length: infer_schema_length)
35
67
  elsif data.is_a?(Series)
36
- self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides)
68
+ self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, strict: strict)
69
+ elsif data.respond_to?(:arrow_c_stream)
70
+ # This uses the fact that RbSeries.from_arrow_c_stream will create a
71
+ # struct-typed Series. Then we unpack that to a DataFrame.
72
+ tmp_col_name = ""
73
+ s = Utils.wrap_s(RbSeries.from_arrow_c_stream(data))
74
+ self._df = s.to_frame(tmp_col_name).unnest(tmp_col_name)._df
37
75
  else
38
76
  raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
39
77
  end
@@ -452,6 +490,11 @@ module Polars
452
490
  end
453
491
  end
454
492
 
493
+ # @private
494
+ def arrow_c_stream
495
+ _df.arrow_c_stream
496
+ end
497
+
455
498
  # Return the dataframe as a scalar.
456
499
  #
457
500
  # Equivalent to `df[0,0]`, with a check that the shape is (1,1).
@@ -766,15 +809,18 @@ module Polars
766
809
  # Compression method. Defaults to "uncompressed".
767
810
  #
768
811
  # @return [nil]
769
- def write_avro(file, compression = "uncompressed")
812
+ def write_avro(file, compression = "uncompressed", name: "")
770
813
  if compression.nil?
771
814
  compression = "uncompressed"
772
815
  end
773
816
  if Utils.pathlike?(file)
774
817
  file = Utils.normalize_filepath(file)
775
818
  end
819
+ if name.nil?
820
+ name = ""
821
+ end
776
822
 
777
- _df.write_avro(file, compression)
823
+ _df.write_avro(file, compression, name)
778
824
  end
779
825
 
780
826
  # Write to Arrow IPC binary stream or Feather file.
@@ -785,7 +831,7 @@ module Polars
785
831
  # Compression method. Defaults to "uncompressed".
786
832
  #
787
833
  # @return [nil]
788
- def write_ipc(file, compression: "uncompressed")
834
+ def write_ipc(file, compression: "uncompressed", compat_level: nil)
789
835
  return_bytes = file.nil?
790
836
  if return_bytes
791
837
  file = StringIO.new
@@ -795,11 +841,15 @@ module Polars
795
841
  file = Utils.normalize_filepath(file)
796
842
  end
797
843
 
844
+ if compat_level.nil?
845
+ compat_level = true
846
+ end
847
+
798
848
  if compression.nil?
799
849
  compression = "uncompressed"
800
850
  end
801
851
 
802
- _df.write_ipc(file, compression)
852
+ _df.write_ipc(file, compression, compat_level)
803
853
  return_bytes ? file.string : nil
804
854
  end
805
855
 
@@ -826,7 +876,8 @@ module Polars
826
876
  # df.write_ipc_stream("new_file.arrow")
827
877
  def write_ipc_stream(
828
878
  file,
829
- compression: "uncompressed"
879
+ compression: "uncompressed",
880
+ compat_level: nil
830
881
  )
831
882
  return_bytes = file.nil?
832
883
  if return_bytes
@@ -836,11 +887,15 @@ module Polars
836
887
  file = Utils.normalize_filepath(file)
837
888
  end
838
889
 
890
+ if compat_level.nil?
891
+ compat_level = true
892
+ end
893
+
839
894
  if compression.nil?
840
895
  compression = "uncompressed"
841
896
  end
842
897
 
843
- _df.write_ipc_stream(file, compression)
898
+ _df.write_ipc_stream(file, compression, compat_level)
844
899
  return_bytes ? file.string : nil
845
900
  end
846
901
 
@@ -1037,6 +1092,10 @@ module Polars
1037
1092
  #
1038
1093
  # @param mapping [Hash]
1039
1094
  # Key value pairs that map from old name to new name.
1095
+ # @param strict [Boolean]
1096
+ # Validate that all column names exist in the current schema,
1097
+ # and throw an exception if any do not. (Note that this parameter
1098
+ # is a no-op when passing a function to `mapping`).
1040
1099
  #
1041
1100
  # @return [DataFrame]
1042
1101
  #
@@ -1060,8 +1119,8 @@ module Polars
1060
1119
  # # │ 2 ┆ 7 ┆ b │
1061
1120
  # # │ 3 ┆ 8 ┆ c │
1062
1121
  # # └───────┴─────┴─────┘
1063
- def rename(mapping)
1064
- lazy.rename(mapping).collect(no_optimization: true)
1122
+ def rename(mapping, strict: true)
1123
+ lazy.rename(mapping, strict: strict).collect(no_optimization: true)
1065
1124
  end
1066
1125
 
1067
1126
  # Insert a Series at a certain column index. This operation is in place.
@@ -2190,6 +2249,11 @@ module Polars
2190
2249
  # @param force_parallel [Boolean]
2191
2250
  # Force the physical plan to evaluate the computation of both DataFrames up to
2192
2251
  # the join in parallel.
2252
+ # @param coalesce [Boolean]
2253
+ # Coalescing behavior (merging of join columns).
2254
+ # - true: -> Always coalesce join columns.
2255
+ # - false: -> Never coalesce join columns.
2256
+ # Note that joining on any other expressions than `col` will turn off coalescing.
2193
2257
  #
2194
2258
  # @return [DataFrame]
2195
2259
  #
@@ -2243,7 +2307,8 @@ module Polars
2243
2307
  suffix: "_right",
2244
2308
  tolerance: nil,
2245
2309
  allow_parallel: true,
2246
- force_parallel: false
2310
+ force_parallel: false,
2311
+ coalesce: true
2247
2312
  )
2248
2313
  lazy
2249
2314
  .join_asof(
@@ -2258,7 +2323,8 @@ module Polars
2258
2323
  suffix: suffix,
2259
2324
  tolerance: tolerance,
2260
2325
  allow_parallel: allow_parallel,
2261
- force_parallel: force_parallel
2326
+ force_parallel: force_parallel,
2327
+ coalesce: coalesce
2262
2328
  )
2263
2329
  .collect(no_optimization: true)
2264
2330
  end
@@ -2277,8 +2343,20 @@ module Polars
2277
2343
  # Join strategy.
2278
2344
  # @param suffix [String]
2279
2345
  # Suffix to append to columns with a duplicate name.
2346
+ # @param validate ['m:m', 'm:1', '1:m', '1:1']
2347
+ # Checks if join is of specified type.
2348
+ # * *many_to_many* - “m:m”: default, does not result in checks
2349
+ # * *one_to_one* - “1:1”: check if join keys are unique in both left and right datasets
2350
+ # * *one_to_many* - “1:m”: check if join keys are unique in left dataset
2351
+ # * *many_to_one* - “m:1”: check if join keys are unique in right dataset
2280
2352
  # @param join_nulls [Boolean]
2281
2353
  # Join on null values. By default null values will never produce matches.
2354
+ # @param coalesce [Boolean]
2355
+ # Coalescing behavior (merging of join columns).
2356
+ # - nil: -> join specific.
2357
+ # - true: -> Always coalesce join columns.
2358
+ # - false: -> Never coalesce join columns.
2359
+ # Note that joining on any other expressions than `col` will turn off coalescing.
2282
2360
  #
2283
2361
  # @return [DataFrame]
2284
2362
  #
@@ -2361,7 +2439,16 @@ module Polars
2361
2439
  # # ╞═════╪═════╪═════╡
2362
2440
  # # │ 3 ┆ 8.0 ┆ c │
2363
2441
  # # └─────┴─────┴─────┘
2364
- def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right", join_nulls: false)
2442
+ def join(other,
2443
+ left_on: nil,
2444
+ right_on: nil,
2445
+ on: nil,
2446
+ how: "inner",
2447
+ suffix: "_right",
2448
+ validate: "m:m",
2449
+ join_nulls: false,
2450
+ coalesce: nil
2451
+ )
2365
2452
  lazy
2366
2453
  .join(
2367
2454
  other.lazy,
@@ -2370,7 +2457,9 @@ module Polars
2370
2457
  on: on,
2371
2458
  how: how,
2372
2459
  suffix: suffix,
2373
- join_nulls: join_nulls
2460
+ validate: validate,
2461
+ join_nulls: join_nulls,
2462
+ coalesce: coalesce
2374
2463
  )
2375
2464
  .collect(no_optimization: true)
2376
2465
  end
@@ -2426,15 +2515,15 @@ module Polars
2426
2515
  # df.map_rows { |t| t[0] * 2 + t[1] }
2427
2516
  # # =>
2428
2517
  # # shape: (3, 1)
2429
- # # ┌───────┐
2430
- # # │ apply
2431
- # # │ ---
2432
- # # │ i64
2433
- # # ╞═══════╡
2434
- # # │ 1
2435
- # # │ 9
2436
- # # │ 14
2437
- # # └───────┘
2518
+ # # ┌─────┐
2519
+ # # │ map
2520
+ # # │ ---
2521
+ # # │ i64
2522
+ # # ╞═════╡
2523
+ # # │ 1
2524
+ # # │ 9
2525
+ # # │ 14
2526
+ # # └─────┘
2438
2527
  def map_rows(return_dtype: nil, inference_size: 256, &f)
2439
2528
  out, is_df = _df.map_rows(f, return_dtype, inference_size)
2440
2529
  if is_df
@@ -2717,10 +2806,85 @@ module Polars
2717
2806
  # Column to drop.
2718
2807
  #
2719
2808
  # @return [Series]
2809
+ #
2810
+ # @example
2811
+ # df = Polars::DataFrame.new(
2812
+ # {
2813
+ # "foo" => [1, 2, 3],
2814
+ # "bar" => [6, 7, 8],
2815
+ # "ham" => ["a", "b", "c"]
2816
+ # }
2817
+ # )
2818
+ # df.delete("ham")
2819
+ # # =>
2820
+ # # shape: (3,)
2821
+ # # Series: 'ham' [str]
2822
+ # # [
2823
+ # # "a"
2824
+ # # "b"
2825
+ # # "c"
2826
+ # # ]
2827
+ #
2828
+ # @example
2829
+ # df.delete("missing")
2830
+ # # => nil
2720
2831
  def delete(name)
2721
2832
  drop_in_place(name) if include?(name)
2722
2833
  end
2723
2834
 
2835
+ # Cast DataFrame column(s) to the specified dtype(s).
2836
+ #
2837
+ # @param dtypes [Object]
2838
+ # Mapping of column names (or selector) to dtypes, or a single dtype
2839
+ # to which all columns will be cast.
2840
+ # @param strict [Boolean]
2841
+ # Throw an error if a cast could not be done (for instance, due to an
2842
+ # overflow).
2843
+ #
2844
+ # @return [DataFrame]
2845
+ #
2846
+ # @example Cast specific frame columns to the specified dtypes:
2847
+ # df = Polars::DataFrame.new(
2848
+ # {
2849
+ # "foo" => [1, 2, 3],
2850
+ # "bar" => [6.0, 7.0, 8.0],
2851
+ # "ham" => [Date.new(2020, 1, 2), Date.new(2021, 3, 4), Date.new(2022, 5, 6)]
2852
+ # }
2853
+ # )
2854
+ # df.cast({"foo" => Polars::Float32, "bar" => Polars::UInt8})
2855
+ # # =>
2856
+ # # shape: (3, 3)
2857
+ # # ┌─────┬─────┬────────────┐
2858
+ # # │ foo ┆ bar ┆ ham │
2859
+ # # │ --- ┆ --- ┆ --- │
2860
+ # # │ f32 ┆ u8 ┆ date │
2861
+ # # ╞═════╪═════╪════════════╡
2862
+ # # │ 1.0 ┆ 6 ┆ 2020-01-02 │
2863
+ # # │ 2.0 ┆ 7 ┆ 2021-03-04 │
2864
+ # # │ 3.0 ┆ 8 ┆ 2022-05-06 │
2865
+ # # └─────┴─────┴────────────┘
2866
+ #
2867
+ # @example Cast all frame columns matching one dtype (or dtype group) to another dtype:
2868
+ # df.cast({Polars::Date => Polars::Datetime})
2869
+ # # =>
2870
+ # # shape: (3, 3)
2871
+ # # ┌─────┬─────┬─────────────────────┐
2872
+ # # │ foo ┆ bar ┆ ham │
2873
+ # # │ --- ┆ --- ┆ --- │
2874
+ # # │ i64 ┆ f64 ┆ datetime[μs] │
2875
+ # # ╞═════╪═════╪═════════════════════╡
2876
+ # # │ 1 ┆ 6.0 ┆ 2020-01-02 00:00:00 │
2877
+ # # │ 2 ┆ 7.0 ┆ 2021-03-04 00:00:00 │
2878
+ # # │ 3 ┆ 8.0 ┆ 2022-05-06 00:00:00 │
2879
+ # # └─────┴─────┴─────────────────────┘
2880
+ #
2881
+ # @example Cast all frame columns to the specified dtype:
2882
+ # df.cast(Polars::String).to_h(as_series: false)
2883
+ # # => {"foo"=>["1", "2", "3"], "bar"=>["6.0", "7.0", "8.0"], "ham"=>["2020-01-02", "2021-03-04", "2022-05-06"]}
2884
+ def cast(dtypes, strict: true)
2885
+ lazy.cast(dtypes, strict: strict).collect(_eager: true)
2886
+ end
2887
+
2724
2888
  # Create an empty copy of the current DataFrame.
2725
2889
  #
2726
2890
  # Returns a DataFrame with identical schema but no data.
@@ -2775,6 +2939,57 @@ module Polars
2775
2939
  # Get the DataFrame as a Array of Series.
2776
2940
  #
2777
2941
  # @return [Array]
2942
+ #
2943
+ # @example
2944
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
2945
+ # df.get_columns
2946
+ # # =>
2947
+ # # [shape: (3,)
2948
+ # # Series: 'foo' [i64]
2949
+ # # [
2950
+ # # 1
2951
+ # # 2
2952
+ # # 3
2953
+ # # ], shape: (3,)
2954
+ # # Series: 'bar' [i64]
2955
+ # # [
2956
+ # # 4
2957
+ # # 5
2958
+ # # 6
2959
+ # # ]]
2960
+ #
2961
+ # @example
2962
+ # df = Polars::DataFrame.new(
2963
+ # {
2964
+ # "a" => [1, 2, 3, 4],
2965
+ # "b" => [0.5, 4, 10, 13],
2966
+ # "c" => [true, true, false, true]
2967
+ # }
2968
+ # )
2969
+ # df.get_columns
2970
+ # # =>
2971
+ # # [shape: (4,)
2972
+ # # Series: 'a' [i64]
2973
+ # # [
2974
+ # # 1
2975
+ # # 2
2976
+ # # 3
2977
+ # # 4
2978
+ # # ], shape: (4,)
2979
+ # # Series: 'b' [f64]
2980
+ # # [
2981
+ # # 0.5
2982
+ # # 4.0
2983
+ # # 10.0
2984
+ # # 13.0
2985
+ # # ], shape: (4,)
2986
+ # # Series: 'c' [bool]
2987
+ # # [
2988
+ # # true
2989
+ # # true
2990
+ # # false
2991
+ # # true
2992
+ # # ]]
2778
2993
  def get_columns
2779
2994
  _df.get_columns.map { |s| Utils.wrap_s(s) }
2780
2995
  end
@@ -3083,7 +3298,7 @@ module Polars
3083
3298
  # "c" => [2, 4, 6]
3084
3299
  # }
3085
3300
  # )
3086
- # df.unpivot(Polars::Selectors.numeric, index: "a")
3301
+ # df.unpivot(Polars.cs.numeric, index: "a")
3087
3302
  # # =>
3088
3303
  # # shape: (6, 3)
3089
3304
  # # ┌─────┬──────────┬───────┐
@@ -4234,7 +4449,7 @@ module Polars
4234
4449
  if n.nil? && !frac.nil?
4235
4450
  frac = Series.new("frac", [frac]) unless frac.is_a?(Series)
4236
4451
 
4237
- _from_rbdf(
4452
+ return _from_rbdf(
4238
4453
  _df.sample_frac(frac._s, with_replacement, shuffle, seed)
4239
4454
  )
4240
4455
  end
@@ -4296,7 +4511,7 @@ module Polars
4296
4511
  # @example A horizontal string concatenation:
4297
4512
  # df = Polars::DataFrame.new(
4298
4513
  # {
4299
- # "a" => ["foo", "bar", 2],
4514
+ # "a" => ["foo", "bar", nil],
4300
4515
  # "b" => [1, 2, 3],
4301
4516
  # "c" => [1.0, 2.0, 3.0]
4302
4517
  # }
@@ -4327,11 +4542,11 @@ module Polars
4327
4542
  # # true
4328
4543
  # # true
4329
4544
  # # ]
4330
- def fold(&operation)
4545
+ def fold
4331
4546
  acc = to_series(0)
4332
4547
 
4333
4548
  1.upto(width - 1) do |i|
4334
- acc = operation.call(acc, to_series(i))
4549
+ acc = yield(acc, to_series(i))
4335
4550
  end
4336
4551
  acc
4337
4552
  end
@@ -4843,7 +5058,7 @@ module Polars
4843
5058
  end
4844
5059
 
4845
5060
  # @private
4846
- def self.expand_hash_scalars(data, schema_overrides: nil, order: nil, nan_to_null: false)
5061
+ def self.expand_hash_scalars(data, schema_overrides: nil, strict: true, order: nil, nan_to_null: false)
4847
5062
  updated_data = {}
4848
5063
  unless data.empty?
4849
5064
  dtypes = schema_overrides || {}
@@ -4852,23 +5067,23 @@ module Polars
4852
5067
  data.each do |name, val|
4853
5068
  dtype = dtypes[name]
4854
5069
  if val.is_a?(Hash) && dtype != Struct
4855
- updated_data[name] = DataFrame.new(val).to_struct(name)
5070
+ updated_data[name] = DataFrame.new(val, strict: strict).to_struct(name)
4856
5071
  elsif !Utils.arrlen(val).nil?
4857
- updated_data[name] = Series.new(::String.new(name), val, dtype: dtype)
5072
+ updated_data[name] = Series.new(::String.new(name), val, dtype: dtype, strict: strict)
4858
5073
  elsif val.nil? || [Integer, Float, TrueClass, FalseClass, ::String, ::Date, ::DateTime, ::Time].any? { |cls| val.is_a?(cls) }
4859
5074
  dtype = Polars::Float64 if val.nil? && dtype.nil?
4860
- updated_data[name] = Series.new(::String.new(name), [val], dtype: dtype).extend_constant(val, array_len - 1)
5075
+ updated_data[name] = Series.new(::String.new(name), [val], dtype: dtype, strict: strict).extend_constant(val, array_len - 1)
4861
5076
  else
4862
5077
  raise Todo
4863
5078
  end
4864
5079
  end
4865
5080
  elsif data.values.all? { |val| Utils.arrlen(val) == 0 }
4866
5081
  data.each do |name, val|
4867
- updated_data[name] = Series.new(name, val, dtype: dtypes[name])
5082
+ updated_data[name] = Series.new(name, val, dtype: dtypes[name], strict: strict)
4868
5083
  end
4869
5084
  elsif data.values.all? { |val| Utils.arrlen(val).nil? }
4870
5085
  data.each do |name, val|
4871
- updated_data[name] = Series.new(name, [val], dtype: dtypes[name])
5086
+ updated_data[name] = Series.new(name, [val], dtype: dtypes[name], strict: strict)
4872
5087
  end
4873
5088
  end
4874
5089
  end
@@ -4876,7 +5091,7 @@ module Polars
4876
5091
  end
4877
5092
 
4878
5093
  # @private
4879
- def self.hash_to_rbdf(data, schema: nil, schema_overrides: nil, nan_to_null: nil)
5094
+ def self.hash_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true, nan_to_null: nil)
4880
5095
  if schema.is_a?(Hash) && !data.empty?
4881
5096
  if !data.all? { |col, _| schema[col] }
4882
5097
  raise ArgumentError, "The given column-schema names do not match the data dictionary"
@@ -4893,9 +5108,9 @@ module Polars
4893
5108
  end
4894
5109
 
4895
5110
  if data.empty? && !schema_overrides.empty?
4896
- data_series = column_names.map { |name| Series.new(name, [], dtype: schema_overrides[name], nan_to_null: nan_to_null)._s }
5111
+ data_series = column_names.map { |name| Series.new(name, [], dtype: schema_overrides[name], strict: strict, nan_to_null: nan_to_null)._s }
4897
5112
  else
4898
- data_series = expand_hash_scalars(data, schema_overrides: schema_overrides, nan_to_null: nan_to_null).values.map(&:_s)
5113
+ data_series = expand_hash_scalars(data, schema_overrides: schema_overrides, strict: strict, nan_to_null: nan_to_null).values.map(&:_s)
4899
5114
  end
4900
5115
 
4901
5116
  data_series = _handle_columns_arg(data_series, columns: column_names, from_hash: true)
@@ -4969,7 +5184,7 @@ module Polars
4969
5184
  end
4970
5185
  end
4971
5186
 
4972
- def self._post_apply_columns(rbdf, columns, structs: nil, schema_overrides: nil)
5187
+ def self._post_apply_columns(rbdf, columns, structs: nil, schema_overrides: nil, strict: true)
4973
5188
  rbdf_columns = rbdf.columns
4974
5189
  rbdf_dtypes = rbdf.dtypes
4975
5190
  columns, dtypes = _unpack_schema(
@@ -4985,13 +5200,13 @@ module Polars
4985
5200
  end
4986
5201
 
4987
5202
  column_casts = []
4988
- columns.each do |col, i|
5203
+ columns.each_with_index do |col, i|
4989
5204
  if dtypes[col] == Categorical # != rbdf_dtypes[i]
4990
- column_casts << Polars.col(col).cast(Categorical)._rbexpr
5205
+ column_casts << Polars.col(col).cast(Categorical, strict: strict)._rbexpr
4991
5206
  elsif structs&.any? && structs.include?(col) && structs[col] != rbdf_dtypes[i]
4992
- column_casts << Polars.col(col).cast(structs[col])._rbexpr
5207
+ column_casts << Polars.col(col).cast(structs[col], strict: strict)._rbexpr
4993
5208
  elsif dtypes.include?(col) && dtypes[col] != rbdf_dtypes[i]
4994
- column_casts << Polars.col(col).cast(dtypes[col])._rbexpr
5209
+ column_casts << Polars.col(col).cast(dtypes[col], strict: strict)._rbexpr
4995
5210
  end
4996
5211
  end
4997
5212
 
@@ -5010,12 +5225,11 @@ module Polars
5010
5225
  end
5011
5226
 
5012
5227
  # @private
5013
- def self.sequence_to_rbdf(data, schema: nil, schema_overrides: nil, orient: nil, infer_schema_length: 50)
5014
- raise Todo if schema_overrides
5228
+ def self.sequence_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true, orient: nil, infer_schema_length: 50)
5015
5229
  columns = schema
5016
5230
 
5017
5231
  if data.length == 0
5018
- return hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
5232
+ return hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides, strict: strict)
5019
5233
  end
5020
5234
 
5021
5235
  if data[0].is_a?(Series)
@@ -5028,7 +5242,7 @@ module Polars
5028
5242
  elsif data[0].is_a?(Hash)
5029
5243
  column_names, dtypes = _unpack_schema(columns)
5030
5244
  schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
5031
- rbdf = RbDataFrame.from_hashes(data, schema, schema_overrides, false, infer_schema_length)
5245
+ rbdf = RbDataFrame.from_hashes(data, schema, schema_overrides, strict, infer_schema_length)
5032
5246
  if column_names
5033
5247
  rbdf = _post_apply_columns(rbdf, column_names)
5034
5248
  end
@@ -5048,7 +5262,7 @@ module Polars
5048
5262
  schema, schema_overrides: schema_overrides, n_expected: first_element.length
5049
5263
  )
5050
5264
  local_schema_override = (
5051
- schema_overrides.any? ? (raise Todo) : {}
5265
+ schema_overrides.any? ? _include_unknowns(schema_overrides, column_names) : {}
5052
5266
  )
5053
5267
  if column_names.any? && first_element.length > 0 && first_element.length != column_names.length
5054
5268
  raise ArgumentError, "the row data does not match the number of columns"
@@ -5056,7 +5270,11 @@ module Polars
5056
5270
 
5057
5271
  unpack_nested = false
5058
5272
  local_schema_override.each do |col, tp|
5059
- raise Todo
5273
+ if [Categorical, Enum].include?(tp)
5274
+ local_schema_override[col] = String
5275
+ elsif !unpack_nested && [Unknown, Struct].include?(tp.base_type)
5276
+ raise Todo
5277
+ end
5060
5278
  end
5061
5279
 
5062
5280
  if unpack_nested
@@ -5070,7 +5288,7 @@ module Polars
5070
5288
  end
5071
5289
  if column_names.any? || schema_overrides.any?
5072
5290
  rbdf = _post_apply_columns(
5073
- rbdf, column_names, schema_overrides: schema_overrides
5291
+ rbdf, column_names, schema_overrides: schema_overrides, strict: strict
5074
5292
  )
5075
5293
  end
5076
5294
  return rbdf
@@ -5080,7 +5298,7 @@ module Polars
5080
5298
  )
5081
5299
  data_series =
5082
5300
  data.map.with_index do |element, i|
5083
- Series.new(column_names[i], element, dtype: schema_overrides[column_names[i]])._s
5301
+ Series.new(column_names[i], element, dtype: schema_overrides[column_names[i]], strict: strict)._s
5084
5302
  end
5085
5303
  return RbDataFrame.new(data_series)
5086
5304
  else
@@ -5093,7 +5311,12 @@ module Polars
5093
5311
  end
5094
5312
 
5095
5313
  # @private
5096
- def self.series_to_rbdf(data, schema: nil, schema_overrides: nil)
5314
+ def self._include_unknowns(schema, cols)
5315
+ cols.to_h { |col| [col, schema[col] || Unknown] }
5316
+ end
5317
+
5318
+ # @private
5319
+ def self.series_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true)
5097
5320
  data_series = [data._s]
5098
5321
  series_name = data_series.map(&:name)
5099
5322
  column_names, schema_overrides = _unpack_schema(
@@ -5102,7 +5325,7 @@ module Polars
5102
5325
  if schema_overrides.any?
5103
5326
  new_dtype = schema_overrides.values[0]
5104
5327
  if new_dtype != data.dtype
5105
- data_series[0] = data_series[0].cast(new_dtype, true)
5328
+ data_series[0] = data_series[0].cast(new_dtype, strict)
5106
5329
  end
5107
5330
  end
5108
5331
 
@@ -0,0 +1,28 @@
1
+ module Polars
2
+ class DataTypeGroup < Set
3
+ end
4
+
5
+ SIGNED_INTEGER_DTYPES = DataTypeGroup.new(
6
+ [
7
+ Int8,
8
+ Int16,
9
+ Int32,
10
+ Int64
11
+ ]
12
+ )
13
+ UNSIGNED_INTEGER_DTYPES = DataTypeGroup.new(
14
+ [
15
+ UInt8,
16
+ UInt16,
17
+ UInt32,
18
+ UInt64
19
+ ]
20
+ )
21
+ INTEGER_DTYPES = (
22
+ SIGNED_INTEGER_DTYPES | UNSIGNED_INTEGER_DTYPES
23
+ )
24
+ FLOAT_DTYPES = DataTypeGroup.new([Float32, Float64])
25
+ NUMERIC_DTYPES = DataTypeGroup.new(
26
+ FLOAT_DTYPES + INTEGER_DTYPES | [Decimal]
27
+ )
28
+ end
@@ -292,6 +292,8 @@ module Polars
292
292
 
293
293
  # A categorical encoding of a set of strings.
294
294
  class Categorical < DataType
295
+ attr_reader :ordering
296
+
295
297
  def initialize(ordering = "physical")
296
298
  @ordering = ordering
297
299
  end