polars-df 0.4.0-x86_64-linux → 0.6.0-x86_64-linux

Sign up to get free protection for your applications and to get access to all the features.
@@ -18,7 +18,10 @@ module Polars
18
18
  # Whether to interpret two-dimensional data as columns or as rows. If `nil`,
19
19
  # the orientation is inferred by matching the columns and data dimensions. If
20
20
  # this does not yield conclusive results, column orientation is used.
21
- def initialize(data = nil, columns: nil, orient: nil)
21
+ def initialize(data = nil, schema: nil, columns: nil, schema_overrides: nil, orient: nil, infer_schema_length: 100, nan_to_null: false)
22
+ schema ||= columns
23
+ raise Todo if schema_overrides
24
+
22
25
  # TODO deprecate in favor of read_sql
23
26
  if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
24
27
  result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.to_sql)
@@ -29,14 +32,14 @@ module Polars
29
32
  end
30
33
 
31
34
  if data.nil?
32
- self._df = self.class.hash_to_rbdf({}, columns: columns)
35
+ self._df = self.class.hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
33
36
  elsif data.is_a?(Hash)
34
37
  data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
35
- self._df = self.class.hash_to_rbdf(data, columns: columns)
36
- elsif data.is_a?(Array)
37
- self._df = self.class.sequence_to_rbdf(data, columns: columns, orient: orient)
38
+ self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, nan_to_null: nan_to_null)
39
+ elsif data.is_a?(::Array)
40
+ self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, orient: orient, infer_schema_length: infer_schema_length)
38
41
  elsif data.is_a?(Series)
39
- self._df = self.class.series_to_rbdf(data, columns: columns)
42
+ self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides)
40
43
  else
41
44
  raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
42
45
  end
@@ -56,8 +59,8 @@ module Polars
56
59
  end
57
60
 
58
61
  # @private
59
- def self._from_hash(data, columns: nil)
60
- _from_rbdf(hash_to_rbdf(data, columns: columns))
62
+ def self._from_hash(data, schema: nil, schema_overrides: nil)
63
+ _from_rbdf(hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides))
61
64
  end
62
65
 
63
66
  # def self._from_records
@@ -113,7 +116,7 @@ module Polars
113
116
  dtypes.each do|k, v|
114
117
  dtype_list << [k, Utils.rb_type_to_dtype(v)]
115
118
  end
116
- elsif dtypes.is_a?(Array)
119
+ elsif dtypes.is_a?(::Array)
117
120
  dtype_slice = dtypes
118
121
  else
119
122
  raise ArgumentError, "dtype arg should be list or dict"
@@ -336,6 +339,7 @@ module Polars
336
339
  end
337
340
  alias_method :count, :height
338
341
  alias_method :length, :height
342
+ alias_method :size, :height
339
343
 
340
344
  # Get the width of the DataFrame.
341
345
  #
@@ -546,6 +550,13 @@ module Polars
546
550
  end
547
551
  alias_method :inspect, :to_s
548
552
 
553
+ # Returns an array representing the DataFrame
554
+ #
555
+ # @return [Array]
556
+ def to_a
557
+ rows(named: true)
558
+ end
559
+
549
560
  # Check if DataFrame includes column.
550
561
  #
551
562
  # @return [Boolean]
@@ -579,7 +590,7 @@ module Polars
579
590
 
580
591
  # df[2, ..] (select row as df)
581
592
  if row_selection.is_a?(Integer)
582
- if col_selection.is_a?(Array)
593
+ if col_selection.is_a?(::Array)
583
594
  df = self[0.., col_selection]
584
595
  return df.slice(row_selection, 1)
585
596
  end
@@ -600,7 +611,7 @@ module Polars
600
611
  return series[row_selection]
601
612
  end
602
613
 
603
- if col_selection.is_a?(Array)
614
+ if col_selection.is_a?(::Array)
604
615
  # df[.., [1, 2]]
605
616
  if Utils.is_int_sequence(col_selection)
606
617
  series_list = col_selection.map { |i| to_series(i) }
@@ -630,7 +641,7 @@ module Polars
630
641
  return Slice.new(self).apply(item)
631
642
  end
632
643
 
633
- if item.is_a?(Array) && item.all? { |v| Utils.strlike?(v) }
644
+ if item.is_a?(::Array) && item.all? { |v| Utils.strlike?(v) }
634
645
  # select multiple columns
635
646
  # df[["foo", "bar"]]
636
647
  return _from_rbdf(_df.select(item.map(&:to_s)))
@@ -655,7 +666,7 @@ module Polars
655
666
  end
656
667
 
657
668
  # Ruby-specific
658
- if item.is_a?(Expr)
669
+ if item.is_a?(Expr) || item.is_a?(Series)
659
670
  return filter(item)
660
671
  end
661
672
 
@@ -665,15 +676,42 @@ module Polars
665
676
  # Set item.
666
677
  #
667
678
  # @return [Object]
668
- #
669
- # def []=(key, value)
670
- # if key.is_a?(String)
671
- # raise TypeError, "'DataFrame' object does not support 'Series' assignment by index. Use 'DataFrame.with_columns'"
672
- # end
679
+ def []=(*key, value)
680
+ if key.length == 1
681
+ key = key.first
682
+ elsif key.length != 2
683
+ raise ArgumentError, "wrong number of arguments (given #{key.length + 1}, expected 2..3)"
684
+ end
673
685
 
674
- # raise Todo
675
- # end
686
+ if Utils.strlike?(key)
687
+ if value.is_a?(::Array) || (defined?(Numo::NArray) && value.is_a?(Numo::NArray))
688
+ value = Series.new(value)
689
+ elsif !value.is_a?(Series)
690
+ value = Polars.lit(value)
691
+ end
692
+ self._df = with_column(value.alias(key.to_s))._df
693
+ elsif key.is_a?(::Array)
694
+ row_selection, col_selection = key
695
+
696
+ if Utils.strlike?(col_selection)
697
+ s = self[col_selection]
698
+ elsif col_selection.is_a?(Integer)
699
+ raise Todo
700
+ else
701
+ raise ArgumentError, "column selection not understood: #{col_selection}"
702
+ end
676
703
 
704
+ s[row_selection] = value
705
+
706
+ if col_selection.is_a?(Integer)
707
+ replace_at_idx(col_selection, s)
708
+ elsif Utils.strlike?(col_selection)
709
+ replace(col_selection, s)
710
+ end
711
+ else
712
+ raise Todo
713
+ end
714
+ end
677
715
 
678
716
  # Return the dataframe as a scalar.
679
717
  #
@@ -956,14 +994,21 @@ module Polars
956
994
  #
957
995
  # @return [nil]
958
996
  def write_ipc(file, compression: "uncompressed")
959
- if compression.nil?
960
- compression = "uncompressed"
997
+ return_bytes = file.nil?
998
+ if return_bytes
999
+ file = StringIO.new
1000
+ file.set_encoding(Encoding::BINARY)
961
1001
  end
962
1002
  if Utils.pathlike?(file)
963
1003
  file = Utils.normalise_filepath(file)
964
1004
  end
965
1005
 
1006
+ if compression.nil?
1007
+ compression = "uncompressed"
1008
+ end
1009
+
966
1010
  _df.write_ipc(file, compression)
1011
+ return_bytes ? file.string : nil
967
1012
  end
968
1013
 
969
1014
  # Write to Apache Parquet file.
@@ -1453,13 +1498,23 @@ module Polars
1453
1498
  # # │ 1 ┆ 6.0 ┆ a │
1454
1499
  # # └─────┴─────┴─────┘
1455
1500
  def sort(by, reverse: false, nulls_last: false)
1456
- if by.is_a?(Array) || by.is_a?(Expr)
1457
- lazy
1458
- .sort(by, reverse: reverse, nulls_last: nulls_last)
1459
- .collect(no_optimization: true, string_cache: false)
1460
- else
1461
- _from_rbdf(_df.sort(by, reverse, nulls_last))
1462
- end
1501
+ lazy
1502
+ .sort(by, reverse: reverse, nulls_last: nulls_last)
1503
+ .collect(no_optimization: true)
1504
+ end
1505
+
1506
+ # Sort the DataFrame by column in-place.
1507
+ #
1508
+ # @param by [String]
1509
+ # By which column to sort.
1510
+ # @param reverse [Boolean]
1511
+ # Reverse/descending sort.
1512
+ # @param nulls_last [Boolean]
1513
+ # Place null values last. Can only be used if sorted by a single column.
1514
+ #
1515
+ # @return [DataFrame]
1516
+ def sort!(by, reverse: false, nulls_last: false)
1517
+ self._df = sort(by, reverse: reverse, nulls_last: nulls_last)._df
1463
1518
  end
1464
1519
 
1465
1520
  # Check if DataFrame is equal to other.
@@ -1519,7 +1574,7 @@ module Polars
1519
1574
  # # │ 30 ┆ 6 │
1520
1575
  # # └─────┴─────┘
1521
1576
  def replace(column, new_col)
1522
- _df.replace(column, new_col._s)
1577
+ _df.replace(column.to_s, new_col._s)
1523
1578
  self
1524
1579
  end
1525
1580
 
@@ -1847,6 +1902,12 @@ module Polars
1847
1902
  # Define whether the temporal window interval is closed or not.
1848
1903
  # @param by [Object]
1849
1904
  # Also group by this column/these columns.
1905
+ # @param check_sorted [Boolean]
1906
+ # When the `by` argument is given, polars can not check sortedness
1907
+ # by the metadata and has to do a full scan on the index column to
1908
+ # verify data is sorted. This is expensive. If you are sure the
1909
+ # data within the by groups is sorted, you can set this to `false`.
1910
+ # Doing so incorrectly will lead to incorrect output
1850
1911
  #
1851
1912
  # @return [RollingGroupBy]
1852
1913
  #
@@ -1860,7 +1921,7 @@ module Polars
1860
1921
  # "2020-01-08 23:16:43"
1861
1922
  # ]
1862
1923
  # df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
1863
- # Polars.col("dt").str.strptime(:datetime)
1924
+ # Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
1864
1925
  # )
1865
1926
  # df.groupby_rolling(index_column: "dt", period: "2d").agg(
1866
1927
  # [
@@ -1888,9 +1949,10 @@ module Polars
1888
1949
  period:,
1889
1950
  offset: nil,
1890
1951
  closed: "right",
1891
- by: nil
1952
+ by: nil,
1953
+ check_sorted: true
1892
1954
  )
1893
- RollingGroupBy.new(self, index_column, period, offset, closed, by)
1955
+ RollingGroupBy.new(self, index_column, period, offset, closed, by, check_sorted)
1894
1956
  end
1895
1957
 
1896
1958
  # Group based on a time value (or index value of type `:i32`, `:i64`).
@@ -2026,21 +2088,21 @@ module Polars
2026
2088
  # df.groupby_dynamic("time", every: "1h", closed: "left").agg(
2027
2089
  # [
2028
2090
  # Polars.col("time").count.alias("time_count"),
2029
- # Polars.col("time").list.alias("time_agg_list")
2091
+ # Polars.col("time").alias("time_agg_list")
2030
2092
  # ]
2031
2093
  # )
2032
2094
  # # =>
2033
2095
  # # shape: (4, 3)
2034
- # # ┌─────────────────────┬────────────┬─────────────────────────────────────┐
2035
- # # │ time ┆ time_count ┆ time_agg_list
2036
- # # │ --- ┆ --- ┆ ---
2037
- # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
2038
- # # ╞═════════════════════╪════════════╪═════════════════════════════════════╡
2039
- # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16...
2040
- # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16...
2041
- # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16...
2042
- # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
2043
- # # └─────────────────────┴────────────┴─────────────────────────────────────┘
2096
+ # # ┌─────────────────────┬────────────┬───────────────────────────────────┐
2097
+ # # │ time ┆ time_count ┆ time_agg_list
2098
+ # # │ --- ┆ --- ┆ ---
2099
+ # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
2100
+ # # ╞═════════════════════╪════════════╪═══════════════════════════════════╡
2101
+ # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16
2102
+ # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16
2103
+ # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16
2104
+ # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
2105
+ # # └─────────────────────┴────────────┴───────────────────────────────────┘
2044
2106
  #
2045
2107
  # @example When closed="both" the time values at the window boundaries belong to 2 groups.
2046
2108
  # df.groupby_dynamic("time", every: "1h", closed: "both").agg(
@@ -2107,7 +2169,7 @@ module Polars
2107
2169
  # period: "3i",
2108
2170
  # include_boundaries: true,
2109
2171
  # closed: "right"
2110
- # ).agg(Polars.col("A").list.alias("A_agg_list"))
2172
+ # ).agg(Polars.col("A").alias("A_agg_list"))
2111
2173
  # # =>
2112
2174
  # # shape: (3, 4)
2113
2175
  # # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
@@ -2190,7 +2252,7 @@ module Polars
2190
2252
  # "groups" => ["A", "B", "A", "B"],
2191
2253
  # "values" => [0, 1, 2, 3]
2192
2254
  # }
2193
- # )
2255
+ # ).set_sorted("time")
2194
2256
  # df.upsample(
2195
2257
  # time_column: "time", every: "1mo", by: "groups", maintain_order: true
2196
2258
  # ).select(Polars.all.forward_fill)
@@ -2308,7 +2370,7 @@ module Polars
2308
2370
  # ], # note record date: Jan 1st (sorted!)
2309
2371
  # "gdp" => [4164, 4411, 4566, 4696]
2310
2372
  # }
2311
- # )
2373
+ # ).set_sorted("date")
2312
2374
  # population = Polars::DataFrame.new(
2313
2375
  # {
2314
2376
  # "date" => [
@@ -2319,7 +2381,7 @@ module Polars
2319
2381
  # ], # note record date: May 12th (sorted!)
2320
2382
  # "population" => [82.19, 82.66, 83.12, 83.52]
2321
2383
  # }
2322
- # )
2384
+ # ).set_sorted("date")
2323
2385
  # population.join_asof(
2324
2386
  # gdp, left_on: "date", right_on: "date", strategy: "backward"
2325
2387
  # )
@@ -2622,7 +2684,7 @@ module Polars
2622
2684
  # # │ 3 ┆ 8 ┆ c ┆ 30 │
2623
2685
  # # └─────┴─────┴─────┴───────┘
2624
2686
  def hstack(columns, in_place: false)
2625
- if !columns.is_a?(Array)
2687
+ if !columns.is_a?(::Array)
2626
2688
  columns = columns.get_columns
2627
2689
  end
2628
2690
  if in_place
@@ -2752,7 +2814,7 @@ module Polars
2752
2814
  # # │ 3 ┆ 8.0 │
2753
2815
  # # └─────┴─────┘
2754
2816
  def drop(columns)
2755
- if columns.is_a?(Array)
2817
+ if columns.is_a?(::Array)
2756
2818
  df = clone
2757
2819
  columns.each do |n|
2758
2820
  df._df.drop_in_place(n)
@@ -2791,6 +2853,16 @@ module Polars
2791
2853
  Utils.wrap_s(_df.drop_in_place(name))
2792
2854
  end
2793
2855
 
2856
+ # Drop in place if exists.
2857
+ #
2858
+ # @param name [Object]
2859
+ # Column to drop.
2860
+ #
2861
+ # @return [Series]
2862
+ def delete(name)
2863
+ drop_in_place(name) if include?(name)
2864
+ end
2865
+
2794
2866
  # Create an empty copy of the current DataFrame.
2795
2867
  #
2796
2868
  # Returns a DataFrame with identical schema but no data.
@@ -3202,7 +3274,7 @@ module Polars
3202
3274
  # # │ B ┆ 1 │
3203
3275
  # # │ C ┆ 2 │
3204
3276
  # # │ D ┆ 3 │
3205
- # # │
3277
+ # # │ E4
3206
3278
  # # │ F ┆ 5 │
3207
3279
  # # │ G ┆ 6 │
3208
3280
  # # │ H ┆ 7 │
@@ -3255,7 +3327,7 @@ module Polars
3255
3327
  n_fill = n_cols * n_rows - height
3256
3328
 
3257
3329
  if n_fill > 0
3258
- if !fill_values.is_a?(Array)
3330
+ if !fill_values.is_a?(::Array)
3259
3331
  fill_values = [fill_values] * df.width
3260
3332
  end
3261
3333
 
@@ -3364,29 +3436,29 @@ module Polars
3364
3436
  # # ╞═════╪═════╪═════╡
3365
3437
  # # │ C ┆ 2 ┆ l │
3366
3438
  # # └─────┴─────┴─────┘}
3367
- def partition_by(groups, maintain_order: true, as_dict: false)
3439
+ def partition_by(groups, maintain_order: true, include_key: true, as_dict: false)
3368
3440
  if groups.is_a?(String)
3369
3441
  groups = [groups]
3370
- elsif !groups.is_a?(Array)
3442
+ elsif !groups.is_a?(::Array)
3371
3443
  groups = Array(groups)
3372
3444
  end
3373
3445
 
3374
3446
  if as_dict
3375
3447
  out = {}
3376
3448
  if groups.length == 1
3377
- _df.partition_by(groups, maintain_order).each do |df|
3449
+ _df.partition_by(groups, maintain_order, include_key).each do |df|
3378
3450
  df = _from_rbdf(df)
3379
3451
  out[df[groups][0, 0]] = df
3380
3452
  end
3381
3453
  else
3382
- _df.partition_by(groups, maintain_order).each do |df|
3454
+ _df.partition_by(groups, maintain_order, include_key).each do |df|
3383
3455
  df = _from_rbdf(df)
3384
3456
  out[df[groups].row(0)] = df
3385
3457
  end
3386
3458
  end
3387
3459
  out
3388
3460
  else
3389
- _df.partition_by(groups, maintain_order).map { |df| _from_rbdf(df) }
3461
+ _df.partition_by(groups, maintain_order, include_key).map { |df| _from_rbdf(df) }
3390
3462
  end
3391
3463
  end
3392
3464
 
@@ -3654,7 +3726,7 @@ module Polars
3654
3726
  # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
3655
3727
  # # └─────┴──────┴───────┴──────┴──────┴───────┘
3656
3728
  def with_columns(exprs)
3657
- if !exprs.nil? && !exprs.is_a?(Array)
3729
+ if !exprs.nil? && !exprs.is_a?(::Array)
3658
3730
  exprs = [exprs]
3659
3731
  end
3660
3732
  lazy
@@ -4035,11 +4107,11 @@ module Polars
4035
4107
  # # │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │
4036
4108
  # # │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │
4037
4109
  # # └───────┴───────┴───────┴───────┴───────┴───────┘
4038
- def to_dummies(columns: nil, separator: "_")
4110
+ def to_dummies(columns: nil, separator: "_", drop_first: false)
4039
4111
  if columns.is_a?(String)
4040
4112
  columns = [columns]
4041
4113
  end
4042
- _from_rbdf(_df.to_dummies(columns, separator))
4114
+ _from_rbdf(_df.to_dummies(columns, separator, drop_first))
4043
4115
  end
4044
4116
 
4045
4117
  # Drop duplicate rows from this DataFrame.
@@ -4127,7 +4199,7 @@ module Polars
4127
4199
  subset = [subset]
4128
4200
  end
4129
4201
 
4130
- if subset.is_a?(Array) && subset.length == 1
4202
+ if subset.is_a?(::Array) && subset.length == 1
4131
4203
  expr = Utils.expr_to_lit_or_expr(subset[0], str_to_lit: false)
4132
4204
  else
4133
4205
  struct_fields = subset.nil? ? Polars.all : subset
@@ -4428,7 +4500,7 @@ module Polars
4428
4500
  end
4429
4501
  end
4430
4502
 
4431
- # Returns an iterator over the DataFrame of rows of python-native values.
4503
+ # Returns an iterator over the DataFrame of rows of Ruby-native values.
4432
4504
  #
4433
4505
  # @param named [Boolean]
4434
4506
  # Return hashes instead of arrays. The hashes are a mapping of
@@ -4489,6 +4561,24 @@ module Polars
4489
4561
  end
4490
4562
  end
4491
4563
 
4564
+ # Returns an iterator over the DataFrame of rows of Ruby-native values.
4565
+ #
4566
+ # @param named [Boolean]
4567
+ # Return hashes instead of arrays. The hashes are a mapping of
4568
+ # column name to row value. This is more expensive than returning an
4569
+ # array, but allows for accessing values by column name.
4570
+ # @param buffer_size [Integer]
4571
+ # Determines the number of rows that are buffered internally while iterating
4572
+ # over the data; you should only modify this in very specific cases where the
4573
+ # default value is determined not to be a good fit to your access pattern, as
4574
+ # the speedup from using the buffer is significant (~2-4x). Setting this
4575
+ # value to zero disables row buffering.
4576
+ #
4577
+ # @return [Object]
4578
+ def each_row(named: true, buffer_size: 500, &block)
4579
+ iter_rows(named: named, buffer_size: buffer_size, &block)
4580
+ end
4581
+
4492
4582
  # Shrink DataFrame memory usage.
4493
4583
  #
4494
4584
  # Shrinks to fit the exact capacity needed to hold the data.
@@ -4678,6 +4768,38 @@ module Polars
4678
4768
  _from_rbdf(_df.unnest(names))
4679
4769
  end
4680
4770
 
4771
+ # TODO
4772
+ # def corr
4773
+ # end
4774
+
4775
+ # TODO
4776
+ # def merge_sorted
4777
+ # end
4778
+
4779
+ # Indicate that one or multiple columns are sorted.
4780
+ #
4781
+ # @param column [Object]
4782
+ # Columns that are sorted
4783
+ # @param more_columns [Object]
4784
+ # Additional columns that are sorted, specified as positional arguments.
4785
+ # @param descending [Boolean]
4786
+ # Whether the columns are sorted in descending order.
4787
+ #
4788
+ # @return [DataFrame]
4789
+ def set_sorted(
4790
+ column,
4791
+ *more_columns,
4792
+ descending: false
4793
+ )
4794
+ lazy
4795
+ .set_sorted(column, *more_columns, descending: descending)
4796
+ .collect(no_optimization: true)
4797
+ end
4798
+
4799
+ # TODO
4800
+ # def update
4801
+ # end
4802
+
4681
4803
  private
4682
4804
 
4683
4805
  def initialize_copy(other)
@@ -4742,20 +4864,63 @@ module Polars
4742
4864
  end
4743
4865
 
4744
4866
  # @private
4745
- def self.hash_to_rbdf(data, columns: nil)
4746
- if !columns.nil?
4747
- columns, dtypes = _unpack_columns(columns, lookup_names: data.keys)
4867
+ def self.expand_hash_scalars(data, schema_overrides: nil, order: nil, nan_to_null: false)
4868
+ updated_data = {}
4869
+ unless data.empty?
4870
+ dtypes = schema_overrides || {}
4871
+ array_len = data.values.map { |val| Utils.arrlen(val) || 0 }.max
4872
+ if array_len > 0
4873
+ data.each do |name, val|
4874
+ dtype = dtypes[name]
4875
+ if val.is_a?(Hash) && dtype != Struct
4876
+ updated_data[name] = DataFrame.new(val).to_struct(name)
4877
+ elsif !Utils.arrlen(val).nil?
4878
+ updated_data[name] = Series.new(String.new(name), val, dtype: dtype)
4879
+ elsif val.nil? || [Integer, Float, TrueClass, FalseClass, String, ::Date, ::DateTime, ::Time].any? { |cls| val.is_a?(cls) }
4880
+ dtype = Polars::Float64 if val.nil? && dtype.nil?
4881
+ updated_data[name] = Series.new(String.new(name), [val], dtype: dtype).extend_constant(val, array_len - 1)
4882
+ else
4883
+ raise Todo
4884
+ end
4885
+ end
4886
+ elsif data.values.all? { |val| Utils.arrlen(val) == 0 }
4887
+ data.each do |name, val|
4888
+ updated_data[name] = Series.new(name, val, dtype: dtypes[name])
4889
+ end
4890
+ elsif data.values.all? { |val| Utils.arrlen(val).nil? }
4891
+ data.each do |name, val|
4892
+ updated_data[name] = Series.new(name, [val], dtype: dtypes[name])
4893
+ end
4894
+ end
4895
+ end
4896
+ updated_data
4897
+ end
4748
4898
 
4749
- if data.empty? && dtypes
4750
- data_series = columns.map { |name| Series.new(name, [], dtype: dtypes[name])._s }
4751
- else
4752
- data_series = data.map { |name, values| Series.new(name, values, dtype: dtypes[name])._s }
4899
+ # @private
4900
+ def self.hash_to_rbdf(data, schema: nil, schema_overrides: nil, nan_to_null: nil)
4901
+ if schema.is_a?(Hash) && !data.empty?
4902
+ if !data.all? { |col, _| schema[col] }
4903
+ raise ArgumentError, "The given column-schema names do not match the data dictionary"
4753
4904
  end
4754
- data_series = _handle_columns_arg(data_series, columns: columns)
4755
- return RbDataFrame.new(data_series)
4905
+
4906
+ data = schema.to_h { |col| [col, data[col]] }
4756
4907
  end
4757
4908
 
4758
- RbDataFrame.read_hash(data)
4909
+ column_names, schema_overrides = _unpack_schema(
4910
+ schema, lookup_names: data.keys, schema_overrides: schema_overrides
4911
+ )
4912
+ if column_names.empty?
4913
+ column_names = data.keys
4914
+ end
4915
+
4916
+ if data.empty? && !schema_overrides.empty?
4917
+ data_series = column_names.map { |name| Series.new(name, [], dtype: schema_overrides[name], nan_to_null: nan_to_null)._s }
4918
+ else
4919
+ data_series = expand_hash_scalars(data, schema_overrides: schema_overrides, nan_to_null: nan_to_null).values.map(&:_s)
4920
+ end
4921
+
4922
+ data_series = _handle_columns_arg(data_series, columns: column_names, from_hash: true)
4923
+ RbDataFrame.new(data_series)
4759
4924
  end
4760
4925
 
4761
4926
  # @private
@@ -4764,14 +4929,12 @@ module Polars
4764
4929
  end
4765
4930
 
4766
4931
  # @private
4767
- def self._unpack_columns(columns, schema_overrides: nil, lookup_names: nil, n_expected: nil)
4768
- raise Todo if schema_overrides
4769
-
4770
- if columns.is_a?(Hash)
4771
- columns = columns.to_a
4932
+ def self._unpack_schema(schema, schema_overrides: nil, n_expected: nil, lookup_names: nil, include_overrides_in_columns: false)
4933
+ if schema.is_a?(Hash)
4934
+ schema = schema.to_a
4772
4935
  end
4773
4936
  column_names =
4774
- (columns || []).map.with_index do |col, i|
4937
+ (schema || []).map.with_index do |col, i|
4775
4938
  if col.is_a?(String)
4776
4939
  col || "column_#{i}"
4777
4940
  else
@@ -4784,21 +4947,38 @@ module Polars
4784
4947
  # TODO zip_longest
4785
4948
  lookup = column_names.zip(lookup_names || []).to_h
4786
4949
 
4787
- [
4788
- column_names,
4789
- (columns || []).select { |col| !col.is_a?(String) && col[1] }.to_h do |col|
4950
+ column_dtypes =
4951
+ (schema || []).select { |col| !col.is_a?(String) && col[1] }.to_h do |col|
4790
4952
  [lookup[col[0]] || col[0], col[1]]
4791
4953
  end
4792
- ]
4954
+
4955
+ if schema_overrides
4956
+ raise Todo
4957
+ end
4958
+
4959
+ column_dtypes.each do |col, dtype|
4960
+ if !Utils.is_polars_dtype(dtype, include_unknown: true) && !dtype.nil?
4961
+ column_dtypes[col] = Utils.rb_type_to_dtype(dtype)
4962
+ end
4963
+ end
4964
+
4965
+ [column_names, column_dtypes]
4793
4966
  end
4794
4967
 
4795
- def self._handle_columns_arg(data, columns: nil)
4796
- if columns.nil?
4968
+ def self._handle_columns_arg(data, columns: nil, from_hash: false)
4969
+ if columns.nil? || columns.empty?
4797
4970
  data
4798
4971
  else
4799
4972
  if data.empty?
4800
4973
  columns.map { |c| Series.new(c, nil)._s }
4801
4974
  elsif data.length == columns.length
4975
+ if from_hash
4976
+ series_map = data.to_h { |s| [s.name, s] }
4977
+ if columns.all? { |col| series_map.key?(col) }
4978
+ return columns.map { |col| series_map[col] }
4979
+ end
4980
+ end
4981
+
4802
4982
  columns.each_with_index do |c, i|
4803
4983
  # not in-place?
4804
4984
  data[i].rename(c)
@@ -4813,7 +4993,7 @@ module Polars
4813
4993
  def self._post_apply_columns(rbdf, columns, structs: nil, schema_overrides: nil)
4814
4994
  rbdf_columns = rbdf.columns
4815
4995
  rbdf_dtypes = rbdf.dtypes
4816
- columns, dtypes = _unpack_columns(
4996
+ columns, dtypes = _unpack_schema(
4817
4997
  (columns || rbdf_columns), schema_overrides: schema_overrides
4818
4998
  )
4819
4999
  column_subset = []
@@ -4829,7 +5009,7 @@ module Polars
4829
5009
  columns.each do |col, i|
4830
5010
  if dtypes[col] == Categorical # != rbdf_dtypes[i]
4831
5011
  column_casts << Polars.col(col).cast(Categorical)._rbexpr
4832
- elsif structs.any? && structs.include?(col) && structs[col] != rbdf_dtypes[i]
5012
+ elsif structs&.any? && structs.include?(col) && structs[col] != rbdf_dtypes[i]
4833
5013
  column_casts << Polars.col(col).cast(structs[col])._rbexpr
4834
5014
  elsif dtypes.include?(col) && dtypes[col] != rbdf_dtypes[i]
4835
5015
  column_casts << Polars.col(col).cast(dtypes[col])._rbexpr
@@ -4851,27 +5031,30 @@ module Polars
4851
5031
  end
4852
5032
 
4853
5033
  # @private
4854
- def self.sequence_to_rbdf(data, columns: nil, orient: nil, infer_schema_length: 50)
5034
+ def self.sequence_to_rbdf(data, schema: nil, schema_overrides: nil, orient: nil, infer_schema_length: 50)
5035
+ raise Todo if schema_overrides
5036
+ columns = schema
5037
+
4855
5038
  if data.length == 0
4856
- return hash_to_rbdf({}, columns: columns)
5039
+ return hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
4857
5040
  end
4858
5041
 
4859
5042
  if data[0].is_a?(Series)
4860
5043
  # series_names = data.map(&:name)
4861
- # columns, dtypes = _unpack_columns(columns || series_names, n_expected: data.length)
5044
+ # columns, dtypes = _unpack_schema(columns || series_names, n_expected: data.length)
4862
5045
  data_series = []
4863
5046
  data.each do |s|
4864
5047
  data_series << s._s
4865
5048
  end
4866
5049
  elsif data[0].is_a?(Hash)
4867
- column_names, dtypes = _unpack_columns(columns)
5050
+ column_names, dtypes = _unpack_schema(columns)
4868
5051
  schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
4869
5052
  rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema_overrides)
4870
5053
  if column_names
4871
5054
  rbdf = _post_apply_columns(rbdf, column_names)
4872
5055
  end
4873
5056
  return rbdf
4874
- elsif data[0].is_a?(Array)
5057
+ elsif data[0].is_a?(::Array)
4875
5058
  if orient.nil? && !columns.nil?
4876
5059
  orient = columns.length == data.length ? "col" : "row"
4877
5060
  end
@@ -4890,11 +5073,21 @@ module Polars
4890
5073
  end
4891
5074
 
4892
5075
  # @private
4893
- def self.series_to_rbdf(data, columns: nil)
4894
- if columns
4895
- raise Todo
5076
+ def self.series_to_rbdf(data, schema: nil, schema_overrides: nil)
5077
+ data_series = [data._s]
5078
+ series_name = data_series.map(&:name)
5079
+ column_names, schema_overrides = _unpack_schema(
5080
+ schema || series_name, schema_overrides: schema_overrides, n_expected: 1
5081
+ )
5082
+ if schema_overrides.any?
5083
+ new_dtype = schema_overrides.values[0]
5084
+ if new_dtype != data.dtype
5085
+ data_series[0] = data_series[0].cast(new_dtype, true)
5086
+ end
4896
5087
  end
4897
- RbDataFrame.new([data._s])
5088
+
5089
+ data_series = _handle_columns_arg(data_series, columns: column_names)
5090
+ RbDataFrame.new(data_series)
4898
5091
  end
4899
5092
 
4900
5093
  def wrap_ldf(ldf)
@@ -4966,7 +5159,7 @@ module Polars
4966
5159
 
4967
5160
  def _prepare_other_arg(other)
4968
5161
  if !other.is_a?(Series)
4969
- if other.is_a?(Array)
5162
+ if other.is_a?(::Array)
4970
5163
  raise ArgumentError, "Operation not supported."
4971
5164
  end
4972
5165