polars-df 0.4.0-x86_64-linux → 0.6.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -18,7 +18,10 @@ module Polars
18
18
  # Whether to interpret two-dimensional data as columns or as rows. If `nil`,
19
19
  # the orientation is inferred by matching the columns and data dimensions. If
20
20
  # this does not yield conclusive results, column orientation is used.
21
- def initialize(data = nil, columns: nil, orient: nil)
21
+ def initialize(data = nil, schema: nil, columns: nil, schema_overrides: nil, orient: nil, infer_schema_length: 100, nan_to_null: false)
22
+ schema ||= columns
23
+ raise Todo if schema_overrides
24
+
22
25
  # TODO deprecate in favor of read_sql
23
26
  if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
24
27
  result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.to_sql)
@@ -29,14 +32,14 @@ module Polars
29
32
  end
30
33
 
31
34
  if data.nil?
32
- self._df = self.class.hash_to_rbdf({}, columns: columns)
35
+ self._df = self.class.hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
33
36
  elsif data.is_a?(Hash)
34
37
  data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
35
- self._df = self.class.hash_to_rbdf(data, columns: columns)
36
- elsif data.is_a?(Array)
37
- self._df = self.class.sequence_to_rbdf(data, columns: columns, orient: orient)
38
+ self._df = self.class.hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, nan_to_null: nan_to_null)
39
+ elsif data.is_a?(::Array)
40
+ self._df = self.class.sequence_to_rbdf(data, schema: schema, schema_overrides: schema_overrides, orient: orient, infer_schema_length: infer_schema_length)
38
41
  elsif data.is_a?(Series)
39
- self._df = self.class.series_to_rbdf(data, columns: columns)
42
+ self._df = self.class.series_to_rbdf(data, schema: schema, schema_overrides: schema_overrides)
40
43
  else
41
44
  raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
42
45
  end
@@ -56,8 +59,8 @@ module Polars
56
59
  end
57
60
 
58
61
  # @private
59
- def self._from_hash(data, columns: nil)
60
- _from_rbdf(hash_to_rbdf(data, columns: columns))
62
+ def self._from_hash(data, schema: nil, schema_overrides: nil)
63
+ _from_rbdf(hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides))
61
64
  end
62
65
 
63
66
  # def self._from_records
@@ -113,7 +116,7 @@ module Polars
113
116
  dtypes.each do|k, v|
114
117
  dtype_list << [k, Utils.rb_type_to_dtype(v)]
115
118
  end
116
- elsif dtypes.is_a?(Array)
119
+ elsif dtypes.is_a?(::Array)
117
120
  dtype_slice = dtypes
118
121
  else
119
122
  raise ArgumentError, "dtype arg should be list or dict"
@@ -336,6 +339,7 @@ module Polars
336
339
  end
337
340
  alias_method :count, :height
338
341
  alias_method :length, :height
342
+ alias_method :size, :height
339
343
 
340
344
  # Get the width of the DataFrame.
341
345
  #
@@ -546,6 +550,13 @@ module Polars
546
550
  end
547
551
  alias_method :inspect, :to_s
548
552
 
553
+ # Returns an array representing the DataFrame
554
+ #
555
+ # @return [Array]
556
+ def to_a
557
+ rows(named: true)
558
+ end
559
+
549
560
  # Check if DataFrame includes column.
550
561
  #
551
562
  # @return [Boolean]
@@ -579,7 +590,7 @@ module Polars
579
590
 
580
591
  # df[2, ..] (select row as df)
581
592
  if row_selection.is_a?(Integer)
582
- if col_selection.is_a?(Array)
593
+ if col_selection.is_a?(::Array)
583
594
  df = self[0.., col_selection]
584
595
  return df.slice(row_selection, 1)
585
596
  end
@@ -600,7 +611,7 @@ module Polars
600
611
  return series[row_selection]
601
612
  end
602
613
 
603
- if col_selection.is_a?(Array)
614
+ if col_selection.is_a?(::Array)
604
615
  # df[.., [1, 2]]
605
616
  if Utils.is_int_sequence(col_selection)
606
617
  series_list = col_selection.map { |i| to_series(i) }
@@ -630,7 +641,7 @@ module Polars
630
641
  return Slice.new(self).apply(item)
631
642
  end
632
643
 
633
- if item.is_a?(Array) && item.all? { |v| Utils.strlike?(v) }
644
+ if item.is_a?(::Array) && item.all? { |v| Utils.strlike?(v) }
634
645
  # select multiple columns
635
646
  # df[["foo", "bar"]]
636
647
  return _from_rbdf(_df.select(item.map(&:to_s)))
@@ -655,7 +666,7 @@ module Polars
655
666
  end
656
667
 
657
668
  # Ruby-specific
658
- if item.is_a?(Expr)
669
+ if item.is_a?(Expr) || item.is_a?(Series)
659
670
  return filter(item)
660
671
  end
661
672
 
@@ -665,15 +676,42 @@ module Polars
665
676
  # Set item.
666
677
  #
667
678
  # @return [Object]
668
- #
669
- # def []=(key, value)
670
- # if key.is_a?(String)
671
- # raise TypeError, "'DataFrame' object does not support 'Series' assignment by index. Use 'DataFrame.with_columns'"
672
- # end
679
+ def []=(*key, value)
680
+ if key.length == 1
681
+ key = key.first
682
+ elsif key.length != 2
683
+ raise ArgumentError, "wrong number of arguments (given #{key.length + 1}, expected 2..3)"
684
+ end
673
685
 
674
- # raise Todo
675
- # end
686
+ if Utils.strlike?(key)
687
+ if value.is_a?(::Array) || (defined?(Numo::NArray) && value.is_a?(Numo::NArray))
688
+ value = Series.new(value)
689
+ elsif !value.is_a?(Series)
690
+ value = Polars.lit(value)
691
+ end
692
+ self._df = with_column(value.alias(key.to_s))._df
693
+ elsif key.is_a?(::Array)
694
+ row_selection, col_selection = key
695
+
696
+ if Utils.strlike?(col_selection)
697
+ s = self[col_selection]
698
+ elsif col_selection.is_a?(Integer)
699
+ raise Todo
700
+ else
701
+ raise ArgumentError, "column selection not understood: #{col_selection}"
702
+ end
676
703
 
704
+ s[row_selection] = value
705
+
706
+ if col_selection.is_a?(Integer)
707
+ replace_at_idx(col_selection, s)
708
+ elsif Utils.strlike?(col_selection)
709
+ replace(col_selection, s)
710
+ end
711
+ else
712
+ raise Todo
713
+ end
714
+ end
677
715
 
678
716
  # Return the dataframe as a scalar.
679
717
  #
@@ -956,14 +994,21 @@ module Polars
956
994
  #
957
995
  # @return [nil]
958
996
  def write_ipc(file, compression: "uncompressed")
959
- if compression.nil?
960
- compression = "uncompressed"
997
+ return_bytes = file.nil?
998
+ if return_bytes
999
+ file = StringIO.new
1000
+ file.set_encoding(Encoding::BINARY)
961
1001
  end
962
1002
  if Utils.pathlike?(file)
963
1003
  file = Utils.normalise_filepath(file)
964
1004
  end
965
1005
 
1006
+ if compression.nil?
1007
+ compression = "uncompressed"
1008
+ end
1009
+
966
1010
  _df.write_ipc(file, compression)
1011
+ return_bytes ? file.string : nil
967
1012
  end
968
1013
 
969
1014
  # Write to Apache Parquet file.
@@ -1453,13 +1498,23 @@ module Polars
1453
1498
  # # │ 1 ┆ 6.0 ┆ a │
1454
1499
  # # └─────┴─────┴─────┘
1455
1500
  def sort(by, reverse: false, nulls_last: false)
1456
- if by.is_a?(Array) || by.is_a?(Expr)
1457
- lazy
1458
- .sort(by, reverse: reverse, nulls_last: nulls_last)
1459
- .collect(no_optimization: true, string_cache: false)
1460
- else
1461
- _from_rbdf(_df.sort(by, reverse, nulls_last))
1462
- end
1501
+ lazy
1502
+ .sort(by, reverse: reverse, nulls_last: nulls_last)
1503
+ .collect(no_optimization: true)
1504
+ end
1505
+
1506
+ # Sort the DataFrame by column in-place.
1507
+ #
1508
+ # @param by [String]
1509
+ # By which column to sort.
1510
+ # @param reverse [Boolean]
1511
+ # Reverse/descending sort.
1512
+ # @param nulls_last [Boolean]
1513
+ # Place null values last. Can only be used if sorted by a single column.
1514
+ #
1515
+ # @return [DataFrame]
1516
+ def sort!(by, reverse: false, nulls_last: false)
1517
+ self._df = sort(by, reverse: reverse, nulls_last: nulls_last)._df
1463
1518
  end
1464
1519
 
1465
1520
  # Check if DataFrame is equal to other.
@@ -1519,7 +1574,7 @@ module Polars
1519
1574
  # # │ 30 ┆ 6 │
1520
1575
  # # └─────┴─────┘
1521
1576
  def replace(column, new_col)
1522
- _df.replace(column, new_col._s)
1577
+ _df.replace(column.to_s, new_col._s)
1523
1578
  self
1524
1579
  end
1525
1580
 
@@ -1847,6 +1902,12 @@ module Polars
1847
1902
  # Define whether the temporal window interval is closed or not.
1848
1903
  # @param by [Object]
1849
1904
  # Also group by this column/these columns.
1905
+ # @param check_sorted [Boolean]
1906
+ # When the `by` argument is given, polars can not check sortedness
1907
+ # by the metadata and has to do a full scan on the index column to
1908
+ # verify data is sorted. This is expensive. If you are sure the
1909
+ # data within the by groups is sorted, you can set this to `false`.
1910
+ # Doing so incorrectly will lead to incorrect output
1850
1911
  #
1851
1912
  # @return [RollingGroupBy]
1852
1913
  #
@@ -1860,7 +1921,7 @@ module Polars
1860
1921
  # "2020-01-08 23:16:43"
1861
1922
  # ]
1862
1923
  # df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
1863
- # Polars.col("dt").str.strptime(:datetime)
1924
+ # Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
1864
1925
  # )
1865
1926
  # df.groupby_rolling(index_column: "dt", period: "2d").agg(
1866
1927
  # [
@@ -1888,9 +1949,10 @@ module Polars
1888
1949
  period:,
1889
1950
  offset: nil,
1890
1951
  closed: "right",
1891
- by: nil
1952
+ by: nil,
1953
+ check_sorted: true
1892
1954
  )
1893
- RollingGroupBy.new(self, index_column, period, offset, closed, by)
1955
+ RollingGroupBy.new(self, index_column, period, offset, closed, by, check_sorted)
1894
1956
  end
1895
1957
 
1896
1958
  # Group based on a time value (or index value of type `:i32`, `:i64`).
@@ -2026,21 +2088,21 @@ module Polars
2026
2088
  # df.groupby_dynamic("time", every: "1h", closed: "left").agg(
2027
2089
  # [
2028
2090
  # Polars.col("time").count.alias("time_count"),
2029
- # Polars.col("time").list.alias("time_agg_list")
2091
+ # Polars.col("time").alias("time_agg_list")
2030
2092
  # ]
2031
2093
  # )
2032
2094
  # # =>
2033
2095
  # # shape: (4, 3)
2034
- # # ┌─────────────────────┬────────────┬─────────────────────────────────────┐
2035
- # # │ time ┆ time_count ┆ time_agg_list
2036
- # # │ --- ┆ --- ┆ ---
2037
- # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
2038
- # # ╞═════════════════════╪════════════╪═════════════════════════════════════╡
2039
- # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16...
2040
- # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16...
2041
- # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16...
2042
- # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
2043
- # # └─────────────────────┴────────────┴─────────────────────────────────────┘
2096
+ # # ┌─────────────────────┬────────────┬───────────────────────────────────┐
2097
+ # # │ time ┆ time_count ┆ time_agg_list
2098
+ # # │ --- ┆ --- ┆ ---
2099
+ # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
2100
+ # # ╞═════════════════════╪════════════╪═══════════════════════════════════╡
2101
+ # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16
2102
+ # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16
2103
+ # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16
2104
+ # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
2105
+ # # └─────────────────────┴────────────┴───────────────────────────────────┘
2044
2106
  #
2045
2107
  # @example When closed="both" the time values at the window boundaries belong to 2 groups.
2046
2108
  # df.groupby_dynamic("time", every: "1h", closed: "both").agg(
@@ -2107,7 +2169,7 @@ module Polars
2107
2169
  # period: "3i",
2108
2170
  # include_boundaries: true,
2109
2171
  # closed: "right"
2110
- # ).agg(Polars.col("A").list.alias("A_agg_list"))
2172
+ # ).agg(Polars.col("A").alias("A_agg_list"))
2111
2173
  # # =>
2112
2174
  # # shape: (3, 4)
2113
2175
  # # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
@@ -2190,7 +2252,7 @@ module Polars
2190
2252
  # "groups" => ["A", "B", "A", "B"],
2191
2253
  # "values" => [0, 1, 2, 3]
2192
2254
  # }
2193
- # )
2255
+ # ).set_sorted("time")
2194
2256
  # df.upsample(
2195
2257
  # time_column: "time", every: "1mo", by: "groups", maintain_order: true
2196
2258
  # ).select(Polars.all.forward_fill)
@@ -2308,7 +2370,7 @@ module Polars
2308
2370
  # ], # note record date: Jan 1st (sorted!)
2309
2371
  # "gdp" => [4164, 4411, 4566, 4696]
2310
2372
  # }
2311
- # )
2373
+ # ).set_sorted("date")
2312
2374
  # population = Polars::DataFrame.new(
2313
2375
  # {
2314
2376
  # "date" => [
@@ -2319,7 +2381,7 @@ module Polars
2319
2381
  # ], # note record date: May 12th (sorted!)
2320
2382
  # "population" => [82.19, 82.66, 83.12, 83.52]
2321
2383
  # }
2322
- # )
2384
+ # ).set_sorted("date")
2323
2385
  # population.join_asof(
2324
2386
  # gdp, left_on: "date", right_on: "date", strategy: "backward"
2325
2387
  # )
@@ -2622,7 +2684,7 @@ module Polars
2622
2684
  # # │ 3 ┆ 8 ┆ c ┆ 30 │
2623
2685
  # # └─────┴─────┴─────┴───────┘
2624
2686
  def hstack(columns, in_place: false)
2625
- if !columns.is_a?(Array)
2687
+ if !columns.is_a?(::Array)
2626
2688
  columns = columns.get_columns
2627
2689
  end
2628
2690
  if in_place
@@ -2752,7 +2814,7 @@ module Polars
2752
2814
  # # │ 3 ┆ 8.0 │
2753
2815
  # # └─────┴─────┘
2754
2816
  def drop(columns)
2755
- if columns.is_a?(Array)
2817
+ if columns.is_a?(::Array)
2756
2818
  df = clone
2757
2819
  columns.each do |n|
2758
2820
  df._df.drop_in_place(n)
@@ -2791,6 +2853,16 @@ module Polars
2791
2853
  Utils.wrap_s(_df.drop_in_place(name))
2792
2854
  end
2793
2855
 
2856
+ # Drop in place if exists.
2857
+ #
2858
+ # @param name [Object]
2859
+ # Column to drop.
2860
+ #
2861
+ # @return [Series]
2862
+ def delete(name)
2863
+ drop_in_place(name) if include?(name)
2864
+ end
2865
+
2794
2866
  # Create an empty copy of the current DataFrame.
2795
2867
  #
2796
2868
  # Returns a DataFrame with identical schema but no data.
@@ -3202,7 +3274,7 @@ module Polars
3202
3274
  # # │ B ┆ 1 │
3203
3275
  # # │ C ┆ 2 │
3204
3276
  # # │ D ┆ 3 │
3205
- # # │
3277
+ # # │ E4
3206
3278
  # # │ F ┆ 5 │
3207
3279
  # # │ G ┆ 6 │
3208
3280
  # # │ H ┆ 7 │
@@ -3255,7 +3327,7 @@ module Polars
3255
3327
  n_fill = n_cols * n_rows - height
3256
3328
 
3257
3329
  if n_fill > 0
3258
- if !fill_values.is_a?(Array)
3330
+ if !fill_values.is_a?(::Array)
3259
3331
  fill_values = [fill_values] * df.width
3260
3332
  end
3261
3333
 
@@ -3364,29 +3436,29 @@ module Polars
3364
3436
  # # ╞═════╪═════╪═════╡
3365
3437
  # # │ C ┆ 2 ┆ l │
3366
3438
  # # └─────┴─────┴─────┘}
3367
- def partition_by(groups, maintain_order: true, as_dict: false)
3439
+ def partition_by(groups, maintain_order: true, include_key: true, as_dict: false)
3368
3440
  if groups.is_a?(String)
3369
3441
  groups = [groups]
3370
- elsif !groups.is_a?(Array)
3442
+ elsif !groups.is_a?(::Array)
3371
3443
  groups = Array(groups)
3372
3444
  end
3373
3445
 
3374
3446
  if as_dict
3375
3447
  out = {}
3376
3448
  if groups.length == 1
3377
- _df.partition_by(groups, maintain_order).each do |df|
3449
+ _df.partition_by(groups, maintain_order, include_key).each do |df|
3378
3450
  df = _from_rbdf(df)
3379
3451
  out[df[groups][0, 0]] = df
3380
3452
  end
3381
3453
  else
3382
- _df.partition_by(groups, maintain_order).each do |df|
3454
+ _df.partition_by(groups, maintain_order, include_key).each do |df|
3383
3455
  df = _from_rbdf(df)
3384
3456
  out[df[groups].row(0)] = df
3385
3457
  end
3386
3458
  end
3387
3459
  out
3388
3460
  else
3389
- _df.partition_by(groups, maintain_order).map { |df| _from_rbdf(df) }
3461
+ _df.partition_by(groups, maintain_order, include_key).map { |df| _from_rbdf(df) }
3390
3462
  end
3391
3463
  end
3392
3464
 
@@ -3654,7 +3726,7 @@ module Polars
3654
3726
  # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
3655
3727
  # # └─────┴──────┴───────┴──────┴──────┴───────┘
3656
3728
  def with_columns(exprs)
3657
- if !exprs.nil? && !exprs.is_a?(Array)
3729
+ if !exprs.nil? && !exprs.is_a?(::Array)
3658
3730
  exprs = [exprs]
3659
3731
  end
3660
3732
  lazy
@@ -4035,11 +4107,11 @@ module Polars
4035
4107
  # # │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │
4036
4108
  # # │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │
4037
4109
  # # └───────┴───────┴───────┴───────┴───────┴───────┘
4038
- def to_dummies(columns: nil, separator: "_")
4110
+ def to_dummies(columns: nil, separator: "_", drop_first: false)
4039
4111
  if columns.is_a?(String)
4040
4112
  columns = [columns]
4041
4113
  end
4042
- _from_rbdf(_df.to_dummies(columns, separator))
4114
+ _from_rbdf(_df.to_dummies(columns, separator, drop_first))
4043
4115
  end
4044
4116
 
4045
4117
  # Drop duplicate rows from this DataFrame.
@@ -4127,7 +4199,7 @@ module Polars
4127
4199
  subset = [subset]
4128
4200
  end
4129
4201
 
4130
- if subset.is_a?(Array) && subset.length == 1
4202
+ if subset.is_a?(::Array) && subset.length == 1
4131
4203
  expr = Utils.expr_to_lit_or_expr(subset[0], str_to_lit: false)
4132
4204
  else
4133
4205
  struct_fields = subset.nil? ? Polars.all : subset
@@ -4428,7 +4500,7 @@ module Polars
4428
4500
  end
4429
4501
  end
4430
4502
 
4431
- # Returns an iterator over the DataFrame of rows of python-native values.
4503
+ # Returns an iterator over the DataFrame of rows of Ruby-native values.
4432
4504
  #
4433
4505
  # @param named [Boolean]
4434
4506
  # Return hashes instead of arrays. The hashes are a mapping of
@@ -4489,6 +4561,24 @@ module Polars
4489
4561
  end
4490
4562
  end
4491
4563
 
4564
+ # Returns an iterator over the DataFrame of rows of Ruby-native values.
4565
+ #
4566
+ # @param named [Boolean]
4567
+ # Return hashes instead of arrays. The hashes are a mapping of
4568
+ # column name to row value. This is more expensive than returning an
4569
+ # array, but allows for accessing values by column name.
4570
+ # @param buffer_size [Integer]
4571
+ # Determines the number of rows that are buffered internally while iterating
4572
+ # over the data; you should only modify this in very specific cases where the
4573
+ # default value is determined not to be a good fit to your access pattern, as
4574
+ # the speedup from using the buffer is significant (~2-4x). Setting this
4575
+ # value to zero disables row buffering.
4576
+ #
4577
+ # @return [Object]
4578
+ def each_row(named: true, buffer_size: 500, &block)
4579
+ iter_rows(named: named, buffer_size: buffer_size, &block)
4580
+ end
4581
+
4492
4582
  # Shrink DataFrame memory usage.
4493
4583
  #
4494
4584
  # Shrinks to fit the exact capacity needed to hold the data.
@@ -4678,6 +4768,38 @@ module Polars
4678
4768
  _from_rbdf(_df.unnest(names))
4679
4769
  end
4680
4770
 
4771
+ # TODO
4772
+ # def corr
4773
+ # end
4774
+
4775
+ # TODO
4776
+ # def merge_sorted
4777
+ # end
4778
+
4779
+ # Indicate that one or multiple columns are sorted.
4780
+ #
4781
+ # @param column [Object]
4782
+ # Columns that are sorted
4783
+ # @param more_columns [Object]
4784
+ # Additional columns that are sorted, specified as positional arguments.
4785
+ # @param descending [Boolean]
4786
+ # Whether the columns are sorted in descending order.
4787
+ #
4788
+ # @return [DataFrame]
4789
+ def set_sorted(
4790
+ column,
4791
+ *more_columns,
4792
+ descending: false
4793
+ )
4794
+ lazy
4795
+ .set_sorted(column, *more_columns, descending: descending)
4796
+ .collect(no_optimization: true)
4797
+ end
4798
+
4799
+ # TODO
4800
+ # def update
4801
+ # end
4802
+
4681
4803
  private
4682
4804
 
4683
4805
  def initialize_copy(other)
@@ -4742,20 +4864,63 @@ module Polars
4742
4864
  end
4743
4865
 
4744
4866
  # @private
4745
- def self.hash_to_rbdf(data, columns: nil)
4746
- if !columns.nil?
4747
- columns, dtypes = _unpack_columns(columns, lookup_names: data.keys)
4867
+ def self.expand_hash_scalars(data, schema_overrides: nil, order: nil, nan_to_null: false)
4868
+ updated_data = {}
4869
+ unless data.empty?
4870
+ dtypes = schema_overrides || {}
4871
+ array_len = data.values.map { |val| Utils.arrlen(val) || 0 }.max
4872
+ if array_len > 0
4873
+ data.each do |name, val|
4874
+ dtype = dtypes[name]
4875
+ if val.is_a?(Hash) && dtype != Struct
4876
+ updated_data[name] = DataFrame.new(val).to_struct(name)
4877
+ elsif !Utils.arrlen(val).nil?
4878
+ updated_data[name] = Series.new(String.new(name), val, dtype: dtype)
4879
+ elsif val.nil? || [Integer, Float, TrueClass, FalseClass, String, ::Date, ::DateTime, ::Time].any? { |cls| val.is_a?(cls) }
4880
+ dtype = Polars::Float64 if val.nil? && dtype.nil?
4881
+ updated_data[name] = Series.new(String.new(name), [val], dtype: dtype).extend_constant(val, array_len - 1)
4882
+ else
4883
+ raise Todo
4884
+ end
4885
+ end
4886
+ elsif data.values.all? { |val| Utils.arrlen(val) == 0 }
4887
+ data.each do |name, val|
4888
+ updated_data[name] = Series.new(name, val, dtype: dtypes[name])
4889
+ end
4890
+ elsif data.values.all? { |val| Utils.arrlen(val).nil? }
4891
+ data.each do |name, val|
4892
+ updated_data[name] = Series.new(name, [val], dtype: dtypes[name])
4893
+ end
4894
+ end
4895
+ end
4896
+ updated_data
4897
+ end
4748
4898
 
4749
- if data.empty? && dtypes
4750
- data_series = columns.map { |name| Series.new(name, [], dtype: dtypes[name])._s }
4751
- else
4752
- data_series = data.map { |name, values| Series.new(name, values, dtype: dtypes[name])._s }
4899
+ # @private
4900
+ def self.hash_to_rbdf(data, schema: nil, schema_overrides: nil, nan_to_null: nil)
4901
+ if schema.is_a?(Hash) && !data.empty?
4902
+ if !data.all? { |col, _| schema[col] }
4903
+ raise ArgumentError, "The given column-schema names do not match the data dictionary"
4753
4904
  end
4754
- data_series = _handle_columns_arg(data_series, columns: columns)
4755
- return RbDataFrame.new(data_series)
4905
+
4906
+ data = schema.to_h { |col| [col, data[col]] }
4756
4907
  end
4757
4908
 
4758
- RbDataFrame.read_hash(data)
4909
+ column_names, schema_overrides = _unpack_schema(
4910
+ schema, lookup_names: data.keys, schema_overrides: schema_overrides
4911
+ )
4912
+ if column_names.empty?
4913
+ column_names = data.keys
4914
+ end
4915
+
4916
+ if data.empty? && !schema_overrides.empty?
4917
+ data_series = column_names.map { |name| Series.new(name, [], dtype: schema_overrides[name], nan_to_null: nan_to_null)._s }
4918
+ else
4919
+ data_series = expand_hash_scalars(data, schema_overrides: schema_overrides, nan_to_null: nan_to_null).values.map(&:_s)
4920
+ end
4921
+
4922
+ data_series = _handle_columns_arg(data_series, columns: column_names, from_hash: true)
4923
+ RbDataFrame.new(data_series)
4759
4924
  end
4760
4925
 
4761
4926
  # @private
@@ -4764,14 +4929,12 @@ module Polars
4764
4929
  end
4765
4930
 
4766
4931
  # @private
4767
- def self._unpack_columns(columns, schema_overrides: nil, lookup_names: nil, n_expected: nil)
4768
- raise Todo if schema_overrides
4769
-
4770
- if columns.is_a?(Hash)
4771
- columns = columns.to_a
4932
+ def self._unpack_schema(schema, schema_overrides: nil, n_expected: nil, lookup_names: nil, include_overrides_in_columns: false)
4933
+ if schema.is_a?(Hash)
4934
+ schema = schema.to_a
4772
4935
  end
4773
4936
  column_names =
4774
- (columns || []).map.with_index do |col, i|
4937
+ (schema || []).map.with_index do |col, i|
4775
4938
  if col.is_a?(String)
4776
4939
  col || "column_#{i}"
4777
4940
  else
@@ -4784,21 +4947,38 @@ module Polars
4784
4947
  # TODO zip_longest
4785
4948
  lookup = column_names.zip(lookup_names || []).to_h
4786
4949
 
4787
- [
4788
- column_names,
4789
- (columns || []).select { |col| !col.is_a?(String) && col[1] }.to_h do |col|
4950
+ column_dtypes =
4951
+ (schema || []).select { |col| !col.is_a?(String) && col[1] }.to_h do |col|
4790
4952
  [lookup[col[0]] || col[0], col[1]]
4791
4953
  end
4792
- ]
4954
+
4955
+ if schema_overrides
4956
+ raise Todo
4957
+ end
4958
+
4959
+ column_dtypes.each do |col, dtype|
4960
+ if !Utils.is_polars_dtype(dtype, include_unknown: true) && !dtype.nil?
4961
+ column_dtypes[col] = Utils.rb_type_to_dtype(dtype)
4962
+ end
4963
+ end
4964
+
4965
+ [column_names, column_dtypes]
4793
4966
  end
4794
4967
 
4795
- def self._handle_columns_arg(data, columns: nil)
4796
- if columns.nil?
4968
+ def self._handle_columns_arg(data, columns: nil, from_hash: false)
4969
+ if columns.nil? || columns.empty?
4797
4970
  data
4798
4971
  else
4799
4972
  if data.empty?
4800
4973
  columns.map { |c| Series.new(c, nil)._s }
4801
4974
  elsif data.length == columns.length
4975
+ if from_hash
4976
+ series_map = data.to_h { |s| [s.name, s] }
4977
+ if columns.all? { |col| series_map.key?(col) }
4978
+ return columns.map { |col| series_map[col] }
4979
+ end
4980
+ end
4981
+
4802
4982
  columns.each_with_index do |c, i|
4803
4983
  # not in-place?
4804
4984
  data[i].rename(c)
@@ -4813,7 +4993,7 @@ module Polars
4813
4993
  def self._post_apply_columns(rbdf, columns, structs: nil, schema_overrides: nil)
4814
4994
  rbdf_columns = rbdf.columns
4815
4995
  rbdf_dtypes = rbdf.dtypes
4816
- columns, dtypes = _unpack_columns(
4996
+ columns, dtypes = _unpack_schema(
4817
4997
  (columns || rbdf_columns), schema_overrides: schema_overrides
4818
4998
  )
4819
4999
  column_subset = []
@@ -4829,7 +5009,7 @@ module Polars
4829
5009
  columns.each do |col, i|
4830
5010
  if dtypes[col] == Categorical # != rbdf_dtypes[i]
4831
5011
  column_casts << Polars.col(col).cast(Categorical)._rbexpr
4832
- elsif structs.any? && structs.include?(col) && structs[col] != rbdf_dtypes[i]
5012
+ elsif structs&.any? && structs.include?(col) && structs[col] != rbdf_dtypes[i]
4833
5013
  column_casts << Polars.col(col).cast(structs[col])._rbexpr
4834
5014
  elsif dtypes.include?(col) && dtypes[col] != rbdf_dtypes[i]
4835
5015
  column_casts << Polars.col(col).cast(dtypes[col])._rbexpr
@@ -4851,27 +5031,30 @@ module Polars
4851
5031
  end
4852
5032
 
4853
5033
  # @private
4854
- def self.sequence_to_rbdf(data, columns: nil, orient: nil, infer_schema_length: 50)
5034
+ def self.sequence_to_rbdf(data, schema: nil, schema_overrides: nil, orient: nil, infer_schema_length: 50)
5035
+ raise Todo if schema_overrides
5036
+ columns = schema
5037
+
4855
5038
  if data.length == 0
4856
- return hash_to_rbdf({}, columns: columns)
5039
+ return hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
4857
5040
  end
4858
5041
 
4859
5042
  if data[0].is_a?(Series)
4860
5043
  # series_names = data.map(&:name)
4861
- # columns, dtypes = _unpack_columns(columns || series_names, n_expected: data.length)
5044
+ # columns, dtypes = _unpack_schema(columns || series_names, n_expected: data.length)
4862
5045
  data_series = []
4863
5046
  data.each do |s|
4864
5047
  data_series << s._s
4865
5048
  end
4866
5049
  elsif data[0].is_a?(Hash)
4867
- column_names, dtypes = _unpack_columns(columns)
5050
+ column_names, dtypes = _unpack_schema(columns)
4868
5051
  schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
4869
5052
  rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema_overrides)
4870
5053
  if column_names
4871
5054
  rbdf = _post_apply_columns(rbdf, column_names)
4872
5055
  end
4873
5056
  return rbdf
4874
- elsif data[0].is_a?(Array)
5057
+ elsif data[0].is_a?(::Array)
4875
5058
  if orient.nil? && !columns.nil?
4876
5059
  orient = columns.length == data.length ? "col" : "row"
4877
5060
  end
@@ -4890,11 +5073,21 @@ module Polars
4890
5073
  end
4891
5074
 
4892
5075
  # @private
4893
- def self.series_to_rbdf(data, columns: nil)
4894
- if columns
4895
- raise Todo
5076
+ def self.series_to_rbdf(data, schema: nil, schema_overrides: nil)
5077
+ data_series = [data._s]
5078
+ series_name = data_series.map(&:name)
5079
+ column_names, schema_overrides = _unpack_schema(
5080
+ schema || series_name, schema_overrides: schema_overrides, n_expected: 1
5081
+ )
5082
+ if schema_overrides.any?
5083
+ new_dtype = schema_overrides.values[0]
5084
+ if new_dtype != data.dtype
5085
+ data_series[0] = data_series[0].cast(new_dtype, true)
5086
+ end
4896
5087
  end
4897
- RbDataFrame.new([data._s])
5088
+
5089
+ data_series = _handle_columns_arg(data_series, columns: column_names)
5090
+ RbDataFrame.new(data_series)
4898
5091
  end
4899
5092
 
4900
5093
  def wrap_ldf(ldf)
@@ -4966,7 +5159,7 @@ module Polars
4966
5159
 
4967
5160
  def _prepare_other_arg(other)
4968
5161
  if !other.is_a?(Series)
4969
- if other.is_a?(Array)
5162
+ if other.is_a?(::Array)
4970
5163
  raise ArgumentError, "Operation not supported."
4971
5164
  end
4972
5165