polars-df 0.6.0 → 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (74) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +24 -0
  3. data/Cargo.lock +597 -599
  4. data/Cargo.toml +1 -0
  5. data/README.md +8 -7
  6. data/ext/polars/Cargo.toml +20 -10
  7. data/ext/polars/src/batched_csv.rs +27 -28
  8. data/ext/polars/src/conversion.rs +135 -106
  9. data/ext/polars/src/dataframe.rs +140 -131
  10. data/ext/polars/src/error.rs +0 -5
  11. data/ext/polars/src/expr/binary.rs +18 -6
  12. data/ext/polars/src/expr/categorical.rs +8 -1
  13. data/ext/polars/src/expr/datetime.rs +10 -12
  14. data/ext/polars/src/expr/general.rs +129 -286
  15. data/ext/polars/src/expr/list.rs +17 -9
  16. data/ext/polars/src/{expr.rs → expr/mod.rs} +4 -2
  17. data/ext/polars/src/expr/name.rs +44 -0
  18. data/ext/polars/src/expr/rolling.rs +201 -0
  19. data/ext/polars/src/expr/string.rs +94 -67
  20. data/ext/polars/src/file.rs +3 -3
  21. data/ext/polars/src/functions/aggregation.rs +35 -0
  22. data/ext/polars/src/functions/eager.rs +7 -31
  23. data/ext/polars/src/functions/io.rs +10 -10
  24. data/ext/polars/src/functions/lazy.rs +66 -41
  25. data/ext/polars/src/functions/meta.rs +30 -0
  26. data/ext/polars/src/functions/misc.rs +8 -0
  27. data/ext/polars/src/functions/mod.rs +5 -0
  28. data/ext/polars/src/functions/random.rs +6 -0
  29. data/ext/polars/src/functions/range.rs +41 -0
  30. data/ext/polars/src/functions/string_cache.rs +11 -0
  31. data/ext/polars/src/functions/whenthen.rs +7 -7
  32. data/ext/polars/src/lazyframe.rs +74 -60
  33. data/ext/polars/src/lib.rs +175 -91
  34. data/ext/polars/src/{apply → map}/dataframe.rs +29 -34
  35. data/ext/polars/src/{apply → map}/mod.rs +5 -5
  36. data/ext/polars/src/{apply → map}/series.rs +18 -22
  37. data/ext/polars/src/object.rs +0 -30
  38. data/ext/polars/src/on_startup.rs +32 -0
  39. data/ext/polars/src/rb_modules.rs +22 -7
  40. data/ext/polars/src/series/aggregation.rs +3 -0
  41. data/ext/polars/src/series/construction.rs +5 -5
  42. data/ext/polars/src/series/export.rs +4 -4
  43. data/ext/polars/src/{series.rs → series/mod.rs} +28 -45
  44. data/ext/polars/src/series/{set_at_idx.rs → scatter.rs} +38 -22
  45. data/ext/polars/src/sql.rs +46 -0
  46. data/ext/polars/src/utils.rs +1 -1
  47. data/lib/polars/config.rb +530 -0
  48. data/lib/polars/data_frame.rb +182 -145
  49. data/lib/polars/data_types.rb +4 -1
  50. data/lib/polars/date_time_expr.rb +23 -28
  51. data/lib/polars/date_time_name_space.rb +17 -37
  52. data/lib/polars/dynamic_group_by.rb +2 -2
  53. data/lib/polars/expr.rb +398 -110
  54. data/lib/polars/functions.rb +29 -37
  55. data/lib/polars/group_by.rb +38 -55
  56. data/lib/polars/io.rb +40 -5
  57. data/lib/polars/lazy_frame.rb +116 -89
  58. data/lib/polars/lazy_functions.rb +40 -68
  59. data/lib/polars/lazy_group_by.rb +7 -8
  60. data/lib/polars/list_expr.rb +12 -8
  61. data/lib/polars/list_name_space.rb +2 -2
  62. data/lib/polars/name_expr.rb +198 -0
  63. data/lib/polars/rolling_group_by.rb +2 -2
  64. data/lib/polars/series.rb +315 -43
  65. data/lib/polars/sql_context.rb +194 -0
  66. data/lib/polars/string_expr.rb +114 -60
  67. data/lib/polars/string_name_space.rb +19 -4
  68. data/lib/polars/struct_expr.rb +1 -1
  69. data/lib/polars/struct_name_space.rb +1 -1
  70. data/lib/polars/utils.rb +25 -13
  71. data/lib/polars/version.rb +1 -1
  72. data/lib/polars.rb +3 -0
  73. metadata +23 -11
  74. /data/ext/polars/src/{apply → map}/lazy.rs +0 -0
@@ -20,15 +20,9 @@ module Polars
20
20
  # this does not yield conclusive results, column orientation is used.
21
21
  def initialize(data = nil, schema: nil, columns: nil, schema_overrides: nil, orient: nil, infer_schema_length: 100, nan_to_null: false)
22
22
  schema ||= columns
23
- raise Todo if schema_overrides
24
23
 
25
- # TODO deprecate in favor of read_sql
26
24
  if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
27
- result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.to_sql)
28
- data = {}
29
- result.columns.each_with_index do |k, i|
30
- data[k] = result.rows.map { |r| r[i] }
31
- end
25
+ raise ArgumentError, "Use read_database instead"
32
26
  end
33
27
 
34
28
  if data.nil?
@@ -125,10 +119,10 @@ module Polars
125
119
 
126
120
  processed_null_values = Utils._process_null_values(null_values)
127
121
 
128
- if columns.is_a?(String)
122
+ if columns.is_a?(::String)
129
123
  columns = [columns]
130
124
  end
131
- if file.is_a?(String) && file.include?("*")
125
+ if file.is_a?(::String) && file.include?("*")
132
126
  dtypes_dict = nil
133
127
  if !dtype_list.nil?
134
128
  dtypes_dict = dtype_list.to_h
@@ -212,11 +206,11 @@ module Polars
212
206
  if Utils.pathlike?(source)
213
207
  source = Utils.normalise_filepath(source)
214
208
  end
215
- if columns.is_a?(String)
209
+ if columns.is_a?(::String)
216
210
  columns = [columns]
217
211
  end
218
212
 
219
- if source.is_a?(String) && source.include?("*") && Utils.local_file?(source)
213
+ if source.is_a?(::String) && source.include?("*") && Utils.local_file?(source)
220
214
  scan =
221
215
  Polars.scan_parquet(
222
216
  source,
@@ -275,11 +269,11 @@ module Polars
275
269
  if Utils.pathlike?(file)
276
270
  file = Utils.normalise_filepath(file)
277
271
  end
278
- if columns.is_a?(String)
272
+ if columns.is_a?(::String)
279
273
  columns = [columns]
280
274
  end
281
275
 
282
- if file.is_a?(String) && file.include?("*")
276
+ if file.is_a?(::String) && file.include?("*")
283
277
  raise Todo
284
278
  end
285
279
 
@@ -417,7 +411,7 @@ module Polars
417
411
  # }
418
412
  # )
419
413
  # df.dtypes
420
- # # => [Polars::Int64, Polars::Float64, Polars::Utf8]
414
+ # # => [Polars::Int64, Polars::Float64, Polars::String]
421
415
  def dtypes
422
416
  _df.dtypes
423
417
  end
@@ -435,7 +429,7 @@ module Polars
435
429
  # }
436
430
  # )
437
431
  # df.schema
438
- # # => {"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::Utf8}
432
+ # # => {"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::String}
439
433
  def schema
440
434
  columns.zip(dtypes).to_h
441
435
  end
@@ -595,13 +589,13 @@ module Polars
595
589
  return df.slice(row_selection, 1)
596
590
  end
597
591
  # df[2, "a"]
598
- if col_selection.is_a?(String) || col_selection.is_a?(Symbol)
592
+ if col_selection.is_a?(::String) || col_selection.is_a?(Symbol)
599
593
  return self[col_selection][row_selection]
600
594
  end
601
595
  end
602
596
 
603
597
  # column selection can be "a" and ["a", "b"]
604
- if col_selection.is_a?(String) || col_selection.is_a?(Symbol)
598
+ if col_selection.is_a?(::String) || col_selection.is_a?(Symbol)
605
599
  col_selection = [col_selection]
606
600
  end
607
601
 
@@ -627,7 +621,7 @@ module Polars
627
621
 
628
622
  # select single column
629
623
  # df["foo"]
630
- if item.is_a?(String) || item.is_a?(Symbol)
624
+ if item.is_a?(::String) || item.is_a?(Symbol)
631
625
  return Utils.wrap_s(_df.column(item.to_s))
632
626
  end
633
627
 
@@ -653,7 +647,7 @@ module Polars
653
647
 
654
648
  if item.is_a?(Series)
655
649
  dtype = item.dtype
656
- if dtype == Utf8
650
+ if dtype == String
657
651
  return _from_rbdf(_df.select(item))
658
652
  elsif dtype == UInt32
659
653
  return _from_rbdf(_df.take_with_series(item._s))
@@ -704,7 +698,7 @@ module Polars
704
698
  s[row_selection] = value
705
699
 
706
700
  if col_selection.is_a?(Integer)
707
- replace_at_idx(col_selection, s)
701
+ replace_column(col_selection, s)
708
702
  elsif Utils.strlike?(col_selection)
709
703
  replace(col_selection, s)
710
704
  end
@@ -905,6 +899,7 @@ module Polars
905
899
  def write_csv(
906
900
  file = nil,
907
901
  has_header: true,
902
+ include_header: nil,
908
903
  sep: ",",
909
904
  quote: '"',
910
905
  batch_size: 1024,
@@ -914,6 +909,8 @@ module Polars
914
909
  float_precision: nil,
915
910
  null_value: nil
916
911
  )
912
+ include_header = has_header if include_header.nil?
913
+
917
914
  if sep.length > 1
918
915
  raise ArgumentError, "only single byte separator is allowed"
919
916
  elsif quote.length > 1
@@ -927,7 +924,7 @@ module Polars
927
924
  buffer.set_encoding(Encoding::BINARY)
928
925
  _df.write_csv(
929
926
  buffer,
930
- has_header,
927
+ include_header,
931
928
  sep.ord,
932
929
  quote.ord,
933
930
  batch_size,
@@ -946,7 +943,7 @@ module Polars
946
943
 
947
944
  _df.write_csv(
948
945
  file,
949
- has_header,
946
+ include_header,
950
947
  sep.ord,
951
948
  quote.ord,
952
949
  batch_size,
@@ -1151,22 +1148,8 @@ module Polars
1151
1148
  # # │ b ┆ 1 ┆ 2 ┆ 3 │
1152
1149
  # # └─────┴─────┴─────┴─────┘
1153
1150
  def transpose(include_header: false, header_name: "column", column_names: nil)
1154
- df = _from_rbdf(_df.transpose(include_header, header_name))
1155
- if !column_names.nil?
1156
- names = []
1157
- n = df.width
1158
- if include_header
1159
- names << header_name
1160
- n -= 1
1161
- end
1162
-
1163
- column_names = column_names.each
1164
- n.times do
1165
- names << column_names.next
1166
- end
1167
- df.columns = names
1168
- end
1169
- df
1151
+ keep_names_as = include_header ? header_name : nil
1152
+ _from_rbdf(_df.transpose(keep_names_as, column_names))
1170
1153
  end
1171
1154
 
1172
1155
  # Reverse the DataFrame.
@@ -1239,7 +1222,7 @@ module Polars
1239
1222
  # @example
1240
1223
  # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
1241
1224
  # s = Polars::Series.new("baz", [97, 98, 99])
1242
- # df.insert_at_idx(1, s)
1225
+ # df.insert_column(1, s)
1243
1226
  # # =>
1244
1227
  # # shape: (3, 3)
1245
1228
  # # ┌─────┬─────┬─────┐
@@ -1261,7 +1244,7 @@ module Polars
1261
1244
  # }
1262
1245
  # )
1263
1246
  # s = Polars::Series.new("d", [-2.5, 15, 20.5, 0])
1264
- # df.insert_at_idx(3, s)
1247
+ # df.insert_column(3, s)
1265
1248
  # # =>
1266
1249
  # # shape: (4, 4)
1267
1250
  # # ┌─────┬──────┬───────┬──────┐
@@ -1274,13 +1257,14 @@ module Polars
1274
1257
  # # │ 3 ┆ 10.0 ┆ false ┆ 20.5 │
1275
1258
  # # │ 4 ┆ 13.0 ┆ true ┆ 0.0 │
1276
1259
  # # └─────┴──────┴───────┴──────┘
1277
- def insert_at_idx(index, series)
1260
+ def insert_column(index, series)
1278
1261
  if index < 0
1279
1262
  index = columns.length + index
1280
1263
  end
1281
- _df.insert_at_idx(index, series._s)
1264
+ _df.insert_column(index, series._s)
1282
1265
  self
1283
1266
  end
1267
+ alias_method :insert_at_idx, :insert_column
1284
1268
 
1285
1269
  # Filter the rows in the DataFrame based on a predicate expression.
1286
1270
  #
@@ -1384,7 +1368,7 @@ module Polars
1384
1368
  ]
1385
1369
  )._df
1386
1370
  )
1387
- summary.insert_at_idx(
1371
+ summary.insert_column(
1388
1372
  0,
1389
1373
  Polars::Series.new(
1390
1374
  "describe",
@@ -1405,11 +1389,12 @@ module Polars
1405
1389
  # df = Polars::DataFrame.new(
1406
1390
  # {"foo" => [1, 2, 3], "bar" => [6, 7, 8], "ham" => ["a", "b", "c"]}
1407
1391
  # )
1408
- # df.find_idx_by_name("ham")
1392
+ # df.get_column_index("ham")
1409
1393
  # # => 2
1410
- def find_idx_by_name(name)
1411
- _df.find_idx_by_name(name)
1394
+ def get_column_index(name)
1395
+ _df.get_column_index(name)
1412
1396
  end
1397
+ alias_method :find_idx_by_name, :get_column_index
1413
1398
 
1414
1399
  # Replace a column at an index location.
1415
1400
  #
@@ -1429,7 +1414,7 @@ module Polars
1429
1414
  # }
1430
1415
  # )
1431
1416
  # s = Polars::Series.new("apple", [10, 20, 30])
1432
- # df.replace_at_idx(0, s)
1417
+ # df.replace_column(0, s)
1433
1418
  # # =>
1434
1419
  # # shape: (3, 3)
1435
1420
  # # ┌───────┬─────┬─────┐
@@ -1441,13 +1426,14 @@ module Polars
1441
1426
  # # │ 20 ┆ 7 ┆ b │
1442
1427
  # # │ 30 ┆ 8 ┆ c │
1443
1428
  # # └───────┴─────┴─────┘
1444
- def replace_at_idx(index, series)
1429
+ def replace_column(index, series)
1445
1430
  if index < 0
1446
1431
  index = columns.length + index
1447
1432
  end
1448
- _df.replace_at_idx(index, series._s)
1433
+ _df.replace_column(index, series._s)
1449
1434
  self
1450
1435
  end
1436
+ alias_method :replace_at_idx, :replace_column
1451
1437
 
1452
1438
  # Sort the DataFrame by column.
1453
1439
  #
@@ -1541,13 +1527,14 @@ module Polars
1541
1527
  # "ham" => ["c", "b", "a"]
1542
1528
  # }
1543
1529
  # )
1544
- # df1.frame_equal(df1)
1530
+ # df1.equals(df1)
1545
1531
  # # => true
1546
- # df1.frame_equal(df2)
1532
+ # df1.equals(df2)
1547
1533
  # # => false
1548
- def frame_equal(other, null_equal: true)
1549
- _df.frame_equal(other._df, null_equal)
1534
+ def equals(other, null_equal: true)
1535
+ _df.equals(other._df, null_equal)
1550
1536
  end
1537
+ alias_method :frame_equal, :equals
1551
1538
 
1552
1539
  # Replace a column by a new Series.
1553
1540
  #
@@ -1733,7 +1720,7 @@ module Polars
1733
1720
  # # │ 3 ┆ 8 ┆ c │
1734
1721
  # # └─────┴─────┴─────┘
1735
1722
  def drop_nulls(subset: nil)
1736
- if subset.is_a?(String)
1723
+ if subset.is_a?(::String)
1737
1724
  subset = [subset]
1738
1725
  end
1739
1726
  _from_rbdf(_df.drop_nulls(subset))
@@ -1811,13 +1798,13 @@ module Polars
1811
1798
  _from_rbdf(_df.with_row_count(name, offset))
1812
1799
  end
1813
1800
 
1814
- # Start a groupby operation.
1801
+ # Start a group by operation.
1815
1802
  #
1816
1803
  # @param by [Object]
1817
1804
  # Column(s) to group by.
1818
1805
  # @param maintain_order [Boolean]
1819
1806
  # Make sure that the order of the groups remain consistent. This is more
1820
- # expensive than a default groupby. Note that this only works in expression
1807
+ # expensive than a default group by. Note that this only works in expression
1821
1808
  # aggregations.
1822
1809
  #
1823
1810
  # @return [GroupBy]
@@ -1830,7 +1817,7 @@ module Polars
1830
1817
  # "c" => [6, 5, 4, 3, 2, 1]
1831
1818
  # }
1832
1819
  # )
1833
- # df.groupby("a").agg(Polars.col("b").sum).sort("a")
1820
+ # df.group_by("a").agg(Polars.col("b").sum).sort("a")
1834
1821
  # # =>
1835
1822
  # # shape: (3, 2)
1836
1823
  # # ┌─────┬─────┐
@@ -1842,25 +1829,26 @@ module Polars
1842
1829
  # # │ b ┆ 11 │
1843
1830
  # # │ c ┆ 6 │
1844
1831
  # # └─────┴─────┘
1845
- def groupby(by, maintain_order: false)
1832
+ def group_by(by, maintain_order: false)
1846
1833
  if !Utils.bool?(maintain_order)
1847
- raise TypeError, "invalid input for groupby arg `maintain_order`: #{maintain_order}."
1834
+ raise TypeError, "invalid input for group_by arg `maintain_order`: #{maintain_order}."
1848
1835
  end
1849
1836
  GroupBy.new(
1850
- _df,
1837
+ self,
1851
1838
  by,
1852
- self.class,
1853
1839
  maintain_order: maintain_order
1854
1840
  )
1855
1841
  end
1842
+ alias_method :groupby, :group_by
1843
+ alias_method :group, :group_by
1856
1844
 
1857
1845
  # Create rolling groups based on a time column.
1858
1846
  #
1859
1847
  # Also works for index values of type `:i32` or `:i64`.
1860
1848
  #
1861
- # Different from a `dynamic_groupby` the windows are now determined by the
1849
+ # Different from a `dynamic_group_by` the windows are now determined by the
1862
1850
  # individual values and are not of constant intervals. For constant intervals use
1863
- # *groupby_dynamic*
1851
+ # *group_by_dynamic*
1864
1852
  #
1865
1853
  # The `period` and `offset` arguments are created either from a timedelta, or
1866
1854
  # by using the following string language:
@@ -1880,7 +1868,7 @@ module Polars
1880
1868
  # Or combine them:
1881
1869
  # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
1882
1870
  #
1883
- # In case of a groupby_rolling on an integer column, the windows are defined by:
1871
+ # In case of a group_by_rolling on an integer column, the windows are defined by:
1884
1872
  #
1885
1873
  # - **"1i" # length 1**
1886
1874
  # - **"10i" # length 10**
@@ -1891,7 +1879,7 @@ module Polars
1891
1879
  # This column must be sorted in ascending order. If not the output will not
1892
1880
  # make sense.
1893
1881
  #
1894
- # In case of a rolling groupby on indices, dtype needs to be one of
1882
+ # In case of a rolling group by on indices, dtype needs to be one of
1895
1883
  # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
1896
1884
  # performance matters use an `:i64` column.
1897
1885
  # @param period [Object]
@@ -1923,7 +1911,7 @@ module Polars
1923
1911
  # df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
1924
1912
  # Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
1925
1913
  # )
1926
- # df.groupby_rolling(index_column: "dt", period: "2d").agg(
1914
+ # df.group_by_rolling(index_column: "dt", period: "2d").agg(
1927
1915
  # [
1928
1916
  # Polars.sum("a").alias("sum_a"),
1929
1917
  # Polars.min("a").alias("min_a"),
@@ -1944,7 +1932,7 @@ module Polars
1944
1932
  # # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
1945
1933
  # # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
1946
1934
  # # └─────────────────────┴───────┴───────┴───────┘
1947
- def groupby_rolling(
1935
+ def group_by_rolling(
1948
1936
  index_column:,
1949
1937
  period:,
1950
1938
  offset: nil,
@@ -1954,11 +1942,12 @@ module Polars
1954
1942
  )
1955
1943
  RollingGroupBy.new(self, index_column, period, offset, closed, by, check_sorted)
1956
1944
  end
1945
+ alias_method :groupby_rolling, :group_by_rolling
1957
1946
 
1958
1947
  # Group based on a time value (or index value of type `:i32`, `:i64`).
1959
1948
  #
1960
1949
  # Time windows are calculated and rows are assigned to windows. Different from a
1961
- # normal groupby is that a row can be member of multiple groups. The time/index
1950
+ # normal group by is that a row can be member of multiple groups. The time/index
1962
1951
  # window could be seen as a rolling window, with a window size determined by
1963
1952
  # dates/times/values instead of slots in the DataFrame.
1964
1953
  #
@@ -1986,7 +1975,7 @@ module Polars
1986
1975
  # Or combine them:
1987
1976
  # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
1988
1977
  #
1989
- # In case of a groupby_dynamic on an integer column, the windows are defined by:
1978
+ # In case of a group_by_dynamic on an integer column, the windows are defined by:
1990
1979
  #
1991
1980
  # - "1i" # length 1
1992
1981
  # - "10i" # length 10
@@ -1997,7 +1986,7 @@ module Polars
1997
1986
  # This column must be sorted in ascending order. If not the output will not
1998
1987
  # make sense.
1999
1988
  #
2000
- # In case of a dynamic groupby on indices, dtype needs to be one of
1989
+ # In case of a dynamic group by on indices, dtype needs to be one of
2001
1990
  # `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
2002
1991
  # performance matters use an `:i64` column.
2003
1992
  # @param every
@@ -2048,7 +2037,7 @@ module Polars
2048
2037
  # # └─────────────────────┴─────┘
2049
2038
  #
2050
2039
  # @example Group by windows of 1 hour starting at 2021-12-16 00:00:00.
2051
- # df.groupby_dynamic("time", every: "1h", closed: "right").agg(
2040
+ # df.group_by_dynamic("time", every: "1h", closed: "right").agg(
2052
2041
  # [
2053
2042
  # Polars.col("time").min.alias("time_min"),
2054
2043
  # Polars.col("time").max.alias("time_max")
@@ -2068,7 +2057,7 @@ module Polars
2068
2057
  # # └─────────────────────┴─────────────────────┴─────────────────────┘
2069
2058
  #
2070
2059
  # @example The window boundaries can also be added to the aggregation result.
2071
- # df.groupby_dynamic(
2060
+ # df.group_by_dynamic(
2072
2061
  # "time", every: "1h", include_boundaries: true, closed: "right"
2073
2062
  # ).agg([Polars.col("time").count.alias("time_count")])
2074
2063
  # # =>
@@ -2085,7 +2074,7 @@ module Polars
2085
2074
  # # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
2086
2075
  #
2087
2076
  # @example When closed="left", should not include right end of interval.
2088
- # df.groupby_dynamic("time", every: "1h", closed: "left").agg(
2077
+ # df.group_by_dynamic("time", every: "1h", closed: "left").agg(
2089
2078
  # [
2090
2079
  # Polars.col("time").count.alias("time_count"),
2091
2080
  # Polars.col("time").alias("time_agg_list")
@@ -2105,7 +2094,7 @@ module Polars
2105
2094
  # # └─────────────────────┴────────────┴───────────────────────────────────┘
2106
2095
  #
2107
2096
  # @example When closed="both" the time values at the window boundaries belong to 2 groups.
2108
- # df.groupby_dynamic("time", every: "1h", closed: "both").agg(
2097
+ # df.group_by_dynamic("time", every: "1h", closed: "both").agg(
2109
2098
  # [Polars.col("time").count.alias("time_count")]
2110
2099
  # )
2111
2100
  # # =>
@@ -2122,7 +2111,7 @@ module Polars
2122
2111
  # # │ 2021-12-16 03:00:00 ┆ 1 │
2123
2112
  # # └─────────────────────┴────────────┘
2124
2113
  #
2125
- # @example Dynamic groupbys can also be combined with grouping on normal keys.
2114
+ # @example Dynamic group bys can also be combined with grouping on normal keys.
2126
2115
  # df = Polars::DataFrame.new(
2127
2116
  # {
2128
2117
  # "time" => Polars.date_range(
@@ -2133,7 +2122,7 @@ module Polars
2133
2122
  # "groups" => ["a", "a", "a", "b", "b", "a", "a"]
2134
2123
  # }
2135
2124
  # )
2136
- # df.groupby_dynamic(
2125
+ # df.group_by_dynamic(
2137
2126
  # "time",
2138
2127
  # every: "1h",
2139
2128
  # closed: "both",
@@ -2156,14 +2145,14 @@ module Polars
2156
2145
  # # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │
2157
2146
  # # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
2158
2147
  #
2159
- # @example Dynamic groupby on an index column.
2148
+ # @example Dynamic group by on an index column.
2160
2149
  # df = Polars::DataFrame.new(
2161
2150
  # {
2162
2151
  # "idx" => Polars.arange(0, 6, eager: true),
2163
2152
  # "A" => ["A", "A", "B", "B", "B", "C"]
2164
2153
  # }
2165
2154
  # )
2166
- # df.groupby_dynamic(
2155
+ # df.group_by_dynamic(
2167
2156
  # "idx",
2168
2157
  # every: "2i",
2169
2158
  # period: "3i",
@@ -2181,7 +2170,7 @@ module Polars
2181
2170
  # # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
2182
2171
  # # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
2183
2172
  # # └─────────────────┴─────────────────┴─────┴─────────────────┘
2184
- def groupby_dynamic(
2173
+ def group_by_dynamic(
2185
2174
  index_column,
2186
2175
  every:,
2187
2176
  period: nil,
@@ -2205,6 +2194,7 @@ module Polars
2205
2194
  start_by
2206
2195
  )
2207
2196
  end
2197
+ alias_method :groupby_dynamic, :group_by_dynamic
2208
2198
 
2209
2199
  # Upsample a DataFrame at a regular frequency.
2210
2200
  #
@@ -2281,7 +2271,7 @@ module Polars
2281
2271
  if by.nil?
2282
2272
  by = []
2283
2273
  end
2284
- if by.is_a?(String)
2274
+ if by.is_a?(::String)
2285
2275
  by = [by]
2286
2276
  end
2287
2277
  if offset.nil?
@@ -2475,17 +2465,17 @@ module Polars
2475
2465
  # @example
2476
2466
  # df.join(other_df, on: "ham", how: "outer")
2477
2467
  # # =>
2478
- # # shape: (4, 4)
2479
- # # ┌──────┬──────┬─────┬───────┐
2480
- # # │ foo ┆ bar ┆ ham ┆ apple │
2481
- # # │ --- ┆ --- ┆ --- ┆ ---
2482
- # # │ i64 ┆ f64 ┆ str ┆ str
2483
- # # ╞══════╪══════╪═════╪═══════╡
2484
- # # │ 1 ┆ 6.0 ┆ a ┆ x │
2485
- # # │ 2 ┆ 7.0 ┆ b ┆ y │
2486
- # # │ null ┆ null ┆ d ┆ z │
2487
- # # │ 3 ┆ 8.0 ┆ c ┆ null │
2488
- # # └──────┴──────┴─────┴───────┘
2468
+ # # shape: (4, 5)
2469
+ # # ┌──────┬──────┬──────┬───────┬───────────┐
2470
+ # # │ foo ┆ bar ┆ ham ┆ apple ┆ ham_right
2471
+ # # │ --- ┆ --- ┆ --- --- ┆ ---
2472
+ # # │ i64 ┆ f64 ┆ str str ┆ str
2473
+ # # ╞══════╪══════╪══════╪═══════╪═══════════╡
2474
+ # # │ 1 ┆ 6.0 ┆ a ┆ x ┆ a
2475
+ # # │ 2 ┆ 7.0 ┆ b ┆ y ┆ b
2476
+ # # │ null ┆ null ┆ null ┆ z ┆ d
2477
+ # # │ 3 ┆ 8.0 ┆ c ┆ null ┆ null
2478
+ # # └──────┴──────┴──────┴───────┴───────────┘
2489
2479
  #
2490
2480
  # @example
2491
2481
  # df.join(other_df, on: "ham", how: "left")
@@ -3125,17 +3115,17 @@ module Polars
3125
3115
  sort_columns: false,
3126
3116
  separator: "_"
3127
3117
  )
3128
- if values.is_a?(String)
3118
+ if values.is_a?(::String)
3129
3119
  values = [values]
3130
3120
  end
3131
- if index.is_a?(String)
3121
+ if index.is_a?(::String)
3132
3122
  index = [index]
3133
3123
  end
3134
- if columns.is_a?(String)
3124
+ if columns.is_a?(::String)
3135
3125
  columns = [columns]
3136
3126
  end
3137
3127
 
3138
- if aggregate_fn.is_a?(String)
3128
+ if aggregate_fn.is_a?(::String)
3139
3129
  case aggregate_fn
3140
3130
  when "first"
3141
3131
  aggregate_expr = Polars.element.first._rbexpr
@@ -3220,10 +3210,10 @@ module Polars
3220
3210
  # # │ z ┆ c ┆ 6 │
3221
3211
  # # └─────┴──────────┴───────┘
3222
3212
  def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil)
3223
- if value_vars.is_a?(String)
3213
+ if value_vars.is_a?(::String)
3224
3214
  value_vars = [value_vars]
3225
3215
  end
3226
- if id_vars.is_a?(String)
3216
+ if id_vars.is_a?(::String)
3227
3217
  id_vars = [id_vars]
3228
3218
  end
3229
3219
  if value_vars.nil?
@@ -3437,7 +3427,7 @@ module Polars
3437
3427
  # # │ C ┆ 2 ┆ l │
3438
3428
  # # └─────┴─────┴─────┘}
3439
3429
  def partition_by(groups, maintain_order: true, include_key: true, as_dict: false)
3440
- if groups.is_a?(String)
3430
+ if groups.is_a?(::String)
3441
3431
  groups = [groups]
3442
3432
  elsif !groups.is_a?(::Array)
3443
3433
  groups = Array(groups)
@@ -3464,8 +3454,10 @@ module Polars
3464
3454
 
3465
3455
  # Shift values by the given period.
3466
3456
  #
3467
- # @param periods [Integer]
3457
+ # @param n [Integer]
3468
3458
  # Number of places to shift (may be negative).
3459
+ # @param fill_value [Object]
3460
+ # Fill the resulting null values with this value.
3469
3461
  #
3470
3462
  # @return [DataFrame]
3471
3463
  #
@@ -3503,8 +3495,8 @@ module Polars
3503
3495
  # # │ 3 ┆ 8 ┆ c │
3504
3496
  # # │ null ┆ null ┆ null │
3505
3497
  # # └──────┴──────┴──────┘
3506
- def shift(periods)
3507
- _from_rbdf(_df.shift(periods))
3498
+ def shift(n, fill_value: nil)
3499
+ lazy.shift(n, fill_value: fill_value).collect(_eager: true)
3508
3500
  end
3509
3501
 
3510
3502
  # Shift the values by a given period and fill the resulting null values.
@@ -3537,9 +3529,7 @@ module Polars
3537
3529
  # # │ 2 ┆ 7 ┆ b │
3538
3530
  # # └─────┴─────┴─────┘
3539
3531
  def shift_and_fill(periods, fill_value)
3540
- lazy
3541
- .shift_and_fill(periods, fill_value)
3542
- .collect(no_optimization: true, string_cache: false)
3532
+ shift(periods, fill_value: fill_value)
3543
3533
  end
3544
3534
 
3545
3535
  # Get a mask of all duplicated rows in this DataFrame.
@@ -3788,9 +3778,9 @@ module Polars
3788
3778
  # # └─────┴─────┴─────┘
3789
3779
  def max(axis: 0)
3790
3780
  if axis == 0
3791
- _from_rbdf(_df.max)
3781
+ lazy.max.collect(_eager: true)
3792
3782
  elsif axis == 1
3793
- Utils.wrap_s(_df.hmax)
3783
+ Utils.wrap_s(_df.max_horizontal)
3794
3784
  else
3795
3785
  raise ArgumentError, "Axis should be 0 or 1."
3796
3786
  end
@@ -3820,9 +3810,9 @@ module Polars
3820
3810
  # # └─────┴─────┴─────┘
3821
3811
  def min(axis: 0)
3822
3812
  if axis == 0
3823
- _from_rbdf(_df.min)
3813
+ lazy.min.collect(_eager: true)
3824
3814
  elsif axis == 1
3825
- Utils.wrap_s(_df.hmin)
3815
+ Utils.wrap_s(_df.min_horizontal)
3826
3816
  else
3827
3817
  raise ArgumentError, "Axis should be 0 or 1."
3828
3818
  end
@@ -3869,9 +3859,9 @@ module Polars
3869
3859
  def sum(axis: 0, null_strategy: "ignore")
3870
3860
  case axis
3871
3861
  when 0
3872
- _from_rbdf(_df.sum)
3862
+ lazy.sum.collect(_eager: true)
3873
3863
  when 1
3874
- Utils.wrap_s(_df.hsum(null_strategy))
3864
+ Utils.wrap_s(_df.sum_horizontal(null_strategy))
3875
3865
  else
3876
3866
  raise ArgumentError, "Axis should be 0 or 1."
3877
3867
  end
@@ -3907,9 +3897,9 @@ module Polars
3907
3897
  def mean(axis: 0, null_strategy: "ignore")
3908
3898
  case axis
3909
3899
  when 0
3910
- _from_rbdf(_df.mean)
3900
+ lazy.mean.collect(_eager: true)
3911
3901
  when 1
3912
- Utils.wrap_s(_df.hmean(null_strategy))
3902
+ Utils.wrap_s(_df.mean_horizontal(null_strategy))
3913
3903
  else
3914
3904
  raise ArgumentError, "Axis should be 0 or 1."
3915
3905
  end
@@ -3953,7 +3943,7 @@ module Polars
3953
3943
  # # │ 0.816497 ┆ 0.816497 ┆ null │
3954
3944
  # # └──────────┴──────────┴──────┘
3955
3945
  def std(ddof: 1)
3956
- _from_rbdf(_df.std(ddof))
3946
+ lazy.std(ddof: ddof).collect(_eager: true)
3957
3947
  end
3958
3948
 
3959
3949
  # Aggregate the columns of this DataFrame to their variance value.
@@ -3994,7 +3984,7 @@ module Polars
3994
3984
  # # │ 0.666667 ┆ 0.666667 ┆ null │
3995
3985
  # # └──────────┴──────────┴──────┘
3996
3986
  def var(ddof: 1)
3997
- _from_rbdf(_df.var(ddof))
3987
+ lazy.var(ddof: ddof).collect(_eager: true)
3998
3988
  end
3999
3989
 
4000
3990
  # Aggregate the columns of this DataFrame to their median value.
@@ -4020,7 +4010,7 @@ module Polars
4020
4010
  # # │ 2.0 ┆ 7.0 ┆ null │
4021
4011
  # # └─────┴─────┴──────┘
4022
4012
  def median
4023
- _from_rbdf(_df.median)
4013
+ lazy.median.collect(_eager: true)
4024
4014
  end
4025
4015
 
4026
4016
  # Aggregate the columns of this DataFrame to their product values.
@@ -4077,7 +4067,7 @@ module Polars
4077
4067
  # # │ 2.0 ┆ 7.0 ┆ null │
4078
4068
  # # └─────┴─────┴──────┘
4079
4069
  def quantile(quantile, interpolation: "nearest")
4080
- _from_rbdf(_df.quantile(quantile, interpolation))
4070
+ lazy.quantile(quantile, interpolation: interpolation).collect(_eager: true)
4081
4071
  end
4082
4072
 
4083
4073
  # Get one hot encoded dummy variables.
@@ -4108,7 +4098,7 @@ module Polars
4108
4098
  # # │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │
4109
4099
  # # └───────┴───────┴───────┴───────┴───────┴───────┘
4110
4100
  def to_dummies(columns: nil, separator: "_", drop_first: false)
4111
- if columns.is_a?(String)
4101
+ if columns.is_a?(::String)
4112
4102
  columns = [columns]
4113
4103
  end
4114
4104
  _from_rbdf(_df.to_dummies(columns, separator, drop_first))
@@ -4294,15 +4284,20 @@ module Polars
4294
4284
  end
4295
4285
 
4296
4286
  if n.nil? && !frac.nil?
4287
+ frac = Series.new("frac", [frac]) unless frac.is_a?(Series)
4288
+
4297
4289
  _from_rbdf(
4298
- _df.sample_frac(frac, with_replacement, shuffle, seed)
4290
+ _df.sample_frac(frac._s, with_replacement, shuffle, seed)
4299
4291
  )
4300
4292
  end
4301
4293
 
4302
4294
  if n.nil?
4303
4295
  n = 1
4304
4296
  end
4305
- _from_rbdf(_df.sample_n(n, with_replacement, shuffle, seed))
4297
+
4298
+ n = Series.new("", [n]) unless n.is_a?(Series)
4299
+
4300
+ _from_rbdf(_df.sample_n(n._s, with_replacement, shuffle, seed))
4306
4301
  end
4307
4302
 
4308
4303
  # Apply a horizontal reduction on a DataFrame.
@@ -4601,7 +4596,7 @@ module Polars
4601
4596
  #
4602
4597
  # @example
4603
4598
  # s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]})
4604
- # s.take_every(2)
4599
+ # s.gather_every(2)
4605
4600
  # # =>
4606
4601
  # # shape: (2, 2)
4607
4602
  # # ┌─────┬─────┐
@@ -4612,9 +4607,10 @@ module Polars
4612
4607
  # # │ 1 ┆ 5 │
4613
4608
  # # │ 3 ┆ 7 │
4614
4609
  # # └─────┴─────┘
4615
- def take_every(n)
4616
- select(Utils.col("*").take_every(n))
4610
+ def gather_every(n, offset = 0)
4611
+ select(Utils.col("*").gather_every(n, offset))
4617
4612
  end
4613
+ alias_method :take_every, :gather_every
4618
4614
 
4619
4615
  # Hash and combine the rows in this DataFrame.
4620
4616
  #
@@ -4671,16 +4667,16 @@ module Polars
4671
4667
  # df.interpolate
4672
4668
  # # =>
4673
4669
  # # shape: (4, 3)
4674
- # # ┌─────┬──────┬─────┐
4675
- # # │ foo ┆ bar ┆ baz
4676
- # # │ --- ┆ --- ┆ ---
4677
- # # │ i64 i64i64
4678
- # # ╞═════╪══════╪═════╡
4679
- # # │ 1 ┆ 6 ┆ 1
4680
- # # │ 5 ┆ 7 ┆ 3
4681
- # # │ 9 ┆ 9 ┆ 6
4682
- # # │ 10 ┆ null ┆ 9
4683
- # # └─────┴──────┴─────┘
4670
+ # # ┌──────┬──────┬──────────┐
4671
+ # # │ foo ┆ bar ┆ baz
4672
+ # # │ --- ┆ --- ┆ ---
4673
+ # # │ f64 f64f64
4674
+ # # ╞══════╪══════╪══════════╡
4675
+ # # │ 1.0 ┆ 6.0 ┆ 1.0
4676
+ # # │ 5.0 ┆ 7.0 ┆ 3.666667
4677
+ # # │ 9.0 ┆ 9.0 ┆ 6.333333
4678
+ # # │ 10.0 ┆ null ┆ 9.0
4679
+ # # └──────┴──────┴──────────┘
4684
4680
  def interpolate
4685
4681
  select(Utils.col("*").interpolate)
4686
4682
  end
@@ -4762,7 +4758,7 @@ module Polars
4762
4758
  # # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │
4763
4759
  # # └────────┴─────┴─────┴──────┴───────────┴───────┘
4764
4760
  def unnest(names)
4765
- if names.is_a?(String)
4761
+ if names.is_a?(::String)
4766
4762
  names = [names]
4767
4763
  end
4768
4764
  _from_rbdf(_df.unnest(names))
@@ -4875,10 +4871,10 @@ module Polars
4875
4871
  if val.is_a?(Hash) && dtype != Struct
4876
4872
  updated_data[name] = DataFrame.new(val).to_struct(name)
4877
4873
  elsif !Utils.arrlen(val).nil?
4878
- updated_data[name] = Series.new(String.new(name), val, dtype: dtype)
4879
- elsif val.nil? || [Integer, Float, TrueClass, FalseClass, String, ::Date, ::DateTime, ::Time].any? { |cls| val.is_a?(cls) }
4874
+ updated_data[name] = Series.new(::String.new(name), val, dtype: dtype)
4875
+ elsif val.nil? || [Integer, Float, TrueClass, FalseClass, ::String, ::Date, ::DateTime, ::Time].any? { |cls| val.is_a?(cls) }
4880
4876
  dtype = Polars::Float64 if val.nil? && dtype.nil?
4881
- updated_data[name] = Series.new(String.new(name), [val], dtype: dtype).extend_constant(val, array_len - 1)
4877
+ updated_data[name] = Series.new(::String.new(name), [val], dtype: dtype).extend_constant(val, array_len - 1)
4882
4878
  else
4883
4879
  raise Todo
4884
4880
  end
@@ -4935,7 +4931,7 @@ module Polars
4935
4931
  end
4936
4932
  column_names =
4937
4933
  (schema || []).map.with_index do |col, i|
4938
- if col.is_a?(String)
4934
+ if col.is_a?(::String)
4939
4935
  col || "column_#{i}"
4940
4936
  else
4941
4937
  col[0]
@@ -4948,12 +4944,12 @@ module Polars
4948
4944
  lookup = column_names.zip(lookup_names || []).to_h
4949
4945
 
4950
4946
  column_dtypes =
4951
- (schema || []).select { |col| !col.is_a?(String) && col[1] }.to_h do |col|
4947
+ (schema || []).select { |col| !col.is_a?(::String) && col[1] }.to_h do |col|
4952
4948
  [lookup[col[0]] || col[0], col[1]]
4953
4949
  end
4954
4950
 
4955
- if schema_overrides
4956
- raise Todo
4951
+ if schema_overrides && schema_overrides.any?
4952
+ column_dtypes.merge!(schema_overrides)
4957
4953
  end
4958
4954
 
4959
4955
  column_dtypes.each do |col, dtype|
@@ -5056,13 +5052,54 @@ module Polars
5056
5052
  return rbdf
5057
5053
  elsif data[0].is_a?(::Array)
5058
5054
  if orient.nil? && !columns.nil?
5059
- orient = columns.length == data.length ? "col" : "row"
5055
+ first_element = data[0]
5056
+ row_types = first_element.filter_map { |value| value.class }.uniq
5057
+ if row_types.include?(Integer) && row_types.include?(Float)
5058
+ row_types.delete(Integer)
5059
+ end
5060
+ orient = row_types.length == 1 ? "col" : "row"
5060
5061
  end
5061
5062
 
5062
5063
  if orient == "row"
5063
- raise Todo
5064
+ column_names, schema_overrides = _unpack_schema(
5065
+ schema, schema_overrides: schema_overrides, n_expected: first_element.length
5066
+ )
5067
+ local_schema_override = (
5068
+ schema_overrides.any? ? (raise Todo) : {}
5069
+ )
5070
+ if column_names.any? && first_element.length > 0 && first_element.length != column_names.length
5071
+ raise ArgumentError, "the row data does not match the number of columns"
5072
+ end
5073
+
5074
+ unpack_nested = false
5075
+ local_schema_override.each do |col, tp|
5076
+ raise Todo
5077
+ end
5078
+
5079
+ if unpack_nested
5080
+ raise Todo
5081
+ else
5082
+ rbdf = RbDataFrame.read_rows(
5083
+ data,
5084
+ infer_schema_length,
5085
+ local_schema_override.any? ? local_schema_override : nil
5086
+ )
5087
+ end
5088
+ if column_names.any? || schema_overrides.any?
5089
+ rbdf = _post_apply_columns(
5090
+ rbdf, column_names, schema_overrides: schema_overrides
5091
+ )
5092
+ end
5093
+ return rbdf
5064
5094
  elsif orient == "col" || orient.nil?
5065
- raise Todo
5095
+ column_names, schema_overrides = _unpack_schema(
5096
+ schema, schema_overrides: schema_overrides, n_expected: data.length
5097
+ )
5098
+ data_series =
5099
+ data.map.with_index do |element, i|
5100
+ Series.new(column_names[i], element, dtype: schema_overrides[column_names[i]])._s
5101
+ end
5102
+ return RbDataFrame.new(data_series)
5066
5103
  else
5067
5104
  raise ArgumentError, "orient must be one of {{'col', 'row', nil}}, got #{orient} instead."
5068
5105
  end
@@ -5108,10 +5145,10 @@ module Polars
5108
5145
 
5109
5146
  def _compare_to_other_df(other, op)
5110
5147
  if columns != other.columns
5111
- raise ArgmentError, "DataFrame columns do not match"
5148
+ raise ArgumentError, "DataFrame columns do not match"
5112
5149
  end
5113
5150
  if shape != other.shape
5114
- raise ArgmentError, "DataFrame dimensions do not match"
5151
+ raise ArgumentError, "DataFrame dimensions do not match"
5115
5152
  end
5116
5153
 
5117
5154
  suffix = "__POLARS_CMP_OTHER"