polars-df 0.7.0-x86_64-linux → 0.9.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +41 -0
  3. data/Cargo.lock +353 -237
  4. data/Cargo.toml +0 -3
  5. data/LICENSE-THIRD-PARTY.txt +1978 -1459
  6. data/LICENSE.txt +1 -1
  7. data/README.md +2 -2
  8. data/lib/polars/3.1/polars.so +0 -0
  9. data/lib/polars/3.2/polars.so +0 -0
  10. data/lib/polars/{3.0 → 3.3}/polars.so +0 -0
  11. data/lib/polars/array_expr.rb +449 -0
  12. data/lib/polars/array_name_space.rb +346 -0
  13. data/lib/polars/cat_expr.rb +24 -0
  14. data/lib/polars/cat_name_space.rb +75 -0
  15. data/lib/polars/config.rb +2 -2
  16. data/lib/polars/data_frame.rb +248 -108
  17. data/lib/polars/data_types.rb +195 -29
  18. data/lib/polars/date_time_expr.rb +41 -24
  19. data/lib/polars/date_time_name_space.rb +12 -12
  20. data/lib/polars/exceptions.rb +12 -1
  21. data/lib/polars/expr.rb +1080 -195
  22. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  23. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  24. data/lib/polars/functions/as_datatype.rb +248 -0
  25. data/lib/polars/functions/col.rb +47 -0
  26. data/lib/polars/functions/eager.rb +182 -0
  27. data/lib/polars/functions/lazy.rb +1280 -0
  28. data/lib/polars/functions/len.rb +49 -0
  29. data/lib/polars/functions/lit.rb +35 -0
  30. data/lib/polars/functions/random.rb +16 -0
  31. data/lib/polars/functions/range/date_range.rb +103 -0
  32. data/lib/polars/functions/range/int_range.rb +51 -0
  33. data/lib/polars/functions/repeat.rb +144 -0
  34. data/lib/polars/functions/whenthen.rb +27 -0
  35. data/lib/polars/functions.rb +29 -416
  36. data/lib/polars/group_by.rb +3 -3
  37. data/lib/polars/io.rb +21 -28
  38. data/lib/polars/lazy_frame.rb +390 -76
  39. data/lib/polars/list_expr.rb +152 -6
  40. data/lib/polars/list_name_space.rb +102 -0
  41. data/lib/polars/meta_expr.rb +175 -7
  42. data/lib/polars/series.rb +557 -59
  43. data/lib/polars/sql_context.rb +1 -1
  44. data/lib/polars/string_cache.rb +75 -0
  45. data/lib/polars/string_expr.rb +412 -96
  46. data/lib/polars/string_name_space.rb +4 -4
  47. data/lib/polars/struct_expr.rb +1 -1
  48. data/lib/polars/struct_name_space.rb +1 -1
  49. data/lib/polars/testing.rb +507 -0
  50. data/lib/polars/utils.rb +64 -20
  51. data/lib/polars/version.rb +1 -1
  52. data/lib/polars.rb +15 -2
  53. metadata +36 -7
  54. data/lib/polars/lazy_functions.rb +0 -1197
@@ -47,8 +47,8 @@ module Polars
47
47
  end
48
48
 
49
49
  # @private
50
- def self._from_hashes(data, infer_schema_length: 100, schema: nil)
51
- rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema)
50
+ def self._from_hashes(data, infer_schema_length: 100, schema: nil, schema_overrides: nil)
51
+ rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema, schema_overrides)
52
52
  _from_rbdf(rbdf)
53
53
  end
54
54
 
@@ -119,10 +119,10 @@ module Polars
119
119
 
120
120
  processed_null_values = Utils._process_null_values(null_values)
121
121
 
122
- if columns.is_a?(String)
122
+ if columns.is_a?(::String)
123
123
  columns = [columns]
124
124
  end
125
- if file.is_a?(String) && file.include?("*")
125
+ if file.is_a?(::String) && file.include?("*")
126
126
  dtypes_dict = nil
127
127
  if !dtype_list.nil?
128
128
  dtypes_dict = dtype_list.to_h
@@ -206,11 +206,11 @@ module Polars
206
206
  if Utils.pathlike?(source)
207
207
  source = Utils.normalise_filepath(source)
208
208
  end
209
- if columns.is_a?(String)
209
+ if columns.is_a?(::String)
210
210
  columns = [columns]
211
211
  end
212
212
 
213
- if source.is_a?(String) && source.include?("*") && Utils.local_file?(source)
213
+ if source.is_a?(::String) && source.include?("*") && Utils.local_file?(source)
214
214
  scan =
215
215
  Polars.scan_parquet(
216
216
  source,
@@ -269,11 +269,11 @@ module Polars
269
269
  if Utils.pathlike?(file)
270
270
  file = Utils.normalise_filepath(file)
271
271
  end
272
- if columns.is_a?(String)
272
+ if columns.is_a?(::String)
273
273
  columns = [columns]
274
274
  end
275
275
 
276
- if file.is_a?(String) && file.include?("*")
276
+ if file.is_a?(::String) && file.include?("*")
277
277
  raise Todo
278
278
  end
279
279
 
@@ -411,7 +411,7 @@ module Polars
411
411
  # }
412
412
  # )
413
413
  # df.dtypes
414
- # # => [Polars::Int64, Polars::Float64, Polars::Utf8]
414
+ # # => [Polars::Int64, Polars::Float64, Polars::String]
415
415
  def dtypes
416
416
  _df.dtypes
417
417
  end
@@ -429,7 +429,7 @@ module Polars
429
429
  # }
430
430
  # )
431
431
  # df.schema
432
- # # => {"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::Utf8}
432
+ # # => {"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::String}
433
433
  def schema
434
434
  columns.zip(dtypes).to_h
435
435
  end
@@ -589,13 +589,13 @@ module Polars
589
589
  return df.slice(row_selection, 1)
590
590
  end
591
591
  # df[2, "a"]
592
- if col_selection.is_a?(String) || col_selection.is_a?(Symbol)
592
+ if col_selection.is_a?(::String) || col_selection.is_a?(Symbol)
593
593
  return self[col_selection][row_selection]
594
594
  end
595
595
  end
596
596
 
597
597
  # column selection can be "a" and ["a", "b"]
598
- if col_selection.is_a?(String) || col_selection.is_a?(Symbol)
598
+ if col_selection.is_a?(::String) || col_selection.is_a?(Symbol)
599
599
  col_selection = [col_selection]
600
600
  end
601
601
 
@@ -621,8 +621,8 @@ module Polars
621
621
 
622
622
  # select single column
623
623
  # df["foo"]
624
- if item.is_a?(String) || item.is_a?(Symbol)
625
- return Utils.wrap_s(_df.column(item.to_s))
624
+ if item.is_a?(::String) || item.is_a?(Symbol)
625
+ return Utils.wrap_s(_df.get_column(item.to_s))
626
626
  end
627
627
 
628
628
  # df[idx]
@@ -647,7 +647,7 @@ module Polars
647
647
 
648
648
  if item.is_a?(Series)
649
649
  dtype = item.dtype
650
- if dtype == Utf8
650
+ if dtype == String
651
651
  return _from_rbdf(_df.select(item))
652
652
  elsif dtype == UInt32
653
653
  return _from_rbdf(_df.take_with_series(item._s))
@@ -698,7 +698,7 @@ module Polars
698
698
  s[row_selection] = value
699
699
 
700
700
  if col_selection.is_a?(Integer)
701
- replace_at_idx(col_selection, s)
701
+ replace_column(col_selection, s)
702
702
  elsif Utils.strlike?(col_selection)
703
703
  replace(col_selection, s)
704
704
  end
@@ -1084,7 +1084,7 @@ module Polars
1084
1084
  # df.estimated_size
1085
1085
  # # => 25888898
1086
1086
  # df.estimated_size("mb")
1087
- # # => 24.689577102661133
1087
+ # # => 26.702880859375
1088
1088
  def estimated_size(unit = "b")
1089
1089
  sz = _df.estimated_size
1090
1090
  Utils.scale_bytes(sz, to: unit)
@@ -1222,7 +1222,7 @@ module Polars
1222
1222
  # @example
1223
1223
  # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
1224
1224
  # s = Polars::Series.new("baz", [97, 98, 99])
1225
- # df.insert_at_idx(1, s)
1225
+ # df.insert_column(1, s)
1226
1226
  # # =>
1227
1227
  # # shape: (3, 3)
1228
1228
  # # ┌─────┬─────┬─────┐
@@ -1244,7 +1244,7 @@ module Polars
1244
1244
  # }
1245
1245
  # )
1246
1246
  # s = Polars::Series.new("d", [-2.5, 15, 20.5, 0])
1247
- # df.insert_at_idx(3, s)
1247
+ # df.insert_column(3, s)
1248
1248
  # # =>
1249
1249
  # # shape: (4, 4)
1250
1250
  # # ┌─────┬──────┬───────┬──────┐
@@ -1257,13 +1257,14 @@ module Polars
1257
1257
  # # │ 3 ┆ 10.0 ┆ false ┆ 20.5 │
1258
1258
  # # │ 4 ┆ 13.0 ┆ true ┆ 0.0 │
1259
1259
  # # └─────┴──────┴───────┴──────┘
1260
- def insert_at_idx(index, series)
1260
+ def insert_column(index, series)
1261
1261
  if index < 0
1262
1262
  index = columns.length + index
1263
1263
  end
1264
- _df.insert_at_idx(index, series._s)
1264
+ _df.insert_column(index, series._s)
1265
1265
  self
1266
1266
  end
1267
+ alias_method :insert_at_idx, :insert_column
1267
1268
 
1268
1269
  # Filter the rows in the DataFrame based on a predicate expression.
1269
1270
  #
@@ -1367,7 +1368,7 @@ module Polars
1367
1368
  ]
1368
1369
  )._df
1369
1370
  )
1370
- summary.insert_at_idx(
1371
+ summary.insert_column(
1371
1372
  0,
1372
1373
  Polars::Series.new(
1373
1374
  "describe",
@@ -1388,11 +1389,12 @@ module Polars
1388
1389
  # df = Polars::DataFrame.new(
1389
1390
  # {"foo" => [1, 2, 3], "bar" => [6, 7, 8], "ham" => ["a", "b", "c"]}
1390
1391
  # )
1391
- # df.find_idx_by_name("ham")
1392
+ # df.get_column_index("ham")
1392
1393
  # # => 2
1393
- def find_idx_by_name(name)
1394
- _df.find_idx_by_name(name)
1394
+ def get_column_index(name)
1395
+ _df.get_column_index(name)
1395
1396
  end
1397
+ alias_method :find_idx_by_name, :get_column_index
1396
1398
 
1397
1399
  # Replace a column at an index location.
1398
1400
  #
@@ -1412,7 +1414,7 @@ module Polars
1412
1414
  # }
1413
1415
  # )
1414
1416
  # s = Polars::Series.new("apple", [10, 20, 30])
1415
- # df.replace_at_idx(0, s)
1417
+ # df.replace_column(0, s)
1416
1418
  # # =>
1417
1419
  # # shape: (3, 3)
1418
1420
  # # ┌───────┬─────┬─────┐
@@ -1424,13 +1426,14 @@ module Polars
1424
1426
  # # │ 20 ┆ 7 ┆ b │
1425
1427
  # # │ 30 ┆ 8 ┆ c │
1426
1428
  # # └───────┴─────┴─────┘
1427
- def replace_at_idx(index, series)
1429
+ def replace_column(index, series)
1428
1430
  if index < 0
1429
1431
  index = columns.length + index
1430
1432
  end
1431
- _df.replace_at_idx(index, series._s)
1433
+ _df.replace_column(index, series._s)
1432
1434
  self
1433
1435
  end
1436
+ alias_method :replace_at_idx, :replace_column
1434
1437
 
1435
1438
  # Sort the DataFrame by column.
1436
1439
  #
@@ -1524,13 +1527,14 @@ module Polars
1524
1527
  # "ham" => ["c", "b", "a"]
1525
1528
  # }
1526
1529
  # )
1527
- # df1.frame_equal(df1)
1530
+ # df1.equals(df1)
1528
1531
  # # => true
1529
- # df1.frame_equal(df2)
1532
+ # df1.equals(df2)
1530
1533
  # # => false
1531
- def frame_equal(other, null_equal: true)
1532
- _df.frame_equal(other._df, null_equal)
1534
+ def equals(other, null_equal: true)
1535
+ _df.equals(other._df, null_equal)
1533
1536
  end
1537
+ alias_method :frame_equal, :equals
1534
1538
 
1535
1539
  # Replace a column by a new Series.
1536
1540
  #
@@ -1716,7 +1720,7 @@ module Polars
1716
1720
  # # │ 3 ┆ 8 ┆ c │
1717
1721
  # # └─────┴─────┴─────┘
1718
1722
  def drop_nulls(subset: nil)
1719
- if subset.is_a?(String)
1723
+ if subset.is_a?(::String)
1720
1724
  subset = [subset]
1721
1725
  end
1722
1726
  _from_rbdf(_df.drop_nulls(subset))
@@ -1778,7 +1782,7 @@ module Polars
1778
1782
  # "b" => [2, 4, 6]
1779
1783
  # }
1780
1784
  # )
1781
- # df.with_row_count
1785
+ # df.with_row_index
1782
1786
  # # =>
1783
1787
  # # shape: (3, 3)
1784
1788
  # # ┌────────┬─────┬─────┐
@@ -1790,9 +1794,10 @@ module Polars
1790
1794
  # # │ 1 ┆ 3 ┆ 4 │
1791
1795
  # # │ 2 ┆ 5 ┆ 6 │
1792
1796
  # # └────────┴─────┴─────┘
1793
- def with_row_count(name: "row_nr", offset: 0)
1794
- _from_rbdf(_df.with_row_count(name, offset))
1797
+ def with_row_index(name: "row_nr", offset: 0)
1798
+ _from_rbdf(_df.with_row_index(name, offset))
1795
1799
  end
1800
+ alias_method :with_row_count, :with_row_index
1796
1801
 
1797
1802
  # Start a group by operation.
1798
1803
  #
@@ -2267,7 +2272,7 @@ module Polars
2267
2272
  if by.nil?
2268
2273
  by = []
2269
2274
  end
2270
- if by.is_a?(String)
2275
+ if by.is_a?(::String)
2271
2276
  by = [by]
2272
2277
  end
2273
2278
  if offset.nil?
@@ -2429,6 +2434,8 @@ module Polars
2429
2434
  # Join strategy.
2430
2435
  # @param suffix [String]
2431
2436
  # Suffix to append to columns with a duplicate name.
2437
+ # @param join_nulls [Boolean]
2438
+ # Join on null values. By default null values will never produce matches.
2432
2439
  #
2433
2440
  # @return [DataFrame]
2434
2441
  #
@@ -2461,17 +2468,17 @@ module Polars
2461
2468
  # @example
2462
2469
  # df.join(other_df, on: "ham", how: "outer")
2463
2470
  # # =>
2464
- # # shape: (4, 4)
2465
- # # ┌──────┬──────┬─────┬───────┐
2466
- # # │ foo ┆ bar ┆ ham ┆ apple │
2467
- # # │ --- ┆ --- ┆ --- ┆ ---
2468
- # # │ i64 ┆ f64 ┆ str ┆ str
2469
- # # ╞══════╪══════╪═════╪═══════╡
2470
- # # │ 1 ┆ 6.0 ┆ a ┆ x │
2471
- # # │ 2 ┆ 7.0 ┆ b ┆ y │
2472
- # # │ null ┆ null ┆ d ┆ z │
2473
- # # │ 3 ┆ 8.0 ┆ c ┆ null │
2474
- # # └──────┴──────┴─────┴───────┘
2471
+ # # shape: (4, 5)
2472
+ # # ┌──────┬──────┬──────┬───────┬───────────┐
2473
+ # # │ foo ┆ bar ┆ ham ┆ apple ┆ ham_right
2474
+ # # │ --- ┆ --- ┆ --- --- ┆ ---
2475
+ # # │ i64 ┆ f64 ┆ str str ┆ str
2476
+ # # ╞══════╪══════╪══════╪═══════╪═══════════╡
2477
+ # # │ 1 ┆ 6.0 ┆ a ┆ x ┆ a
2478
+ # # │ 2 ┆ 7.0 ┆ b ┆ y ┆ b
2479
+ # # │ null ┆ null ┆ null ┆ z ┆ d
2480
+ # # │ 3 ┆ 8.0 ┆ c ┆ null ┆ null
2481
+ # # └──────┴──────┴──────┴───────┴───────────┘
2475
2482
  #
2476
2483
  # @example
2477
2484
  # df.join(other_df, on: "ham", how: "left")
@@ -2511,7 +2518,7 @@ module Polars
2511
2518
  # # ╞═════╪═════╪═════╡
2512
2519
  # # │ 3 ┆ 8.0 ┆ c │
2513
2520
  # # └─────┴─────┴─────┘
2514
- def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right")
2521
+ def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right", join_nulls: false)
2515
2522
  lazy
2516
2523
  .join(
2517
2524
  other.lazy,
@@ -2520,6 +2527,7 @@ module Polars
2520
2527
  on: on,
2521
2528
  how: how,
2522
2529
  suffix: suffix,
2530
+ join_nulls: join_nulls
2523
2531
  )
2524
2532
  .collect(no_optimization: true)
2525
2533
  end
@@ -2863,7 +2871,7 @@ module Polars
2863
2871
  # "c" => [true, true, false, nil]
2864
2872
  # }
2865
2873
  # )
2866
- # df.cleared
2874
+ # df.clear
2867
2875
  # # =>
2868
2876
  # # shape: (0, 3)
2869
2877
  # # ┌─────┬─────┬──────┐
@@ -2872,9 +2880,31 @@ module Polars
2872
2880
  # # │ i64 ┆ f64 ┆ bool │
2873
2881
  # # ╞═════╪═════╪══════╡
2874
2882
  # # └─────┴─────┴──────┘
2875
- def cleared
2876
- height > 0 ? head(0) : clone
2883
+ #
2884
+ # @example
2885
+ # df.clear(2)
2886
+ # # =>
2887
+ # # shape: (2, 3)
2888
+ # # ┌──────┬──────┬──────┐
2889
+ # # │ a ┆ b ┆ c │
2890
+ # # │ --- ┆ --- ┆ --- │
2891
+ # # │ i64 ┆ f64 ┆ bool │
2892
+ # # ╞══════╪══════╪══════╡
2893
+ # # │ null ┆ null ┆ null │
2894
+ # # │ null ┆ null ┆ null │
2895
+ # # └──────┴──────┴──────┘
2896
+ def clear(n = 0)
2897
+ if n == 0
2898
+ _from_rbdf(_df.clear)
2899
+ elsif n > 0 || len > 0
2900
+ self.class.new(
2901
+ schema.to_h { |nm, tp| [nm, Series.new(nm, [], dtype: tp).extend_constant(nil, n)] }
2902
+ )
2903
+ else
2904
+ clone
2905
+ end
2877
2906
  end
2907
+ alias_method :cleared, :clear
2878
2908
 
2879
2909
  # clone handled by initialize_copy
2880
2910
 
@@ -3111,17 +3141,17 @@ module Polars
3111
3141
  sort_columns: false,
3112
3142
  separator: "_"
3113
3143
  )
3114
- if values.is_a?(String)
3144
+ if values.is_a?(::String)
3115
3145
  values = [values]
3116
3146
  end
3117
- if index.is_a?(String)
3147
+ if index.is_a?(::String)
3118
3148
  index = [index]
3119
3149
  end
3120
- if columns.is_a?(String)
3150
+ if columns.is_a?(::String)
3121
3151
  columns = [columns]
3122
3152
  end
3123
3153
 
3124
- if aggregate_fn.is_a?(String)
3154
+ if aggregate_fn.is_a?(::String)
3125
3155
  case aggregate_fn
3126
3156
  when "first"
3127
3157
  aggregate_expr = Polars.element.first._rbexpr
@@ -3137,8 +3167,11 @@ module Polars
3137
3167
  aggregate_expr = Polars.element.median._rbexpr
3138
3168
  when "last"
3139
3169
  aggregate_expr = Polars.element.last._rbexpr
3170
+ when "len"
3171
+ aggregate_expr = Polars.len._rbexpr
3140
3172
  when "count"
3141
- aggregate_expr = Polars.count._rbexpr
3173
+ warn "`aggregate_function: \"count\"` input for `pivot` is deprecated. Use `aggregate_function: \"len\"` instead."
3174
+ aggregate_expr = Polars.len._rbexpr
3142
3175
  else
3143
3176
  raise ArgumentError, "Argument aggregate fn: '#{aggregate_fn}' was not expected."
3144
3177
  end
@@ -3150,9 +3183,9 @@ module Polars
3150
3183
 
3151
3184
  _from_rbdf(
3152
3185
  _df.pivot_expr(
3153
- values,
3154
3186
  index,
3155
3187
  columns,
3188
+ values,
3156
3189
  maintain_order,
3157
3190
  sort_columns,
3158
3191
  aggregate_expr,
@@ -3206,10 +3239,10 @@ module Polars
3206
3239
  # # │ z ┆ c ┆ 6 │
3207
3240
  # # └─────┴──────────┴───────┘
3208
3241
  def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil)
3209
- if value_vars.is_a?(String)
3242
+ if value_vars.is_a?(::String)
3210
3243
  value_vars = [value_vars]
3211
3244
  end
3212
- if id_vars.is_a?(String)
3245
+ if id_vars.is_a?(::String)
3213
3246
  id_vars = [id_vars]
3214
3247
  end
3215
3248
  if value_vars.nil?
@@ -3423,7 +3456,7 @@ module Polars
3423
3456
  # # │ C ┆ 2 ┆ l │
3424
3457
  # # └─────┴─────┴─────┘}
3425
3458
  def partition_by(groups, maintain_order: true, include_key: true, as_dict: false)
3426
- if groups.is_a?(String)
3459
+ if groups.is_a?(::String)
3427
3460
  groups = [groups]
3428
3461
  elsif !groups.is_a?(::Array)
3429
3462
  groups = Array(groups)
@@ -3587,8 +3620,13 @@ module Polars
3587
3620
 
3588
3621
  # Select columns from this DataFrame.
3589
3622
  #
3590
- # @param exprs [Object]
3591
- # Column or columns to select.
3623
+ # @param exprs [Array]
3624
+ # Column(s) to select, specified as positional arguments.
3625
+ # Accepts expression input. Strings are parsed as column names,
3626
+ # other non-expression inputs are parsed as literals.
3627
+ # @param named_exprs [Hash]
3628
+ # Additional columns to select, specified as keyword arguments.
3629
+ # The columns will be renamed to the keyword used.
3592
3630
  #
3593
3631
  # @return [DataFrame]
3594
3632
  #
@@ -3668,23 +3706,25 @@ module Polars
3668
3706
  # # │ 0 │
3669
3707
  # # │ 10 │
3670
3708
  # # └─────────┘
3671
- def select(exprs)
3672
- _from_rbdf(
3673
- lazy
3674
- .select(exprs)
3675
- .collect(no_optimization: true, string_cache: false)
3676
- ._df
3677
- )
3709
+ def select(*exprs, **named_exprs)
3710
+ lazy.select(*exprs, **named_exprs).collect(_eager: true)
3678
3711
  end
3679
3712
 
3680
- # Add or overwrite multiple columns in a DataFrame.
3713
+ # Add columns to this DataFrame.
3714
+ #
3715
+ # Added columns will replace existing columns with the same name.
3681
3716
  #
3682
3717
  # @param exprs [Array]
3683
- # Array of Expressions that evaluate to columns.
3718
+ # Column(s) to add, specified as positional arguments.
3719
+ # Accepts expression input. Strings are parsed as column names, other
3720
+ # non-expression inputs are parsed as literals.
3721
+ # @param named_exprs [Hash]
3722
+ # Additional columns to add, specified as keyword arguments.
3723
+ # The columns will be renamed to the keyword used.
3684
3724
  #
3685
3725
  # @return [DataFrame]
3686
3726
  #
3687
- # @example
3727
+ # @example Pass an expression to add it as a new column.
3688
3728
  # df = Polars::DataFrame.new(
3689
3729
  # {
3690
3730
  # "a" => [1, 2, 3, 4],
@@ -3692,11 +3732,41 @@ module Polars
3692
3732
  # "c" => [true, true, false, true]
3693
3733
  # }
3694
3734
  # )
3735
+ # df.with_columns((Polars.col("a") ** 2).alias("a^2"))
3736
+ # # =>
3737
+ # # shape: (4, 4)
3738
+ # # ┌─────┬──────┬───────┬──────┐
3739
+ # # │ a ┆ b ┆ c ┆ a^2 │
3740
+ # # │ --- ┆ --- ┆ --- ┆ --- │
3741
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 │
3742
+ # # ╞═════╪══════╪═══════╪══════╡
3743
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1.0 │
3744
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4.0 │
3745
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9.0 │
3746
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 │
3747
+ # # └─────┴──────┴───────┴──────┘
3748
+ #
3749
+ # @example Added columns will replace existing columns with the same name.
3750
+ # df.with_columns(Polars.col("a").cast(Polars::Float64))
3751
+ # # =>
3752
+ # # shape: (4, 3)
3753
+ # # ┌─────┬──────┬───────┐
3754
+ # # │ a ┆ b ┆ c │
3755
+ # # │ --- ┆ --- ┆ --- │
3756
+ # # │ f64 ┆ f64 ┆ bool │
3757
+ # # ╞═════╪══════╪═══════╡
3758
+ # # │ 1.0 ┆ 0.5 ┆ true │
3759
+ # # │ 2.0 ┆ 4.0 ┆ true │
3760
+ # # │ 3.0 ┆ 10.0 ┆ false │
3761
+ # # │ 4.0 ┆ 13.0 ┆ true │
3762
+ # # └─────┴──────┴───────┘
3763
+ #
3764
+ # @example Multiple columns can be added by passing a list of expressions.
3695
3765
  # df.with_columns(
3696
3766
  # [
3697
3767
  # (Polars.col("a") ** 2).alias("a^2"),
3698
3768
  # (Polars.col("b") / 2).alias("b/2"),
3699
- # (Polars.col("c").is_not).alias("not c")
3769
+ # (Polars.col("c").not_).alias("not c"),
3700
3770
  # ]
3701
3771
  # )
3702
3772
  # # =>
@@ -3711,13 +3781,45 @@ module Polars
3711
3781
  # # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
3712
3782
  # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
3713
3783
  # # └─────┴──────┴───────┴──────┴──────┴───────┘
3714
- def with_columns(exprs)
3715
- if !exprs.nil? && !exprs.is_a?(::Array)
3716
- exprs = [exprs]
3717
- end
3718
- lazy
3719
- .with_columns(exprs)
3720
- .collect(no_optimization: true, string_cache: false)
3784
+ #
3785
+ # @example Multiple columns also can be added using positional arguments instead of a list.
3786
+ # df.with_columns(
3787
+ # (Polars.col("a") ** 2).alias("a^2"),
3788
+ # (Polars.col("b") / 2).alias("b/2"),
3789
+ # (Polars.col("c").not_).alias("not c"),
3790
+ # )
3791
+ # # =>
3792
+ # # shape: (4, 6)
3793
+ # # ┌─────┬──────┬───────┬──────┬──────┬───────┐
3794
+ # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
3795
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3796
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │
3797
+ # # ╞═════╪══════╪═══════╪══════╪══════╪═══════╡
3798
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │
3799
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │
3800
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
3801
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
3802
+ # # └─────┴──────┴───────┴──────┴──────┴───────┘
3803
+ #
3804
+ # @example Use keyword arguments to easily name your expression inputs.
3805
+ # df.with_columns(
3806
+ # ab: Polars.col("a") * Polars.col("b"),
3807
+ # not_c: Polars.col("c").not_
3808
+ # )
3809
+ # # =>
3810
+ # # shape: (4, 5)
3811
+ # # ┌─────┬──────┬───────┬──────┬───────┐
3812
+ # # │ a ┆ b ┆ c ┆ ab ┆ not_c │
3813
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3814
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │
3815
+ # # ╞═════╪══════╪═══════╪══════╪═══════╡
3816
+ # # │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │
3817
+ # # │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │
3818
+ # # │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │
3819
+ # # │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │
3820
+ # # └─────┴──────┴───────┴──────┴───────┘
3821
+ def with_columns(*exprs, **named_exprs)
3822
+ lazy.with_columns(*exprs, **named_exprs).collect(_eager: true)
3721
3823
  end
3722
3824
 
3723
3825
  # Get number of chunks used by the ChunkedArrays of this DataFrame.
@@ -3774,7 +3876,7 @@ module Polars
3774
3876
  # # └─────┴─────┴─────┘
3775
3877
  def max(axis: 0)
3776
3878
  if axis == 0
3777
- _from_rbdf(_df.max)
3879
+ lazy.max.collect(_eager: true)
3778
3880
  elsif axis == 1
3779
3881
  Utils.wrap_s(_df.max_horizontal)
3780
3882
  else
@@ -3806,7 +3908,7 @@ module Polars
3806
3908
  # # └─────┴─────┴─────┘
3807
3909
  def min(axis: 0)
3808
3910
  if axis == 0
3809
- _from_rbdf(_df.min)
3911
+ lazy.min.collect(_eager: true)
3810
3912
  elsif axis == 1
3811
3913
  Utils.wrap_s(_df.min_horizontal)
3812
3914
  else
@@ -3855,7 +3957,7 @@ module Polars
3855
3957
  def sum(axis: 0, null_strategy: "ignore")
3856
3958
  case axis
3857
3959
  when 0
3858
- _from_rbdf(_df.sum)
3960
+ lazy.sum.collect(_eager: true)
3859
3961
  when 1
3860
3962
  Utils.wrap_s(_df.sum_horizontal(null_strategy))
3861
3963
  else
@@ -3893,7 +3995,7 @@ module Polars
3893
3995
  def mean(axis: 0, null_strategy: "ignore")
3894
3996
  case axis
3895
3997
  when 0
3896
- _from_rbdf(_df.mean)
3998
+ lazy.mean.collect(_eager: true)
3897
3999
  when 1
3898
4000
  Utils.wrap_s(_df.mean_horizontal(null_strategy))
3899
4001
  else
@@ -3939,7 +4041,7 @@ module Polars
3939
4041
  # # │ 0.816497 ┆ 0.816497 ┆ null │
3940
4042
  # # └──────────┴──────────┴──────┘
3941
4043
  def std(ddof: 1)
3942
- _from_rbdf(_df.std(ddof))
4044
+ lazy.std(ddof: ddof).collect(_eager: true)
3943
4045
  end
3944
4046
 
3945
4047
  # Aggregate the columns of this DataFrame to their variance value.
@@ -3980,7 +4082,7 @@ module Polars
3980
4082
  # # │ 0.666667 ┆ 0.666667 ┆ null │
3981
4083
  # # └──────────┴──────────┴──────┘
3982
4084
  def var(ddof: 1)
3983
- _from_rbdf(_df.var(ddof))
4085
+ lazy.var(ddof: ddof).collect(_eager: true)
3984
4086
  end
3985
4087
 
3986
4088
  # Aggregate the columns of this DataFrame to their median value.
@@ -4006,7 +4108,7 @@ module Polars
4006
4108
  # # │ 2.0 ┆ 7.0 ┆ null │
4007
4109
  # # └─────┴─────┴──────┘
4008
4110
  def median
4009
- _from_rbdf(_df.median)
4111
+ lazy.median.collect(_eager: true)
4010
4112
  end
4011
4113
 
4012
4114
  # Aggregate the columns of this DataFrame to their product values.
@@ -4063,7 +4165,7 @@ module Polars
4063
4165
  # # │ 2.0 ┆ 7.0 ┆ null │
4064
4166
  # # └─────┴─────┴──────┘
4065
4167
  def quantile(quantile, interpolation: "nearest")
4066
- _from_rbdf(_df.quantile(quantile, interpolation))
4168
+ lazy.quantile(quantile, interpolation: interpolation).collect(_eager: true)
4067
4169
  end
4068
4170
 
4069
4171
  # Get one hot encoded dummy variables.
@@ -4094,7 +4196,7 @@ module Polars
4094
4196
  # # │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │
4095
4197
  # # └───────┴───────┴───────┴───────┴───────┴───────┘
4096
4198
  def to_dummies(columns: nil, separator: "_", drop_first: false)
4097
- if columns.is_a?(String)
4199
+ if columns.is_a?(::String)
4098
4200
  columns = [columns]
4099
4201
  end
4100
4202
  _from_rbdf(_df.to_dummies(columns, separator, drop_first))
@@ -4359,7 +4461,7 @@ module Polars
4359
4461
  # # null
4360
4462
  # # ]
4361
4463
  #
4362
- # @example A horizontal boolean or, similar to a row-wise .any():
4464
+ # @example A horizontal boolean or, similar to a row-wise .any:
4363
4465
  # df = Polars::DataFrame.new(
4364
4466
  # {
4365
4467
  # "a" => [false, false, true],
@@ -4482,7 +4584,7 @@ module Polars
4482
4584
  # # => [{"a"=>1, "b"=>2}, {"a"=>3, "b"=>4}, {"a"=>5, "b"=>6}]
4483
4585
  def rows(named: false)
4484
4586
  if named
4485
- columns = columns()
4587
+ columns = self.columns
4486
4588
  _df.row_tuples.map do |v|
4487
4589
  columns.zip(v).to_h
4488
4590
  end
@@ -4523,7 +4625,7 @@ module Polars
4523
4625
  return to_enum(:iter_rows, named: named, buffer_size: buffer_size) unless block_given?
4524
4626
 
4525
4627
  # load into the local namespace for a modest performance boost in the hot loops
4526
- columns = columns()
4628
+ columns = self.columns
4527
4629
 
4528
4630
  # note: buffering rows results in a 2-4x speedup over individual calls
4529
4631
  # to ".row(i)", so it should only be disabled in extremely specific cases.
@@ -4603,8 +4705,8 @@ module Polars
4603
4705
  # # │ 1 ┆ 5 │
4604
4706
  # # │ 3 ┆ 7 │
4605
4707
  # # └─────┴─────┘
4606
- def gather_every(n)
4607
- select(Utils.col("*").gather_every(n))
4708
+ def gather_every(n, offset = 0)
4709
+ select(Utils.col("*").gather_every(n, offset))
4608
4710
  end
4609
4711
  alias_method :take_every, :gather_every
4610
4712
 
@@ -4754,19 +4856,57 @@ module Polars
4754
4856
  # # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │
4755
4857
  # # └────────┴─────┴─────┴──────┴───────────┴───────┘
4756
4858
  def unnest(names)
4757
- if names.is_a?(String)
4859
+ if names.is_a?(::String)
4758
4860
  names = [names]
4759
4861
  end
4760
4862
  _from_rbdf(_df.unnest(names))
4761
4863
  end
4762
4864
 
4763
- # TODO
4865
+ # Requires NumPy
4764
4866
  # def corr
4765
4867
  # end
4766
4868
 
4767
- # TODO
4768
- # def merge_sorted
4769
- # end
4869
+ # Take two sorted DataFrames and merge them by the sorted key.
4870
+ #
4871
+ # The output of this operation will also be sorted.
4872
+ # It is the callers responsibility that the frames are sorted
4873
+ # by that key otherwise the output will not make sense.
4874
+ #
4875
+ # The schemas of both DataFrames must be equal.
4876
+ #
4877
+ # @param other [DataFrame]
4878
+ # Other DataFrame that must be merged
4879
+ # @param key [String]
4880
+ # Key that is sorted.
4881
+ #
4882
+ # @return [DataFrame]
4883
+ #
4884
+ # @example
4885
+ # df0 = Polars::DataFrame.new(
4886
+ # {"name" => ["steve", "elise", "bob"], "age" => [42, 44, 18]}
4887
+ # ).sort("age")
4888
+ # df1 = Polars::DataFrame.new(
4889
+ # {"name" => ["anna", "megan", "steve", "thomas"], "age" => [21, 33, 42, 20]}
4890
+ # ).sort("age")
4891
+ # df0.merge_sorted(df1, "age")
4892
+ # # =>
4893
+ # # shape: (7, 2)
4894
+ # # ┌────────┬─────┐
4895
+ # # │ name ┆ age │
4896
+ # # │ --- ┆ --- │
4897
+ # # │ str ┆ i64 │
4898
+ # # ╞════════╪═════╡
4899
+ # # │ bob ┆ 18 │
4900
+ # # │ thomas ┆ 20 │
4901
+ # # │ anna ┆ 21 │
4902
+ # # │ megan ┆ 33 │
4903
+ # # │ steve ┆ 42 │
4904
+ # # │ steve ┆ 42 │
4905
+ # # │ elise ┆ 44 │
4906
+ # # └────────┴─────┘
4907
+ def merge_sorted(other, key)
4908
+ lazy.merge_sorted(other.lazy, key).collect(_eager: true)
4909
+ end
4770
4910
 
4771
4911
  # Indicate that one or multiple columns are sorted.
4772
4912
  #
@@ -4808,7 +4948,7 @@ module Polars
4808
4948
  end
4809
4949
 
4810
4950
  def _pos_idxs(idxs, dim)
4811
- idx_type = Polars._get_idx_type
4951
+ idx_type = Plr.get_index_type
4812
4952
 
4813
4953
  if idxs.is_a?(Series)
4814
4954
  if idxs.dtype == idx_type
@@ -4867,10 +5007,10 @@ module Polars
4867
5007
  if val.is_a?(Hash) && dtype != Struct
4868
5008
  updated_data[name] = DataFrame.new(val).to_struct(name)
4869
5009
  elsif !Utils.arrlen(val).nil?
4870
- updated_data[name] = Series.new(String.new(name), val, dtype: dtype)
4871
- elsif val.nil? || [Integer, Float, TrueClass, FalseClass, String, ::Date, ::DateTime, ::Time].any? { |cls| val.is_a?(cls) }
5010
+ updated_data[name] = Series.new(::String.new(name), val, dtype: dtype)
5011
+ elsif val.nil? || [Integer, Float, TrueClass, FalseClass, ::String, ::Date, ::DateTime, ::Time].any? { |cls| val.is_a?(cls) }
4872
5012
  dtype = Polars::Float64 if val.nil? && dtype.nil?
4873
- updated_data[name] = Series.new(String.new(name), [val], dtype: dtype).extend_constant(val, array_len - 1)
5013
+ updated_data[name] = Series.new(::String.new(name), [val], dtype: dtype).extend_constant(val, array_len - 1)
4874
5014
  else
4875
5015
  raise Todo
4876
5016
  end
@@ -4927,7 +5067,7 @@ module Polars
4927
5067
  end
4928
5068
  column_names =
4929
5069
  (schema || []).map.with_index do |col, i|
4930
- if col.is_a?(String)
5070
+ if col.is_a?(::String)
4931
5071
  col || "column_#{i}"
4932
5072
  else
4933
5073
  col[0]
@@ -4940,7 +5080,7 @@ module Polars
4940
5080
  lookup = column_names.zip(lookup_names || []).to_h
4941
5081
 
4942
5082
  column_dtypes =
4943
- (schema || []).select { |col| !col.is_a?(String) && col[1] }.to_h do |col|
5083
+ (schema || []).select { |col| !col.is_a?(::String) && col[1] }.to_h do |col|
4944
5084
  [lookup[col[0]] || col[0], col[1]]
4945
5085
  end
4946
5086
 
@@ -5041,14 +5181,14 @@ module Polars
5041
5181
  elsif data[0].is_a?(Hash)
5042
5182
  column_names, dtypes = _unpack_schema(columns)
5043
5183
  schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
5044
- rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema_overrides)
5184
+ rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema, schema_overrides)
5045
5185
  if column_names
5046
5186
  rbdf = _post_apply_columns(rbdf, column_names)
5047
5187
  end
5048
5188
  return rbdf
5049
5189
  elsif data[0].is_a?(::Array)
5190
+ first_element = data[0]
5050
5191
  if orient.nil? && !columns.nil?
5051
- first_element = data[0]
5052
5192
  row_types = first_element.filter_map { |value| value.class }.uniq
5053
5193
  if row_types.include?(Integer) && row_types.include?(Float)
5054
5194
  row_types.delete(Integer)