polars-df 0.7.0-x86_64-darwin → 0.9.0-x86_64-darwin

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +41 -0
  3. data/Cargo.lock +353 -237
  4. data/Cargo.toml +0 -3
  5. data/LICENSE-THIRD-PARTY.txt +4014 -3495
  6. data/LICENSE.txt +1 -1
  7. data/README.md +2 -2
  8. data/lib/polars/3.1/polars.bundle +0 -0
  9. data/lib/polars/3.2/polars.bundle +0 -0
  10. data/lib/polars/{3.0 → 3.3}/polars.bundle +0 -0
  11. data/lib/polars/array_expr.rb +449 -0
  12. data/lib/polars/array_name_space.rb +346 -0
  13. data/lib/polars/cat_expr.rb +24 -0
  14. data/lib/polars/cat_name_space.rb +75 -0
  15. data/lib/polars/config.rb +2 -2
  16. data/lib/polars/data_frame.rb +248 -108
  17. data/lib/polars/data_types.rb +195 -29
  18. data/lib/polars/date_time_expr.rb +41 -24
  19. data/lib/polars/date_time_name_space.rb +12 -12
  20. data/lib/polars/exceptions.rb +12 -1
  21. data/lib/polars/expr.rb +1080 -195
  22. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  23. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  24. data/lib/polars/functions/as_datatype.rb +248 -0
  25. data/lib/polars/functions/col.rb +47 -0
  26. data/lib/polars/functions/eager.rb +182 -0
  27. data/lib/polars/functions/lazy.rb +1280 -0
  28. data/lib/polars/functions/len.rb +49 -0
  29. data/lib/polars/functions/lit.rb +35 -0
  30. data/lib/polars/functions/random.rb +16 -0
  31. data/lib/polars/functions/range/date_range.rb +103 -0
  32. data/lib/polars/functions/range/int_range.rb +51 -0
  33. data/lib/polars/functions/repeat.rb +144 -0
  34. data/lib/polars/functions/whenthen.rb +27 -0
  35. data/lib/polars/functions.rb +29 -416
  36. data/lib/polars/group_by.rb +3 -3
  37. data/lib/polars/io.rb +21 -28
  38. data/lib/polars/lazy_frame.rb +390 -76
  39. data/lib/polars/list_expr.rb +152 -6
  40. data/lib/polars/list_name_space.rb +102 -0
  41. data/lib/polars/meta_expr.rb +175 -7
  42. data/lib/polars/series.rb +557 -59
  43. data/lib/polars/sql_context.rb +1 -1
  44. data/lib/polars/string_cache.rb +75 -0
  45. data/lib/polars/string_expr.rb +412 -96
  46. data/lib/polars/string_name_space.rb +4 -4
  47. data/lib/polars/struct_expr.rb +1 -1
  48. data/lib/polars/struct_name_space.rb +1 -1
  49. data/lib/polars/testing.rb +507 -0
  50. data/lib/polars/utils.rb +64 -20
  51. data/lib/polars/version.rb +1 -1
  52. data/lib/polars.rb +15 -2
  53. metadata +36 -7
  54. data/lib/polars/lazy_functions.rb +0 -1197
@@ -47,8 +47,8 @@ module Polars
47
47
  end
48
48
 
49
49
  # @private
50
- def self._from_hashes(data, infer_schema_length: 100, schema: nil)
51
- rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema)
50
+ def self._from_hashes(data, infer_schema_length: 100, schema: nil, schema_overrides: nil)
51
+ rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema, schema_overrides)
52
52
  _from_rbdf(rbdf)
53
53
  end
54
54
 
@@ -119,10 +119,10 @@ module Polars
119
119
 
120
120
  processed_null_values = Utils._process_null_values(null_values)
121
121
 
122
- if columns.is_a?(String)
122
+ if columns.is_a?(::String)
123
123
  columns = [columns]
124
124
  end
125
- if file.is_a?(String) && file.include?("*")
125
+ if file.is_a?(::String) && file.include?("*")
126
126
  dtypes_dict = nil
127
127
  if !dtype_list.nil?
128
128
  dtypes_dict = dtype_list.to_h
@@ -206,11 +206,11 @@ module Polars
206
206
  if Utils.pathlike?(source)
207
207
  source = Utils.normalise_filepath(source)
208
208
  end
209
- if columns.is_a?(String)
209
+ if columns.is_a?(::String)
210
210
  columns = [columns]
211
211
  end
212
212
 
213
- if source.is_a?(String) && source.include?("*") && Utils.local_file?(source)
213
+ if source.is_a?(::String) && source.include?("*") && Utils.local_file?(source)
214
214
  scan =
215
215
  Polars.scan_parquet(
216
216
  source,
@@ -269,11 +269,11 @@ module Polars
269
269
  if Utils.pathlike?(file)
270
270
  file = Utils.normalise_filepath(file)
271
271
  end
272
- if columns.is_a?(String)
272
+ if columns.is_a?(::String)
273
273
  columns = [columns]
274
274
  end
275
275
 
276
- if file.is_a?(String) && file.include?("*")
276
+ if file.is_a?(::String) && file.include?("*")
277
277
  raise Todo
278
278
  end
279
279
 
@@ -411,7 +411,7 @@ module Polars
411
411
  # }
412
412
  # )
413
413
  # df.dtypes
414
- # # => [Polars::Int64, Polars::Float64, Polars::Utf8]
414
+ # # => [Polars::Int64, Polars::Float64, Polars::String]
415
415
  def dtypes
416
416
  _df.dtypes
417
417
  end
@@ -429,7 +429,7 @@ module Polars
429
429
  # }
430
430
  # )
431
431
  # df.schema
432
- # # => {"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::Utf8}
432
+ # # => {"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::String}
433
433
  def schema
434
434
  columns.zip(dtypes).to_h
435
435
  end
@@ -589,13 +589,13 @@ module Polars
589
589
  return df.slice(row_selection, 1)
590
590
  end
591
591
  # df[2, "a"]
592
- if col_selection.is_a?(String) || col_selection.is_a?(Symbol)
592
+ if col_selection.is_a?(::String) || col_selection.is_a?(Symbol)
593
593
  return self[col_selection][row_selection]
594
594
  end
595
595
  end
596
596
 
597
597
  # column selection can be "a" and ["a", "b"]
598
- if col_selection.is_a?(String) || col_selection.is_a?(Symbol)
598
+ if col_selection.is_a?(::String) || col_selection.is_a?(Symbol)
599
599
  col_selection = [col_selection]
600
600
  end
601
601
 
@@ -621,8 +621,8 @@ module Polars
621
621
 
622
622
  # select single column
623
623
  # df["foo"]
624
- if item.is_a?(String) || item.is_a?(Symbol)
625
- return Utils.wrap_s(_df.column(item.to_s))
624
+ if item.is_a?(::String) || item.is_a?(Symbol)
625
+ return Utils.wrap_s(_df.get_column(item.to_s))
626
626
  end
627
627
 
628
628
  # df[idx]
@@ -647,7 +647,7 @@ module Polars
647
647
 
648
648
  if item.is_a?(Series)
649
649
  dtype = item.dtype
650
- if dtype == Utf8
650
+ if dtype == String
651
651
  return _from_rbdf(_df.select(item))
652
652
  elsif dtype == UInt32
653
653
  return _from_rbdf(_df.take_with_series(item._s))
@@ -698,7 +698,7 @@ module Polars
698
698
  s[row_selection] = value
699
699
 
700
700
  if col_selection.is_a?(Integer)
701
- replace_at_idx(col_selection, s)
701
+ replace_column(col_selection, s)
702
702
  elsif Utils.strlike?(col_selection)
703
703
  replace(col_selection, s)
704
704
  end
@@ -1084,7 +1084,7 @@ module Polars
1084
1084
  # df.estimated_size
1085
1085
  # # => 25888898
1086
1086
  # df.estimated_size("mb")
1087
- # # => 24.689577102661133
1087
+ # # => 26.702880859375
1088
1088
  def estimated_size(unit = "b")
1089
1089
  sz = _df.estimated_size
1090
1090
  Utils.scale_bytes(sz, to: unit)
@@ -1222,7 +1222,7 @@ module Polars
1222
1222
  # @example
1223
1223
  # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
1224
1224
  # s = Polars::Series.new("baz", [97, 98, 99])
1225
- # df.insert_at_idx(1, s)
1225
+ # df.insert_column(1, s)
1226
1226
  # # =>
1227
1227
  # # shape: (3, 3)
1228
1228
  # # ┌─────┬─────┬─────┐
@@ -1244,7 +1244,7 @@ module Polars
1244
1244
  # }
1245
1245
  # )
1246
1246
  # s = Polars::Series.new("d", [-2.5, 15, 20.5, 0])
1247
- # df.insert_at_idx(3, s)
1247
+ # df.insert_column(3, s)
1248
1248
  # # =>
1249
1249
  # # shape: (4, 4)
1250
1250
  # # ┌─────┬──────┬───────┬──────┐
@@ -1257,13 +1257,14 @@ module Polars
1257
1257
  # # │ 3 ┆ 10.0 ┆ false ┆ 20.5 │
1258
1258
  # # │ 4 ┆ 13.0 ┆ true ┆ 0.0 │
1259
1259
  # # └─────┴──────┴───────┴──────┘
1260
- def insert_at_idx(index, series)
1260
+ def insert_column(index, series)
1261
1261
  if index < 0
1262
1262
  index = columns.length + index
1263
1263
  end
1264
- _df.insert_at_idx(index, series._s)
1264
+ _df.insert_column(index, series._s)
1265
1265
  self
1266
1266
  end
1267
+ alias_method :insert_at_idx, :insert_column
1267
1268
 
1268
1269
  # Filter the rows in the DataFrame based on a predicate expression.
1269
1270
  #
@@ -1367,7 +1368,7 @@ module Polars
1367
1368
  ]
1368
1369
  )._df
1369
1370
  )
1370
- summary.insert_at_idx(
1371
+ summary.insert_column(
1371
1372
  0,
1372
1373
  Polars::Series.new(
1373
1374
  "describe",
@@ -1388,11 +1389,12 @@ module Polars
1388
1389
  # df = Polars::DataFrame.new(
1389
1390
  # {"foo" => [1, 2, 3], "bar" => [6, 7, 8], "ham" => ["a", "b", "c"]}
1390
1391
  # )
1391
- # df.find_idx_by_name("ham")
1392
+ # df.get_column_index("ham")
1392
1393
  # # => 2
1393
- def find_idx_by_name(name)
1394
- _df.find_idx_by_name(name)
1394
+ def get_column_index(name)
1395
+ _df.get_column_index(name)
1395
1396
  end
1397
+ alias_method :find_idx_by_name, :get_column_index
1396
1398
 
1397
1399
  # Replace a column at an index location.
1398
1400
  #
@@ -1412,7 +1414,7 @@ module Polars
1412
1414
  # }
1413
1415
  # )
1414
1416
  # s = Polars::Series.new("apple", [10, 20, 30])
1415
- # df.replace_at_idx(0, s)
1417
+ # df.replace_column(0, s)
1416
1418
  # # =>
1417
1419
  # # shape: (3, 3)
1418
1420
  # # ┌───────┬─────┬─────┐
@@ -1424,13 +1426,14 @@ module Polars
1424
1426
  # # │ 20 ┆ 7 ┆ b │
1425
1427
  # # │ 30 ┆ 8 ┆ c │
1426
1428
  # # └───────┴─────┴─────┘
1427
- def replace_at_idx(index, series)
1429
+ def replace_column(index, series)
1428
1430
  if index < 0
1429
1431
  index = columns.length + index
1430
1432
  end
1431
- _df.replace_at_idx(index, series._s)
1433
+ _df.replace_column(index, series._s)
1432
1434
  self
1433
1435
  end
1436
+ alias_method :replace_at_idx, :replace_column
1434
1437
 
1435
1438
  # Sort the DataFrame by column.
1436
1439
  #
@@ -1524,13 +1527,14 @@ module Polars
1524
1527
  # "ham" => ["c", "b", "a"]
1525
1528
  # }
1526
1529
  # )
1527
- # df1.frame_equal(df1)
1530
+ # df1.equals(df1)
1528
1531
  # # => true
1529
- # df1.frame_equal(df2)
1532
+ # df1.equals(df2)
1530
1533
  # # => false
1531
- def frame_equal(other, null_equal: true)
1532
- _df.frame_equal(other._df, null_equal)
1534
+ def equals(other, null_equal: true)
1535
+ _df.equals(other._df, null_equal)
1533
1536
  end
1537
+ alias_method :frame_equal, :equals
1534
1538
 
1535
1539
  # Replace a column by a new Series.
1536
1540
  #
@@ -1716,7 +1720,7 @@ module Polars
1716
1720
  # # │ 3 ┆ 8 ┆ c │
1717
1721
  # # └─────┴─────┴─────┘
1718
1722
  def drop_nulls(subset: nil)
1719
- if subset.is_a?(String)
1723
+ if subset.is_a?(::String)
1720
1724
  subset = [subset]
1721
1725
  end
1722
1726
  _from_rbdf(_df.drop_nulls(subset))
@@ -1778,7 +1782,7 @@ module Polars
1778
1782
  # "b" => [2, 4, 6]
1779
1783
  # }
1780
1784
  # )
1781
- # df.with_row_count
1785
+ # df.with_row_index
1782
1786
  # # =>
1783
1787
  # # shape: (3, 3)
1784
1788
  # # ┌────────┬─────┬─────┐
@@ -1790,9 +1794,10 @@ module Polars
1790
1794
  # # │ 1 ┆ 3 ┆ 4 │
1791
1795
  # # │ 2 ┆ 5 ┆ 6 │
1792
1796
  # # └────────┴─────┴─────┘
1793
- def with_row_count(name: "row_nr", offset: 0)
1794
- _from_rbdf(_df.with_row_count(name, offset))
1797
+ def with_row_index(name: "row_nr", offset: 0)
1798
+ _from_rbdf(_df.with_row_index(name, offset))
1795
1799
  end
1800
+ alias_method :with_row_count, :with_row_index
1796
1801
 
1797
1802
  # Start a group by operation.
1798
1803
  #
@@ -2267,7 +2272,7 @@ module Polars
2267
2272
  if by.nil?
2268
2273
  by = []
2269
2274
  end
2270
- if by.is_a?(String)
2275
+ if by.is_a?(::String)
2271
2276
  by = [by]
2272
2277
  end
2273
2278
  if offset.nil?
@@ -2429,6 +2434,8 @@ module Polars
2429
2434
  # Join strategy.
2430
2435
  # @param suffix [String]
2431
2436
  # Suffix to append to columns with a duplicate name.
2437
+ # @param join_nulls [Boolean]
2438
+ # Join on null values. By default null values will never produce matches.
2432
2439
  #
2433
2440
  # @return [DataFrame]
2434
2441
  #
@@ -2461,17 +2468,17 @@ module Polars
2461
2468
  # @example
2462
2469
  # df.join(other_df, on: "ham", how: "outer")
2463
2470
  # # =>
2464
- # # shape: (4, 4)
2465
- # # ┌──────┬──────┬─────┬───────┐
2466
- # # │ foo ┆ bar ┆ ham ┆ apple │
2467
- # # │ --- ┆ --- ┆ --- ┆ ---
2468
- # # │ i64 ┆ f64 ┆ str ┆ str
2469
- # # ╞══════╪══════╪═════╪═══════╡
2470
- # # │ 1 ┆ 6.0 ┆ a ┆ x │
2471
- # # │ 2 ┆ 7.0 ┆ b ┆ y │
2472
- # # │ null ┆ null ┆ d ┆ z │
2473
- # # │ 3 ┆ 8.0 ┆ c ┆ null │
2474
- # # └──────┴──────┴─────┴───────┘
2471
+ # # shape: (4, 5)
2472
+ # # ┌──────┬──────┬──────┬───────┬───────────┐
2473
+ # # │ foo ┆ bar ┆ ham ┆ apple ┆ ham_right
2474
+ # # │ --- ┆ --- ┆ --- --- ┆ ---
2475
+ # # │ i64 ┆ f64 ┆ str str ┆ str
2476
+ # # ╞══════╪══════╪══════╪═══════╪═══════════╡
2477
+ # # │ 1 ┆ 6.0 ┆ a ┆ x ┆ a
2478
+ # # │ 2 ┆ 7.0 ┆ b ┆ y ┆ b
2479
+ # # │ null ┆ null ┆ null ┆ z ┆ d
2480
+ # # │ 3 ┆ 8.0 ┆ c ┆ null ┆ null
2481
+ # # └──────┴──────┴──────┴───────┴───────────┘
2475
2482
  #
2476
2483
  # @example
2477
2484
  # df.join(other_df, on: "ham", how: "left")
@@ -2511,7 +2518,7 @@ module Polars
2511
2518
  # # ╞═════╪═════╪═════╡
2512
2519
  # # │ 3 ┆ 8.0 ┆ c │
2513
2520
  # # └─────┴─────┴─────┘
2514
- def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right")
2521
+ def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right", join_nulls: false)
2515
2522
  lazy
2516
2523
  .join(
2517
2524
  other.lazy,
@@ -2520,6 +2527,7 @@ module Polars
2520
2527
  on: on,
2521
2528
  how: how,
2522
2529
  suffix: suffix,
2530
+ join_nulls: join_nulls
2523
2531
  )
2524
2532
  .collect(no_optimization: true)
2525
2533
  end
@@ -2863,7 +2871,7 @@ module Polars
2863
2871
  # "c" => [true, true, false, nil]
2864
2872
  # }
2865
2873
  # )
2866
- # df.cleared
2874
+ # df.clear
2867
2875
  # # =>
2868
2876
  # # shape: (0, 3)
2869
2877
  # # ┌─────┬─────┬──────┐
@@ -2872,9 +2880,31 @@ module Polars
2872
2880
  # # │ i64 ┆ f64 ┆ bool │
2873
2881
  # # ╞═════╪═════╪══════╡
2874
2882
  # # └─────┴─────┴──────┘
2875
- def cleared
2876
- height > 0 ? head(0) : clone
2883
+ #
2884
+ # @example
2885
+ # df.clear(2)
2886
+ # # =>
2887
+ # # shape: (2, 3)
2888
+ # # ┌──────┬──────┬──────┐
2889
+ # # │ a ┆ b ┆ c │
2890
+ # # │ --- ┆ --- ┆ --- │
2891
+ # # │ i64 ┆ f64 ┆ bool │
2892
+ # # ╞══════╪══════╪══════╡
2893
+ # # │ null ┆ null ┆ null │
2894
+ # # │ null ┆ null ┆ null │
2895
+ # # └──────┴──────┴──────┘
2896
+ def clear(n = 0)
2897
+ if n == 0
2898
+ _from_rbdf(_df.clear)
2899
+ elsif n > 0 || len > 0
2900
+ self.class.new(
2901
+ schema.to_h { |nm, tp| [nm, Series.new(nm, [], dtype: tp).extend_constant(nil, n)] }
2902
+ )
2903
+ else
2904
+ clone
2905
+ end
2877
2906
  end
2907
+ alias_method :cleared, :clear
2878
2908
 
2879
2909
  # clone handled by initialize_copy
2880
2910
 
@@ -3111,17 +3141,17 @@ module Polars
3111
3141
  sort_columns: false,
3112
3142
  separator: "_"
3113
3143
  )
3114
- if values.is_a?(String)
3144
+ if values.is_a?(::String)
3115
3145
  values = [values]
3116
3146
  end
3117
- if index.is_a?(String)
3147
+ if index.is_a?(::String)
3118
3148
  index = [index]
3119
3149
  end
3120
- if columns.is_a?(String)
3150
+ if columns.is_a?(::String)
3121
3151
  columns = [columns]
3122
3152
  end
3123
3153
 
3124
- if aggregate_fn.is_a?(String)
3154
+ if aggregate_fn.is_a?(::String)
3125
3155
  case aggregate_fn
3126
3156
  when "first"
3127
3157
  aggregate_expr = Polars.element.first._rbexpr
@@ -3137,8 +3167,11 @@ module Polars
3137
3167
  aggregate_expr = Polars.element.median._rbexpr
3138
3168
  when "last"
3139
3169
  aggregate_expr = Polars.element.last._rbexpr
3170
+ when "len"
3171
+ aggregate_expr = Polars.len._rbexpr
3140
3172
  when "count"
3141
- aggregate_expr = Polars.count._rbexpr
3173
+ warn "`aggregate_function: \"count\"` input for `pivot` is deprecated. Use `aggregate_function: \"len\"` instead."
3174
+ aggregate_expr = Polars.len._rbexpr
3142
3175
  else
3143
3176
  raise ArgumentError, "Argument aggregate fn: '#{aggregate_fn}' was not expected."
3144
3177
  end
@@ -3150,9 +3183,9 @@ module Polars
3150
3183
 
3151
3184
  _from_rbdf(
3152
3185
  _df.pivot_expr(
3153
- values,
3154
3186
  index,
3155
3187
  columns,
3188
+ values,
3156
3189
  maintain_order,
3157
3190
  sort_columns,
3158
3191
  aggregate_expr,
@@ -3206,10 +3239,10 @@ module Polars
3206
3239
  # # │ z ┆ c ┆ 6 │
3207
3240
  # # └─────┴──────────┴───────┘
3208
3241
  def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil)
3209
- if value_vars.is_a?(String)
3242
+ if value_vars.is_a?(::String)
3210
3243
  value_vars = [value_vars]
3211
3244
  end
3212
- if id_vars.is_a?(String)
3245
+ if id_vars.is_a?(::String)
3213
3246
  id_vars = [id_vars]
3214
3247
  end
3215
3248
  if value_vars.nil?
@@ -3423,7 +3456,7 @@ module Polars
3423
3456
  # # │ C ┆ 2 ┆ l │
3424
3457
  # # └─────┴─────┴─────┘}
3425
3458
  def partition_by(groups, maintain_order: true, include_key: true, as_dict: false)
3426
- if groups.is_a?(String)
3459
+ if groups.is_a?(::String)
3427
3460
  groups = [groups]
3428
3461
  elsif !groups.is_a?(::Array)
3429
3462
  groups = Array(groups)
@@ -3587,8 +3620,13 @@ module Polars
3587
3620
 
3588
3621
  # Select columns from this DataFrame.
3589
3622
  #
3590
- # @param exprs [Object]
3591
- # Column or columns to select.
3623
+ # @param exprs [Array]
3624
+ # Column(s) to select, specified as positional arguments.
3625
+ # Accepts expression input. Strings are parsed as column names,
3626
+ # other non-expression inputs are parsed as literals.
3627
+ # @param named_exprs [Hash]
3628
+ # Additional columns to select, specified as keyword arguments.
3629
+ # The columns will be renamed to the keyword used.
3592
3630
  #
3593
3631
  # @return [DataFrame]
3594
3632
  #
@@ -3668,23 +3706,25 @@ module Polars
3668
3706
  # # │ 0 │
3669
3707
  # # │ 10 │
3670
3708
  # # └─────────┘
3671
- def select(exprs)
3672
- _from_rbdf(
3673
- lazy
3674
- .select(exprs)
3675
- .collect(no_optimization: true, string_cache: false)
3676
- ._df
3677
- )
3709
+ def select(*exprs, **named_exprs)
3710
+ lazy.select(*exprs, **named_exprs).collect(_eager: true)
3678
3711
  end
3679
3712
 
3680
- # Add or overwrite multiple columns in a DataFrame.
3713
+ # Add columns to this DataFrame.
3714
+ #
3715
+ # Added columns will replace existing columns with the same name.
3681
3716
  #
3682
3717
  # @param exprs [Array]
3683
- # Array of Expressions that evaluate to columns.
3718
+ # Column(s) to add, specified as positional arguments.
3719
+ # Accepts expression input. Strings are parsed as column names, other
3720
+ # non-expression inputs are parsed as literals.
3721
+ # @param named_exprs [Hash]
3722
+ # Additional columns to add, specified as keyword arguments.
3723
+ # The columns will be renamed to the keyword used.
3684
3724
  #
3685
3725
  # @return [DataFrame]
3686
3726
  #
3687
- # @example
3727
+ # @example Pass an expression to add it as a new column.
3688
3728
  # df = Polars::DataFrame.new(
3689
3729
  # {
3690
3730
  # "a" => [1, 2, 3, 4],
@@ -3692,11 +3732,41 @@ module Polars
3692
3732
  # "c" => [true, true, false, true]
3693
3733
  # }
3694
3734
  # )
3735
+ # df.with_columns((Polars.col("a") ** 2).alias("a^2"))
3736
+ # # =>
3737
+ # # shape: (4, 4)
3738
+ # # ┌─────┬──────┬───────┬──────┐
3739
+ # # │ a ┆ b ┆ c ┆ a^2 │
3740
+ # # │ --- ┆ --- ┆ --- ┆ --- │
3741
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 │
3742
+ # # ╞═════╪══════╪═══════╪══════╡
3743
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1.0 │
3744
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4.0 │
3745
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9.0 │
3746
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 │
3747
+ # # └─────┴──────┴───────┴──────┘
3748
+ #
3749
+ # @example Added columns will replace existing columns with the same name.
3750
+ # df.with_columns(Polars.col("a").cast(Polars::Float64))
3751
+ # # =>
3752
+ # # shape: (4, 3)
3753
+ # # ┌─────┬──────┬───────┐
3754
+ # # │ a ┆ b ┆ c │
3755
+ # # │ --- ┆ --- ┆ --- │
3756
+ # # │ f64 ┆ f64 ┆ bool │
3757
+ # # ╞═════╪══════╪═══════╡
3758
+ # # │ 1.0 ┆ 0.5 ┆ true │
3759
+ # # │ 2.0 ┆ 4.0 ┆ true │
3760
+ # # │ 3.0 ┆ 10.0 ┆ false │
3761
+ # # │ 4.0 ┆ 13.0 ┆ true │
3762
+ # # └─────┴──────┴───────┘
3763
+ #
3764
+ # @example Multiple columns can be added by passing a list of expressions.
3695
3765
  # df.with_columns(
3696
3766
  # [
3697
3767
  # (Polars.col("a") ** 2).alias("a^2"),
3698
3768
  # (Polars.col("b") / 2).alias("b/2"),
3699
- # (Polars.col("c").is_not).alias("not c")
3769
+ # (Polars.col("c").not_).alias("not c"),
3700
3770
  # ]
3701
3771
  # )
3702
3772
  # # =>
@@ -3711,13 +3781,45 @@ module Polars
3711
3781
  # # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
3712
3782
  # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
3713
3783
  # # └─────┴──────┴───────┴──────┴──────┴───────┘
3714
- def with_columns(exprs)
3715
- if !exprs.nil? && !exprs.is_a?(::Array)
3716
- exprs = [exprs]
3717
- end
3718
- lazy
3719
- .with_columns(exprs)
3720
- .collect(no_optimization: true, string_cache: false)
3784
+ #
3785
+ # @example Multiple columns also can be added using positional arguments instead of a list.
3786
+ # df.with_columns(
3787
+ # (Polars.col("a") ** 2).alias("a^2"),
3788
+ # (Polars.col("b") / 2).alias("b/2"),
3789
+ # (Polars.col("c").not_).alias("not c"),
3790
+ # )
3791
+ # # =>
3792
+ # # shape: (4, 6)
3793
+ # # ┌─────┬──────┬───────┬──────┬──────┬───────┐
3794
+ # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
3795
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3796
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │
3797
+ # # ╞═════╪══════╪═══════╪══════╪══════╪═══════╡
3798
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │
3799
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │
3800
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
3801
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
3802
+ # # └─────┴──────┴───────┴──────┴──────┴───────┘
3803
+ #
3804
+ # @example Use keyword arguments to easily name your expression inputs.
3805
+ # df.with_columns(
3806
+ # ab: Polars.col("a") * Polars.col("b"),
3807
+ # not_c: Polars.col("c").not_
3808
+ # )
3809
+ # # =>
3810
+ # # shape: (4, 5)
3811
+ # # ┌─────┬──────┬───────┬──────┬───────┐
3812
+ # # │ a ┆ b ┆ c ┆ ab ┆ not_c │
3813
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3814
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │
3815
+ # # ╞═════╪══════╪═══════╪══════╪═══════╡
3816
+ # # │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │
3817
+ # # │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │
3818
+ # # │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │
3819
+ # # │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │
3820
+ # # └─────┴──────┴───────┴──────┴───────┘
3821
+ def with_columns(*exprs, **named_exprs)
3822
+ lazy.with_columns(*exprs, **named_exprs).collect(_eager: true)
3721
3823
  end
3722
3824
 
3723
3825
  # Get number of chunks used by the ChunkedArrays of this DataFrame.
@@ -3774,7 +3876,7 @@ module Polars
3774
3876
  # # └─────┴─────┴─────┘
3775
3877
  def max(axis: 0)
3776
3878
  if axis == 0
3777
- _from_rbdf(_df.max)
3879
+ lazy.max.collect(_eager: true)
3778
3880
  elsif axis == 1
3779
3881
  Utils.wrap_s(_df.max_horizontal)
3780
3882
  else
@@ -3806,7 +3908,7 @@ module Polars
3806
3908
  # # └─────┴─────┴─────┘
3807
3909
  def min(axis: 0)
3808
3910
  if axis == 0
3809
- _from_rbdf(_df.min)
3911
+ lazy.min.collect(_eager: true)
3810
3912
  elsif axis == 1
3811
3913
  Utils.wrap_s(_df.min_horizontal)
3812
3914
  else
@@ -3855,7 +3957,7 @@ module Polars
3855
3957
  def sum(axis: 0, null_strategy: "ignore")
3856
3958
  case axis
3857
3959
  when 0
3858
- _from_rbdf(_df.sum)
3960
+ lazy.sum.collect(_eager: true)
3859
3961
  when 1
3860
3962
  Utils.wrap_s(_df.sum_horizontal(null_strategy))
3861
3963
  else
@@ -3893,7 +3995,7 @@ module Polars
3893
3995
  def mean(axis: 0, null_strategy: "ignore")
3894
3996
  case axis
3895
3997
  when 0
3896
- _from_rbdf(_df.mean)
3998
+ lazy.mean.collect(_eager: true)
3897
3999
  when 1
3898
4000
  Utils.wrap_s(_df.mean_horizontal(null_strategy))
3899
4001
  else
@@ -3939,7 +4041,7 @@ module Polars
3939
4041
  # # │ 0.816497 ┆ 0.816497 ┆ null │
3940
4042
  # # └──────────┴──────────┴──────┘
3941
4043
  def std(ddof: 1)
3942
- _from_rbdf(_df.std(ddof))
4044
+ lazy.std(ddof: ddof).collect(_eager: true)
3943
4045
  end
3944
4046
 
3945
4047
  # Aggregate the columns of this DataFrame to their variance value.
@@ -3980,7 +4082,7 @@ module Polars
3980
4082
  # # │ 0.666667 ┆ 0.666667 ┆ null │
3981
4083
  # # └──────────┴──────────┴──────┘
3982
4084
  def var(ddof: 1)
3983
- _from_rbdf(_df.var(ddof))
4085
+ lazy.var(ddof: ddof).collect(_eager: true)
3984
4086
  end
3985
4087
 
3986
4088
  # Aggregate the columns of this DataFrame to their median value.
@@ -4006,7 +4108,7 @@ module Polars
4006
4108
  # # │ 2.0 ┆ 7.0 ┆ null │
4007
4109
  # # └─────┴─────┴──────┘
4008
4110
  def median
4009
- _from_rbdf(_df.median)
4111
+ lazy.median.collect(_eager: true)
4010
4112
  end
4011
4113
 
4012
4114
  # Aggregate the columns of this DataFrame to their product values.
@@ -4063,7 +4165,7 @@ module Polars
4063
4165
  # # │ 2.0 ┆ 7.0 ┆ null │
4064
4166
  # # └─────┴─────┴──────┘
4065
4167
  def quantile(quantile, interpolation: "nearest")
4066
- _from_rbdf(_df.quantile(quantile, interpolation))
4168
+ lazy.quantile(quantile, interpolation: interpolation).collect(_eager: true)
4067
4169
  end
4068
4170
 
4069
4171
  # Get one hot encoded dummy variables.
@@ -4094,7 +4196,7 @@ module Polars
4094
4196
  # # │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │
4095
4197
  # # └───────┴───────┴───────┴───────┴───────┴───────┘
4096
4198
  def to_dummies(columns: nil, separator: "_", drop_first: false)
4097
- if columns.is_a?(String)
4199
+ if columns.is_a?(::String)
4098
4200
  columns = [columns]
4099
4201
  end
4100
4202
  _from_rbdf(_df.to_dummies(columns, separator, drop_first))
@@ -4359,7 +4461,7 @@ module Polars
4359
4461
  # # null
4360
4462
  # # ]
4361
4463
  #
4362
- # @example A horizontal boolean or, similar to a row-wise .any():
4464
+ # @example A horizontal boolean or, similar to a row-wise .any:
4363
4465
  # df = Polars::DataFrame.new(
4364
4466
  # {
4365
4467
  # "a" => [false, false, true],
@@ -4482,7 +4584,7 @@ module Polars
4482
4584
  # # => [{"a"=>1, "b"=>2}, {"a"=>3, "b"=>4}, {"a"=>5, "b"=>6}]
4483
4585
  def rows(named: false)
4484
4586
  if named
4485
- columns = columns()
4587
+ columns = self.columns
4486
4588
  _df.row_tuples.map do |v|
4487
4589
  columns.zip(v).to_h
4488
4590
  end
@@ -4523,7 +4625,7 @@ module Polars
4523
4625
  return to_enum(:iter_rows, named: named, buffer_size: buffer_size) unless block_given?
4524
4626
 
4525
4627
  # load into the local namespace for a modest performance boost in the hot loops
4526
- columns = columns()
4628
+ columns = self.columns
4527
4629
 
4528
4630
  # note: buffering rows results in a 2-4x speedup over individual calls
4529
4631
  # to ".row(i)", so it should only be disabled in extremely specific cases.
@@ -4603,8 +4705,8 @@ module Polars
4603
4705
  # # │ 1 ┆ 5 │
4604
4706
  # # │ 3 ┆ 7 │
4605
4707
  # # └─────┴─────┘
4606
- def gather_every(n)
4607
- select(Utils.col("*").gather_every(n))
4708
+ def gather_every(n, offset = 0)
4709
+ select(Utils.col("*").gather_every(n, offset))
4608
4710
  end
4609
4711
  alias_method :take_every, :gather_every
4610
4712
 
@@ -4754,19 +4856,57 @@ module Polars
4754
4856
  # # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │
4755
4857
  # # └────────┴─────┴─────┴──────┴───────────┴───────┘
4756
4858
  def unnest(names)
4757
- if names.is_a?(String)
4859
+ if names.is_a?(::String)
4758
4860
  names = [names]
4759
4861
  end
4760
4862
  _from_rbdf(_df.unnest(names))
4761
4863
  end
4762
4864
 
4763
- # TODO
4865
+ # Requires NumPy
4764
4866
  # def corr
4765
4867
  # end
4766
4868
 
4767
- # TODO
4768
- # def merge_sorted
4769
- # end
4869
+ # Take two sorted DataFrames and merge them by the sorted key.
4870
+ #
4871
+ # The output of this operation will also be sorted.
4872
+ # It is the callers responsibility that the frames are sorted
4873
+ # by that key otherwise the output will not make sense.
4874
+ #
4875
+ # The schemas of both DataFrames must be equal.
4876
+ #
4877
+ # @param other [DataFrame]
4878
+ # Other DataFrame that must be merged
4879
+ # @param key [String]
4880
+ # Key that is sorted.
4881
+ #
4882
+ # @return [DataFrame]
4883
+ #
4884
+ # @example
4885
+ # df0 = Polars::DataFrame.new(
4886
+ # {"name" => ["steve", "elise", "bob"], "age" => [42, 44, 18]}
4887
+ # ).sort("age")
4888
+ # df1 = Polars::DataFrame.new(
4889
+ # {"name" => ["anna", "megan", "steve", "thomas"], "age" => [21, 33, 42, 20]}
4890
+ # ).sort("age")
4891
+ # df0.merge_sorted(df1, "age")
4892
+ # # =>
4893
+ # # shape: (7, 2)
4894
+ # # ┌────────┬─────┐
4895
+ # # │ name ┆ age │
4896
+ # # │ --- ┆ --- │
4897
+ # # │ str ┆ i64 │
4898
+ # # ╞════════╪═════╡
4899
+ # # │ bob ┆ 18 │
4900
+ # # │ thomas ┆ 20 │
4901
+ # # │ anna ┆ 21 │
4902
+ # # │ megan ┆ 33 │
4903
+ # # │ steve ┆ 42 │
4904
+ # # │ steve ┆ 42 │
4905
+ # # │ elise ┆ 44 │
4906
+ # # └────────┴─────┘
4907
+ def merge_sorted(other, key)
4908
+ lazy.merge_sorted(other.lazy, key).collect(_eager: true)
4909
+ end
4770
4910
 
4771
4911
  # Indicate that one or multiple columns are sorted.
4772
4912
  #
@@ -4808,7 +4948,7 @@ module Polars
4808
4948
  end
4809
4949
 
4810
4950
  def _pos_idxs(idxs, dim)
4811
- idx_type = Polars._get_idx_type
4951
+ idx_type = Plr.get_index_type
4812
4952
 
4813
4953
  if idxs.is_a?(Series)
4814
4954
  if idxs.dtype == idx_type
@@ -4867,10 +5007,10 @@ module Polars
4867
5007
  if val.is_a?(Hash) && dtype != Struct
4868
5008
  updated_data[name] = DataFrame.new(val).to_struct(name)
4869
5009
  elsif !Utils.arrlen(val).nil?
4870
- updated_data[name] = Series.new(String.new(name), val, dtype: dtype)
4871
- elsif val.nil? || [Integer, Float, TrueClass, FalseClass, String, ::Date, ::DateTime, ::Time].any? { |cls| val.is_a?(cls) }
5010
+ updated_data[name] = Series.new(::String.new(name), val, dtype: dtype)
5011
+ elsif val.nil? || [Integer, Float, TrueClass, FalseClass, ::String, ::Date, ::DateTime, ::Time].any? { |cls| val.is_a?(cls) }
4872
5012
  dtype = Polars::Float64 if val.nil? && dtype.nil?
4873
- updated_data[name] = Series.new(String.new(name), [val], dtype: dtype).extend_constant(val, array_len - 1)
5013
+ updated_data[name] = Series.new(::String.new(name), [val], dtype: dtype).extend_constant(val, array_len - 1)
4874
5014
  else
4875
5015
  raise Todo
4876
5016
  end
@@ -4927,7 +5067,7 @@ module Polars
4927
5067
  end
4928
5068
  column_names =
4929
5069
  (schema || []).map.with_index do |col, i|
4930
- if col.is_a?(String)
5070
+ if col.is_a?(::String)
4931
5071
  col || "column_#{i}"
4932
5072
  else
4933
5073
  col[0]
@@ -4940,7 +5080,7 @@ module Polars
4940
5080
  lookup = column_names.zip(lookup_names || []).to_h
4941
5081
 
4942
5082
  column_dtypes =
4943
- (schema || []).select { |col| !col.is_a?(String) && col[1] }.to_h do |col|
5083
+ (schema || []).select { |col| !col.is_a?(::String) && col[1] }.to_h do |col|
4944
5084
  [lookup[col[0]] || col[0], col[1]]
4945
5085
  end
4946
5086
 
@@ -5041,14 +5181,14 @@ module Polars
5041
5181
  elsif data[0].is_a?(Hash)
5042
5182
  column_names, dtypes = _unpack_schema(columns)
5043
5183
  schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
5044
- rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema_overrides)
5184
+ rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema, schema_overrides)
5045
5185
  if column_names
5046
5186
  rbdf = _post_apply_columns(rbdf, column_names)
5047
5187
  end
5048
5188
  return rbdf
5049
5189
  elsif data[0].is_a?(::Array)
5190
+ first_element = data[0]
5050
5191
  if orient.nil? && !columns.nil?
5051
- first_element = data[0]
5052
5192
  row_types = first_element.filter_map { |value| value.class }.uniq
5053
5193
  if row_types.include?(Integer) && row_types.include?(Float)
5054
5194
  row_types.delete(Integer)