polars-df 0.10.0-aarch64-linux → 0.12.0-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +27 -0
  3. data/Cargo.lock +392 -351
  4. data/LICENSE-THIRD-PARTY.txt +1125 -865
  5. data/README.md +6 -6
  6. data/lib/polars/3.1/polars.so +0 -0
  7. data/lib/polars/3.2/polars.so +0 -0
  8. data/lib/polars/3.3/polars.so +0 -0
  9. data/lib/polars/array_expr.rb +4 -4
  10. data/lib/polars/batched_csv_reader.rb +11 -5
  11. data/lib/polars/cat_expr.rb +0 -36
  12. data/lib/polars/cat_name_space.rb +0 -37
  13. data/lib/polars/convert.rb +6 -1
  14. data/lib/polars/data_frame.rb +176 -403
  15. data/lib/polars/data_types.rb +1 -1
  16. data/lib/polars/date_time_expr.rb +525 -572
  17. data/lib/polars/date_time_name_space.rb +263 -460
  18. data/lib/polars/dynamic_group_by.rb +5 -5
  19. data/lib/polars/exceptions.rb +7 -0
  20. data/lib/polars/expr.rb +1394 -243
  21. data/lib/polars/expr_dispatch.rb +1 -1
  22. data/lib/polars/functions/aggregation/horizontal.rb +8 -8
  23. data/lib/polars/functions/as_datatype.rb +63 -40
  24. data/lib/polars/functions/lazy.rb +63 -14
  25. data/lib/polars/functions/lit.rb +1 -1
  26. data/lib/polars/functions/range/date_range.rb +90 -57
  27. data/lib/polars/functions/range/datetime_range.rb +149 -0
  28. data/lib/polars/functions/range/int_range.rb +2 -2
  29. data/lib/polars/functions/range/time_range.rb +141 -0
  30. data/lib/polars/functions/repeat.rb +1 -1
  31. data/lib/polars/functions/whenthen.rb +1 -1
  32. data/lib/polars/group_by.rb +88 -23
  33. data/lib/polars/io/avro.rb +24 -0
  34. data/lib/polars/{io.rb → io/csv.rb} +299 -493
  35. data/lib/polars/io/database.rb +73 -0
  36. data/lib/polars/io/ipc.rb +247 -0
  37. data/lib/polars/io/json.rb +29 -0
  38. data/lib/polars/io/ndjson.rb +80 -0
  39. data/lib/polars/io/parquet.rb +227 -0
  40. data/lib/polars/lazy_frame.rb +143 -272
  41. data/lib/polars/lazy_group_by.rb +100 -3
  42. data/lib/polars/list_expr.rb +11 -11
  43. data/lib/polars/list_name_space.rb +5 -1
  44. data/lib/polars/rolling_group_by.rb +7 -9
  45. data/lib/polars/series.rb +103 -187
  46. data/lib/polars/string_expr.rb +78 -102
  47. data/lib/polars/string_name_space.rb +5 -4
  48. data/lib/polars/testing.rb +2 -2
  49. data/lib/polars/utils/constants.rb +9 -0
  50. data/lib/polars/utils/convert.rb +97 -0
  51. data/lib/polars/utils/parse.rb +89 -0
  52. data/lib/polars/utils/various.rb +76 -0
  53. data/lib/polars/utils/wrap.rb +19 -0
  54. data/lib/polars/utils.rb +8 -300
  55. data/lib/polars/version.rb +1 -1
  56. data/lib/polars/whenthen.rb +6 -6
  57. data/lib/polars.rb +20 -1
  58. metadata +17 -4
@@ -27,149 +27,6 @@ module Polars
27
27
  ldf
28
28
  end
29
29
 
30
- # @private
31
- def self._scan_csv(
32
- file,
33
- has_header: true,
34
- sep: ",",
35
- comment_char: nil,
36
- quote_char: '"',
37
- skip_rows: 0,
38
- dtypes: nil,
39
- null_values: nil,
40
- ignore_errors: false,
41
- cache: true,
42
- with_column_names: nil,
43
- infer_schema_length: 100,
44
- n_rows: nil,
45
- encoding: "utf8",
46
- low_memory: false,
47
- rechunk: true,
48
- skip_rows_after_header: 0,
49
- row_count_name: nil,
50
- row_count_offset: 0,
51
- parse_dates: false,
52
- eol_char: "\n",
53
- truncate_ragged_lines: true
54
- )
55
- dtype_list = nil
56
- if !dtypes.nil?
57
- dtype_list = []
58
- dtypes.each do |k, v|
59
- dtype_list << [k, Utils.rb_type_to_dtype(v)]
60
- end
61
- end
62
- processed_null_values = Utils._process_null_values(null_values)
63
-
64
- _from_rbldf(
65
- RbLazyFrame.new_from_csv(
66
- file,
67
- sep,
68
- has_header,
69
- ignore_errors,
70
- skip_rows,
71
- n_rows,
72
- cache,
73
- dtype_list,
74
- low_memory,
75
- comment_char,
76
- quote_char,
77
- processed_null_values,
78
- infer_schema_length,
79
- with_column_names,
80
- rechunk,
81
- skip_rows_after_header,
82
- encoding,
83
- Utils._prepare_row_count_args(row_count_name, row_count_offset),
84
- parse_dates,
85
- eol_char,
86
- truncate_ragged_lines
87
- )
88
- )
89
- end
90
-
91
- # @private
92
- def self._scan_parquet(
93
- file,
94
- n_rows: nil,
95
- cache: true,
96
- parallel: "auto",
97
- rechunk: true,
98
- row_count_name: nil,
99
- row_count_offset: 0,
100
- storage_options: nil,
101
- low_memory: false,
102
- use_statistics: true,
103
- hive_partitioning: true
104
- )
105
- _from_rbldf(
106
- RbLazyFrame.new_from_parquet(
107
- file,
108
- [],
109
- n_rows,
110
- cache,
111
- parallel,
112
- rechunk,
113
- Utils._prepare_row_count_args(row_count_name, row_count_offset),
114
- low_memory,
115
- use_statistics,
116
- hive_partitioning,
117
- nil
118
- )
119
- )
120
- end
121
-
122
- # @private
123
- def self._scan_ipc(
124
- file,
125
- n_rows: nil,
126
- cache: true,
127
- rechunk: true,
128
- row_count_name: nil,
129
- row_count_offset: 0,
130
- storage_options: nil,
131
- memory_map: true
132
- )
133
- if Utils.pathlike?(file)
134
- file = Utils.normalise_filepath(file)
135
- end
136
-
137
- _from_rbldf(
138
- RbLazyFrame.new_from_ipc(
139
- file,
140
- n_rows,
141
- cache,
142
- rechunk,
143
- Utils._prepare_row_count_args(row_count_name, row_count_offset),
144
- memory_map
145
- )
146
- )
147
- end
148
-
149
- # @private
150
- def self._scan_ndjson(
151
- file,
152
- infer_schema_length: nil,
153
- batch_size: nil,
154
- n_rows: nil,
155
- low_memory: false,
156
- rechunk: true,
157
- row_count_name: nil,
158
- row_count_offset: 0
159
- )
160
- _from_rbldf(
161
- RbLazyFrame.new_from_ndjson(
162
- file,
163
- infer_schema_length,
164
- batch_size,
165
- n_rows,
166
- low_memory,
167
- rechunk,
168
- Utils._prepare_row_count_args(row_count_name, row_count_offset)
169
- )
170
- )
171
- end
172
-
173
30
  # def self.from_json
174
31
  # end
175
32
 
@@ -181,7 +38,7 @@ module Polars
181
38
  # @return [LazyFrame]
182
39
  def self.read_json(file)
183
40
  if Utils.pathlike?(file)
184
- file = Utils.normalise_filepath(file)
41
+ file = Utils.normalize_filepath(file)
185
42
  end
186
43
 
187
44
  Utils.wrap_ldf(RbLazyFrame.read_json(file))
@@ -206,7 +63,7 @@ module Polars
206
63
  # df.columns
207
64
  # # => ["foo", "bar"]
208
65
  def columns
209
- _ldf.columns
66
+ _ldf.collect_schema.keys
210
67
  end
211
68
 
212
69
  # Get dtypes of columns in LazyFrame.
@@ -224,7 +81,7 @@ module Polars
224
81
  # lf.dtypes
225
82
  # # => [Polars::Int64, Polars::Float64, Polars::String]
226
83
  def dtypes
227
- _ldf.dtypes
84
+ _ldf.collect_schema.values
228
85
  end
229
86
 
230
87
  # Get the schema.
@@ -242,7 +99,7 @@ module Polars
242
99
  # lf.schema
243
100
  # # => {"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::String}
244
101
  def schema
245
- _ldf.schema
102
+ _ldf.collect_schema
246
103
  end
247
104
 
248
105
  # Get the width of the LazyFrame.
@@ -254,7 +111,7 @@ module Polars
254
111
  # lf.width
255
112
  # # => 2
256
113
  def width
257
- _ldf.width
114
+ _ldf.collect_schema.length
258
115
  end
259
116
 
260
117
  # Check if LazyFrame includes key.
@@ -288,7 +145,7 @@ module Polars
288
145
  # @return [nil]
289
146
  def write_json(file)
290
147
  if Utils.pathlike?(file)
291
- file = Utils.normalise_filepath(file)
148
+ file = Utils.normalize_filepath(file)
292
149
  end
293
150
  _ldf.write_json(file)
294
151
  nil
@@ -404,16 +261,23 @@ module Polars
404
261
  # # │ 2 ┆ 7.0 ┆ b │
405
262
  # # │ 1 ┆ 6.0 ┆ a │
406
263
  # # └─────┴─────┴─────┘
407
- def sort(by, reverse: false, nulls_last: false, maintain_order: false, multithreaded: true)
408
- if by.is_a?(::String)
409
- return _from_rbldf(_ldf.sort(by, reverse, nulls_last, maintain_order, multithreaded))
410
- end
411
- if Utils.bool?(reverse)
412
- reverse = [reverse]
264
+ def sort(by, *more_by, reverse: false, nulls_last: false, maintain_order: false, multithreaded: true)
265
+ if by.is_a?(::String) && more_by.empty?
266
+ return _from_rbldf(
267
+ _ldf.sort(
268
+ by, reverse, nulls_last, maintain_order, multithreaded
269
+ )
270
+ )
413
271
  end
414
272
 
415
- by = Utils.selection_to_rbexpr_list(by)
416
- _from_rbldf(_ldf.sort_by_exprs(by, reverse, nulls_last, maintain_order, multithreaded))
273
+ by = Utils.parse_into_list_of_expressions(by, *more_by)
274
+ reverse = Utils.extend_bool(reverse, by.length, "reverse", "by")
275
+ nulls_last = Utils.extend_bool(nulls_last, by.length, "nulls_last", "by")
276
+ _from_rbldf(
277
+ _ldf.sort_by_exprs(
278
+ by, reverse, nulls_last, maintain_order, multithreaded
279
+ )
280
+ )
417
281
  end
418
282
 
419
283
  # def profile
@@ -558,7 +422,7 @@ module Polars
558
422
  path,
559
423
  compression: "zstd",
560
424
  compression_level: nil,
561
- statistics: false,
425
+ statistics: true,
562
426
  row_group_size: nil,
563
427
  data_pagesize_limit: nil,
564
428
  maintain_order: true,
@@ -578,6 +442,24 @@ module Polars
578
442
  no_optimization: no_optimization
579
443
  )
580
444
 
445
+ if statistics == true
446
+ statistics = {
447
+ min: true,
448
+ max: true,
449
+ distinct_count: false,
450
+ null_count: true
451
+ }
452
+ elsif statistics == false
453
+ statistics = {}
454
+ elsif statistics == "full"
455
+ statistics = {
456
+ min: true,
457
+ max: true,
458
+ distinct_count: true,
459
+ null_count: true
460
+ }
461
+ end
462
+
581
463
  lf.sink_parquet(
582
464
  path,
583
465
  compression,
@@ -732,6 +614,7 @@ module Polars
732
614
  datetime_format: nil,
733
615
  date_format: nil,
734
616
  time_format: nil,
617
+ float_scientific: nil,
735
618
  float_precision: nil,
736
619
  null_value: nil,
737
620
  quote_style: nil,
@@ -766,6 +649,7 @@ module Polars
766
649
  datetime_format,
767
650
  date_format,
768
651
  time_format,
652
+ float_scientific,
769
653
  float_precision,
770
654
  null_value,
771
655
  quote_style,
@@ -1050,7 +934,7 @@ module Polars
1050
934
  def filter(predicate)
1051
935
  _from_rbldf(
1052
936
  _ldf.filter(
1053
- Utils.expr_to_lit_or_expr(predicate, str_to_lit: false)._rbexpr
937
+ Utils.parse_into_expression(predicate, str_as_lit: false)
1054
938
  )
1055
939
  )
1056
940
  end
@@ -1137,7 +1021,7 @@ module Polars
1137
1021
  # # ┌─────────┐
1138
1022
  # # │ literal │
1139
1023
  # # │ --- │
1140
- # # │ i64
1024
+ # # │ i32
1141
1025
  # # ╞═════════╡
1142
1026
  # # │ 0 │
1143
1027
  # # │ 0 │
@@ -1146,7 +1030,7 @@ module Polars
1146
1030
  def select(*exprs, **named_exprs)
1147
1031
  structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", "0") != "0"
1148
1032
 
1149
- rbexprs = Utils.parse_as_list_of_expressions(
1033
+ rbexprs = Utils.parse_into_list_of_expressions(
1150
1034
  *exprs, **named_exprs, __structify: structify
1151
1035
  )
1152
1036
  _from_rbldf(_ldf.select(rbexprs))
@@ -1154,12 +1038,14 @@ module Polars
1154
1038
 
1155
1039
  # Start a group by operation.
1156
1040
  #
1157
- # @param by [Object]
1041
+ # @param by [Array]
1158
1042
  # Column(s) to group by.
1159
1043
  # @param maintain_order [Boolean]
1160
1044
  # Make sure that the order of the groups remain consistent. This is more
1161
1045
  # expensive than a default group by.
1162
- #
1046
+ # @param named_by [Hash]
1047
+ # Additional columns to group by, specified as keyword arguments.
1048
+ # The columns will be renamed to the keyword used.
1163
1049
  # @return [LazyGroupBy]
1164
1050
  #
1165
1051
  # @example
@@ -1182,9 +1068,9 @@ module Polars
1182
1068
  # # │ b ┆ 11 │
1183
1069
  # # │ c ┆ 6 │
1184
1070
  # # └─────┴─────┘
1185
- def group_by(by, maintain_order: false)
1186
- rbexprs_by = Utils.selection_to_rbexpr_list(by)
1187
- lgb = _ldf.group_by(rbexprs_by, maintain_order)
1071
+ def group_by(*by, maintain_order: false, **named_by)
1072
+ exprs = Utils.parse_into_list_of_expressions(*by, **named_by)
1073
+ lgb = _ldf.group_by(exprs, maintain_order)
1188
1074
  LazyGroupBy.new(lgb)
1189
1075
  end
1190
1076
  alias_method :groupby, :group_by
@@ -1238,12 +1124,6 @@ module Polars
1238
1124
  # Define whether the temporal window interval is closed or not.
1239
1125
  # @param by [Object]
1240
1126
  # Also group by this column/these columns.
1241
- # @param check_sorted [Boolean]
1242
- # When the `by` argument is given, polars can not check sortedness
1243
- # by the metadata and has to do a full scan on the index column to
1244
- # verify data is sorted. This is expensive. If you are sure the
1245
- # data within the by groups is sorted, you can set this to `false`.
1246
- # Doing so incorrectly will lead to incorrect output
1247
1127
  #
1248
1128
  # @return [LazyFrame]
1249
1129
  #
@@ -1285,21 +1165,20 @@ module Polars
1285
1165
  period:,
1286
1166
  offset: nil,
1287
1167
  closed: "right",
1288
- by: nil,
1289
- check_sorted: true
1168
+ by: nil
1290
1169
  )
1291
- index_column = Utils.parse_as_expression(index_column)
1170
+ index_column = Utils.parse_into_expression(index_column)
1292
1171
  if offset.nil?
1293
- offset = "-#{period}"
1172
+ offset = Utils.negate_duration_string(Utils.parse_as_duration_string(period))
1294
1173
  end
1295
1174
 
1296
- rbexprs_by = by.nil? ? [] : Utils.selection_to_rbexpr_list(by)
1297
- period = Utils._timedelta_to_pl_duration(period)
1298
- offset = Utils._timedelta_to_pl_duration(offset)
1299
-
1300
- lgb = _ldf.rolling(
1301
- index_column, period, offset, closed, rbexprs_by, check_sorted
1175
+ rbexprs_by = (
1176
+ !by.nil? ? Utils.parse_into_list_of_expressions(by) : []
1302
1177
  )
1178
+ period = Utils.parse_as_duration_string(period)
1179
+ offset = Utils.parse_as_duration_string(offset)
1180
+
1181
+ lgb = _ldf.rolling(index_column, period, offset, closed, rbexprs_by)
1303
1182
  LazyGroupBy.new(lgb)
1304
1183
  end
1305
1184
  alias_method :group_by_rolling, :rolling
@@ -1367,22 +1246,18 @@ module Polars
1367
1246
  # Define whether the temporal window interval is closed or not.
1368
1247
  # @param by [Object]
1369
1248
  # Also group by this column/these columns
1370
- # @param check_sorted [Boolean]
1371
- # When the `by` argument is given, polars can not check sortedness
1372
- # by the metadata and has to do a full scan on the index column to
1373
- # verify data is sorted. This is expensive. If you are sure the
1374
- # data within the by groups is sorted, you can set this to `false`.
1375
- # Doing so incorrectly will lead to incorrect output.
1376
1249
  #
1377
1250
  # @return [DataFrame]
1378
1251
  #
1379
1252
  # @example
1380
1253
  # df = Polars::DataFrame.new(
1381
1254
  # {
1382
- # "time" => Polars.date_range(
1255
+ # "time" => Polars.datetime_range(
1383
1256
  # DateTime.new(2021, 12, 16),
1384
1257
  # DateTime.new(2021, 12, 16, 3),
1385
- # "30m"
1258
+ # "30m",
1259
+ # time_unit: "us",
1260
+ # eager: true
1386
1261
  # ),
1387
1262
  # "n" => 0..6
1388
1263
  # }
@@ -1449,16 +1324,16 @@ module Polars
1449
1324
  # )
1450
1325
  # # =>
1451
1326
  # # shape: (4, 3)
1452
- # # ┌─────────────────────┬────────────┬───────────────────────────────────┐
1453
- # # │ time ┆ time_count ┆ time_agg_list
1454
- # # │ --- ┆ --- ┆ ---
1455
- # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
1456
- # # ╞═════════════════════╪════════════╪═══════════════════════════════════╡
1457
- # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16…
1458
- # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16…
1459
- # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16…
1460
- # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
1461
- # # └─────────────────────┴────────────┴───────────────────────────────────┘
1327
+ # # ┌─────────────────────┬────────────┬─────────────────────────────────┐
1328
+ # # │ time ┆ time_count ┆ time_agg_list
1329
+ # # │ --- ┆ --- ┆ ---
1330
+ # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
1331
+ # # ╞═════════════════════╪════════════╪═════════════════════════════════╡
1332
+ # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-…
1333
+ # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-…
1334
+ # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-…
1335
+ # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
1336
+ # # └─────────────────────┴────────────┴─────────────────────────────────┘
1462
1337
  #
1463
1338
  # @example When closed="both" the time values at the window boundaries belong to 2 groups.
1464
1339
  # df.group_by_dynamic("time", every: "1h", closed: "both").agg(
@@ -1481,10 +1356,12 @@ module Polars
1481
1356
  # @example Dynamic group bys can also be combined with grouping on normal keys.
1482
1357
  # df = Polars::DataFrame.new(
1483
1358
  # {
1484
- # "time" => Polars.date_range(
1359
+ # "time" => Polars.datetime_range(
1485
1360
  # DateTime.new(2021, 12, 16),
1486
1361
  # DateTime.new(2021, 12, 16, 3),
1487
- # "30m"
1362
+ # "30m",
1363
+ # time_unit: "us",
1364
+ # eager: true
1488
1365
  # ),
1489
1366
  # "groups" => ["a", "a", "a", "b", "b", "a", "a"]
1490
1367
  # }
@@ -1548,14 +1425,13 @@ module Polars
1548
1425
  closed: "left",
1549
1426
  label: "left",
1550
1427
  by: nil,
1551
- start_by: "window",
1552
- check_sorted: true
1428
+ start_by: "window"
1553
1429
  )
1554
1430
  if !truncate.nil?
1555
1431
  label = truncate ? "left" : "datapoint"
1556
1432
  end
1557
1433
 
1558
- index_column = Utils.expr_to_lit_or_expr(index_column, str_to_lit: false)
1434
+ index_column = Utils.parse_into_expression(index_column, str_as_lit: false)
1559
1435
  if offset.nil?
1560
1436
  offset = period.nil? ? "-#{every}" : "0ns"
1561
1437
  end
@@ -1564,13 +1440,13 @@ module Polars
1564
1440
  period = every
1565
1441
  end
1566
1442
 
1567
- period = Utils._timedelta_to_pl_duration(period)
1568
- offset = Utils._timedelta_to_pl_duration(offset)
1569
- every = Utils._timedelta_to_pl_duration(every)
1443
+ period = Utils.parse_as_duration_string(period)
1444
+ offset = Utils.parse_as_duration_string(offset)
1445
+ every = Utils.parse_as_duration_string(every)
1570
1446
 
1571
- rbexprs_by = by.nil? ? [] : Utils.selection_to_rbexpr_list(by)
1447
+ rbexprs_by = by.nil? ? [] : Utils.parse_into_list_of_expressions(by)
1572
1448
  lgb = _ldf.group_by_dynamic(
1573
- index_column._rbexpr,
1449
+ index_column,
1574
1450
  every,
1575
1451
  period,
1576
1452
  offset,
@@ -1578,8 +1454,7 @@ module Polars
1578
1454
  include_boundaries,
1579
1455
  closed,
1580
1456
  rbexprs_by,
1581
- start_by,
1582
- check_sorted
1457
+ start_by
1583
1458
  )
1584
1459
  LazyGroupBy.new(lgb)
1585
1460
  end
@@ -1730,7 +1605,7 @@ module Polars
1730
1605
  # @param on Object
1731
1606
  # Join column of both DataFrames. If set, `left_on` and `right_on` should be
1732
1607
  # None.
1733
- # @param how ["inner", "left", "outer", "semi", "anti", "cross"]
1608
+ # @param how ["inner", "left", "full", "semi", "anti", "cross"]
1734
1609
  # Join strategy.
1735
1610
  # @param suffix [String]
1736
1611
  # Suffix to append to columns with a duplicate name.
@@ -1772,7 +1647,7 @@ module Polars
1772
1647
  # # └─────┴─────┴─────┴───────┘
1773
1648
  #
1774
1649
  # @example
1775
- # df.join(other_df, on: "ham", how: "outer").collect
1650
+ # df.join(other_df, on: "ham", how: "full").collect
1776
1651
  # # =>
1777
1652
  # # shape: (4, 5)
1778
1653
  # # ┌──────┬──────┬──────┬───────┬───────────┐
@@ -1839,7 +1714,9 @@ module Polars
1839
1714
  raise ArgumentError, "Expected a `LazyFrame` as join table, got #{other.class.name}"
1840
1715
  end
1841
1716
 
1842
- if how == "cross"
1717
+ if how == "outer"
1718
+ how = "full"
1719
+ elsif how == "cross"
1843
1720
  return _from_rbldf(
1844
1721
  _ldf.join(
1845
1722
  other._ldf, [], [], allow_parallel, join_nulls, force_parallel, how, suffix
@@ -1848,12 +1725,12 @@ module Polars
1848
1725
  end
1849
1726
 
1850
1727
  if !on.nil?
1851
- rbexprs = Utils.selection_to_rbexpr_list(on)
1728
+ rbexprs = Utils.parse_into_list_of_expressions(on)
1852
1729
  rbexprs_left = rbexprs
1853
1730
  rbexprs_right = rbexprs
1854
1731
  elsif !left_on.nil? && !right_on.nil?
1855
- rbexprs_left = Utils.selection_to_rbexpr_list(left_on)
1856
- rbexprs_right = Utils.selection_to_rbexpr_list(right_on)
1732
+ rbexprs_left = Utils.parse_into_list_of_expressions(left_on)
1733
+ rbexprs_right = Utils.parse_into_list_of_expressions(right_on)
1857
1734
  else
1858
1735
  raise ArgumentError, "must specify `on` OR `left_on` and `right_on`"
1859
1736
  end
@@ -1908,7 +1785,8 @@ module Polars
1908
1785
  # # └─────┴──────┴───────┴─────┴──────┴───────┘
1909
1786
  def with_columns(*exprs, **named_exprs)
1910
1787
  structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", "0") != "0"
1911
- rbexprs = Utils.parse_as_list_of_expressions(*exprs, **named_exprs, __structify: structify)
1788
+
1789
+ rbexprs = Utils.parse_into_list_of_expressions(*exprs, **named_exprs, __structify: structify)
1912
1790
 
1913
1791
  _from_rbldf(_ldf.with_columns(rbexprs))
1914
1792
  end
@@ -2069,9 +1947,9 @@ module Polars
2069
1947
  # # └──────┴──────┘
2070
1948
  def shift(n, fill_value: nil)
2071
1949
  if !fill_value.nil?
2072
- fill_value = Utils.parse_as_expression(fill_value, str_as_lit: true)
1950
+ fill_value = Utils.parse_into_expression(fill_value, str_as_lit: true)
2073
1951
  end
2074
- n = Utils.parse_as_expression(n)
1952
+ n = Utils.parse_into_expression(n)
2075
1953
  _from_rbldf(_ldf.shift(n, fill_value))
2076
1954
  end
2077
1955
 
@@ -2236,16 +2114,16 @@ module Polars
2236
2114
  # df.with_row_index.collect
2237
2115
  # # =>
2238
2116
  # # shape: (3, 3)
2239
- # # ┌────────┬─────┬─────┐
2240
- # # │ row_nr ┆ a ┆ b │
2241
- # # │ --- ┆ --- ┆ --- │
2242
- # # │ u32 ┆ i64 ┆ i64 │
2243
- # # ╞════════╪═════╪═════╡
2244
- # # │ 0 ┆ 1 ┆ 2 │
2245
- # # │ 1 ┆ 3 ┆ 4 │
2246
- # # │ 2 ┆ 5 ┆ 6 │
2247
- # # └────────┴─────┴─────┘
2248
- def with_row_index(name: "row_nr", offset: 0)
2117
+ # # ┌───────┬─────┬─────┐
2118
+ # # │ index ┆ a ┆ b │
2119
+ # # │ --- ┆ --- ┆ --- │
2120
+ # # │ u32 ┆ i64 ┆ i64 │
2121
+ # # ╞═══════╪═════╪═════╡
2122
+ # # │ 0 ┆ 1 ┆ 2 │
2123
+ # # │ 1 ┆ 3 ┆ 4 │
2124
+ # # │ 2 ┆ 5 ┆ 6 │
2125
+ # # └───────┴─────┴─────┘
2126
+ def with_row_index(name: "index", offset: 0)
2249
2127
  _from_rbldf(_ldf.with_row_index(name, offset))
2250
2128
  end
2251
2129
  alias_method :with_row_count, :with_row_index
@@ -2268,7 +2146,7 @@ module Polars
2268
2146
  # # │ 3 ┆ 7 │
2269
2147
  # # └─────┴─────┘
2270
2148
  def take_every(n)
2271
- select(Utils.col("*").take_every(n))
2149
+ select(F.col("*").take_every(n))
2272
2150
  end
2273
2151
 
2274
2152
  # Fill null values using the specified value or strategy.
@@ -2311,7 +2189,7 @@ module Polars
2311
2189
  # # └──────┴──────┘
2312
2190
  def fill_nan(fill_value)
2313
2191
  if !fill_value.is_a?(Expr)
2314
- fill_value = Utils.lit(fill_value)
2192
+ fill_value = F.lit(fill_value)
2315
2193
  end
2316
2194
  _from_rbldf(_ldf.fill_nan(fill_value._rbexpr))
2317
2195
  end
@@ -2502,8 +2380,8 @@ module Polars
2502
2380
  # # │ 3.0 ┆ 1.0 │
2503
2381
  # # └─────┴─────┘
2504
2382
  def quantile(quantile, interpolation: "nearest")
2505
- quantile = Utils.expr_to_lit_or_expr(quantile, str_to_lit: false)
2506
- _from_rbldf(_ldf.quantile(quantile._rbexpr, interpolation))
2383
+ quantile = Utils.parse_into_expression(quantile, str_as_lit: false)
2384
+ _from_rbldf(_ldf.quantile(quantile, interpolation))
2507
2385
  end
2508
2386
 
2509
2387
  # Explode lists to long format.
@@ -2535,7 +2413,7 @@ module Polars
2535
2413
  # # │ c ┆ 8 │
2536
2414
  # # └─────────┴─────────┘
2537
2415
  def explode(columns)
2538
- columns = Utils.selection_to_rbexpr_list(columns)
2416
+ columns = Utils.parse_into_list_of_expressions(columns)
2539
2417
  _from_rbldf(_ldf.explode(columns))
2540
2418
  end
2541
2419
 
@@ -2598,35 +2476,35 @@ module Polars
2598
2476
  # Optionally leaves identifiers set.
2599
2477
  #
2600
2478
  # This function is useful to massage a DataFrame into a format where one or more
2601
- # columns are identifier variables (id_vars), while all other columns, considered
2602
- # measured variables (value_vars), are "unpivoted" to the row axis, leaving just
2479
+ # columns are identifier variables (index) while all other columns, considered
2480
+ # measured variables (on), are "unpivoted" to the row axis leaving just
2603
2481
  # two non-identifier columns, 'variable' and 'value'.
2604
2482
  #
2605
- # @param id_vars [Object]
2606
- # Columns to use as identifier variables.
2607
- # @param value_vars [Object]
2608
- # Values to use as identifier variables.
2609
- # If `value_vars` is empty all columns that are not in `id_vars` will be used.
2483
+ # @param on [Object]
2484
+ # Column(s) or selector(s) to use as values variables; if `on`
2485
+ # is empty all columns that are not in `index` will be used.
2486
+ # @param index [Object]
2487
+ # Column(s) or selector(s) to use as identifier variables.
2610
2488
  # @param variable_name [String]
2611
- # Name to give to the `value` column. Defaults to "variable"
2489
+ # Name to give to the `variable` column. Defaults to "variable"
2612
2490
  # @param value_name [String]
2613
2491
  # Name to give to the `value` column. Defaults to "value"
2614
2492
  # @param streamable [Boolean]
2615
2493
  # Allow this node to run in the streaming engine.
2616
- # If this runs in streaming, the output of the melt operation
2494
+ # If this runs in streaming, the output of the unpivot operation
2617
2495
  # will not have a stable ordering.
2618
2496
  #
2619
2497
  # @return [LazyFrame]
2620
2498
  #
2621
2499
  # @example
2622
- # df = Polars::DataFrame.new(
2500
+ # lf = Polars::LazyFrame.new(
2623
2501
  # {
2624
2502
  # "a" => ["x", "y", "z"],
2625
2503
  # "b" => [1, 3, 5],
2626
2504
  # "c" => [2, 4, 6]
2627
2505
  # }
2628
- # ).lazy
2629
- # df.melt(id_vars: "a", value_vars: ["b", "c"]).collect
2506
+ # )
2507
+ # lf.unpivot(Polars::Selectors.numeric, index: "a").collect
2630
2508
  # # =>
2631
2509
  # # shape: (6, 3)
2632
2510
  # # ┌─────┬──────────┬───────┐
@@ -2641,23 +2519,21 @@ module Polars
2641
2519
  # # │ y ┆ c ┆ 4 │
2642
2520
  # # │ z ┆ c ┆ 6 │
2643
2521
  # # └─────┴──────────┴───────┘
2644
- def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil, streamable: true)
2645
- if value_vars.is_a?(::String)
2646
- value_vars = [value_vars]
2647
- end
2648
- if id_vars.is_a?(::String)
2649
- id_vars = [id_vars]
2650
- end
2651
- if value_vars.nil?
2652
- value_vars = []
2653
- end
2654
- if id_vars.nil?
2655
- id_vars = []
2656
- end
2522
+ def unpivot(
2523
+ on,
2524
+ index: nil,
2525
+ variable_name: nil,
2526
+ value_name: nil,
2527
+ streamable: true
2528
+ )
2529
+ on = on.nil? ? [] : Utils._expand_selectors(self, on)
2530
+ index = index.nil? ? [] : Utils._expand_selectors(self, index)
2531
+
2657
2532
  _from_rbldf(
2658
- _ldf.melt(id_vars, value_vars, value_name, variable_name, streamable)
2533
+ _ldf.unpivot(on, index, value_name, variable_name, streamable)
2659
2534
  )
2660
2535
  end
2536
+ alias_method :melt, :unpivot
2661
2537
 
2662
2538
  # def map
2663
2539
  # end
@@ -2688,7 +2564,7 @@ module Polars
2688
2564
  # # │ 10.0 ┆ null ┆ 9.0 │
2689
2565
  # # └──────┴──────┴──────────┘
2690
2566
  def interpolate
2691
- select(Utils.col("*").interpolate)
2567
+ select(F.col("*").interpolate)
2692
2568
  end
2693
2569
 
2694
2570
  # Decompose a struct into its fields.
@@ -2795,24 +2671,19 @@ module Polars
2795
2671
  #
2796
2672
  # @param column [Object]
2797
2673
  # Columns that are sorted
2798
- # @param more_columns [Object]
2799
- # Additional columns that are sorted, specified as positional arguments.
2800
2674
  # @param descending [Boolean]
2801
2675
  # Whether the columns are sorted in descending order.
2802
2676
  #
2803
2677
  # @return [LazyFrame]
2804
2678
  def set_sorted(
2805
2679
  column,
2806
- *more_columns,
2807
2680
  descending: false
2808
2681
  )
2809
- columns = Utils.selection_to_rbexpr_list(column)
2810
- if more_columns.any?
2811
- columns.concat(Utils.selection_to_rbexpr_list(more_columns))
2682
+ if !Utils.strlike?(column)
2683
+ msg = "expected a 'str' for argument 'column' in 'set_sorted'"
2684
+ raise TypeError, msg
2812
2685
  end
2813
- with_columns(
2814
- columns.map { |e| Utils.wrap_expr(e).set_sorted(descending: descending) }
2815
- )
2686
+ with_columns(F.col(column).set_sorted(descending: descending))
2816
2687
  end
2817
2688
 
2818
2689
  # TODO