polars-df 0.10.0-x86_64-linux → 0.12.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +27 -0
  3. data/Cargo.lock +392 -351
  4. data/LICENSE-THIRD-PARTY.txt +1125 -865
  5. data/README.md +6 -6
  6. data/lib/polars/3.1/polars.so +0 -0
  7. data/lib/polars/3.2/polars.so +0 -0
  8. data/lib/polars/3.3/polars.so +0 -0
  9. data/lib/polars/array_expr.rb +4 -4
  10. data/lib/polars/batched_csv_reader.rb +11 -5
  11. data/lib/polars/cat_expr.rb +0 -36
  12. data/lib/polars/cat_name_space.rb +0 -37
  13. data/lib/polars/convert.rb +6 -1
  14. data/lib/polars/data_frame.rb +176 -403
  15. data/lib/polars/data_types.rb +1 -1
  16. data/lib/polars/date_time_expr.rb +525 -572
  17. data/lib/polars/date_time_name_space.rb +263 -460
  18. data/lib/polars/dynamic_group_by.rb +5 -5
  19. data/lib/polars/exceptions.rb +7 -0
  20. data/lib/polars/expr.rb +1394 -243
  21. data/lib/polars/expr_dispatch.rb +1 -1
  22. data/lib/polars/functions/aggregation/horizontal.rb +8 -8
  23. data/lib/polars/functions/as_datatype.rb +63 -40
  24. data/lib/polars/functions/lazy.rb +63 -14
  25. data/lib/polars/functions/lit.rb +1 -1
  26. data/lib/polars/functions/range/date_range.rb +90 -57
  27. data/lib/polars/functions/range/datetime_range.rb +149 -0
  28. data/lib/polars/functions/range/int_range.rb +2 -2
  29. data/lib/polars/functions/range/time_range.rb +141 -0
  30. data/lib/polars/functions/repeat.rb +1 -1
  31. data/lib/polars/functions/whenthen.rb +1 -1
  32. data/lib/polars/group_by.rb +88 -23
  33. data/lib/polars/io/avro.rb +24 -0
  34. data/lib/polars/{io.rb → io/csv.rb} +299 -493
  35. data/lib/polars/io/database.rb +73 -0
  36. data/lib/polars/io/ipc.rb +247 -0
  37. data/lib/polars/io/json.rb +29 -0
  38. data/lib/polars/io/ndjson.rb +80 -0
  39. data/lib/polars/io/parquet.rb +227 -0
  40. data/lib/polars/lazy_frame.rb +143 -272
  41. data/lib/polars/lazy_group_by.rb +100 -3
  42. data/lib/polars/list_expr.rb +11 -11
  43. data/lib/polars/list_name_space.rb +5 -1
  44. data/lib/polars/rolling_group_by.rb +7 -9
  45. data/lib/polars/series.rb +103 -187
  46. data/lib/polars/string_expr.rb +78 -102
  47. data/lib/polars/string_name_space.rb +5 -4
  48. data/lib/polars/testing.rb +2 -2
  49. data/lib/polars/utils/constants.rb +9 -0
  50. data/lib/polars/utils/convert.rb +97 -0
  51. data/lib/polars/utils/parse.rb +89 -0
  52. data/lib/polars/utils/various.rb +76 -0
  53. data/lib/polars/utils/wrap.rb +19 -0
  54. data/lib/polars/utils.rb +8 -300
  55. data/lib/polars/version.rb +1 -1
  56. data/lib/polars/whenthen.rb +6 -6
  57. data/lib/polars.rb +20 -1
  58. metadata +17 -4
@@ -46,271 +46,6 @@ module Polars
46
46
  df
47
47
  end
48
48
 
49
- # @private
50
- def self._from_hashes(data, infer_schema_length: 100, schema: nil, schema_overrides: nil)
51
- rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema, schema_overrides)
52
- _from_rbdf(rbdf)
53
- end
54
-
55
- # @private
56
- def self._from_hash(data, schema: nil, schema_overrides: nil)
57
- _from_rbdf(hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides))
58
- end
59
-
60
- # def self._from_records
61
- # end
62
-
63
- # def self._from_numo
64
- # end
65
-
66
- # no self._from_arrow
67
-
68
- # no self._from_pandas
69
-
70
- # @private
71
- def self._read_csv(
72
- file,
73
- has_header: true,
74
- columns: nil,
75
- sep: str = ",",
76
- comment_char: nil,
77
- quote_char: '"',
78
- skip_rows: 0,
79
- dtypes: nil,
80
- null_values: nil,
81
- ignore_errors: false,
82
- parse_dates: false,
83
- n_threads: nil,
84
- infer_schema_length: 100,
85
- batch_size: 8192,
86
- n_rows: nil,
87
- encoding: "utf8",
88
- low_memory: false,
89
- rechunk: true,
90
- skip_rows_after_header: 0,
91
- row_count_name: nil,
92
- row_count_offset: 0,
93
- sample_size: 1024,
94
- eol_char: "\n",
95
- truncate_ragged_lines: false
96
- )
97
- if Utils.pathlike?(file)
98
- path = Utils.normalise_filepath(file)
99
- else
100
- path = nil
101
- # if defined?(StringIO) && file.is_a?(StringIO)
102
- # file = file.string
103
- # end
104
- end
105
-
106
- dtype_list = nil
107
- dtype_slice = nil
108
- if !dtypes.nil?
109
- if dtypes.is_a?(Hash)
110
- dtype_list = []
111
- dtypes.each do|k, v|
112
- dtype_list << [k, Utils.rb_type_to_dtype(v)]
113
- end
114
- elsif dtypes.is_a?(::Array)
115
- dtype_slice = dtypes
116
- else
117
- raise ArgumentError, "dtype arg should be list or dict"
118
- end
119
- end
120
-
121
- processed_null_values = Utils._process_null_values(null_values)
122
-
123
- if columns.is_a?(::String)
124
- columns = [columns]
125
- end
126
- if file.is_a?(::String) && file.include?("*")
127
- dtypes_dict = nil
128
- if !dtype_list.nil?
129
- dtypes_dict = dtype_list.to_h
130
- end
131
- if !dtype_slice.nil?
132
- raise ArgumentError, "cannot use glob patterns and unnamed dtypes as `dtypes` argument; Use dtypes: Mapping[str, Type[DataType]"
133
- end
134
- scan = Polars.scan_csv(
135
- file,
136
- has_header: has_header,
137
- sep: sep,
138
- comment_char: comment_char,
139
- quote_char: quote_char,
140
- skip_rows: skip_rows,
141
- dtypes: dtypes_dict,
142
- null_values: null_values,
143
- ignore_errors: ignore_errors,
144
- infer_schema_length: infer_schema_length,
145
- n_rows: n_rows,
146
- low_memory: low_memory,
147
- rechunk: rechunk,
148
- skip_rows_after_header: skip_rows_after_header,
149
- row_count_name: row_count_name,
150
- row_count_offset: row_count_offset,
151
- eol_char: eol_char,
152
- truncate_ragged_lines: truncate_ragged_lines
153
- )
154
- if columns.nil?
155
- return _from_rbdf(scan.collect._df)
156
- elsif is_str_sequence(columns, allow_str: false)
157
- return _from_rbdf(scan.select(columns).collect._df)
158
- else
159
- raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: List[str]"
160
- end
161
- end
162
-
163
- projection, columns = Utils.handle_projection_columns(columns)
164
-
165
- _from_rbdf(
166
- RbDataFrame.read_csv(
167
- file,
168
- infer_schema_length,
169
- batch_size,
170
- has_header,
171
- ignore_errors,
172
- n_rows,
173
- skip_rows,
174
- projection,
175
- sep,
176
- rechunk,
177
- columns,
178
- encoding,
179
- n_threads,
180
- path,
181
- dtype_list,
182
- dtype_slice,
183
- low_memory,
184
- comment_char,
185
- quote_char,
186
- processed_null_values,
187
- parse_dates,
188
- skip_rows_after_header,
189
- Utils._prepare_row_count_args(row_count_name, row_count_offset),
190
- sample_size,
191
- eol_char,
192
- truncate_ragged_lines
193
- )
194
- )
195
- end
196
-
197
- # @private
198
- def self._read_parquet(
199
- source,
200
- columns: nil,
201
- n_rows: nil,
202
- parallel: "auto",
203
- row_count_name: nil,
204
- row_count_offset: 0,
205
- low_memory: false,
206
- use_statistics: true,
207
- rechunk: true
208
- )
209
- if Utils.pathlike?(source)
210
- source = Utils.normalise_filepath(source)
211
- end
212
- if columns.is_a?(::String)
213
- columns = [columns]
214
- end
215
-
216
- if source.is_a?(::String) && source.include?("*") && Utils.local_file?(source)
217
- scan =
218
- Polars.scan_parquet(
219
- source,
220
- n_rows: n_rows,
221
- rechunk: true,
222
- parallel: parallel,
223
- row_count_name: row_count_name,
224
- row_count_offset: row_count_offset,
225
- low_memory: low_memory
226
- )
227
-
228
- if columns.nil?
229
- return self._from_rbdf(scan.collect._df)
230
- elsif Utils.is_str_sequence(columns, allow_str: false)
231
- return self._from_rbdf(scan.select(columns).collect._df)
232
- else
233
- raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: Array[String]"
234
- end
235
- end
236
-
237
- projection, columns = Utils.handle_projection_columns(columns)
238
- _from_rbdf(
239
- RbDataFrame.read_parquet(
240
- source,
241
- columns,
242
- projection,
243
- n_rows,
244
- parallel,
245
- Utils._prepare_row_count_args(row_count_name, row_count_offset),
246
- low_memory,
247
- use_statistics,
248
- rechunk
249
- )
250
- )
251
- end
252
-
253
- # @private
254
- def self._read_avro(file, columns: nil, n_rows: nil)
255
- if Utils.pathlike?(file)
256
- file = Utils.normalise_filepath(file)
257
- end
258
- projection, columns = Utils.handle_projection_columns(columns)
259
- _from_rbdf(RbDataFrame.read_avro(file, columns, projection, n_rows))
260
- end
261
-
262
- # @private
263
- def self._read_ipc(
264
- file,
265
- columns: nil,
266
- n_rows: nil,
267
- row_count_name: nil,
268
- row_count_offset: 0,
269
- rechunk: true,
270
- memory_map: true
271
- )
272
- if Utils.pathlike?(file)
273
- file = Utils.normalise_filepath(file)
274
- end
275
- if columns.is_a?(::String)
276
- columns = [columns]
277
- end
278
-
279
- if file.is_a?(::String) && file.include?("*")
280
- raise Todo
281
- end
282
-
283
- projection, columns = Utils.handle_projection_columns(columns)
284
- _from_rbdf(
285
- RbDataFrame.read_ipc(
286
- file,
287
- columns,
288
- projection,
289
- n_rows,
290
- Utils._prepare_row_count_args(row_count_name, row_count_offset),
291
- memory_map
292
- )
293
- )
294
- end
295
-
296
- # @private
297
- def self._read_json(file)
298
- if Utils.pathlike?(file)
299
- file = Utils.normalise_filepath(file)
300
- end
301
-
302
- _from_rbdf(RbDataFrame.read_json(file))
303
- end
304
-
305
- # @private
306
- def self._read_ndjson(file)
307
- if Utils.pathlike?(file)
308
- file = Utils.normalise_filepath(file)
309
- end
310
-
311
- _from_rbdf(RbDataFrame.read_ndjson(file))
312
- end
313
-
314
49
  # Get the shape of the DataFrame.
315
50
  #
316
51
  # @return [Array]
@@ -419,6 +154,13 @@ module Polars
419
154
  _df.dtypes
420
155
  end
421
156
 
157
+ # Get flags that are set on the columns of this DataFrame.
158
+ #
159
+ # @return [Hash]
160
+ def flags
161
+ columns.to_h { |name| [name, self[name].flags] }
162
+ end
163
+
422
164
  # Get the schema.
423
165
  #
424
166
  # @return [Hash]
@@ -845,7 +587,7 @@ module Polars
845
587
  row_oriented: false
846
588
  )
847
589
  if Utils.pathlike?(file)
848
- file = Utils.normalise_filepath(file)
590
+ file = Utils.normalize_filepath(file)
849
591
  end
850
592
  to_string_io = !file.nil? && file.is_a?(StringIO)
851
593
  if file.nil? || to_string_io
@@ -880,11 +622,11 @@ module Polars
880
622
  # "bar" => [6, 7, 8]
881
623
  # }
882
624
  # )
883
- # df.write_ndjson()
625
+ # df.write_ndjson
884
626
  # # => "{\"foo\":1,\"bar\":6}\n{\"foo\":2,\"bar\":7}\n{\"foo\":3,\"bar\":8}\n"
885
627
  def write_ndjson(file = nil)
886
628
  if Utils.pathlike?(file)
887
- file = Utils.normalise_filepath(file)
629
+ file = Utils.normalize_filepath(file)
888
630
  end
889
631
  to_string_io = !file.nil? && file.is_a?(StringIO)
890
632
  if file.nil? || to_string_io
@@ -991,7 +733,7 @@ module Polars
991
733
  end
992
734
 
993
735
  if Utils.pathlike?(file)
994
- file = Utils.normalise_filepath(file)
736
+ file = Utils.normalize_filepath(file)
995
737
  end
996
738
 
997
739
  _df.write_csv(
@@ -1029,7 +771,7 @@ module Polars
1029
771
  compression = "uncompressed"
1030
772
  end
1031
773
  if Utils.pathlike?(file)
1032
- file = Utils.normalise_filepath(file)
774
+ file = Utils.normalize_filepath(file)
1033
775
  end
1034
776
 
1035
777
  _df.write_avro(file, compression)
@@ -1050,7 +792,7 @@ module Polars
1050
792
  file.set_encoding(Encoding::BINARY)
1051
793
  end
1052
794
  if Utils.pathlike?(file)
1053
- file = Utils.normalise_filepath(file)
795
+ file = Utils.normalize_filepath(file)
1054
796
  end
1055
797
 
1056
798
  if compression.nil?
@@ -1061,6 +803,47 @@ module Polars
1061
803
  return_bytes ? file.string : nil
1062
804
  end
1063
805
 
806
+ # Write to Arrow IPC record batch stream.
807
+ #
808
+ # See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html.
809
+ #
810
+ # @param file [Object]
811
+ # Path or writable file-like object to which the IPC record batch data will
812
+ # be written. If set to `None`, the output is returned as a BytesIO object.
813
+ # @param compression ['uncompressed', 'lz4', 'zstd']
814
+ # Compression method. Defaults to "uncompressed".
815
+ #
816
+ # @return [Object]
817
+ #
818
+ # @example
819
+ # df = Polars::DataFrame.new(
820
+ # {
821
+ # "foo" => [1, 2, 3, 4, 5],
822
+ # "bar" => [6, 7, 8, 9, 10],
823
+ # "ham" => ["a", "b", "c", "d", "e"]
824
+ # }
825
+ # )
826
+ # df.write_ipc_stream("new_file.arrow")
827
+ def write_ipc_stream(
828
+ file,
829
+ compression: "uncompressed"
830
+ )
831
+ return_bytes = file.nil?
832
+ if return_bytes
833
+ file = StringIO.new
834
+ file.set_encoding(Encoding::BINARY)
835
+ elsif Utils.pathlike?(file)
836
+ file = Utils.normalize_filepath(file)
837
+ end
838
+
839
+ if compression.nil?
840
+ compression = "uncompressed"
841
+ end
842
+
843
+ _df.write_ipc_stream(file, compression)
844
+ return_bytes ? file.string : nil
845
+ end
846
+
1064
847
  # Write to Apache Parquet file.
1065
848
  #
1066
849
  # @param file [String, Pathname, StringIO]
@@ -1097,7 +880,25 @@ module Polars
1097
880
  compression = "uncompressed"
1098
881
  end
1099
882
  if Utils.pathlike?(file)
1100
- file = Utils.normalise_filepath(file)
883
+ file = Utils.normalize_filepath(file)
884
+ end
885
+
886
+ if statistics == true
887
+ statistics = {
888
+ min: true,
889
+ max: true,
890
+ distinct_count: false,
891
+ null_count: true
892
+ }
893
+ elsif statistics == false
894
+ statistics = {}
895
+ elsif statistics == "full"
896
+ statistics = {
897
+ min: true,
898
+ max: true,
899
+ distinct_count: true,
900
+ null_count: true
901
+ }
1101
902
  end
1102
903
 
1103
904
  _df.write_parquet(
@@ -1773,10 +1574,7 @@ module Polars
1773
1574
  # # │ 3 ┆ 8 ┆ c │
1774
1575
  # # └─────┴─────┴─────┘
1775
1576
  def drop_nulls(subset: nil)
1776
- if subset.is_a?(::String)
1777
- subset = [subset]
1778
- end
1779
- _from_rbdf(_df.drop_nulls(subset))
1577
+ lazy.drop_nulls(subset: subset).collect(_eager: true)
1780
1578
  end
1781
1579
 
1782
1580
  # Offers a structured way to apply a sequence of user-defined functions (UDFs).
@@ -1838,16 +1636,16 @@ module Polars
1838
1636
  # df.with_row_index
1839
1637
  # # =>
1840
1638
  # # shape: (3, 3)
1841
- # # ┌────────┬─────┬─────┐
1842
- # # │ row_nr ┆ a ┆ b │
1843
- # # │ --- ┆ --- ┆ --- │
1844
- # # │ u32 ┆ i64 ┆ i64 │
1845
- # # ╞════════╪═════╪═════╡
1846
- # # │ 0 ┆ 1 ┆ 2 │
1847
- # # │ 1 ┆ 3 ┆ 4 │
1848
- # # │ 2 ┆ 5 ┆ 6 │
1849
- # # └────────┴─────┴─────┘
1850
- def with_row_index(name: "row_nr", offset: 0)
1639
+ # # ┌───────┬─────┬─────┐
1640
+ # # │ index ┆ a ┆ b │
1641
+ # # │ --- ┆ --- ┆ --- │
1642
+ # # │ u32 ┆ i64 ┆ i64 │
1643
+ # # ╞═══════╪═════╪═════╡
1644
+ # # │ 0 ┆ 1 ┆ 2 │
1645
+ # # │ 1 ┆ 3 ┆ 4 │
1646
+ # # │ 2 ┆ 5 ┆ 6 │
1647
+ # # └───────┴─────┴─────┘
1648
+ def with_row_index(name: "index", offset: 0)
1851
1649
  _from_rbdf(_df.with_row_index(name, offset))
1852
1650
  end
1853
1651
  alias_method :with_row_count, :with_row_index
@@ -1944,12 +1742,6 @@ module Polars
1944
1742
  # Define whether the temporal window interval is closed or not.
1945
1743
  # @param by [Object]
1946
1744
  # Also group by this column/these columns.
1947
- # @param check_sorted [Boolean]
1948
- # When the `by` argument is given, polars can not check sortedness
1949
- # by the metadata and has to do a full scan on the index column to
1950
- # verify data is sorted. This is expensive. If you are sure the
1951
- # data within the by groups is sorted, you can set this to `false`.
1952
- # Doing so incorrectly will lead to incorrect output
1953
1745
  #
1954
1746
  # @return [RollingGroupBy]
1955
1747
  #
@@ -1965,7 +1757,7 @@ module Polars
1965
1757
  # df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
1966
1758
  # Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
1967
1759
  # )
1968
- # df.group_by_rolling(index_column: "dt", period: "2d").agg(
1760
+ # df.rolling(index_column: "dt", period: "2d").agg(
1969
1761
  # [
1970
1762
  # Polars.sum("a").alias("sum_a"),
1971
1763
  # Polars.min("a").alias("min_a"),
@@ -1986,17 +1778,17 @@ module Polars
1986
1778
  # # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
1987
1779
  # # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
1988
1780
  # # └─────────────────────┴───────┴───────┴───────┘
1989
- def group_by_rolling(
1781
+ def rolling(
1990
1782
  index_column:,
1991
1783
  period:,
1992
1784
  offset: nil,
1993
1785
  closed: "right",
1994
- by: nil,
1995
- check_sorted: true
1786
+ by: nil
1996
1787
  )
1997
- RollingGroupBy.new(self, index_column, period, offset, closed, by, check_sorted)
1788
+ RollingGroupBy.new(self, index_column, period, offset, closed, by)
1998
1789
  end
1999
- alias_method :groupby_rolling, :group_by_rolling
1790
+ alias_method :groupby_rolling, :rolling
1791
+ alias_method :group_by_rolling, :rolling
2000
1792
 
2001
1793
  # Group based on a time value (or index value of type `:i32`, `:i64`).
2002
1794
  #
@@ -2066,10 +1858,12 @@ module Polars
2066
1858
  # @example
2067
1859
  # df = Polars::DataFrame.new(
2068
1860
  # {
2069
- # "time" => Polars.date_range(
1861
+ # "time" => Polars.datetime_range(
2070
1862
  # DateTime.new(2021, 12, 16),
2071
1863
  # DateTime.new(2021, 12, 16, 3),
2072
- # "30m"
1864
+ # "30m",
1865
+ # time_unit: "us",
1866
+ # eager: true
2073
1867
  # ),
2074
1868
  # "n" => 0..6
2075
1869
  # }
@@ -2136,16 +1930,16 @@ module Polars
2136
1930
  # )
2137
1931
  # # =>
2138
1932
  # # shape: (4, 3)
2139
- # # ┌─────────────────────┬────────────┬───────────────────────────────────┐
2140
- # # │ time ┆ time_count ┆ time_agg_list
2141
- # # │ --- ┆ --- ┆ ---
2142
- # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
2143
- # # ╞═════════════════════╪════════════╪═══════════════════════════════════╡
2144
- # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16…
2145
- # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16…
2146
- # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16…
2147
- # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
2148
- # # └─────────────────────┴────────────┴───────────────────────────────────┘
1933
+ # # ┌─────────────────────┬────────────┬─────────────────────────────────┐
1934
+ # # │ time ┆ time_count ┆ time_agg_list
1935
+ # # │ --- ┆ --- ┆ ---
1936
+ # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
1937
+ # # ╞═════════════════════╪════════════╪═════════════════════════════════╡
1938
+ # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-…
1939
+ # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-…
1940
+ # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-…
1941
+ # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
1942
+ # # └─────────────────────┴────────────┴─────────────────────────────────┘
2149
1943
  #
2150
1944
  # @example When closed="both" the time values at the window boundaries belong to 2 groups.
2151
1945
  # df.group_by_dynamic("time", every: "1h", closed: "both").agg(
@@ -2168,10 +1962,12 @@ module Polars
2168
1962
  # @example Dynamic group bys can also be combined with grouping on normal keys.
2169
1963
  # df = Polars::DataFrame.new(
2170
1964
  # {
2171
- # "time" => Polars.date_range(
1965
+ # "time" => Polars.datetime_range(
2172
1966
  # DateTime.new(2021, 12, 16),
2173
1967
  # DateTime.new(2021, 12, 16, 3),
2174
- # "30m"
1968
+ # "30m",
1969
+ # time_unit: "us",
1970
+ # eager: true
2175
1971
  # ),
2176
1972
  # "groups" => ["a", "a", "a", "b", "b", "a", "a"]
2177
1973
  # }
@@ -2258,8 +2054,6 @@ module Polars
2258
2054
  # Note that this column has to be sorted for the output to make sense.
2259
2055
  # @param every [String]
2260
2056
  # interval will start 'every' duration
2261
- # @param offset [String]
2262
- # change the start of the date_range by this offset.
2263
2057
  # @param by [Object]
2264
2058
  # First group by these columns and then upsample for every group
2265
2059
  # @param maintain_order [Boolean]
@@ -2319,7 +2113,6 @@ module Polars
2319
2113
  def upsample(
2320
2114
  time_column:,
2321
2115
  every:,
2322
- offset: nil,
2323
2116
  by: nil,
2324
2117
  maintain_order: false
2325
2118
  )
@@ -2329,15 +2122,11 @@ module Polars
2329
2122
  if by.is_a?(::String)
2330
2123
  by = [by]
2331
2124
  end
2332
- if offset.nil?
2333
- offset = "0ns"
2334
- end
2335
2125
 
2336
- every = Utils._timedelta_to_pl_duration(every)
2337
- offset = Utils._timedelta_to_pl_duration(offset)
2126
+ every = Utils.parse_as_duration_string(every)
2338
2127
 
2339
2128
  _from_rbdf(
2340
- _df.upsample(by, time_column, every, offset, maintain_order)
2129
+ _df.upsample(by, time_column, every, maintain_order)
2341
2130
  )
2342
2131
  end
2343
2132
 
@@ -2484,7 +2273,7 @@ module Polars
2484
2273
  # Name(s) of the right join column(s).
2485
2274
  # @param on [Object]
2486
2275
  # Name(s) of the join columns in both DataFrames.
2487
- # @param how ["inner", "left", "outer", "semi", "anti", "cross"]
2276
+ # @param how ["inner", "left", "full", "semi", "anti", "cross"]
2488
2277
  # Join strategy.
2489
2278
  # @param suffix [String]
2490
2279
  # Suffix to append to columns with a duplicate name.
@@ -2520,7 +2309,7 @@ module Polars
2520
2309
  # # └─────┴─────┴─────┴───────┘
2521
2310
  #
2522
2311
  # @example
2523
- # df.join(other_df, on: "ham", how: "outer")
2312
+ # df.join(other_df, on: "ham", how: "full")
2524
2313
  # # =>
2525
2314
  # # shape: (4, 5)
2526
2315
  # # ┌──────┬──────┬──────┬───────┬───────────┐
@@ -2620,7 +2409,7 @@ module Polars
2620
2409
  # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [-1, 5, 8]})
2621
2410
  #
2622
2411
  # @example Return a DataFrame by mapping each row to a tuple:
2623
- # df.apply { |t| [t[0] * 2, t[1] * 3] }
2412
+ # df.map_rows { |t| [t[0] * 2, t[1] * 3] }
2624
2413
  # # =>
2625
2414
  # # shape: (3, 2)
2626
2415
  # # ┌──────────┬──────────┐
@@ -2634,7 +2423,7 @@ module Polars
2634
2423
  # # └──────────┴──────────┘
2635
2424
  #
2636
2425
  # @example Return a Series by mapping each row to a scalar:
2637
- # df.apply { |t| t[0] * 2 + t[1] }
2426
+ # df.map_rows { |t| t[0] * 2 + t[1] }
2638
2427
  # # =>
2639
2428
  # # shape: (3, 1)
2640
2429
  # # ┌───────┐
@@ -2646,14 +2435,15 @@ module Polars
2646
2435
  # # │ 9 │
2647
2436
  # # │ 14 │
2648
2437
  # # └───────┘
2649
- def apply(return_dtype: nil, inference_size: 256, &f)
2650
- out, is_df = _df.apply(f, return_dtype, inference_size)
2438
+ def map_rows(return_dtype: nil, inference_size: 256, &f)
2439
+ out, is_df = _df.map_rows(f, return_dtype, inference_size)
2651
2440
  if is_df
2652
2441
  _from_rbdf(out)
2653
2442
  else
2654
2443
  _from_rbdf(Utils.wrap_s(out).to_frame._df)
2655
2444
  end
2656
2445
  end
2446
+ alias_method :apply, :map_rows
2657
2447
 
2658
2448
  # Return a new DataFrame with the column added or replaced.
2659
2449
  #
@@ -3176,9 +2966,9 @@ module Polars
3176
2966
  # arguments contains multiple columns as well
3177
2967
  # @param index [Object]
3178
2968
  # One or multiple keys to group by
3179
- # @param columns [Object]
2969
+ # @param on [Object]
3180
2970
  # Columns whose values will be used as the header of the output DataFrame
3181
- # @param aggregate_fn ["first", "sum", "max", "min", "mean", "median", "last", "count"]
2971
+ # @param aggregate_function ["first", "sum", "max", "min", "mean", "median", "last", "count"]
3182
2972
  # A predefined aggregate function str or an expression.
3183
2973
  # @param maintain_order [Object]
3184
2974
  # Sort the grouped keys so that the output order is predictable.
@@ -3190,66 +2980,62 @@ module Polars
3190
2980
  # @example
3191
2981
  # df = Polars::DataFrame.new(
3192
2982
  # {
3193
- # "foo" => ["one", "one", "one", "two", "two", "two"],
3194
- # "bar" => ["A", "B", "C", "A", "B", "C"],
2983
+ # "foo" => ["one", "one", "two", "two", "one", "two"],
2984
+ # "bar" => ["y", "y", "y", "x", "x", "x"],
3195
2985
  # "baz" => [1, 2, 3, 4, 5, 6]
3196
2986
  # }
3197
2987
  # )
3198
- # df.pivot(values: "baz", index: "foo", columns: "bar")
2988
+ # df.pivot("bar", index: "foo", values: "baz", aggregate_function: "sum")
3199
2989
  # # =>
3200
- # # shape: (2, 4)
3201
- # # ┌─────┬─────┬─────┬─────┐
3202
- # # │ foo ┆ AB ┆ C
3203
- # # │ --- ┆ --- ┆ --- ┆ ---
3204
- # # │ str ┆ i64 ┆ i64 ┆ i64
3205
- # # ╞═════╪═════╪═════╪═════╡
3206
- # # │ one ┆ 12 ┆ 3
3207
- # # │ two ┆ 45 ┆ 6
3208
- # # └─────┴─────┴─────┴─────┘
2990
+ # # shape: (2, 3)
2991
+ # # ┌─────┬─────┬─────┐
2992
+ # # │ foo ┆ yx
2993
+ # # │ --- ┆ --- ┆ --- │
2994
+ # # │ str ┆ i64 ┆ i64 │
2995
+ # # ╞═════╪═════╪═════╡
2996
+ # # │ one ┆ 35
2997
+ # # │ two ┆ 310
2998
+ # # └─────┴─────┴─────┘
3209
2999
  def pivot(
3210
- values:,
3211
- index:,
3212
- columns:,
3213
- aggregate_fn: "first",
3000
+ on,
3001
+ index: nil,
3002
+ values: nil,
3003
+ aggregate_function: nil,
3214
3004
  maintain_order: true,
3215
3005
  sort_columns: false,
3216
3006
  separator: "_"
3217
3007
  )
3218
- if values.is_a?(::String)
3219
- values = [values]
3220
- end
3221
- if index.is_a?(::String)
3222
- index = [index]
3223
- end
3224
- if columns.is_a?(::String)
3225
- columns = [columns]
3008
+ index = Utils._expand_selectors(self, index)
3009
+ on = Utils._expand_selectors(self, on)
3010
+ if !values.nil?
3011
+ values = Utils._expand_selectors(self, values)
3226
3012
  end
3227
3013
 
3228
- if aggregate_fn.is_a?(::String)
3229
- case aggregate_fn
3014
+ if aggregate_function.is_a?(::String)
3015
+ case aggregate_function
3230
3016
  when "first"
3231
- aggregate_expr = Polars.element.first._rbexpr
3017
+ aggregate_expr = F.element.first._rbexpr
3232
3018
  when "sum"
3233
- aggregate_expr = Polars.element.sum._rbexpr
3019
+ aggregate_expr = F.element.sum._rbexpr
3234
3020
  when "max"
3235
- aggregate_expr = Polars.element.max._rbexpr
3021
+ aggregate_expr = F.element.max._rbexpr
3236
3022
  when "min"
3237
- aggregate_expr = Polars.element.min._rbexpr
3023
+ aggregate_expr = F.element.min._rbexpr
3238
3024
  when "mean"
3239
- aggregate_expr = Polars.element.mean._rbexpr
3025
+ aggregate_expr = F.element.mean._rbexpr
3240
3026
  when "median"
3241
- aggregate_expr = Polars.element.median._rbexpr
3027
+ aggregate_expr = F.element.median._rbexpr
3242
3028
  when "last"
3243
- aggregate_expr = Polars.element.last._rbexpr
3029
+ aggregate_expr = F.element.last._rbexpr
3244
3030
  when "len"
3245
- aggregate_expr = Polars.len._rbexpr
3031
+ aggregate_expr = F.len._rbexpr
3246
3032
  when "count"
3247
3033
  warn "`aggregate_function: \"count\"` input for `pivot` is deprecated. Use `aggregate_function: \"len\"` instead."
3248
- aggregate_expr = Polars.len._rbexpr
3034
+ aggregate_expr = F.len._rbexpr
3249
3035
  else
3250
3036
  raise ArgumentError, "Argument aggregate fn: '#{aggregate_fn}' was not expected."
3251
3037
  end
3252
- elsif aggregate_fn.nil?
3038
+ elsif aggregate_function.nil?
3253
3039
  aggregate_expr = nil
3254
3040
  else
3255
3041
  aggregate_expr = aggregate_function._rbexpr
@@ -3257,8 +3043,8 @@ module Polars
3257
3043
 
3258
3044
  _from_rbdf(
3259
3045
  _df.pivot_expr(
3046
+ on,
3260
3047
  index,
3261
- columns,
3262
3048
  values,
3263
3049
  maintain_order,
3264
3050
  sort_columns,
@@ -3273,18 +3059,18 @@ module Polars
3273
3059
  # Optionally leaves identifiers set.
3274
3060
  #
3275
3061
  # This function is useful to massage a DataFrame into a format where one or more
3276
- # columns are identifier variables (id_vars), while all other columns, considered
3277
- # measured variables (value_vars), are "unpivoted" to the row axis, leaving just
3062
+ # columns are identifier variables (index) while all other columns, considered
3063
+ # measured variables (on), are "unpivoted" to the row axis leaving just
3278
3064
  # two non-identifier columns, 'variable' and 'value'.
3279
3065
  #
3280
- # @param id_vars [Object]
3281
- # Columns to use as identifier variables.
3282
- # @param value_vars [Object]
3283
- # Values to use as identifier variables.
3284
- # If `value_vars` is empty all columns that are not in `id_vars` will be used.
3285
- # @param variable_name [String]
3286
- # Name to give to the `value` column. Defaults to "variable"
3287
- # @param value_name [String]
3066
+ # @param on [Object]
3067
+ # Column(s) or selector(s) to use as values variables; if `on`
3068
+ # is empty all columns that are not in `index` will be used.
3069
+ # @param index [Object]
3070
+ # Column(s) or selector(s) to use as identifier variables.
3071
+ # @param variable_name [Object]
3072
+ # Name to give to the `variable` column. Defaults to "variable"
3073
+ # @param value_name [Object]
3288
3074
  # Name to give to the `value` column. Defaults to "value"
3289
3075
  #
3290
3076
  # @return [DataFrame]
@@ -3297,7 +3083,7 @@ module Polars
3297
3083
  # "c" => [2, 4, 6]
3298
3084
  # }
3299
3085
  # )
3300
- # df.melt(id_vars: "a", value_vars: ["b", "c"])
3086
+ # df.unpivot(Polars::Selectors.numeric, index: "a")
3301
3087
  # # =>
3302
3088
  # # shape: (6, 3)
3303
3089
  # # ┌─────┬──────────┬───────┐
@@ -3312,23 +3098,13 @@ module Polars
3312
3098
  # # │ y ┆ c ┆ 4 │
3313
3099
  # # │ z ┆ c ┆ 6 │
3314
3100
  # # └─────┴──────────┴───────┘
3315
- def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil)
3316
- if value_vars.is_a?(::String)
3317
- value_vars = [value_vars]
3318
- end
3319
- if id_vars.is_a?(::String)
3320
- id_vars = [id_vars]
3321
- end
3322
- if value_vars.nil?
3323
- value_vars = []
3324
- end
3325
- if id_vars.nil?
3326
- id_vars = []
3327
- end
3328
- _from_rbdf(
3329
- _df.melt(id_vars, value_vars, value_name, variable_name)
3330
- )
3101
+ def unpivot(on, index: nil, variable_name: nil, value_name: nil)
3102
+ on = on.nil? ? [] : Utils._expand_selectors(self, on)
3103
+ index = index.nil? ? [] : Utils._expand_selectors(self, index)
3104
+
3105
+ _from_rbdf(_df.unpivot(on, index, value_name, variable_name))
3331
3106
  end
3107
+ alias_method :melt, :unpivot
3332
3108
 
3333
3109
  # Unstack a long table to a wide form without doing an aggregation.
3334
3110
  #
@@ -3774,7 +3550,7 @@ module Polars
3774
3550
  # # ┌─────────┐
3775
3551
  # # │ literal │
3776
3552
  # # │ --- │
3777
- # # │ i64
3553
+ # # │ i32
3778
3554
  # # ╞═════════╡
3779
3555
  # # │ 0 │
3780
3556
  # # │ 0 │
@@ -4362,7 +4138,7 @@ module Polars
4362
4138
  end
4363
4139
 
4364
4140
  if subset.is_a?(::Array) && subset.length == 1
4365
- expr = Utils.expr_to_lit_or_expr(subset[0], str_to_lit: false)
4141
+ expr = Utils.wrap_expr(Utils.parse_into_expression(subset[0], str_as_lit: false))
4366
4142
  else
4367
4143
  struct_fields = subset.nil? ? Polars.all : subset
4368
4144
  expr = Polars.struct(struct_fields)
@@ -4780,7 +4556,7 @@ module Polars
4780
4556
  # # │ 3 ┆ 7 │
4781
4557
  # # └─────┴─────┘
4782
4558
  def gather_every(n, offset = 0)
4783
- select(Utils.col("*").gather_every(n, offset))
4559
+ select(F.col("*").gather_every(n, offset))
4784
4560
  end
4785
4561
  alias_method :take_every, :gather_every
4786
4562
 
@@ -4850,7 +4626,7 @@ module Polars
4850
4626
  # # │ 10.0 ┆ null ┆ 9.0 │
4851
4627
  # # └──────┴──────┴──────────┘
4852
4628
  def interpolate
4853
- select(Utils.col("*").interpolate)
4629
+ select(F.col("*").interpolate)
4854
4630
  end
4855
4631
 
4856
4632
  # Check if the dataframe is empty.
@@ -4986,19 +4762,16 @@ module Polars
4986
4762
  #
4987
4763
  # @param column [Object]
4988
4764
  # Columns that are sorted
4989
- # @param more_columns [Object]
4990
- # Additional columns that are sorted, specified as positional arguments.
4991
4765
  # @param descending [Boolean]
4992
4766
  # Whether the columns are sorted in descending order.
4993
4767
  #
4994
4768
  # @return [DataFrame]
4995
4769
  def set_sorted(
4996
4770
  column,
4997
- *more_columns,
4998
4771
  descending: false
4999
4772
  )
5000
4773
  lazy
5001
- .set_sorted(column, *more_columns, descending: descending)
4774
+ .set_sorted(column, descending: descending)
5002
4775
  .collect(no_optimization: true)
5003
4776
  end
5004
4777
 
@@ -5255,7 +5028,7 @@ module Polars
5255
5028
  elsif data[0].is_a?(Hash)
5256
5029
  column_names, dtypes = _unpack_schema(columns)
5257
5030
  schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
5258
- rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema, schema_overrides)
5031
+ rbdf = RbDataFrame.from_hashes(data, schema, schema_overrides, false, infer_schema_length)
5259
5032
  if column_names
5260
5033
  rbdf = _post_apply_columns(rbdf, column_names)
5261
5034
  end
@@ -5289,7 +5062,7 @@ module Polars
5289
5062
  if unpack_nested
5290
5063
  raise Todo
5291
5064
  else
5292
- rbdf = RbDataFrame.read_rows(
5065
+ rbdf = RbDataFrame.from_rows(
5293
5066
  data,
5294
5067
  infer_schema_length,
5295
5068
  local_schema_override.any? ? local_schema_override : nil