polars-df 0.10.0-x86_64-linux-musl → 0.12.0-x86_64-linux-musl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (58) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +27 -0
  3. data/Cargo.lock +392 -351
  4. data/LICENSE-THIRD-PARTY.txt +1125 -865
  5. data/README.md +6 -6
  6. data/lib/polars/3.1/polars.so +0 -0
  7. data/lib/polars/3.2/polars.so +0 -0
  8. data/lib/polars/3.3/polars.so +0 -0
  9. data/lib/polars/array_expr.rb +4 -4
  10. data/lib/polars/batched_csv_reader.rb +11 -5
  11. data/lib/polars/cat_expr.rb +0 -36
  12. data/lib/polars/cat_name_space.rb +0 -37
  13. data/lib/polars/convert.rb +6 -1
  14. data/lib/polars/data_frame.rb +176 -403
  15. data/lib/polars/data_types.rb +1 -1
  16. data/lib/polars/date_time_expr.rb +525 -572
  17. data/lib/polars/date_time_name_space.rb +263 -460
  18. data/lib/polars/dynamic_group_by.rb +5 -5
  19. data/lib/polars/exceptions.rb +7 -0
  20. data/lib/polars/expr.rb +1394 -243
  21. data/lib/polars/expr_dispatch.rb +1 -1
  22. data/lib/polars/functions/aggregation/horizontal.rb +8 -8
  23. data/lib/polars/functions/as_datatype.rb +63 -40
  24. data/lib/polars/functions/lazy.rb +63 -14
  25. data/lib/polars/functions/lit.rb +1 -1
  26. data/lib/polars/functions/range/date_range.rb +90 -57
  27. data/lib/polars/functions/range/datetime_range.rb +149 -0
  28. data/lib/polars/functions/range/int_range.rb +2 -2
  29. data/lib/polars/functions/range/time_range.rb +141 -0
  30. data/lib/polars/functions/repeat.rb +1 -1
  31. data/lib/polars/functions/whenthen.rb +1 -1
  32. data/lib/polars/group_by.rb +88 -23
  33. data/lib/polars/io/avro.rb +24 -0
  34. data/lib/polars/{io.rb → io/csv.rb} +299 -493
  35. data/lib/polars/io/database.rb +73 -0
  36. data/lib/polars/io/ipc.rb +247 -0
  37. data/lib/polars/io/json.rb +29 -0
  38. data/lib/polars/io/ndjson.rb +80 -0
  39. data/lib/polars/io/parquet.rb +227 -0
  40. data/lib/polars/lazy_frame.rb +143 -272
  41. data/lib/polars/lazy_group_by.rb +100 -3
  42. data/lib/polars/list_expr.rb +11 -11
  43. data/lib/polars/list_name_space.rb +5 -1
  44. data/lib/polars/rolling_group_by.rb +7 -9
  45. data/lib/polars/series.rb +103 -187
  46. data/lib/polars/string_expr.rb +78 -102
  47. data/lib/polars/string_name_space.rb +5 -4
  48. data/lib/polars/testing.rb +2 -2
  49. data/lib/polars/utils/constants.rb +9 -0
  50. data/lib/polars/utils/convert.rb +97 -0
  51. data/lib/polars/utils/parse.rb +89 -0
  52. data/lib/polars/utils/various.rb +76 -0
  53. data/lib/polars/utils/wrap.rb +19 -0
  54. data/lib/polars/utils.rb +8 -300
  55. data/lib/polars/version.rb +1 -1
  56. data/lib/polars/whenthen.rb +6 -6
  57. data/lib/polars.rb +20 -1
  58. metadata +17 -4
@@ -46,271 +46,6 @@ module Polars
46
46
  df
47
47
  end
48
48
 
49
- # @private
50
- def self._from_hashes(data, infer_schema_length: 100, schema: nil, schema_overrides: nil)
51
- rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema, schema_overrides)
52
- _from_rbdf(rbdf)
53
- end
54
-
55
- # @private
56
- def self._from_hash(data, schema: nil, schema_overrides: nil)
57
- _from_rbdf(hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides))
58
- end
59
-
60
- # def self._from_records
61
- # end
62
-
63
- # def self._from_numo
64
- # end
65
-
66
- # no self._from_arrow
67
-
68
- # no self._from_pandas
69
-
70
- # @private
71
- def self._read_csv(
72
- file,
73
- has_header: true,
74
- columns: nil,
75
- sep: str = ",",
76
- comment_char: nil,
77
- quote_char: '"',
78
- skip_rows: 0,
79
- dtypes: nil,
80
- null_values: nil,
81
- ignore_errors: false,
82
- parse_dates: false,
83
- n_threads: nil,
84
- infer_schema_length: 100,
85
- batch_size: 8192,
86
- n_rows: nil,
87
- encoding: "utf8",
88
- low_memory: false,
89
- rechunk: true,
90
- skip_rows_after_header: 0,
91
- row_count_name: nil,
92
- row_count_offset: 0,
93
- sample_size: 1024,
94
- eol_char: "\n",
95
- truncate_ragged_lines: false
96
- )
97
- if Utils.pathlike?(file)
98
- path = Utils.normalise_filepath(file)
99
- else
100
- path = nil
101
- # if defined?(StringIO) && file.is_a?(StringIO)
102
- # file = file.string
103
- # end
104
- end
105
-
106
- dtype_list = nil
107
- dtype_slice = nil
108
- if !dtypes.nil?
109
- if dtypes.is_a?(Hash)
110
- dtype_list = []
111
- dtypes.each do|k, v|
112
- dtype_list << [k, Utils.rb_type_to_dtype(v)]
113
- end
114
- elsif dtypes.is_a?(::Array)
115
- dtype_slice = dtypes
116
- else
117
- raise ArgumentError, "dtype arg should be list or dict"
118
- end
119
- end
120
-
121
- processed_null_values = Utils._process_null_values(null_values)
122
-
123
- if columns.is_a?(::String)
124
- columns = [columns]
125
- end
126
- if file.is_a?(::String) && file.include?("*")
127
- dtypes_dict = nil
128
- if !dtype_list.nil?
129
- dtypes_dict = dtype_list.to_h
130
- end
131
- if !dtype_slice.nil?
132
- raise ArgumentError, "cannot use glob patterns and unnamed dtypes as `dtypes` argument; Use dtypes: Mapping[str, Type[DataType]"
133
- end
134
- scan = Polars.scan_csv(
135
- file,
136
- has_header: has_header,
137
- sep: sep,
138
- comment_char: comment_char,
139
- quote_char: quote_char,
140
- skip_rows: skip_rows,
141
- dtypes: dtypes_dict,
142
- null_values: null_values,
143
- ignore_errors: ignore_errors,
144
- infer_schema_length: infer_schema_length,
145
- n_rows: n_rows,
146
- low_memory: low_memory,
147
- rechunk: rechunk,
148
- skip_rows_after_header: skip_rows_after_header,
149
- row_count_name: row_count_name,
150
- row_count_offset: row_count_offset,
151
- eol_char: eol_char,
152
- truncate_ragged_lines: truncate_ragged_lines
153
- )
154
- if columns.nil?
155
- return _from_rbdf(scan.collect._df)
156
- elsif is_str_sequence(columns, allow_str: false)
157
- return _from_rbdf(scan.select(columns).collect._df)
158
- else
159
- raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: List[str]"
160
- end
161
- end
162
-
163
- projection, columns = Utils.handle_projection_columns(columns)
164
-
165
- _from_rbdf(
166
- RbDataFrame.read_csv(
167
- file,
168
- infer_schema_length,
169
- batch_size,
170
- has_header,
171
- ignore_errors,
172
- n_rows,
173
- skip_rows,
174
- projection,
175
- sep,
176
- rechunk,
177
- columns,
178
- encoding,
179
- n_threads,
180
- path,
181
- dtype_list,
182
- dtype_slice,
183
- low_memory,
184
- comment_char,
185
- quote_char,
186
- processed_null_values,
187
- parse_dates,
188
- skip_rows_after_header,
189
- Utils._prepare_row_count_args(row_count_name, row_count_offset),
190
- sample_size,
191
- eol_char,
192
- truncate_ragged_lines
193
- )
194
- )
195
- end
196
-
197
- # @private
198
- def self._read_parquet(
199
- source,
200
- columns: nil,
201
- n_rows: nil,
202
- parallel: "auto",
203
- row_count_name: nil,
204
- row_count_offset: 0,
205
- low_memory: false,
206
- use_statistics: true,
207
- rechunk: true
208
- )
209
- if Utils.pathlike?(source)
210
- source = Utils.normalise_filepath(source)
211
- end
212
- if columns.is_a?(::String)
213
- columns = [columns]
214
- end
215
-
216
- if source.is_a?(::String) && source.include?("*") && Utils.local_file?(source)
217
- scan =
218
- Polars.scan_parquet(
219
- source,
220
- n_rows: n_rows,
221
- rechunk: true,
222
- parallel: parallel,
223
- row_count_name: row_count_name,
224
- row_count_offset: row_count_offset,
225
- low_memory: low_memory
226
- )
227
-
228
- if columns.nil?
229
- return self._from_rbdf(scan.collect._df)
230
- elsif Utils.is_str_sequence(columns, allow_str: false)
231
- return self._from_rbdf(scan.select(columns).collect._df)
232
- else
233
- raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: Array[String]"
234
- end
235
- end
236
-
237
- projection, columns = Utils.handle_projection_columns(columns)
238
- _from_rbdf(
239
- RbDataFrame.read_parquet(
240
- source,
241
- columns,
242
- projection,
243
- n_rows,
244
- parallel,
245
- Utils._prepare_row_count_args(row_count_name, row_count_offset),
246
- low_memory,
247
- use_statistics,
248
- rechunk
249
- )
250
- )
251
- end
252
-
253
- # @private
254
- def self._read_avro(file, columns: nil, n_rows: nil)
255
- if Utils.pathlike?(file)
256
- file = Utils.normalise_filepath(file)
257
- end
258
- projection, columns = Utils.handle_projection_columns(columns)
259
- _from_rbdf(RbDataFrame.read_avro(file, columns, projection, n_rows))
260
- end
261
-
262
- # @private
263
- def self._read_ipc(
264
- file,
265
- columns: nil,
266
- n_rows: nil,
267
- row_count_name: nil,
268
- row_count_offset: 0,
269
- rechunk: true,
270
- memory_map: true
271
- )
272
- if Utils.pathlike?(file)
273
- file = Utils.normalise_filepath(file)
274
- end
275
- if columns.is_a?(::String)
276
- columns = [columns]
277
- end
278
-
279
- if file.is_a?(::String) && file.include?("*")
280
- raise Todo
281
- end
282
-
283
- projection, columns = Utils.handle_projection_columns(columns)
284
- _from_rbdf(
285
- RbDataFrame.read_ipc(
286
- file,
287
- columns,
288
- projection,
289
- n_rows,
290
- Utils._prepare_row_count_args(row_count_name, row_count_offset),
291
- memory_map
292
- )
293
- )
294
- end
295
-
296
- # @private
297
- def self._read_json(file)
298
- if Utils.pathlike?(file)
299
- file = Utils.normalise_filepath(file)
300
- end
301
-
302
- _from_rbdf(RbDataFrame.read_json(file))
303
- end
304
-
305
- # @private
306
- def self._read_ndjson(file)
307
- if Utils.pathlike?(file)
308
- file = Utils.normalise_filepath(file)
309
- end
310
-
311
- _from_rbdf(RbDataFrame.read_ndjson(file))
312
- end
313
-
314
49
  # Get the shape of the DataFrame.
315
50
  #
316
51
  # @return [Array]
@@ -419,6 +154,13 @@ module Polars
419
154
  _df.dtypes
420
155
  end
421
156
 
157
+ # Get flags that are set on the columns of this DataFrame.
158
+ #
159
+ # @return [Hash]
160
+ def flags
161
+ columns.to_h { |name| [name, self[name].flags] }
162
+ end
163
+
422
164
  # Get the schema.
423
165
  #
424
166
  # @return [Hash]
@@ -845,7 +587,7 @@ module Polars
845
587
  row_oriented: false
846
588
  )
847
589
  if Utils.pathlike?(file)
848
- file = Utils.normalise_filepath(file)
590
+ file = Utils.normalize_filepath(file)
849
591
  end
850
592
  to_string_io = !file.nil? && file.is_a?(StringIO)
851
593
  if file.nil? || to_string_io
@@ -880,11 +622,11 @@ module Polars
880
622
  # "bar" => [6, 7, 8]
881
623
  # }
882
624
  # )
883
- # df.write_ndjson()
625
+ # df.write_ndjson
884
626
  # # => "{\"foo\":1,\"bar\":6}\n{\"foo\":2,\"bar\":7}\n{\"foo\":3,\"bar\":8}\n"
885
627
  def write_ndjson(file = nil)
886
628
  if Utils.pathlike?(file)
887
- file = Utils.normalise_filepath(file)
629
+ file = Utils.normalize_filepath(file)
888
630
  end
889
631
  to_string_io = !file.nil? && file.is_a?(StringIO)
890
632
  if file.nil? || to_string_io
@@ -991,7 +733,7 @@ module Polars
991
733
  end
992
734
 
993
735
  if Utils.pathlike?(file)
994
- file = Utils.normalise_filepath(file)
736
+ file = Utils.normalize_filepath(file)
995
737
  end
996
738
 
997
739
  _df.write_csv(
@@ -1029,7 +771,7 @@ module Polars
1029
771
  compression = "uncompressed"
1030
772
  end
1031
773
  if Utils.pathlike?(file)
1032
- file = Utils.normalise_filepath(file)
774
+ file = Utils.normalize_filepath(file)
1033
775
  end
1034
776
 
1035
777
  _df.write_avro(file, compression)
@@ -1050,7 +792,7 @@ module Polars
1050
792
  file.set_encoding(Encoding::BINARY)
1051
793
  end
1052
794
  if Utils.pathlike?(file)
1053
- file = Utils.normalise_filepath(file)
795
+ file = Utils.normalize_filepath(file)
1054
796
  end
1055
797
 
1056
798
  if compression.nil?
@@ -1061,6 +803,47 @@ module Polars
1061
803
  return_bytes ? file.string : nil
1062
804
  end
1063
805
 
806
+ # Write to Arrow IPC record batch stream.
807
+ #
808
+ # See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html.
809
+ #
810
+ # @param file [Object]
811
+ # Path or writable file-like object to which the IPC record batch data will
812
+ # be written. If set to `None`, the output is returned as a BytesIO object.
813
+ # @param compression ['uncompressed', 'lz4', 'zstd']
814
+ # Compression method. Defaults to "uncompressed".
815
+ #
816
+ # @return [Object]
817
+ #
818
+ # @example
819
+ # df = Polars::DataFrame.new(
820
+ # {
821
+ # "foo" => [1, 2, 3, 4, 5],
822
+ # "bar" => [6, 7, 8, 9, 10],
823
+ # "ham" => ["a", "b", "c", "d", "e"]
824
+ # }
825
+ # )
826
+ # df.write_ipc_stream("new_file.arrow")
827
+ def write_ipc_stream(
828
+ file,
829
+ compression: "uncompressed"
830
+ )
831
+ return_bytes = file.nil?
832
+ if return_bytes
833
+ file = StringIO.new
834
+ file.set_encoding(Encoding::BINARY)
835
+ elsif Utils.pathlike?(file)
836
+ file = Utils.normalize_filepath(file)
837
+ end
838
+
839
+ if compression.nil?
840
+ compression = "uncompressed"
841
+ end
842
+
843
+ _df.write_ipc_stream(file, compression)
844
+ return_bytes ? file.string : nil
845
+ end
846
+
1064
847
  # Write to Apache Parquet file.
1065
848
  #
1066
849
  # @param file [String, Pathname, StringIO]
@@ -1097,7 +880,25 @@ module Polars
1097
880
  compression = "uncompressed"
1098
881
  end
1099
882
  if Utils.pathlike?(file)
1100
- file = Utils.normalise_filepath(file)
883
+ file = Utils.normalize_filepath(file)
884
+ end
885
+
886
+ if statistics == true
887
+ statistics = {
888
+ min: true,
889
+ max: true,
890
+ distinct_count: false,
891
+ null_count: true
892
+ }
893
+ elsif statistics == false
894
+ statistics = {}
895
+ elsif statistics == "full"
896
+ statistics = {
897
+ min: true,
898
+ max: true,
899
+ distinct_count: true,
900
+ null_count: true
901
+ }
1101
902
  end
1102
903
 
1103
904
  _df.write_parquet(
@@ -1773,10 +1574,7 @@ module Polars
1773
1574
  # # │ 3 ┆ 8 ┆ c │
1774
1575
  # # └─────┴─────┴─────┘
1775
1576
  def drop_nulls(subset: nil)
1776
- if subset.is_a?(::String)
1777
- subset = [subset]
1778
- end
1779
- _from_rbdf(_df.drop_nulls(subset))
1577
+ lazy.drop_nulls(subset: subset).collect(_eager: true)
1780
1578
  end
1781
1579
 
1782
1580
  # Offers a structured way to apply a sequence of user-defined functions (UDFs).
@@ -1838,16 +1636,16 @@ module Polars
1838
1636
  # df.with_row_index
1839
1637
  # # =>
1840
1638
  # # shape: (3, 3)
1841
- # # ┌────────┬─────┬─────┐
1842
- # # │ row_nr ┆ a ┆ b │
1843
- # # │ --- ┆ --- ┆ --- │
1844
- # # │ u32 ┆ i64 ┆ i64 │
1845
- # # ╞════════╪═════╪═════╡
1846
- # # │ 0 ┆ 1 ┆ 2 │
1847
- # # │ 1 ┆ 3 ┆ 4 │
1848
- # # │ 2 ┆ 5 ┆ 6 │
1849
- # # └────────┴─────┴─────┘
1850
- def with_row_index(name: "row_nr", offset: 0)
1639
+ # # ┌───────┬─────┬─────┐
1640
+ # # │ index ┆ a ┆ b │
1641
+ # # │ --- ┆ --- ┆ --- │
1642
+ # # │ u32 ┆ i64 ┆ i64 │
1643
+ # # ╞═══════╪═════╪═════╡
1644
+ # # │ 0 ┆ 1 ┆ 2 │
1645
+ # # │ 1 ┆ 3 ┆ 4 │
1646
+ # # │ 2 ┆ 5 ┆ 6 │
1647
+ # # └───────┴─────┴─────┘
1648
+ def with_row_index(name: "index", offset: 0)
1851
1649
  _from_rbdf(_df.with_row_index(name, offset))
1852
1650
  end
1853
1651
  alias_method :with_row_count, :with_row_index
@@ -1944,12 +1742,6 @@ module Polars
1944
1742
  # Define whether the temporal window interval is closed or not.
1945
1743
  # @param by [Object]
1946
1744
  # Also group by this column/these columns.
1947
- # @param check_sorted [Boolean]
1948
- # When the `by` argument is given, polars can not check sortedness
1949
- # by the metadata and has to do a full scan on the index column to
1950
- # verify data is sorted. This is expensive. If you are sure the
1951
- # data within the by groups is sorted, you can set this to `false`.
1952
- # Doing so incorrectly will lead to incorrect output
1953
1745
  #
1954
1746
  # @return [RollingGroupBy]
1955
1747
  #
@@ -1965,7 +1757,7 @@ module Polars
1965
1757
  # df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
1966
1758
  # Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
1967
1759
  # )
1968
- # df.group_by_rolling(index_column: "dt", period: "2d").agg(
1760
+ # df.rolling(index_column: "dt", period: "2d").agg(
1969
1761
  # [
1970
1762
  # Polars.sum("a").alias("sum_a"),
1971
1763
  # Polars.min("a").alias("min_a"),
@@ -1986,17 +1778,17 @@ module Polars
1986
1778
  # # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
1987
1779
  # # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
1988
1780
  # # └─────────────────────┴───────┴───────┴───────┘
1989
- def group_by_rolling(
1781
+ def rolling(
1990
1782
  index_column:,
1991
1783
  period:,
1992
1784
  offset: nil,
1993
1785
  closed: "right",
1994
- by: nil,
1995
- check_sorted: true
1786
+ by: nil
1996
1787
  )
1997
- RollingGroupBy.new(self, index_column, period, offset, closed, by, check_sorted)
1788
+ RollingGroupBy.new(self, index_column, period, offset, closed, by)
1998
1789
  end
1999
- alias_method :groupby_rolling, :group_by_rolling
1790
+ alias_method :groupby_rolling, :rolling
1791
+ alias_method :group_by_rolling, :rolling
2000
1792
 
2001
1793
  # Group based on a time value (or index value of type `:i32`, `:i64`).
2002
1794
  #
@@ -2066,10 +1858,12 @@ module Polars
2066
1858
  # @example
2067
1859
  # df = Polars::DataFrame.new(
2068
1860
  # {
2069
- # "time" => Polars.date_range(
1861
+ # "time" => Polars.datetime_range(
2070
1862
  # DateTime.new(2021, 12, 16),
2071
1863
  # DateTime.new(2021, 12, 16, 3),
2072
- # "30m"
1864
+ # "30m",
1865
+ # time_unit: "us",
1866
+ # eager: true
2073
1867
  # ),
2074
1868
  # "n" => 0..6
2075
1869
  # }
@@ -2136,16 +1930,16 @@ module Polars
2136
1930
  # )
2137
1931
  # # =>
2138
1932
  # # shape: (4, 3)
2139
- # # ┌─────────────────────┬────────────┬───────────────────────────────────┐
2140
- # # │ time ┆ time_count ┆ time_agg_list
2141
- # # │ --- ┆ --- ┆ ---
2142
- # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
2143
- # # ╞═════════════════════╪════════════╪═══════════════════════════════════╡
2144
- # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16…
2145
- # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16…
2146
- # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16…
2147
- # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
2148
- # # └─────────────────────┴────────────┴───────────────────────────────────┘
1933
+ # # ┌─────────────────────┬────────────┬─────────────────────────────────┐
1934
+ # # │ time ┆ time_count ┆ time_agg_list
1935
+ # # │ --- ┆ --- ┆ ---
1936
+ # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
1937
+ # # ╞═════════════════════╪════════════╪═════════════════════════════════╡
1938
+ # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-…
1939
+ # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-…
1940
+ # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-…
1941
+ # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
1942
+ # # └─────────────────────┴────────────┴─────────────────────────────────┘
2149
1943
  #
2150
1944
  # @example When closed="both" the time values at the window boundaries belong to 2 groups.
2151
1945
  # df.group_by_dynamic("time", every: "1h", closed: "both").agg(
@@ -2168,10 +1962,12 @@ module Polars
2168
1962
  # @example Dynamic group bys can also be combined with grouping on normal keys.
2169
1963
  # df = Polars::DataFrame.new(
2170
1964
  # {
2171
- # "time" => Polars.date_range(
1965
+ # "time" => Polars.datetime_range(
2172
1966
  # DateTime.new(2021, 12, 16),
2173
1967
  # DateTime.new(2021, 12, 16, 3),
2174
- # "30m"
1968
+ # "30m",
1969
+ # time_unit: "us",
1970
+ # eager: true
2175
1971
  # ),
2176
1972
  # "groups" => ["a", "a", "a", "b", "b", "a", "a"]
2177
1973
  # }
@@ -2258,8 +2054,6 @@ module Polars
2258
2054
  # Note that this column has to be sorted for the output to make sense.
2259
2055
  # @param every [String]
2260
2056
  # interval will start 'every' duration
2261
- # @param offset [String]
2262
- # change the start of the date_range by this offset.
2263
2057
  # @param by [Object]
2264
2058
  # First group by these columns and then upsample for every group
2265
2059
  # @param maintain_order [Boolean]
@@ -2319,7 +2113,6 @@ module Polars
2319
2113
  def upsample(
2320
2114
  time_column:,
2321
2115
  every:,
2322
- offset: nil,
2323
2116
  by: nil,
2324
2117
  maintain_order: false
2325
2118
  )
@@ -2329,15 +2122,11 @@ module Polars
2329
2122
  if by.is_a?(::String)
2330
2123
  by = [by]
2331
2124
  end
2332
- if offset.nil?
2333
- offset = "0ns"
2334
- end
2335
2125
 
2336
- every = Utils._timedelta_to_pl_duration(every)
2337
- offset = Utils._timedelta_to_pl_duration(offset)
2126
+ every = Utils.parse_as_duration_string(every)
2338
2127
 
2339
2128
  _from_rbdf(
2340
- _df.upsample(by, time_column, every, offset, maintain_order)
2129
+ _df.upsample(by, time_column, every, maintain_order)
2341
2130
  )
2342
2131
  end
2343
2132
 
@@ -2484,7 +2273,7 @@ module Polars
2484
2273
  # Name(s) of the right join column(s).
2485
2274
  # @param on [Object]
2486
2275
  # Name(s) of the join columns in both DataFrames.
2487
- # @param how ["inner", "left", "outer", "semi", "anti", "cross"]
2276
+ # @param how ["inner", "left", "full", "semi", "anti", "cross"]
2488
2277
  # Join strategy.
2489
2278
  # @param suffix [String]
2490
2279
  # Suffix to append to columns with a duplicate name.
@@ -2520,7 +2309,7 @@ module Polars
2520
2309
  # # └─────┴─────┴─────┴───────┘
2521
2310
  #
2522
2311
  # @example
2523
- # df.join(other_df, on: "ham", how: "outer")
2312
+ # df.join(other_df, on: "ham", how: "full")
2524
2313
  # # =>
2525
2314
  # # shape: (4, 5)
2526
2315
  # # ┌──────┬──────┬──────┬───────┬───────────┐
@@ -2620,7 +2409,7 @@ module Polars
2620
2409
  # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [-1, 5, 8]})
2621
2410
  #
2622
2411
  # @example Return a DataFrame by mapping each row to a tuple:
2623
- # df.apply { |t| [t[0] * 2, t[1] * 3] }
2412
+ # df.map_rows { |t| [t[0] * 2, t[1] * 3] }
2624
2413
  # # =>
2625
2414
  # # shape: (3, 2)
2626
2415
  # # ┌──────────┬──────────┐
@@ -2634,7 +2423,7 @@ module Polars
2634
2423
  # # └──────────┴──────────┘
2635
2424
  #
2636
2425
  # @example Return a Series by mapping each row to a scalar:
2637
- # df.apply { |t| t[0] * 2 + t[1] }
2426
+ # df.map_rows { |t| t[0] * 2 + t[1] }
2638
2427
  # # =>
2639
2428
  # # shape: (3, 1)
2640
2429
  # # ┌───────┐
@@ -2646,14 +2435,15 @@ module Polars
2646
2435
  # # │ 9 │
2647
2436
  # # │ 14 │
2648
2437
  # # └───────┘
2649
- def apply(return_dtype: nil, inference_size: 256, &f)
2650
- out, is_df = _df.apply(f, return_dtype, inference_size)
2438
+ def map_rows(return_dtype: nil, inference_size: 256, &f)
2439
+ out, is_df = _df.map_rows(f, return_dtype, inference_size)
2651
2440
  if is_df
2652
2441
  _from_rbdf(out)
2653
2442
  else
2654
2443
  _from_rbdf(Utils.wrap_s(out).to_frame._df)
2655
2444
  end
2656
2445
  end
2446
+ alias_method :apply, :map_rows
2657
2447
 
2658
2448
  # Return a new DataFrame with the column added or replaced.
2659
2449
  #
@@ -3176,9 +2966,9 @@ module Polars
3176
2966
  # arguments contains multiple columns as well
3177
2967
  # @param index [Object]
3178
2968
  # One or multiple keys to group by
3179
- # @param columns [Object]
2969
+ # @param on [Object]
3180
2970
  # Columns whose values will be used as the header of the output DataFrame
3181
- # @param aggregate_fn ["first", "sum", "max", "min", "mean", "median", "last", "count"]
2971
+ # @param aggregate_function ["first", "sum", "max", "min", "mean", "median", "last", "count"]
3182
2972
  # A predefined aggregate function str or an expression.
3183
2973
  # @param maintain_order [Object]
3184
2974
  # Sort the grouped keys so that the output order is predictable.
@@ -3190,66 +2980,62 @@ module Polars
3190
2980
  # @example
3191
2981
  # df = Polars::DataFrame.new(
3192
2982
  # {
3193
- # "foo" => ["one", "one", "one", "two", "two", "two"],
3194
- # "bar" => ["A", "B", "C", "A", "B", "C"],
2983
+ # "foo" => ["one", "one", "two", "two", "one", "two"],
2984
+ # "bar" => ["y", "y", "y", "x", "x", "x"],
3195
2985
  # "baz" => [1, 2, 3, 4, 5, 6]
3196
2986
  # }
3197
2987
  # )
3198
- # df.pivot(values: "baz", index: "foo", columns: "bar")
2988
+ # df.pivot("bar", index: "foo", values: "baz", aggregate_function: "sum")
3199
2989
  # # =>
3200
- # # shape: (2, 4)
3201
- # # ┌─────┬─────┬─────┬─────┐
3202
- # # │ foo ┆ AB ┆ C
3203
- # # │ --- ┆ --- ┆ --- ┆ ---
3204
- # # │ str ┆ i64 ┆ i64 ┆ i64
3205
- # # ╞═════╪═════╪═════╪═════╡
3206
- # # │ one ┆ 12 ┆ 3
3207
- # # │ two ┆ 45 ┆ 6
3208
- # # └─────┴─────┴─────┴─────┘
2990
+ # # shape: (2, 3)
2991
+ # # ┌─────┬─────┬─────┐
2992
+ # # │ foo ┆ yx
2993
+ # # │ --- ┆ --- ┆ --- │
2994
+ # # │ str ┆ i64 ┆ i64 │
2995
+ # # ╞═════╪═════╪═════╡
2996
+ # # │ one ┆ 35
2997
+ # # │ two ┆ 310
2998
+ # # └─────┴─────┴─────┘
3209
2999
  def pivot(
3210
- values:,
3211
- index:,
3212
- columns:,
3213
- aggregate_fn: "first",
3000
+ on,
3001
+ index: nil,
3002
+ values: nil,
3003
+ aggregate_function: nil,
3214
3004
  maintain_order: true,
3215
3005
  sort_columns: false,
3216
3006
  separator: "_"
3217
3007
  )
3218
- if values.is_a?(::String)
3219
- values = [values]
3220
- end
3221
- if index.is_a?(::String)
3222
- index = [index]
3223
- end
3224
- if columns.is_a?(::String)
3225
- columns = [columns]
3008
+ index = Utils._expand_selectors(self, index)
3009
+ on = Utils._expand_selectors(self, on)
3010
+ if !values.nil?
3011
+ values = Utils._expand_selectors(self, values)
3226
3012
  end
3227
3013
 
3228
- if aggregate_fn.is_a?(::String)
3229
- case aggregate_fn
3014
+ if aggregate_function.is_a?(::String)
3015
+ case aggregate_function
3230
3016
  when "first"
3231
- aggregate_expr = Polars.element.first._rbexpr
3017
+ aggregate_expr = F.element.first._rbexpr
3232
3018
  when "sum"
3233
- aggregate_expr = Polars.element.sum._rbexpr
3019
+ aggregate_expr = F.element.sum._rbexpr
3234
3020
  when "max"
3235
- aggregate_expr = Polars.element.max._rbexpr
3021
+ aggregate_expr = F.element.max._rbexpr
3236
3022
  when "min"
3237
- aggregate_expr = Polars.element.min._rbexpr
3023
+ aggregate_expr = F.element.min._rbexpr
3238
3024
  when "mean"
3239
- aggregate_expr = Polars.element.mean._rbexpr
3025
+ aggregate_expr = F.element.mean._rbexpr
3240
3026
  when "median"
3241
- aggregate_expr = Polars.element.median._rbexpr
3027
+ aggregate_expr = F.element.median._rbexpr
3242
3028
  when "last"
3243
- aggregate_expr = Polars.element.last._rbexpr
3029
+ aggregate_expr = F.element.last._rbexpr
3244
3030
  when "len"
3245
- aggregate_expr = Polars.len._rbexpr
3031
+ aggregate_expr = F.len._rbexpr
3246
3032
  when "count"
3247
3033
  warn "`aggregate_function: \"count\"` input for `pivot` is deprecated. Use `aggregate_function: \"len\"` instead."
3248
- aggregate_expr = Polars.len._rbexpr
3034
+ aggregate_expr = F.len._rbexpr
3249
3035
  else
3250
3036
  raise ArgumentError, "Argument aggregate fn: '#{aggregate_fn}' was not expected."
3251
3037
  end
3252
- elsif aggregate_fn.nil?
3038
+ elsif aggregate_function.nil?
3253
3039
  aggregate_expr = nil
3254
3040
  else
3255
3041
  aggregate_expr = aggregate_function._rbexpr
@@ -3257,8 +3043,8 @@ module Polars
3257
3043
 
3258
3044
  _from_rbdf(
3259
3045
  _df.pivot_expr(
3046
+ on,
3260
3047
  index,
3261
- columns,
3262
3048
  values,
3263
3049
  maintain_order,
3264
3050
  sort_columns,
@@ -3273,18 +3059,18 @@ module Polars
3273
3059
  # Optionally leaves identifiers set.
3274
3060
  #
3275
3061
  # This function is useful to massage a DataFrame into a format where one or more
3276
- # columns are identifier variables (id_vars), while all other columns, considered
3277
- # measured variables (value_vars), are "unpivoted" to the row axis, leaving just
3062
+ # columns are identifier variables (index) while all other columns, considered
3063
+ # measured variables (on), are "unpivoted" to the row axis leaving just
3278
3064
  # two non-identifier columns, 'variable' and 'value'.
3279
3065
  #
3280
- # @param id_vars [Object]
3281
- # Columns to use as identifier variables.
3282
- # @param value_vars [Object]
3283
- # Values to use as identifier variables.
3284
- # If `value_vars` is empty all columns that are not in `id_vars` will be used.
3285
- # @param variable_name [String]
3286
- # Name to give to the `value` column. Defaults to "variable"
3287
- # @param value_name [String]
3066
+ # @param on [Object]
3067
+ # Column(s) or selector(s) to use as values variables; if `on`
3068
+ # is empty all columns that are not in `index` will be used.
3069
+ # @param index [Object]
3070
+ # Column(s) or selector(s) to use as identifier variables.
3071
+ # @param variable_name [Object]
3072
+ # Name to give to the `variable` column. Defaults to "variable"
3073
+ # @param value_name [Object]
3288
3074
  # Name to give to the `value` column. Defaults to "value"
3289
3075
  #
3290
3076
  # @return [DataFrame]
@@ -3297,7 +3083,7 @@ module Polars
3297
3083
  # "c" => [2, 4, 6]
3298
3084
  # }
3299
3085
  # )
3300
- # df.melt(id_vars: "a", value_vars: ["b", "c"])
3086
+ # df.unpivot(Polars::Selectors.numeric, index: "a")
3301
3087
  # # =>
3302
3088
  # # shape: (6, 3)
3303
3089
  # # ┌─────┬──────────┬───────┐
@@ -3312,23 +3098,13 @@ module Polars
3312
3098
  # # │ y ┆ c ┆ 4 │
3313
3099
  # # │ z ┆ c ┆ 6 │
3314
3100
  # # └─────┴──────────┴───────┘
3315
- def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil)
3316
- if value_vars.is_a?(::String)
3317
- value_vars = [value_vars]
3318
- end
3319
- if id_vars.is_a?(::String)
3320
- id_vars = [id_vars]
3321
- end
3322
- if value_vars.nil?
3323
- value_vars = []
3324
- end
3325
- if id_vars.nil?
3326
- id_vars = []
3327
- end
3328
- _from_rbdf(
3329
- _df.melt(id_vars, value_vars, value_name, variable_name)
3330
- )
3101
+ def unpivot(on, index: nil, variable_name: nil, value_name: nil)
3102
+ on = on.nil? ? [] : Utils._expand_selectors(self, on)
3103
+ index = index.nil? ? [] : Utils._expand_selectors(self, index)
3104
+
3105
+ _from_rbdf(_df.unpivot(on, index, value_name, variable_name))
3331
3106
  end
3107
+ alias_method :melt, :unpivot
3332
3108
 
3333
3109
  # Unstack a long table to a wide form without doing an aggregation.
3334
3110
  #
@@ -3774,7 +3550,7 @@ module Polars
3774
3550
  # # ┌─────────┐
3775
3551
  # # │ literal │
3776
3552
  # # │ --- │
3777
- # # │ i64
3553
+ # # │ i32
3778
3554
  # # ╞═════════╡
3779
3555
  # # │ 0 │
3780
3556
  # # │ 0 │
@@ -4362,7 +4138,7 @@ module Polars
4362
4138
  end
4363
4139
 
4364
4140
  if subset.is_a?(::Array) && subset.length == 1
4365
- expr = Utils.expr_to_lit_or_expr(subset[0], str_to_lit: false)
4141
+ expr = Utils.wrap_expr(Utils.parse_into_expression(subset[0], str_as_lit: false))
4366
4142
  else
4367
4143
  struct_fields = subset.nil? ? Polars.all : subset
4368
4144
  expr = Polars.struct(struct_fields)
@@ -4780,7 +4556,7 @@ module Polars
4780
4556
  # # │ 3 ┆ 7 │
4781
4557
  # # └─────┴─────┘
4782
4558
  def gather_every(n, offset = 0)
4783
- select(Utils.col("*").gather_every(n, offset))
4559
+ select(F.col("*").gather_every(n, offset))
4784
4560
  end
4785
4561
  alias_method :take_every, :gather_every
4786
4562
 
@@ -4850,7 +4626,7 @@ module Polars
4850
4626
  # # │ 10.0 ┆ null ┆ 9.0 │
4851
4627
  # # └──────┴──────┴──────────┘
4852
4628
  def interpolate
4853
- select(Utils.col("*").interpolate)
4629
+ select(F.col("*").interpolate)
4854
4630
  end
4855
4631
 
4856
4632
  # Check if the dataframe is empty.
@@ -4986,19 +4762,16 @@ module Polars
4986
4762
  #
4987
4763
  # @param column [Object]
4988
4764
  # Columns that are sorted
4989
- # @param more_columns [Object]
4990
- # Additional columns that are sorted, specified as positional arguments.
4991
4765
  # @param descending [Boolean]
4992
4766
  # Whether the columns are sorted in descending order.
4993
4767
  #
4994
4768
  # @return [DataFrame]
4995
4769
  def set_sorted(
4996
4770
  column,
4997
- *more_columns,
4998
4771
  descending: false
4999
4772
  )
5000
4773
  lazy
5001
- .set_sorted(column, *more_columns, descending: descending)
4774
+ .set_sorted(column, descending: descending)
5002
4775
  .collect(no_optimization: true)
5003
4776
  end
5004
4777
 
@@ -5255,7 +5028,7 @@ module Polars
5255
5028
  elsif data[0].is_a?(Hash)
5256
5029
  column_names, dtypes = _unpack_schema(columns)
5257
5030
  schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
5258
- rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema, schema_overrides)
5031
+ rbdf = RbDataFrame.from_hashes(data, schema, schema_overrides, false, infer_schema_length)
5259
5032
  if column_names
5260
5033
  rbdf = _post_apply_columns(rbdf, column_names)
5261
5034
  end
@@ -5289,7 +5062,7 @@ module Polars
5289
5062
  if unpack_nested
5290
5063
  raise Todo
5291
5064
  else
5292
- rbdf = RbDataFrame.read_rows(
5065
+ rbdf = RbDataFrame.from_rows(
5293
5066
  data,
5294
5067
  infer_schema_length,
5295
5068
  local_schema_override.any? ? local_schema_override : nil