polars-df 0.10.0-arm64-darwin → 0.12.0-arm64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +27 -0
- data/Cargo.lock +392 -351
- data/LICENSE-THIRD-PARTY.txt +1127 -867
- data/README.md +6 -6
- data/lib/polars/3.1/polars.bundle +0 -0
- data/lib/polars/3.2/polars.bundle +0 -0
- data/lib/polars/3.3/polars.bundle +0 -0
- data/lib/polars/array_expr.rb +4 -4
- data/lib/polars/batched_csv_reader.rb +11 -5
- data/lib/polars/cat_expr.rb +0 -36
- data/lib/polars/cat_name_space.rb +0 -37
- data/lib/polars/convert.rb +6 -1
- data/lib/polars/data_frame.rb +176 -403
- data/lib/polars/data_types.rb +1 -1
- data/lib/polars/date_time_expr.rb +525 -572
- data/lib/polars/date_time_name_space.rb +263 -460
- data/lib/polars/dynamic_group_by.rb +5 -5
- data/lib/polars/exceptions.rb +7 -0
- data/lib/polars/expr.rb +1394 -243
- data/lib/polars/expr_dispatch.rb +1 -1
- data/lib/polars/functions/aggregation/horizontal.rb +8 -8
- data/lib/polars/functions/as_datatype.rb +63 -40
- data/lib/polars/functions/lazy.rb +63 -14
- data/lib/polars/functions/lit.rb +1 -1
- data/lib/polars/functions/range/date_range.rb +90 -57
- data/lib/polars/functions/range/datetime_range.rb +149 -0
- data/lib/polars/functions/range/int_range.rb +2 -2
- data/lib/polars/functions/range/time_range.rb +141 -0
- data/lib/polars/functions/repeat.rb +1 -1
- data/lib/polars/functions/whenthen.rb +1 -1
- data/lib/polars/group_by.rb +88 -23
- data/lib/polars/io/avro.rb +24 -0
- data/lib/polars/{io.rb → io/csv.rb} +299 -493
- data/lib/polars/io/database.rb +73 -0
- data/lib/polars/io/ipc.rb +247 -0
- data/lib/polars/io/json.rb +29 -0
- data/lib/polars/io/ndjson.rb +80 -0
- data/lib/polars/io/parquet.rb +227 -0
- data/lib/polars/lazy_frame.rb +143 -272
- data/lib/polars/lazy_group_by.rb +100 -3
- data/lib/polars/list_expr.rb +11 -11
- data/lib/polars/list_name_space.rb +5 -1
- data/lib/polars/rolling_group_by.rb +7 -9
- data/lib/polars/series.rb +103 -187
- data/lib/polars/string_expr.rb +78 -102
- data/lib/polars/string_name_space.rb +5 -4
- data/lib/polars/testing.rb +2 -2
- data/lib/polars/utils/constants.rb +9 -0
- data/lib/polars/utils/convert.rb +97 -0
- data/lib/polars/utils/parse.rb +89 -0
- data/lib/polars/utils/various.rb +76 -0
- data/lib/polars/utils/wrap.rb +19 -0
- data/lib/polars/utils.rb +8 -300
- data/lib/polars/version.rb +1 -1
- data/lib/polars/whenthen.rb +6 -6
- data/lib/polars.rb +20 -1
- metadata +17 -4
data/lib/polars/data_frame.rb
CHANGED
@@ -46,271 +46,6 @@ module Polars
|
|
46
46
|
df
|
47
47
|
end
|
48
48
|
|
49
|
-
# @private
|
50
|
-
def self._from_hashes(data, infer_schema_length: 100, schema: nil, schema_overrides: nil)
|
51
|
-
rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema, schema_overrides)
|
52
|
-
_from_rbdf(rbdf)
|
53
|
-
end
|
54
|
-
|
55
|
-
# @private
|
56
|
-
def self._from_hash(data, schema: nil, schema_overrides: nil)
|
57
|
-
_from_rbdf(hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides))
|
58
|
-
end
|
59
|
-
|
60
|
-
# def self._from_records
|
61
|
-
# end
|
62
|
-
|
63
|
-
# def self._from_numo
|
64
|
-
# end
|
65
|
-
|
66
|
-
# no self._from_arrow
|
67
|
-
|
68
|
-
# no self._from_pandas
|
69
|
-
|
70
|
-
# @private
|
71
|
-
def self._read_csv(
|
72
|
-
file,
|
73
|
-
has_header: true,
|
74
|
-
columns: nil,
|
75
|
-
sep: str = ",",
|
76
|
-
comment_char: nil,
|
77
|
-
quote_char: '"',
|
78
|
-
skip_rows: 0,
|
79
|
-
dtypes: nil,
|
80
|
-
null_values: nil,
|
81
|
-
ignore_errors: false,
|
82
|
-
parse_dates: false,
|
83
|
-
n_threads: nil,
|
84
|
-
infer_schema_length: 100,
|
85
|
-
batch_size: 8192,
|
86
|
-
n_rows: nil,
|
87
|
-
encoding: "utf8",
|
88
|
-
low_memory: false,
|
89
|
-
rechunk: true,
|
90
|
-
skip_rows_after_header: 0,
|
91
|
-
row_count_name: nil,
|
92
|
-
row_count_offset: 0,
|
93
|
-
sample_size: 1024,
|
94
|
-
eol_char: "\n",
|
95
|
-
truncate_ragged_lines: false
|
96
|
-
)
|
97
|
-
if Utils.pathlike?(file)
|
98
|
-
path = Utils.normalise_filepath(file)
|
99
|
-
else
|
100
|
-
path = nil
|
101
|
-
# if defined?(StringIO) && file.is_a?(StringIO)
|
102
|
-
# file = file.string
|
103
|
-
# end
|
104
|
-
end
|
105
|
-
|
106
|
-
dtype_list = nil
|
107
|
-
dtype_slice = nil
|
108
|
-
if !dtypes.nil?
|
109
|
-
if dtypes.is_a?(Hash)
|
110
|
-
dtype_list = []
|
111
|
-
dtypes.each do|k, v|
|
112
|
-
dtype_list << [k, Utils.rb_type_to_dtype(v)]
|
113
|
-
end
|
114
|
-
elsif dtypes.is_a?(::Array)
|
115
|
-
dtype_slice = dtypes
|
116
|
-
else
|
117
|
-
raise ArgumentError, "dtype arg should be list or dict"
|
118
|
-
end
|
119
|
-
end
|
120
|
-
|
121
|
-
processed_null_values = Utils._process_null_values(null_values)
|
122
|
-
|
123
|
-
if columns.is_a?(::String)
|
124
|
-
columns = [columns]
|
125
|
-
end
|
126
|
-
if file.is_a?(::String) && file.include?("*")
|
127
|
-
dtypes_dict = nil
|
128
|
-
if !dtype_list.nil?
|
129
|
-
dtypes_dict = dtype_list.to_h
|
130
|
-
end
|
131
|
-
if !dtype_slice.nil?
|
132
|
-
raise ArgumentError, "cannot use glob patterns and unnamed dtypes as `dtypes` argument; Use dtypes: Mapping[str, Type[DataType]"
|
133
|
-
end
|
134
|
-
scan = Polars.scan_csv(
|
135
|
-
file,
|
136
|
-
has_header: has_header,
|
137
|
-
sep: sep,
|
138
|
-
comment_char: comment_char,
|
139
|
-
quote_char: quote_char,
|
140
|
-
skip_rows: skip_rows,
|
141
|
-
dtypes: dtypes_dict,
|
142
|
-
null_values: null_values,
|
143
|
-
ignore_errors: ignore_errors,
|
144
|
-
infer_schema_length: infer_schema_length,
|
145
|
-
n_rows: n_rows,
|
146
|
-
low_memory: low_memory,
|
147
|
-
rechunk: rechunk,
|
148
|
-
skip_rows_after_header: skip_rows_after_header,
|
149
|
-
row_count_name: row_count_name,
|
150
|
-
row_count_offset: row_count_offset,
|
151
|
-
eol_char: eol_char,
|
152
|
-
truncate_ragged_lines: truncate_ragged_lines
|
153
|
-
)
|
154
|
-
if columns.nil?
|
155
|
-
return _from_rbdf(scan.collect._df)
|
156
|
-
elsif is_str_sequence(columns, allow_str: false)
|
157
|
-
return _from_rbdf(scan.select(columns).collect._df)
|
158
|
-
else
|
159
|
-
raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: List[str]"
|
160
|
-
end
|
161
|
-
end
|
162
|
-
|
163
|
-
projection, columns = Utils.handle_projection_columns(columns)
|
164
|
-
|
165
|
-
_from_rbdf(
|
166
|
-
RbDataFrame.read_csv(
|
167
|
-
file,
|
168
|
-
infer_schema_length,
|
169
|
-
batch_size,
|
170
|
-
has_header,
|
171
|
-
ignore_errors,
|
172
|
-
n_rows,
|
173
|
-
skip_rows,
|
174
|
-
projection,
|
175
|
-
sep,
|
176
|
-
rechunk,
|
177
|
-
columns,
|
178
|
-
encoding,
|
179
|
-
n_threads,
|
180
|
-
path,
|
181
|
-
dtype_list,
|
182
|
-
dtype_slice,
|
183
|
-
low_memory,
|
184
|
-
comment_char,
|
185
|
-
quote_char,
|
186
|
-
processed_null_values,
|
187
|
-
parse_dates,
|
188
|
-
skip_rows_after_header,
|
189
|
-
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
190
|
-
sample_size,
|
191
|
-
eol_char,
|
192
|
-
truncate_ragged_lines
|
193
|
-
)
|
194
|
-
)
|
195
|
-
end
|
196
|
-
|
197
|
-
# @private
|
198
|
-
def self._read_parquet(
|
199
|
-
source,
|
200
|
-
columns: nil,
|
201
|
-
n_rows: nil,
|
202
|
-
parallel: "auto",
|
203
|
-
row_count_name: nil,
|
204
|
-
row_count_offset: 0,
|
205
|
-
low_memory: false,
|
206
|
-
use_statistics: true,
|
207
|
-
rechunk: true
|
208
|
-
)
|
209
|
-
if Utils.pathlike?(source)
|
210
|
-
source = Utils.normalise_filepath(source)
|
211
|
-
end
|
212
|
-
if columns.is_a?(::String)
|
213
|
-
columns = [columns]
|
214
|
-
end
|
215
|
-
|
216
|
-
if source.is_a?(::String) && source.include?("*") && Utils.local_file?(source)
|
217
|
-
scan =
|
218
|
-
Polars.scan_parquet(
|
219
|
-
source,
|
220
|
-
n_rows: n_rows,
|
221
|
-
rechunk: true,
|
222
|
-
parallel: parallel,
|
223
|
-
row_count_name: row_count_name,
|
224
|
-
row_count_offset: row_count_offset,
|
225
|
-
low_memory: low_memory
|
226
|
-
)
|
227
|
-
|
228
|
-
if columns.nil?
|
229
|
-
return self._from_rbdf(scan.collect._df)
|
230
|
-
elsif Utils.is_str_sequence(columns, allow_str: false)
|
231
|
-
return self._from_rbdf(scan.select(columns).collect._df)
|
232
|
-
else
|
233
|
-
raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: Array[String]"
|
234
|
-
end
|
235
|
-
end
|
236
|
-
|
237
|
-
projection, columns = Utils.handle_projection_columns(columns)
|
238
|
-
_from_rbdf(
|
239
|
-
RbDataFrame.read_parquet(
|
240
|
-
source,
|
241
|
-
columns,
|
242
|
-
projection,
|
243
|
-
n_rows,
|
244
|
-
parallel,
|
245
|
-
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
246
|
-
low_memory,
|
247
|
-
use_statistics,
|
248
|
-
rechunk
|
249
|
-
)
|
250
|
-
)
|
251
|
-
end
|
252
|
-
|
253
|
-
# @private
|
254
|
-
def self._read_avro(file, columns: nil, n_rows: nil)
|
255
|
-
if Utils.pathlike?(file)
|
256
|
-
file = Utils.normalise_filepath(file)
|
257
|
-
end
|
258
|
-
projection, columns = Utils.handle_projection_columns(columns)
|
259
|
-
_from_rbdf(RbDataFrame.read_avro(file, columns, projection, n_rows))
|
260
|
-
end
|
261
|
-
|
262
|
-
# @private
|
263
|
-
def self._read_ipc(
|
264
|
-
file,
|
265
|
-
columns: nil,
|
266
|
-
n_rows: nil,
|
267
|
-
row_count_name: nil,
|
268
|
-
row_count_offset: 0,
|
269
|
-
rechunk: true,
|
270
|
-
memory_map: true
|
271
|
-
)
|
272
|
-
if Utils.pathlike?(file)
|
273
|
-
file = Utils.normalise_filepath(file)
|
274
|
-
end
|
275
|
-
if columns.is_a?(::String)
|
276
|
-
columns = [columns]
|
277
|
-
end
|
278
|
-
|
279
|
-
if file.is_a?(::String) && file.include?("*")
|
280
|
-
raise Todo
|
281
|
-
end
|
282
|
-
|
283
|
-
projection, columns = Utils.handle_projection_columns(columns)
|
284
|
-
_from_rbdf(
|
285
|
-
RbDataFrame.read_ipc(
|
286
|
-
file,
|
287
|
-
columns,
|
288
|
-
projection,
|
289
|
-
n_rows,
|
290
|
-
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
291
|
-
memory_map
|
292
|
-
)
|
293
|
-
)
|
294
|
-
end
|
295
|
-
|
296
|
-
# @private
|
297
|
-
def self._read_json(file)
|
298
|
-
if Utils.pathlike?(file)
|
299
|
-
file = Utils.normalise_filepath(file)
|
300
|
-
end
|
301
|
-
|
302
|
-
_from_rbdf(RbDataFrame.read_json(file))
|
303
|
-
end
|
304
|
-
|
305
|
-
# @private
|
306
|
-
def self._read_ndjson(file)
|
307
|
-
if Utils.pathlike?(file)
|
308
|
-
file = Utils.normalise_filepath(file)
|
309
|
-
end
|
310
|
-
|
311
|
-
_from_rbdf(RbDataFrame.read_ndjson(file))
|
312
|
-
end
|
313
|
-
|
314
49
|
# Get the shape of the DataFrame.
|
315
50
|
#
|
316
51
|
# @return [Array]
|
@@ -419,6 +154,13 @@ module Polars
|
|
419
154
|
_df.dtypes
|
420
155
|
end
|
421
156
|
|
157
|
+
# Get flags that are set on the columns of this DataFrame.
|
158
|
+
#
|
159
|
+
# @return [Hash]
|
160
|
+
def flags
|
161
|
+
columns.to_h { |name| [name, self[name].flags] }
|
162
|
+
end
|
163
|
+
|
422
164
|
# Get the schema.
|
423
165
|
#
|
424
166
|
# @return [Hash]
|
@@ -845,7 +587,7 @@ module Polars
|
|
845
587
|
row_oriented: false
|
846
588
|
)
|
847
589
|
if Utils.pathlike?(file)
|
848
|
-
file = Utils.
|
590
|
+
file = Utils.normalize_filepath(file)
|
849
591
|
end
|
850
592
|
to_string_io = !file.nil? && file.is_a?(StringIO)
|
851
593
|
if file.nil? || to_string_io
|
@@ -880,11 +622,11 @@ module Polars
|
|
880
622
|
# "bar" => [6, 7, 8]
|
881
623
|
# }
|
882
624
|
# )
|
883
|
-
# df.write_ndjson
|
625
|
+
# df.write_ndjson
|
884
626
|
# # => "{\"foo\":1,\"bar\":6}\n{\"foo\":2,\"bar\":7}\n{\"foo\":3,\"bar\":8}\n"
|
885
627
|
def write_ndjson(file = nil)
|
886
628
|
if Utils.pathlike?(file)
|
887
|
-
file = Utils.
|
629
|
+
file = Utils.normalize_filepath(file)
|
888
630
|
end
|
889
631
|
to_string_io = !file.nil? && file.is_a?(StringIO)
|
890
632
|
if file.nil? || to_string_io
|
@@ -991,7 +733,7 @@ module Polars
|
|
991
733
|
end
|
992
734
|
|
993
735
|
if Utils.pathlike?(file)
|
994
|
-
file = Utils.
|
736
|
+
file = Utils.normalize_filepath(file)
|
995
737
|
end
|
996
738
|
|
997
739
|
_df.write_csv(
|
@@ -1029,7 +771,7 @@ module Polars
|
|
1029
771
|
compression = "uncompressed"
|
1030
772
|
end
|
1031
773
|
if Utils.pathlike?(file)
|
1032
|
-
file = Utils.
|
774
|
+
file = Utils.normalize_filepath(file)
|
1033
775
|
end
|
1034
776
|
|
1035
777
|
_df.write_avro(file, compression)
|
@@ -1050,7 +792,7 @@ module Polars
|
|
1050
792
|
file.set_encoding(Encoding::BINARY)
|
1051
793
|
end
|
1052
794
|
if Utils.pathlike?(file)
|
1053
|
-
file = Utils.
|
795
|
+
file = Utils.normalize_filepath(file)
|
1054
796
|
end
|
1055
797
|
|
1056
798
|
if compression.nil?
|
@@ -1061,6 +803,47 @@ module Polars
|
|
1061
803
|
return_bytes ? file.string : nil
|
1062
804
|
end
|
1063
805
|
|
806
|
+
# Write to Arrow IPC record batch stream.
|
807
|
+
#
|
808
|
+
# See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html.
|
809
|
+
#
|
810
|
+
# @param file [Object]
|
811
|
+
# Path or writable file-like object to which the IPC record batch data will
|
812
|
+
# be written. If set to `None`, the output is returned as a BytesIO object.
|
813
|
+
# @param compression ['uncompressed', 'lz4', 'zstd']
|
814
|
+
# Compression method. Defaults to "uncompressed".
|
815
|
+
#
|
816
|
+
# @return [Object]
|
817
|
+
#
|
818
|
+
# @example
|
819
|
+
# df = Polars::DataFrame.new(
|
820
|
+
# {
|
821
|
+
# "foo" => [1, 2, 3, 4, 5],
|
822
|
+
# "bar" => [6, 7, 8, 9, 10],
|
823
|
+
# "ham" => ["a", "b", "c", "d", "e"]
|
824
|
+
# }
|
825
|
+
# )
|
826
|
+
# df.write_ipc_stream("new_file.arrow")
|
827
|
+
def write_ipc_stream(
|
828
|
+
file,
|
829
|
+
compression: "uncompressed"
|
830
|
+
)
|
831
|
+
return_bytes = file.nil?
|
832
|
+
if return_bytes
|
833
|
+
file = StringIO.new
|
834
|
+
file.set_encoding(Encoding::BINARY)
|
835
|
+
elsif Utils.pathlike?(file)
|
836
|
+
file = Utils.normalize_filepath(file)
|
837
|
+
end
|
838
|
+
|
839
|
+
if compression.nil?
|
840
|
+
compression = "uncompressed"
|
841
|
+
end
|
842
|
+
|
843
|
+
_df.write_ipc_stream(file, compression)
|
844
|
+
return_bytes ? file.string : nil
|
845
|
+
end
|
846
|
+
|
1064
847
|
# Write to Apache Parquet file.
|
1065
848
|
#
|
1066
849
|
# @param file [String, Pathname, StringIO]
|
@@ -1097,7 +880,25 @@ module Polars
|
|
1097
880
|
compression = "uncompressed"
|
1098
881
|
end
|
1099
882
|
if Utils.pathlike?(file)
|
1100
|
-
file = Utils.
|
883
|
+
file = Utils.normalize_filepath(file)
|
884
|
+
end
|
885
|
+
|
886
|
+
if statistics == true
|
887
|
+
statistics = {
|
888
|
+
min: true,
|
889
|
+
max: true,
|
890
|
+
distinct_count: false,
|
891
|
+
null_count: true
|
892
|
+
}
|
893
|
+
elsif statistics == false
|
894
|
+
statistics = {}
|
895
|
+
elsif statistics == "full"
|
896
|
+
statistics = {
|
897
|
+
min: true,
|
898
|
+
max: true,
|
899
|
+
distinct_count: true,
|
900
|
+
null_count: true
|
901
|
+
}
|
1101
902
|
end
|
1102
903
|
|
1103
904
|
_df.write_parquet(
|
@@ -1773,10 +1574,7 @@ module Polars
|
|
1773
1574
|
# # │ 3 ┆ 8 ┆ c │
|
1774
1575
|
# # └─────┴─────┴─────┘
|
1775
1576
|
def drop_nulls(subset: nil)
|
1776
|
-
|
1777
|
-
subset = [subset]
|
1778
|
-
end
|
1779
|
-
_from_rbdf(_df.drop_nulls(subset))
|
1577
|
+
lazy.drop_nulls(subset: subset).collect(_eager: true)
|
1780
1578
|
end
|
1781
1579
|
|
1782
1580
|
# Offers a structured way to apply a sequence of user-defined functions (UDFs).
|
@@ -1838,16 +1636,16 @@ module Polars
|
|
1838
1636
|
# df.with_row_index
|
1839
1637
|
# # =>
|
1840
1638
|
# # shape: (3, 3)
|
1841
|
-
# #
|
1842
|
-
# # │
|
1843
|
-
# # │ ---
|
1844
|
-
# # │ u32
|
1845
|
-
# #
|
1846
|
-
# # │ 0
|
1847
|
-
# # │ 1
|
1848
|
-
# # │ 2
|
1849
|
-
# #
|
1850
|
-
def with_row_index(name: "
|
1639
|
+
# # ┌───────┬─────┬─────┐
|
1640
|
+
# # │ index ┆ a ┆ b │
|
1641
|
+
# # │ --- ┆ --- ┆ --- │
|
1642
|
+
# # │ u32 ┆ i64 ┆ i64 │
|
1643
|
+
# # ╞═══════╪═════╪═════╡
|
1644
|
+
# # │ 0 ┆ 1 ┆ 2 │
|
1645
|
+
# # │ 1 ┆ 3 ┆ 4 │
|
1646
|
+
# # │ 2 ┆ 5 ┆ 6 │
|
1647
|
+
# # └───────┴─────┴─────┘
|
1648
|
+
def with_row_index(name: "index", offset: 0)
|
1851
1649
|
_from_rbdf(_df.with_row_index(name, offset))
|
1852
1650
|
end
|
1853
1651
|
alias_method :with_row_count, :with_row_index
|
@@ -1944,12 +1742,6 @@ module Polars
|
|
1944
1742
|
# Define whether the temporal window interval is closed or not.
|
1945
1743
|
# @param by [Object]
|
1946
1744
|
# Also group by this column/these columns.
|
1947
|
-
# @param check_sorted [Boolean]
|
1948
|
-
# When the `by` argument is given, polars can not check sortedness
|
1949
|
-
# by the metadata and has to do a full scan on the index column to
|
1950
|
-
# verify data is sorted. This is expensive. If you are sure the
|
1951
|
-
# data within the by groups is sorted, you can set this to `false`.
|
1952
|
-
# Doing so incorrectly will lead to incorrect output
|
1953
1745
|
#
|
1954
1746
|
# @return [RollingGroupBy]
|
1955
1747
|
#
|
@@ -1965,7 +1757,7 @@ module Polars
|
|
1965
1757
|
# df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
|
1966
1758
|
# Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
|
1967
1759
|
# )
|
1968
|
-
# df.
|
1760
|
+
# df.rolling(index_column: "dt", period: "2d").agg(
|
1969
1761
|
# [
|
1970
1762
|
# Polars.sum("a").alias("sum_a"),
|
1971
1763
|
# Polars.min("a").alias("min_a"),
|
@@ -1986,17 +1778,17 @@ module Polars
|
|
1986
1778
|
# # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
|
1987
1779
|
# # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
|
1988
1780
|
# # └─────────────────────┴───────┴───────┴───────┘
|
1989
|
-
def
|
1781
|
+
def rolling(
|
1990
1782
|
index_column:,
|
1991
1783
|
period:,
|
1992
1784
|
offset: nil,
|
1993
1785
|
closed: "right",
|
1994
|
-
by: nil
|
1995
|
-
check_sorted: true
|
1786
|
+
by: nil
|
1996
1787
|
)
|
1997
|
-
RollingGroupBy.new(self, index_column, period, offset, closed, by
|
1788
|
+
RollingGroupBy.new(self, index_column, period, offset, closed, by)
|
1998
1789
|
end
|
1999
|
-
alias_method :groupby_rolling, :
|
1790
|
+
alias_method :groupby_rolling, :rolling
|
1791
|
+
alias_method :group_by_rolling, :rolling
|
2000
1792
|
|
2001
1793
|
# Group based on a time value (or index value of type `:i32`, `:i64`).
|
2002
1794
|
#
|
@@ -2066,10 +1858,12 @@ module Polars
|
|
2066
1858
|
# @example
|
2067
1859
|
# df = Polars::DataFrame.new(
|
2068
1860
|
# {
|
2069
|
-
# "time" => Polars.
|
1861
|
+
# "time" => Polars.datetime_range(
|
2070
1862
|
# DateTime.new(2021, 12, 16),
|
2071
1863
|
# DateTime.new(2021, 12, 16, 3),
|
2072
|
-
# "30m"
|
1864
|
+
# "30m",
|
1865
|
+
# time_unit: "us",
|
1866
|
+
# eager: true
|
2073
1867
|
# ),
|
2074
1868
|
# "n" => 0..6
|
2075
1869
|
# }
|
@@ -2136,16 +1930,16 @@ module Polars
|
|
2136
1930
|
# )
|
2137
1931
|
# # =>
|
2138
1932
|
# # shape: (4, 3)
|
2139
|
-
# #
|
2140
|
-
# # │ time ┆ time_count ┆ time_agg_list
|
2141
|
-
# # │ --- ┆ --- ┆ ---
|
2142
|
-
# # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
|
2143
|
-
# #
|
2144
|
-
# # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12
|
2145
|
-
# # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12
|
2146
|
-
# # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12
|
2147
|
-
# # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
|
2148
|
-
# #
|
1933
|
+
# # ┌─────────────────────┬────────────┬─────────────────────────────────┐
|
1934
|
+
# # │ time ┆ time_count ┆ time_agg_list │
|
1935
|
+
# # │ --- ┆ --- ┆ --- │
|
1936
|
+
# # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │
|
1937
|
+
# # ╞═════════════════════╪════════════╪═════════════════════════════════╡
|
1938
|
+
# # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-… │
|
1939
|
+
# # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-… │
|
1940
|
+
# # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-… │
|
1941
|
+
# # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
|
1942
|
+
# # └─────────────────────┴────────────┴─────────────────────────────────┘
|
2149
1943
|
#
|
2150
1944
|
# @example When closed="both" the time values at the window boundaries belong to 2 groups.
|
2151
1945
|
# df.group_by_dynamic("time", every: "1h", closed: "both").agg(
|
@@ -2168,10 +1962,12 @@ module Polars
|
|
2168
1962
|
# @example Dynamic group bys can also be combined with grouping on normal keys.
|
2169
1963
|
# df = Polars::DataFrame.new(
|
2170
1964
|
# {
|
2171
|
-
# "time" => Polars.
|
1965
|
+
# "time" => Polars.datetime_range(
|
2172
1966
|
# DateTime.new(2021, 12, 16),
|
2173
1967
|
# DateTime.new(2021, 12, 16, 3),
|
2174
|
-
# "30m"
|
1968
|
+
# "30m",
|
1969
|
+
# time_unit: "us",
|
1970
|
+
# eager: true
|
2175
1971
|
# ),
|
2176
1972
|
# "groups" => ["a", "a", "a", "b", "b", "a", "a"]
|
2177
1973
|
# }
|
@@ -2258,8 +2054,6 @@ module Polars
|
|
2258
2054
|
# Note that this column has to be sorted for the output to make sense.
|
2259
2055
|
# @param every [String]
|
2260
2056
|
# interval will start 'every' duration
|
2261
|
-
# @param offset [String]
|
2262
|
-
# change the start of the date_range by this offset.
|
2263
2057
|
# @param by [Object]
|
2264
2058
|
# First group by these columns and then upsample for every group
|
2265
2059
|
# @param maintain_order [Boolean]
|
@@ -2319,7 +2113,6 @@ module Polars
|
|
2319
2113
|
def upsample(
|
2320
2114
|
time_column:,
|
2321
2115
|
every:,
|
2322
|
-
offset: nil,
|
2323
2116
|
by: nil,
|
2324
2117
|
maintain_order: false
|
2325
2118
|
)
|
@@ -2329,15 +2122,11 @@ module Polars
|
|
2329
2122
|
if by.is_a?(::String)
|
2330
2123
|
by = [by]
|
2331
2124
|
end
|
2332
|
-
if offset.nil?
|
2333
|
-
offset = "0ns"
|
2334
|
-
end
|
2335
2125
|
|
2336
|
-
every = Utils.
|
2337
|
-
offset = Utils._timedelta_to_pl_duration(offset)
|
2126
|
+
every = Utils.parse_as_duration_string(every)
|
2338
2127
|
|
2339
2128
|
_from_rbdf(
|
2340
|
-
_df.upsample(by, time_column, every,
|
2129
|
+
_df.upsample(by, time_column, every, maintain_order)
|
2341
2130
|
)
|
2342
2131
|
end
|
2343
2132
|
|
@@ -2484,7 +2273,7 @@ module Polars
|
|
2484
2273
|
# Name(s) of the right join column(s).
|
2485
2274
|
# @param on [Object]
|
2486
2275
|
# Name(s) of the join columns in both DataFrames.
|
2487
|
-
# @param how ["inner", "left", "
|
2276
|
+
# @param how ["inner", "left", "full", "semi", "anti", "cross"]
|
2488
2277
|
# Join strategy.
|
2489
2278
|
# @param suffix [String]
|
2490
2279
|
# Suffix to append to columns with a duplicate name.
|
@@ -2520,7 +2309,7 @@ module Polars
|
|
2520
2309
|
# # └─────┴─────┴─────┴───────┘
|
2521
2310
|
#
|
2522
2311
|
# @example
|
2523
|
-
# df.join(other_df, on: "ham", how: "
|
2312
|
+
# df.join(other_df, on: "ham", how: "full")
|
2524
2313
|
# # =>
|
2525
2314
|
# # shape: (4, 5)
|
2526
2315
|
# # ┌──────┬──────┬──────┬───────┬───────────┐
|
@@ -2620,7 +2409,7 @@ module Polars
|
|
2620
2409
|
# df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [-1, 5, 8]})
|
2621
2410
|
#
|
2622
2411
|
# @example Return a DataFrame by mapping each row to a tuple:
|
2623
|
-
# df.
|
2412
|
+
# df.map_rows { |t| [t[0] * 2, t[1] * 3] }
|
2624
2413
|
# # =>
|
2625
2414
|
# # shape: (3, 2)
|
2626
2415
|
# # ┌──────────┬──────────┐
|
@@ -2634,7 +2423,7 @@ module Polars
|
|
2634
2423
|
# # └──────────┴──────────┘
|
2635
2424
|
#
|
2636
2425
|
# @example Return a Series by mapping each row to a scalar:
|
2637
|
-
# df.
|
2426
|
+
# df.map_rows { |t| t[0] * 2 + t[1] }
|
2638
2427
|
# # =>
|
2639
2428
|
# # shape: (3, 1)
|
2640
2429
|
# # ┌───────┐
|
@@ -2646,14 +2435,15 @@ module Polars
|
|
2646
2435
|
# # │ 9 │
|
2647
2436
|
# # │ 14 │
|
2648
2437
|
# # └───────┘
|
2649
|
-
def
|
2650
|
-
out, is_df = _df.
|
2438
|
+
def map_rows(return_dtype: nil, inference_size: 256, &f)
|
2439
|
+
out, is_df = _df.map_rows(f, return_dtype, inference_size)
|
2651
2440
|
if is_df
|
2652
2441
|
_from_rbdf(out)
|
2653
2442
|
else
|
2654
2443
|
_from_rbdf(Utils.wrap_s(out).to_frame._df)
|
2655
2444
|
end
|
2656
2445
|
end
|
2446
|
+
alias_method :apply, :map_rows
|
2657
2447
|
|
2658
2448
|
# Return a new DataFrame with the column added or replaced.
|
2659
2449
|
#
|
@@ -3176,9 +2966,9 @@ module Polars
|
|
3176
2966
|
# arguments contains multiple columns as well
|
3177
2967
|
# @param index [Object]
|
3178
2968
|
# One or multiple keys to group by
|
3179
|
-
# @param
|
2969
|
+
# @param on [Object]
|
3180
2970
|
# Columns whose values will be used as the header of the output DataFrame
|
3181
|
-
# @param
|
2971
|
+
# @param aggregate_function ["first", "sum", "max", "min", "mean", "median", "last", "count"]
|
3182
2972
|
# A predefined aggregate function str or an expression.
|
3183
2973
|
# @param maintain_order [Object]
|
3184
2974
|
# Sort the grouped keys so that the output order is predictable.
|
@@ -3190,66 +2980,62 @@ module Polars
|
|
3190
2980
|
# @example
|
3191
2981
|
# df = Polars::DataFrame.new(
|
3192
2982
|
# {
|
3193
|
-
# "foo" => ["one", "one", "
|
3194
|
-
# "bar" => ["
|
2983
|
+
# "foo" => ["one", "one", "two", "two", "one", "two"],
|
2984
|
+
# "bar" => ["y", "y", "y", "x", "x", "x"],
|
3195
2985
|
# "baz" => [1, 2, 3, 4, 5, 6]
|
3196
2986
|
# }
|
3197
2987
|
# )
|
3198
|
-
# df.pivot(
|
2988
|
+
# df.pivot("bar", index: "foo", values: "baz", aggregate_function: "sum")
|
3199
2989
|
# # =>
|
3200
|
-
# # shape: (2,
|
3201
|
-
# #
|
3202
|
-
# # │ foo ┆
|
3203
|
-
# # │ --- ┆ --- ┆ ---
|
3204
|
-
# # │ str ┆ i64 ┆ i64
|
3205
|
-
# #
|
3206
|
-
# # │ one ┆
|
3207
|
-
# # │ two ┆
|
3208
|
-
# #
|
2990
|
+
# # shape: (2, 3)
|
2991
|
+
# # ┌─────┬─────┬─────┐
|
2992
|
+
# # │ foo ┆ y ┆ x │
|
2993
|
+
# # │ --- ┆ --- ┆ --- │
|
2994
|
+
# # │ str ┆ i64 ┆ i64 │
|
2995
|
+
# # ╞═════╪═════╪═════╡
|
2996
|
+
# # │ one ┆ 3 ┆ 5 │
|
2997
|
+
# # │ two ┆ 3 ┆ 10 │
|
2998
|
+
# # └─────┴─────┴─────┘
|
3209
2999
|
def pivot(
|
3210
|
-
|
3211
|
-
index
|
3212
|
-
|
3213
|
-
|
3000
|
+
on,
|
3001
|
+
index: nil,
|
3002
|
+
values: nil,
|
3003
|
+
aggregate_function: nil,
|
3214
3004
|
maintain_order: true,
|
3215
3005
|
sort_columns: false,
|
3216
3006
|
separator: "_"
|
3217
3007
|
)
|
3218
|
-
|
3219
|
-
|
3220
|
-
|
3221
|
-
|
3222
|
-
index = [index]
|
3223
|
-
end
|
3224
|
-
if columns.is_a?(::String)
|
3225
|
-
columns = [columns]
|
3008
|
+
index = Utils._expand_selectors(self, index)
|
3009
|
+
on = Utils._expand_selectors(self, on)
|
3010
|
+
if !values.nil?
|
3011
|
+
values = Utils._expand_selectors(self, values)
|
3226
3012
|
end
|
3227
3013
|
|
3228
|
-
if
|
3229
|
-
case
|
3014
|
+
if aggregate_function.is_a?(::String)
|
3015
|
+
case aggregate_function
|
3230
3016
|
when "first"
|
3231
|
-
aggregate_expr =
|
3017
|
+
aggregate_expr = F.element.first._rbexpr
|
3232
3018
|
when "sum"
|
3233
|
-
aggregate_expr =
|
3019
|
+
aggregate_expr = F.element.sum._rbexpr
|
3234
3020
|
when "max"
|
3235
|
-
aggregate_expr =
|
3021
|
+
aggregate_expr = F.element.max._rbexpr
|
3236
3022
|
when "min"
|
3237
|
-
aggregate_expr =
|
3023
|
+
aggregate_expr = F.element.min._rbexpr
|
3238
3024
|
when "mean"
|
3239
|
-
aggregate_expr =
|
3025
|
+
aggregate_expr = F.element.mean._rbexpr
|
3240
3026
|
when "median"
|
3241
|
-
aggregate_expr =
|
3027
|
+
aggregate_expr = F.element.median._rbexpr
|
3242
3028
|
when "last"
|
3243
|
-
aggregate_expr =
|
3029
|
+
aggregate_expr = F.element.last._rbexpr
|
3244
3030
|
when "len"
|
3245
|
-
aggregate_expr =
|
3031
|
+
aggregate_expr = F.len._rbexpr
|
3246
3032
|
when "count"
|
3247
3033
|
warn "`aggregate_function: \"count\"` input for `pivot` is deprecated. Use `aggregate_function: \"len\"` instead."
|
3248
|
-
aggregate_expr =
|
3034
|
+
aggregate_expr = F.len._rbexpr
|
3249
3035
|
else
|
3250
3036
|
raise ArgumentError, "Argument aggregate fn: '#{aggregate_fn}' was not expected."
|
3251
3037
|
end
|
3252
|
-
elsif
|
3038
|
+
elsif aggregate_function.nil?
|
3253
3039
|
aggregate_expr = nil
|
3254
3040
|
else
|
3255
3041
|
aggregate_expr = aggregate_function._rbexpr
|
@@ -3257,8 +3043,8 @@ module Polars
|
|
3257
3043
|
|
3258
3044
|
_from_rbdf(
|
3259
3045
|
_df.pivot_expr(
|
3046
|
+
on,
|
3260
3047
|
index,
|
3261
|
-
columns,
|
3262
3048
|
values,
|
3263
3049
|
maintain_order,
|
3264
3050
|
sort_columns,
|
@@ -3273,18 +3059,18 @@ module Polars
|
|
3273
3059
|
# Optionally leaves identifiers set.
|
3274
3060
|
#
|
3275
3061
|
# This function is useful to massage a DataFrame into a format where one or more
|
3276
|
-
# columns are identifier variables (
|
3277
|
-
# measured variables (
|
3062
|
+
# columns are identifier variables (index) while all other columns, considered
|
3063
|
+
# measured variables (on), are "unpivoted" to the row axis leaving just
|
3278
3064
|
# two non-identifier columns, 'variable' and 'value'.
|
3279
3065
|
#
|
3280
|
-
# @param
|
3281
|
-
#
|
3282
|
-
#
|
3283
|
-
#
|
3284
|
-
#
|
3285
|
-
# @param variable_name [
|
3286
|
-
# Name to give to the `
|
3287
|
-
# @param value_name [
|
3066
|
+
# @param on [Object]
|
3067
|
+
# Column(s) or selector(s) to use as values variables; if `on`
|
3068
|
+
# is empty all columns that are not in `index` will be used.
|
3069
|
+
# @param index [Object]
|
3070
|
+
# Column(s) or selector(s) to use as identifier variables.
|
3071
|
+
# @param variable_name [Object]
|
3072
|
+
# Name to give to the `variable` column. Defaults to "variable"
|
3073
|
+
# @param value_name [Object]
|
3288
3074
|
# Name to give to the `value` column. Defaults to "value"
|
3289
3075
|
#
|
3290
3076
|
# @return [DataFrame]
|
@@ -3297,7 +3083,7 @@ module Polars
|
|
3297
3083
|
# "c" => [2, 4, 6]
|
3298
3084
|
# }
|
3299
3085
|
# )
|
3300
|
-
# df.
|
3086
|
+
# df.unpivot(Polars::Selectors.numeric, index: "a")
|
3301
3087
|
# # =>
|
3302
3088
|
# # shape: (6, 3)
|
3303
3089
|
# # ┌─────┬──────────┬───────┐
|
@@ -3312,23 +3098,13 @@ module Polars
|
|
3312
3098
|
# # │ y ┆ c ┆ 4 │
|
3313
3099
|
# # │ z ┆ c ┆ 6 │
|
3314
3100
|
# # └─────┴──────────┴───────┘
|
3315
|
-
def
|
3316
|
-
|
3317
|
-
|
3318
|
-
|
3319
|
-
|
3320
|
-
id_vars = [id_vars]
|
3321
|
-
end
|
3322
|
-
if value_vars.nil?
|
3323
|
-
value_vars = []
|
3324
|
-
end
|
3325
|
-
if id_vars.nil?
|
3326
|
-
id_vars = []
|
3327
|
-
end
|
3328
|
-
_from_rbdf(
|
3329
|
-
_df.melt(id_vars, value_vars, value_name, variable_name)
|
3330
|
-
)
|
3101
|
+
def unpivot(on, index: nil, variable_name: nil, value_name: nil)
|
3102
|
+
on = on.nil? ? [] : Utils._expand_selectors(self, on)
|
3103
|
+
index = index.nil? ? [] : Utils._expand_selectors(self, index)
|
3104
|
+
|
3105
|
+
_from_rbdf(_df.unpivot(on, index, value_name, variable_name))
|
3331
3106
|
end
|
3107
|
+
alias_method :melt, :unpivot
|
3332
3108
|
|
3333
3109
|
# Unstack a long table to a wide form without doing an aggregation.
|
3334
3110
|
#
|
@@ -3774,7 +3550,7 @@ module Polars
|
|
3774
3550
|
# # ┌─────────┐
|
3775
3551
|
# # │ literal │
|
3776
3552
|
# # │ --- │
|
3777
|
-
# # │
|
3553
|
+
# # │ i32 │
|
3778
3554
|
# # ╞═════════╡
|
3779
3555
|
# # │ 0 │
|
3780
3556
|
# # │ 0 │
|
@@ -4362,7 +4138,7 @@ module Polars
|
|
4362
4138
|
end
|
4363
4139
|
|
4364
4140
|
if subset.is_a?(::Array) && subset.length == 1
|
4365
|
-
expr = Utils.
|
4141
|
+
expr = Utils.wrap_expr(Utils.parse_into_expression(subset[0], str_as_lit: false))
|
4366
4142
|
else
|
4367
4143
|
struct_fields = subset.nil? ? Polars.all : subset
|
4368
4144
|
expr = Polars.struct(struct_fields)
|
@@ -4780,7 +4556,7 @@ module Polars
|
|
4780
4556
|
# # │ 3 ┆ 7 │
|
4781
4557
|
# # └─────┴─────┘
|
4782
4558
|
def gather_every(n, offset = 0)
|
4783
|
-
select(
|
4559
|
+
select(F.col("*").gather_every(n, offset))
|
4784
4560
|
end
|
4785
4561
|
alias_method :take_every, :gather_every
|
4786
4562
|
|
@@ -4850,7 +4626,7 @@ module Polars
|
|
4850
4626
|
# # │ 10.0 ┆ null ┆ 9.0 │
|
4851
4627
|
# # └──────┴──────┴──────────┘
|
4852
4628
|
def interpolate
|
4853
|
-
select(
|
4629
|
+
select(F.col("*").interpolate)
|
4854
4630
|
end
|
4855
4631
|
|
4856
4632
|
# Check if the dataframe is empty.
|
@@ -4986,19 +4762,16 @@ module Polars
|
|
4986
4762
|
#
|
4987
4763
|
# @param column [Object]
|
4988
4764
|
# Columns that are sorted
|
4989
|
-
# @param more_columns [Object]
|
4990
|
-
# Additional columns that are sorted, specified as positional arguments.
|
4991
4765
|
# @param descending [Boolean]
|
4992
4766
|
# Whether the columns are sorted in descending order.
|
4993
4767
|
#
|
4994
4768
|
# @return [DataFrame]
|
4995
4769
|
def set_sorted(
|
4996
4770
|
column,
|
4997
|
-
*more_columns,
|
4998
4771
|
descending: false
|
4999
4772
|
)
|
5000
4773
|
lazy
|
5001
|
-
.set_sorted(column,
|
4774
|
+
.set_sorted(column, descending: descending)
|
5002
4775
|
.collect(no_optimization: true)
|
5003
4776
|
end
|
5004
4777
|
|
@@ -5255,7 +5028,7 @@ module Polars
|
|
5255
5028
|
elsif data[0].is_a?(Hash)
|
5256
5029
|
column_names, dtypes = _unpack_schema(columns)
|
5257
5030
|
schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
|
5258
|
-
rbdf = RbDataFrame.
|
5031
|
+
rbdf = RbDataFrame.from_hashes(data, schema, schema_overrides, false, infer_schema_length)
|
5259
5032
|
if column_names
|
5260
5033
|
rbdf = _post_apply_columns(rbdf, column_names)
|
5261
5034
|
end
|
@@ -5289,7 +5062,7 @@ module Polars
|
|
5289
5062
|
if unpack_nested
|
5290
5063
|
raise Todo
|
5291
5064
|
else
|
5292
|
-
rbdf = RbDataFrame.
|
5065
|
+
rbdf = RbDataFrame.from_rows(
|
5293
5066
|
data,
|
5294
5067
|
infer_schema_length,
|
5295
5068
|
local_schema_override.any? ? local_schema_override : nil
|