polars-df 0.10.0-x86_64-linux → 0.12.0-x86_64-linux
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +27 -0
- data/Cargo.lock +392 -351
- data/LICENSE-THIRD-PARTY.txt +1125 -865
- data/README.md +6 -6
- data/lib/polars/3.1/polars.so +0 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/3.3/polars.so +0 -0
- data/lib/polars/array_expr.rb +4 -4
- data/lib/polars/batched_csv_reader.rb +11 -5
- data/lib/polars/cat_expr.rb +0 -36
- data/lib/polars/cat_name_space.rb +0 -37
- data/lib/polars/convert.rb +6 -1
- data/lib/polars/data_frame.rb +176 -403
- data/lib/polars/data_types.rb +1 -1
- data/lib/polars/date_time_expr.rb +525 -572
- data/lib/polars/date_time_name_space.rb +263 -460
- data/lib/polars/dynamic_group_by.rb +5 -5
- data/lib/polars/exceptions.rb +7 -0
- data/lib/polars/expr.rb +1394 -243
- data/lib/polars/expr_dispatch.rb +1 -1
- data/lib/polars/functions/aggregation/horizontal.rb +8 -8
- data/lib/polars/functions/as_datatype.rb +63 -40
- data/lib/polars/functions/lazy.rb +63 -14
- data/lib/polars/functions/lit.rb +1 -1
- data/lib/polars/functions/range/date_range.rb +90 -57
- data/lib/polars/functions/range/datetime_range.rb +149 -0
- data/lib/polars/functions/range/int_range.rb +2 -2
- data/lib/polars/functions/range/time_range.rb +141 -0
- data/lib/polars/functions/repeat.rb +1 -1
- data/lib/polars/functions/whenthen.rb +1 -1
- data/lib/polars/group_by.rb +88 -23
- data/lib/polars/io/avro.rb +24 -0
- data/lib/polars/{io.rb → io/csv.rb} +299 -493
- data/lib/polars/io/database.rb +73 -0
- data/lib/polars/io/ipc.rb +247 -0
- data/lib/polars/io/json.rb +29 -0
- data/lib/polars/io/ndjson.rb +80 -0
- data/lib/polars/io/parquet.rb +227 -0
- data/lib/polars/lazy_frame.rb +143 -272
- data/lib/polars/lazy_group_by.rb +100 -3
- data/lib/polars/list_expr.rb +11 -11
- data/lib/polars/list_name_space.rb +5 -1
- data/lib/polars/rolling_group_by.rb +7 -9
- data/lib/polars/series.rb +103 -187
- data/lib/polars/string_expr.rb +78 -102
- data/lib/polars/string_name_space.rb +5 -4
- data/lib/polars/testing.rb +2 -2
- data/lib/polars/utils/constants.rb +9 -0
- data/lib/polars/utils/convert.rb +97 -0
- data/lib/polars/utils/parse.rb +89 -0
- data/lib/polars/utils/various.rb +76 -0
- data/lib/polars/utils/wrap.rb +19 -0
- data/lib/polars/utils.rb +8 -300
- data/lib/polars/version.rb +1 -1
- data/lib/polars/whenthen.rb +6 -6
- data/lib/polars.rb +20 -1
- metadata +17 -4
data/lib/polars/data_frame.rb
CHANGED
@@ -46,271 +46,6 @@ module Polars
|
|
46
46
|
df
|
47
47
|
end
|
48
48
|
|
49
|
-
# @private
|
50
|
-
def self._from_hashes(data, infer_schema_length: 100, schema: nil, schema_overrides: nil)
|
51
|
-
rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema, schema_overrides)
|
52
|
-
_from_rbdf(rbdf)
|
53
|
-
end
|
54
|
-
|
55
|
-
# @private
|
56
|
-
def self._from_hash(data, schema: nil, schema_overrides: nil)
|
57
|
-
_from_rbdf(hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides))
|
58
|
-
end
|
59
|
-
|
60
|
-
# def self._from_records
|
61
|
-
# end
|
62
|
-
|
63
|
-
# def self._from_numo
|
64
|
-
# end
|
65
|
-
|
66
|
-
# no self._from_arrow
|
67
|
-
|
68
|
-
# no self._from_pandas
|
69
|
-
|
70
|
-
# @private
|
71
|
-
def self._read_csv(
|
72
|
-
file,
|
73
|
-
has_header: true,
|
74
|
-
columns: nil,
|
75
|
-
sep: str = ",",
|
76
|
-
comment_char: nil,
|
77
|
-
quote_char: '"',
|
78
|
-
skip_rows: 0,
|
79
|
-
dtypes: nil,
|
80
|
-
null_values: nil,
|
81
|
-
ignore_errors: false,
|
82
|
-
parse_dates: false,
|
83
|
-
n_threads: nil,
|
84
|
-
infer_schema_length: 100,
|
85
|
-
batch_size: 8192,
|
86
|
-
n_rows: nil,
|
87
|
-
encoding: "utf8",
|
88
|
-
low_memory: false,
|
89
|
-
rechunk: true,
|
90
|
-
skip_rows_after_header: 0,
|
91
|
-
row_count_name: nil,
|
92
|
-
row_count_offset: 0,
|
93
|
-
sample_size: 1024,
|
94
|
-
eol_char: "\n",
|
95
|
-
truncate_ragged_lines: false
|
96
|
-
)
|
97
|
-
if Utils.pathlike?(file)
|
98
|
-
path = Utils.normalise_filepath(file)
|
99
|
-
else
|
100
|
-
path = nil
|
101
|
-
# if defined?(StringIO) && file.is_a?(StringIO)
|
102
|
-
# file = file.string
|
103
|
-
# end
|
104
|
-
end
|
105
|
-
|
106
|
-
dtype_list = nil
|
107
|
-
dtype_slice = nil
|
108
|
-
if !dtypes.nil?
|
109
|
-
if dtypes.is_a?(Hash)
|
110
|
-
dtype_list = []
|
111
|
-
dtypes.each do|k, v|
|
112
|
-
dtype_list << [k, Utils.rb_type_to_dtype(v)]
|
113
|
-
end
|
114
|
-
elsif dtypes.is_a?(::Array)
|
115
|
-
dtype_slice = dtypes
|
116
|
-
else
|
117
|
-
raise ArgumentError, "dtype arg should be list or dict"
|
118
|
-
end
|
119
|
-
end
|
120
|
-
|
121
|
-
processed_null_values = Utils._process_null_values(null_values)
|
122
|
-
|
123
|
-
if columns.is_a?(::String)
|
124
|
-
columns = [columns]
|
125
|
-
end
|
126
|
-
if file.is_a?(::String) && file.include?("*")
|
127
|
-
dtypes_dict = nil
|
128
|
-
if !dtype_list.nil?
|
129
|
-
dtypes_dict = dtype_list.to_h
|
130
|
-
end
|
131
|
-
if !dtype_slice.nil?
|
132
|
-
raise ArgumentError, "cannot use glob patterns and unnamed dtypes as `dtypes` argument; Use dtypes: Mapping[str, Type[DataType]"
|
133
|
-
end
|
134
|
-
scan = Polars.scan_csv(
|
135
|
-
file,
|
136
|
-
has_header: has_header,
|
137
|
-
sep: sep,
|
138
|
-
comment_char: comment_char,
|
139
|
-
quote_char: quote_char,
|
140
|
-
skip_rows: skip_rows,
|
141
|
-
dtypes: dtypes_dict,
|
142
|
-
null_values: null_values,
|
143
|
-
ignore_errors: ignore_errors,
|
144
|
-
infer_schema_length: infer_schema_length,
|
145
|
-
n_rows: n_rows,
|
146
|
-
low_memory: low_memory,
|
147
|
-
rechunk: rechunk,
|
148
|
-
skip_rows_after_header: skip_rows_after_header,
|
149
|
-
row_count_name: row_count_name,
|
150
|
-
row_count_offset: row_count_offset,
|
151
|
-
eol_char: eol_char,
|
152
|
-
truncate_ragged_lines: truncate_ragged_lines
|
153
|
-
)
|
154
|
-
if columns.nil?
|
155
|
-
return _from_rbdf(scan.collect._df)
|
156
|
-
elsif is_str_sequence(columns, allow_str: false)
|
157
|
-
return _from_rbdf(scan.select(columns).collect._df)
|
158
|
-
else
|
159
|
-
raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: List[str]"
|
160
|
-
end
|
161
|
-
end
|
162
|
-
|
163
|
-
projection, columns = Utils.handle_projection_columns(columns)
|
164
|
-
|
165
|
-
_from_rbdf(
|
166
|
-
RbDataFrame.read_csv(
|
167
|
-
file,
|
168
|
-
infer_schema_length,
|
169
|
-
batch_size,
|
170
|
-
has_header,
|
171
|
-
ignore_errors,
|
172
|
-
n_rows,
|
173
|
-
skip_rows,
|
174
|
-
projection,
|
175
|
-
sep,
|
176
|
-
rechunk,
|
177
|
-
columns,
|
178
|
-
encoding,
|
179
|
-
n_threads,
|
180
|
-
path,
|
181
|
-
dtype_list,
|
182
|
-
dtype_slice,
|
183
|
-
low_memory,
|
184
|
-
comment_char,
|
185
|
-
quote_char,
|
186
|
-
processed_null_values,
|
187
|
-
parse_dates,
|
188
|
-
skip_rows_after_header,
|
189
|
-
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
190
|
-
sample_size,
|
191
|
-
eol_char,
|
192
|
-
truncate_ragged_lines
|
193
|
-
)
|
194
|
-
)
|
195
|
-
end
|
196
|
-
|
197
|
-
# @private
|
198
|
-
def self._read_parquet(
|
199
|
-
source,
|
200
|
-
columns: nil,
|
201
|
-
n_rows: nil,
|
202
|
-
parallel: "auto",
|
203
|
-
row_count_name: nil,
|
204
|
-
row_count_offset: 0,
|
205
|
-
low_memory: false,
|
206
|
-
use_statistics: true,
|
207
|
-
rechunk: true
|
208
|
-
)
|
209
|
-
if Utils.pathlike?(source)
|
210
|
-
source = Utils.normalise_filepath(source)
|
211
|
-
end
|
212
|
-
if columns.is_a?(::String)
|
213
|
-
columns = [columns]
|
214
|
-
end
|
215
|
-
|
216
|
-
if source.is_a?(::String) && source.include?("*") && Utils.local_file?(source)
|
217
|
-
scan =
|
218
|
-
Polars.scan_parquet(
|
219
|
-
source,
|
220
|
-
n_rows: n_rows,
|
221
|
-
rechunk: true,
|
222
|
-
parallel: parallel,
|
223
|
-
row_count_name: row_count_name,
|
224
|
-
row_count_offset: row_count_offset,
|
225
|
-
low_memory: low_memory
|
226
|
-
)
|
227
|
-
|
228
|
-
if columns.nil?
|
229
|
-
return self._from_rbdf(scan.collect._df)
|
230
|
-
elsif Utils.is_str_sequence(columns, allow_str: false)
|
231
|
-
return self._from_rbdf(scan.select(columns).collect._df)
|
232
|
-
else
|
233
|
-
raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: Array[String]"
|
234
|
-
end
|
235
|
-
end
|
236
|
-
|
237
|
-
projection, columns = Utils.handle_projection_columns(columns)
|
238
|
-
_from_rbdf(
|
239
|
-
RbDataFrame.read_parquet(
|
240
|
-
source,
|
241
|
-
columns,
|
242
|
-
projection,
|
243
|
-
n_rows,
|
244
|
-
parallel,
|
245
|
-
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
246
|
-
low_memory,
|
247
|
-
use_statistics,
|
248
|
-
rechunk
|
249
|
-
)
|
250
|
-
)
|
251
|
-
end
|
252
|
-
|
253
|
-
# @private
|
254
|
-
def self._read_avro(file, columns: nil, n_rows: nil)
|
255
|
-
if Utils.pathlike?(file)
|
256
|
-
file = Utils.normalise_filepath(file)
|
257
|
-
end
|
258
|
-
projection, columns = Utils.handle_projection_columns(columns)
|
259
|
-
_from_rbdf(RbDataFrame.read_avro(file, columns, projection, n_rows))
|
260
|
-
end
|
261
|
-
|
262
|
-
# @private
|
263
|
-
def self._read_ipc(
|
264
|
-
file,
|
265
|
-
columns: nil,
|
266
|
-
n_rows: nil,
|
267
|
-
row_count_name: nil,
|
268
|
-
row_count_offset: 0,
|
269
|
-
rechunk: true,
|
270
|
-
memory_map: true
|
271
|
-
)
|
272
|
-
if Utils.pathlike?(file)
|
273
|
-
file = Utils.normalise_filepath(file)
|
274
|
-
end
|
275
|
-
if columns.is_a?(::String)
|
276
|
-
columns = [columns]
|
277
|
-
end
|
278
|
-
|
279
|
-
if file.is_a?(::String) && file.include?("*")
|
280
|
-
raise Todo
|
281
|
-
end
|
282
|
-
|
283
|
-
projection, columns = Utils.handle_projection_columns(columns)
|
284
|
-
_from_rbdf(
|
285
|
-
RbDataFrame.read_ipc(
|
286
|
-
file,
|
287
|
-
columns,
|
288
|
-
projection,
|
289
|
-
n_rows,
|
290
|
-
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
291
|
-
memory_map
|
292
|
-
)
|
293
|
-
)
|
294
|
-
end
|
295
|
-
|
296
|
-
# @private
|
297
|
-
def self._read_json(file)
|
298
|
-
if Utils.pathlike?(file)
|
299
|
-
file = Utils.normalise_filepath(file)
|
300
|
-
end
|
301
|
-
|
302
|
-
_from_rbdf(RbDataFrame.read_json(file))
|
303
|
-
end
|
304
|
-
|
305
|
-
# @private
|
306
|
-
def self._read_ndjson(file)
|
307
|
-
if Utils.pathlike?(file)
|
308
|
-
file = Utils.normalise_filepath(file)
|
309
|
-
end
|
310
|
-
|
311
|
-
_from_rbdf(RbDataFrame.read_ndjson(file))
|
312
|
-
end
|
313
|
-
|
314
49
|
# Get the shape of the DataFrame.
|
315
50
|
#
|
316
51
|
# @return [Array]
|
@@ -419,6 +154,13 @@ module Polars
|
|
419
154
|
_df.dtypes
|
420
155
|
end
|
421
156
|
|
157
|
+
# Get flags that are set on the columns of this DataFrame.
|
158
|
+
#
|
159
|
+
# @return [Hash]
|
160
|
+
def flags
|
161
|
+
columns.to_h { |name| [name, self[name].flags] }
|
162
|
+
end
|
163
|
+
|
422
164
|
# Get the schema.
|
423
165
|
#
|
424
166
|
# @return [Hash]
|
@@ -845,7 +587,7 @@ module Polars
|
|
845
587
|
row_oriented: false
|
846
588
|
)
|
847
589
|
if Utils.pathlike?(file)
|
848
|
-
file = Utils.
|
590
|
+
file = Utils.normalize_filepath(file)
|
849
591
|
end
|
850
592
|
to_string_io = !file.nil? && file.is_a?(StringIO)
|
851
593
|
if file.nil? || to_string_io
|
@@ -880,11 +622,11 @@ module Polars
|
|
880
622
|
# "bar" => [6, 7, 8]
|
881
623
|
# }
|
882
624
|
# )
|
883
|
-
# df.write_ndjson
|
625
|
+
# df.write_ndjson
|
884
626
|
# # => "{\"foo\":1,\"bar\":6}\n{\"foo\":2,\"bar\":7}\n{\"foo\":3,\"bar\":8}\n"
|
885
627
|
def write_ndjson(file = nil)
|
886
628
|
if Utils.pathlike?(file)
|
887
|
-
file = Utils.
|
629
|
+
file = Utils.normalize_filepath(file)
|
888
630
|
end
|
889
631
|
to_string_io = !file.nil? && file.is_a?(StringIO)
|
890
632
|
if file.nil? || to_string_io
|
@@ -991,7 +733,7 @@ module Polars
|
|
991
733
|
end
|
992
734
|
|
993
735
|
if Utils.pathlike?(file)
|
994
|
-
file = Utils.
|
736
|
+
file = Utils.normalize_filepath(file)
|
995
737
|
end
|
996
738
|
|
997
739
|
_df.write_csv(
|
@@ -1029,7 +771,7 @@ module Polars
|
|
1029
771
|
compression = "uncompressed"
|
1030
772
|
end
|
1031
773
|
if Utils.pathlike?(file)
|
1032
|
-
file = Utils.
|
774
|
+
file = Utils.normalize_filepath(file)
|
1033
775
|
end
|
1034
776
|
|
1035
777
|
_df.write_avro(file, compression)
|
@@ -1050,7 +792,7 @@ module Polars
|
|
1050
792
|
file.set_encoding(Encoding::BINARY)
|
1051
793
|
end
|
1052
794
|
if Utils.pathlike?(file)
|
1053
|
-
file = Utils.
|
795
|
+
file = Utils.normalize_filepath(file)
|
1054
796
|
end
|
1055
797
|
|
1056
798
|
if compression.nil?
|
@@ -1061,6 +803,47 @@ module Polars
|
|
1061
803
|
return_bytes ? file.string : nil
|
1062
804
|
end
|
1063
805
|
|
806
|
+
# Write to Arrow IPC record batch stream.
|
807
|
+
#
|
808
|
+
# See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html.
|
809
|
+
#
|
810
|
+
# @param file [Object]
|
811
|
+
# Path or writable file-like object to which the IPC record batch data will
|
812
|
+
# be written. If set to `None`, the output is returned as a BytesIO object.
|
813
|
+
# @param compression ['uncompressed', 'lz4', 'zstd']
|
814
|
+
# Compression method. Defaults to "uncompressed".
|
815
|
+
#
|
816
|
+
# @return [Object]
|
817
|
+
#
|
818
|
+
# @example
|
819
|
+
# df = Polars::DataFrame.new(
|
820
|
+
# {
|
821
|
+
# "foo" => [1, 2, 3, 4, 5],
|
822
|
+
# "bar" => [6, 7, 8, 9, 10],
|
823
|
+
# "ham" => ["a", "b", "c", "d", "e"]
|
824
|
+
# }
|
825
|
+
# )
|
826
|
+
# df.write_ipc_stream("new_file.arrow")
|
827
|
+
def write_ipc_stream(
|
828
|
+
file,
|
829
|
+
compression: "uncompressed"
|
830
|
+
)
|
831
|
+
return_bytes = file.nil?
|
832
|
+
if return_bytes
|
833
|
+
file = StringIO.new
|
834
|
+
file.set_encoding(Encoding::BINARY)
|
835
|
+
elsif Utils.pathlike?(file)
|
836
|
+
file = Utils.normalize_filepath(file)
|
837
|
+
end
|
838
|
+
|
839
|
+
if compression.nil?
|
840
|
+
compression = "uncompressed"
|
841
|
+
end
|
842
|
+
|
843
|
+
_df.write_ipc_stream(file, compression)
|
844
|
+
return_bytes ? file.string : nil
|
845
|
+
end
|
846
|
+
|
1064
847
|
# Write to Apache Parquet file.
|
1065
848
|
#
|
1066
849
|
# @param file [String, Pathname, StringIO]
|
@@ -1097,7 +880,25 @@ module Polars
|
|
1097
880
|
compression = "uncompressed"
|
1098
881
|
end
|
1099
882
|
if Utils.pathlike?(file)
|
1100
|
-
file = Utils.
|
883
|
+
file = Utils.normalize_filepath(file)
|
884
|
+
end
|
885
|
+
|
886
|
+
if statistics == true
|
887
|
+
statistics = {
|
888
|
+
min: true,
|
889
|
+
max: true,
|
890
|
+
distinct_count: false,
|
891
|
+
null_count: true
|
892
|
+
}
|
893
|
+
elsif statistics == false
|
894
|
+
statistics = {}
|
895
|
+
elsif statistics == "full"
|
896
|
+
statistics = {
|
897
|
+
min: true,
|
898
|
+
max: true,
|
899
|
+
distinct_count: true,
|
900
|
+
null_count: true
|
901
|
+
}
|
1101
902
|
end
|
1102
903
|
|
1103
904
|
_df.write_parquet(
|
@@ -1773,10 +1574,7 @@ module Polars
|
|
1773
1574
|
# # │ 3 ┆ 8 ┆ c │
|
1774
1575
|
# # └─────┴─────┴─────┘
|
1775
1576
|
def drop_nulls(subset: nil)
|
1776
|
-
|
1777
|
-
subset = [subset]
|
1778
|
-
end
|
1779
|
-
_from_rbdf(_df.drop_nulls(subset))
|
1577
|
+
lazy.drop_nulls(subset: subset).collect(_eager: true)
|
1780
1578
|
end
|
1781
1579
|
|
1782
1580
|
# Offers a structured way to apply a sequence of user-defined functions (UDFs).
|
@@ -1838,16 +1636,16 @@ module Polars
|
|
1838
1636
|
# df.with_row_index
|
1839
1637
|
# # =>
|
1840
1638
|
# # shape: (3, 3)
|
1841
|
-
# #
|
1842
|
-
# # │
|
1843
|
-
# # │ ---
|
1844
|
-
# # │ u32
|
1845
|
-
# #
|
1846
|
-
# # │ 0
|
1847
|
-
# # │ 1
|
1848
|
-
# # │ 2
|
1849
|
-
# #
|
1850
|
-
def with_row_index(name: "
|
1639
|
+
# # ┌───────┬─────┬─────┐
|
1640
|
+
# # │ index ┆ a ┆ b │
|
1641
|
+
# # │ --- ┆ --- ┆ --- │
|
1642
|
+
# # │ u32 ┆ i64 ┆ i64 │
|
1643
|
+
# # ╞═══════╪═════╪═════╡
|
1644
|
+
# # │ 0 ┆ 1 ┆ 2 │
|
1645
|
+
# # │ 1 ┆ 3 ┆ 4 │
|
1646
|
+
# # │ 2 ┆ 5 ┆ 6 │
|
1647
|
+
# # └───────┴─────┴─────┘
|
1648
|
+
def with_row_index(name: "index", offset: 0)
|
1851
1649
|
_from_rbdf(_df.with_row_index(name, offset))
|
1852
1650
|
end
|
1853
1651
|
alias_method :with_row_count, :with_row_index
|
@@ -1944,12 +1742,6 @@ module Polars
|
|
1944
1742
|
# Define whether the temporal window interval is closed or not.
|
1945
1743
|
# @param by [Object]
|
1946
1744
|
# Also group by this column/these columns.
|
1947
|
-
# @param check_sorted [Boolean]
|
1948
|
-
# When the `by` argument is given, polars can not check sortedness
|
1949
|
-
# by the metadata and has to do a full scan on the index column to
|
1950
|
-
# verify data is sorted. This is expensive. If you are sure the
|
1951
|
-
# data within the by groups is sorted, you can set this to `false`.
|
1952
|
-
# Doing so incorrectly will lead to incorrect output
|
1953
1745
|
#
|
1954
1746
|
# @return [RollingGroupBy]
|
1955
1747
|
#
|
@@ -1965,7 +1757,7 @@ module Polars
|
|
1965
1757
|
# df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
|
1966
1758
|
# Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
|
1967
1759
|
# )
|
1968
|
-
# df.
|
1760
|
+
# df.rolling(index_column: "dt", period: "2d").agg(
|
1969
1761
|
# [
|
1970
1762
|
# Polars.sum("a").alias("sum_a"),
|
1971
1763
|
# Polars.min("a").alias("min_a"),
|
@@ -1986,17 +1778,17 @@ module Polars
|
|
1986
1778
|
# # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
|
1987
1779
|
# # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
|
1988
1780
|
# # └─────────────────────┴───────┴───────┴───────┘
|
1989
|
-
def
|
1781
|
+
def rolling(
|
1990
1782
|
index_column:,
|
1991
1783
|
period:,
|
1992
1784
|
offset: nil,
|
1993
1785
|
closed: "right",
|
1994
|
-
by: nil
|
1995
|
-
check_sorted: true
|
1786
|
+
by: nil
|
1996
1787
|
)
|
1997
|
-
RollingGroupBy.new(self, index_column, period, offset, closed, by
|
1788
|
+
RollingGroupBy.new(self, index_column, period, offset, closed, by)
|
1998
1789
|
end
|
1999
|
-
alias_method :groupby_rolling, :
|
1790
|
+
alias_method :groupby_rolling, :rolling
|
1791
|
+
alias_method :group_by_rolling, :rolling
|
2000
1792
|
|
2001
1793
|
# Group based on a time value (or index value of type `:i32`, `:i64`).
|
2002
1794
|
#
|
@@ -2066,10 +1858,12 @@ module Polars
|
|
2066
1858
|
# @example
|
2067
1859
|
# df = Polars::DataFrame.new(
|
2068
1860
|
# {
|
2069
|
-
# "time" => Polars.
|
1861
|
+
# "time" => Polars.datetime_range(
|
2070
1862
|
# DateTime.new(2021, 12, 16),
|
2071
1863
|
# DateTime.new(2021, 12, 16, 3),
|
2072
|
-
# "30m"
|
1864
|
+
# "30m",
|
1865
|
+
# time_unit: "us",
|
1866
|
+
# eager: true
|
2073
1867
|
# ),
|
2074
1868
|
# "n" => 0..6
|
2075
1869
|
# }
|
@@ -2136,16 +1930,16 @@ module Polars
|
|
2136
1930
|
# )
|
2137
1931
|
# # =>
|
2138
1932
|
# # shape: (4, 3)
|
2139
|
-
# #
|
2140
|
-
# # │ time ┆ time_count ┆ time_agg_list
|
2141
|
-
# # │ --- ┆ --- ┆ ---
|
2142
|
-
# # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
|
2143
|
-
# #
|
2144
|
-
# # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12
|
2145
|
-
# # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12
|
2146
|
-
# # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12
|
2147
|
-
# # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
|
2148
|
-
# #
|
1933
|
+
# # ┌─────────────────────┬────────────┬─────────────────────────────────┐
|
1934
|
+
# # │ time ┆ time_count ┆ time_agg_list │
|
1935
|
+
# # │ --- ┆ --- ┆ --- │
|
1936
|
+
# # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │
|
1937
|
+
# # ╞═════════════════════╪════════════╪═════════════════════════════════╡
|
1938
|
+
# # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-… │
|
1939
|
+
# # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-… │
|
1940
|
+
# # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-… │
|
1941
|
+
# # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
|
1942
|
+
# # └─────────────────────┴────────────┴─────────────────────────────────┘
|
2149
1943
|
#
|
2150
1944
|
# @example When closed="both" the time values at the window boundaries belong to 2 groups.
|
2151
1945
|
# df.group_by_dynamic("time", every: "1h", closed: "both").agg(
|
@@ -2168,10 +1962,12 @@ module Polars
|
|
2168
1962
|
# @example Dynamic group bys can also be combined with grouping on normal keys.
|
2169
1963
|
# df = Polars::DataFrame.new(
|
2170
1964
|
# {
|
2171
|
-
# "time" => Polars.
|
1965
|
+
# "time" => Polars.datetime_range(
|
2172
1966
|
# DateTime.new(2021, 12, 16),
|
2173
1967
|
# DateTime.new(2021, 12, 16, 3),
|
2174
|
-
# "30m"
|
1968
|
+
# "30m",
|
1969
|
+
# time_unit: "us",
|
1970
|
+
# eager: true
|
2175
1971
|
# ),
|
2176
1972
|
# "groups" => ["a", "a", "a", "b", "b", "a", "a"]
|
2177
1973
|
# }
|
@@ -2258,8 +2054,6 @@ module Polars
|
|
2258
2054
|
# Note that this column has to be sorted for the output to make sense.
|
2259
2055
|
# @param every [String]
|
2260
2056
|
# interval will start 'every' duration
|
2261
|
-
# @param offset [String]
|
2262
|
-
# change the start of the date_range by this offset.
|
2263
2057
|
# @param by [Object]
|
2264
2058
|
# First group by these columns and then upsample for every group
|
2265
2059
|
# @param maintain_order [Boolean]
|
@@ -2319,7 +2113,6 @@ module Polars
|
|
2319
2113
|
def upsample(
|
2320
2114
|
time_column:,
|
2321
2115
|
every:,
|
2322
|
-
offset: nil,
|
2323
2116
|
by: nil,
|
2324
2117
|
maintain_order: false
|
2325
2118
|
)
|
@@ -2329,15 +2122,11 @@ module Polars
|
|
2329
2122
|
if by.is_a?(::String)
|
2330
2123
|
by = [by]
|
2331
2124
|
end
|
2332
|
-
if offset.nil?
|
2333
|
-
offset = "0ns"
|
2334
|
-
end
|
2335
2125
|
|
2336
|
-
every = Utils.
|
2337
|
-
offset = Utils._timedelta_to_pl_duration(offset)
|
2126
|
+
every = Utils.parse_as_duration_string(every)
|
2338
2127
|
|
2339
2128
|
_from_rbdf(
|
2340
|
-
_df.upsample(by, time_column, every,
|
2129
|
+
_df.upsample(by, time_column, every, maintain_order)
|
2341
2130
|
)
|
2342
2131
|
end
|
2343
2132
|
|
@@ -2484,7 +2273,7 @@ module Polars
|
|
2484
2273
|
# Name(s) of the right join column(s).
|
2485
2274
|
# @param on [Object]
|
2486
2275
|
# Name(s) of the join columns in both DataFrames.
|
2487
|
-
# @param how ["inner", "left", "
|
2276
|
+
# @param how ["inner", "left", "full", "semi", "anti", "cross"]
|
2488
2277
|
# Join strategy.
|
2489
2278
|
# @param suffix [String]
|
2490
2279
|
# Suffix to append to columns with a duplicate name.
|
@@ -2520,7 +2309,7 @@ module Polars
|
|
2520
2309
|
# # └─────┴─────┴─────┴───────┘
|
2521
2310
|
#
|
2522
2311
|
# @example
|
2523
|
-
# df.join(other_df, on: "ham", how: "
|
2312
|
+
# df.join(other_df, on: "ham", how: "full")
|
2524
2313
|
# # =>
|
2525
2314
|
# # shape: (4, 5)
|
2526
2315
|
# # ┌──────┬──────┬──────┬───────┬───────────┐
|
@@ -2620,7 +2409,7 @@ module Polars
|
|
2620
2409
|
# df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [-1, 5, 8]})
|
2621
2410
|
#
|
2622
2411
|
# @example Return a DataFrame by mapping each row to a tuple:
|
2623
|
-
# df.
|
2412
|
+
# df.map_rows { |t| [t[0] * 2, t[1] * 3] }
|
2624
2413
|
# # =>
|
2625
2414
|
# # shape: (3, 2)
|
2626
2415
|
# # ┌──────────┬──────────┐
|
@@ -2634,7 +2423,7 @@ module Polars
|
|
2634
2423
|
# # └──────────┴──────────┘
|
2635
2424
|
#
|
2636
2425
|
# @example Return a Series by mapping each row to a scalar:
|
2637
|
-
# df.
|
2426
|
+
# df.map_rows { |t| t[0] * 2 + t[1] }
|
2638
2427
|
# # =>
|
2639
2428
|
# # shape: (3, 1)
|
2640
2429
|
# # ┌───────┐
|
@@ -2646,14 +2435,15 @@ module Polars
|
|
2646
2435
|
# # │ 9 │
|
2647
2436
|
# # │ 14 │
|
2648
2437
|
# # └───────┘
|
2649
|
-
def
|
2650
|
-
out, is_df = _df.
|
2438
|
+
def map_rows(return_dtype: nil, inference_size: 256, &f)
|
2439
|
+
out, is_df = _df.map_rows(f, return_dtype, inference_size)
|
2651
2440
|
if is_df
|
2652
2441
|
_from_rbdf(out)
|
2653
2442
|
else
|
2654
2443
|
_from_rbdf(Utils.wrap_s(out).to_frame._df)
|
2655
2444
|
end
|
2656
2445
|
end
|
2446
|
+
alias_method :apply, :map_rows
|
2657
2447
|
|
2658
2448
|
# Return a new DataFrame with the column added or replaced.
|
2659
2449
|
#
|
@@ -3176,9 +2966,9 @@ module Polars
|
|
3176
2966
|
# arguments contains multiple columns as well
|
3177
2967
|
# @param index [Object]
|
3178
2968
|
# One or multiple keys to group by
|
3179
|
-
# @param
|
2969
|
+
# @param on [Object]
|
3180
2970
|
# Columns whose values will be used as the header of the output DataFrame
|
3181
|
-
# @param
|
2971
|
+
# @param aggregate_function ["first", "sum", "max", "min", "mean", "median", "last", "count"]
|
3182
2972
|
# A predefined aggregate function str or an expression.
|
3183
2973
|
# @param maintain_order [Object]
|
3184
2974
|
# Sort the grouped keys so that the output order is predictable.
|
@@ -3190,66 +2980,62 @@ module Polars
|
|
3190
2980
|
# @example
|
3191
2981
|
# df = Polars::DataFrame.new(
|
3192
2982
|
# {
|
3193
|
-
# "foo" => ["one", "one", "
|
3194
|
-
# "bar" => ["
|
2983
|
+
# "foo" => ["one", "one", "two", "two", "one", "two"],
|
2984
|
+
# "bar" => ["y", "y", "y", "x", "x", "x"],
|
3195
2985
|
# "baz" => [1, 2, 3, 4, 5, 6]
|
3196
2986
|
# }
|
3197
2987
|
# )
|
3198
|
-
# df.pivot(
|
2988
|
+
# df.pivot("bar", index: "foo", values: "baz", aggregate_function: "sum")
|
3199
2989
|
# # =>
|
3200
|
-
# # shape: (2,
|
3201
|
-
# #
|
3202
|
-
# # │ foo ┆
|
3203
|
-
# # │ --- ┆ --- ┆ ---
|
3204
|
-
# # │ str ┆ i64 ┆ i64
|
3205
|
-
# #
|
3206
|
-
# # │ one ┆
|
3207
|
-
# # │ two ┆
|
3208
|
-
# #
|
2990
|
+
# # shape: (2, 3)
|
2991
|
+
# # ┌─────┬─────┬─────┐
|
2992
|
+
# # │ foo ┆ y ┆ x │
|
2993
|
+
# # │ --- ┆ --- ┆ --- │
|
2994
|
+
# # │ str ┆ i64 ┆ i64 │
|
2995
|
+
# # ╞═════╪═════╪═════╡
|
2996
|
+
# # │ one ┆ 3 ┆ 5 │
|
2997
|
+
# # │ two ┆ 3 ┆ 10 │
|
2998
|
+
# # └─────┴─────┴─────┘
|
3209
2999
|
def pivot(
|
3210
|
-
|
3211
|
-
index
|
3212
|
-
|
3213
|
-
|
3000
|
+
on,
|
3001
|
+
index: nil,
|
3002
|
+
values: nil,
|
3003
|
+
aggregate_function: nil,
|
3214
3004
|
maintain_order: true,
|
3215
3005
|
sort_columns: false,
|
3216
3006
|
separator: "_"
|
3217
3007
|
)
|
3218
|
-
|
3219
|
-
|
3220
|
-
|
3221
|
-
|
3222
|
-
index = [index]
|
3223
|
-
end
|
3224
|
-
if columns.is_a?(::String)
|
3225
|
-
columns = [columns]
|
3008
|
+
index = Utils._expand_selectors(self, index)
|
3009
|
+
on = Utils._expand_selectors(self, on)
|
3010
|
+
if !values.nil?
|
3011
|
+
values = Utils._expand_selectors(self, values)
|
3226
3012
|
end
|
3227
3013
|
|
3228
|
-
if
|
3229
|
-
case
|
3014
|
+
if aggregate_function.is_a?(::String)
|
3015
|
+
case aggregate_function
|
3230
3016
|
when "first"
|
3231
|
-
aggregate_expr =
|
3017
|
+
aggregate_expr = F.element.first._rbexpr
|
3232
3018
|
when "sum"
|
3233
|
-
aggregate_expr =
|
3019
|
+
aggregate_expr = F.element.sum._rbexpr
|
3234
3020
|
when "max"
|
3235
|
-
aggregate_expr =
|
3021
|
+
aggregate_expr = F.element.max._rbexpr
|
3236
3022
|
when "min"
|
3237
|
-
aggregate_expr =
|
3023
|
+
aggregate_expr = F.element.min._rbexpr
|
3238
3024
|
when "mean"
|
3239
|
-
aggregate_expr =
|
3025
|
+
aggregate_expr = F.element.mean._rbexpr
|
3240
3026
|
when "median"
|
3241
|
-
aggregate_expr =
|
3027
|
+
aggregate_expr = F.element.median._rbexpr
|
3242
3028
|
when "last"
|
3243
|
-
aggregate_expr =
|
3029
|
+
aggregate_expr = F.element.last._rbexpr
|
3244
3030
|
when "len"
|
3245
|
-
aggregate_expr =
|
3031
|
+
aggregate_expr = F.len._rbexpr
|
3246
3032
|
when "count"
|
3247
3033
|
warn "`aggregate_function: \"count\"` input for `pivot` is deprecated. Use `aggregate_function: \"len\"` instead."
|
3248
|
-
aggregate_expr =
|
3034
|
+
aggregate_expr = F.len._rbexpr
|
3249
3035
|
else
|
3250
3036
|
raise ArgumentError, "Argument aggregate fn: '#{aggregate_fn}' was not expected."
|
3251
3037
|
end
|
3252
|
-
elsif
|
3038
|
+
elsif aggregate_function.nil?
|
3253
3039
|
aggregate_expr = nil
|
3254
3040
|
else
|
3255
3041
|
aggregate_expr = aggregate_function._rbexpr
|
@@ -3257,8 +3043,8 @@ module Polars
|
|
3257
3043
|
|
3258
3044
|
_from_rbdf(
|
3259
3045
|
_df.pivot_expr(
|
3046
|
+
on,
|
3260
3047
|
index,
|
3261
|
-
columns,
|
3262
3048
|
values,
|
3263
3049
|
maintain_order,
|
3264
3050
|
sort_columns,
|
@@ -3273,18 +3059,18 @@ module Polars
|
|
3273
3059
|
# Optionally leaves identifiers set.
|
3274
3060
|
#
|
3275
3061
|
# This function is useful to massage a DataFrame into a format where one or more
|
3276
|
-
# columns are identifier variables (
|
3277
|
-
# measured variables (
|
3062
|
+
# columns are identifier variables (index) while all other columns, considered
|
3063
|
+
# measured variables (on), are "unpivoted" to the row axis leaving just
|
3278
3064
|
# two non-identifier columns, 'variable' and 'value'.
|
3279
3065
|
#
|
3280
|
-
# @param
|
3281
|
-
#
|
3282
|
-
#
|
3283
|
-
#
|
3284
|
-
#
|
3285
|
-
# @param variable_name [
|
3286
|
-
# Name to give to the `
|
3287
|
-
# @param value_name [
|
3066
|
+
# @param on [Object]
|
3067
|
+
# Column(s) or selector(s) to use as values variables; if `on`
|
3068
|
+
# is empty all columns that are not in `index` will be used.
|
3069
|
+
# @param index [Object]
|
3070
|
+
# Column(s) or selector(s) to use as identifier variables.
|
3071
|
+
# @param variable_name [Object]
|
3072
|
+
# Name to give to the `variable` column. Defaults to "variable"
|
3073
|
+
# @param value_name [Object]
|
3288
3074
|
# Name to give to the `value` column. Defaults to "value"
|
3289
3075
|
#
|
3290
3076
|
# @return [DataFrame]
|
@@ -3297,7 +3083,7 @@ module Polars
|
|
3297
3083
|
# "c" => [2, 4, 6]
|
3298
3084
|
# }
|
3299
3085
|
# )
|
3300
|
-
# df.
|
3086
|
+
# df.unpivot(Polars::Selectors.numeric, index: "a")
|
3301
3087
|
# # =>
|
3302
3088
|
# # shape: (6, 3)
|
3303
3089
|
# # ┌─────┬──────────┬───────┐
|
@@ -3312,23 +3098,13 @@ module Polars
|
|
3312
3098
|
# # │ y ┆ c ┆ 4 │
|
3313
3099
|
# # │ z ┆ c ┆ 6 │
|
3314
3100
|
# # └─────┴──────────┴───────┘
|
3315
|
-
def
|
3316
|
-
|
3317
|
-
|
3318
|
-
|
3319
|
-
|
3320
|
-
id_vars = [id_vars]
|
3321
|
-
end
|
3322
|
-
if value_vars.nil?
|
3323
|
-
value_vars = []
|
3324
|
-
end
|
3325
|
-
if id_vars.nil?
|
3326
|
-
id_vars = []
|
3327
|
-
end
|
3328
|
-
_from_rbdf(
|
3329
|
-
_df.melt(id_vars, value_vars, value_name, variable_name)
|
3330
|
-
)
|
3101
|
+
def unpivot(on, index: nil, variable_name: nil, value_name: nil)
|
3102
|
+
on = on.nil? ? [] : Utils._expand_selectors(self, on)
|
3103
|
+
index = index.nil? ? [] : Utils._expand_selectors(self, index)
|
3104
|
+
|
3105
|
+
_from_rbdf(_df.unpivot(on, index, value_name, variable_name))
|
3331
3106
|
end
|
3107
|
+
alias_method :melt, :unpivot
|
3332
3108
|
|
3333
3109
|
# Unstack a long table to a wide form without doing an aggregation.
|
3334
3110
|
#
|
@@ -3774,7 +3550,7 @@ module Polars
|
|
3774
3550
|
# # ┌─────────┐
|
3775
3551
|
# # │ literal │
|
3776
3552
|
# # │ --- │
|
3777
|
-
# # │
|
3553
|
+
# # │ i32 │
|
3778
3554
|
# # ╞═════════╡
|
3779
3555
|
# # │ 0 │
|
3780
3556
|
# # │ 0 │
|
@@ -4362,7 +4138,7 @@ module Polars
|
|
4362
4138
|
end
|
4363
4139
|
|
4364
4140
|
if subset.is_a?(::Array) && subset.length == 1
|
4365
|
-
expr = Utils.
|
4141
|
+
expr = Utils.wrap_expr(Utils.parse_into_expression(subset[0], str_as_lit: false))
|
4366
4142
|
else
|
4367
4143
|
struct_fields = subset.nil? ? Polars.all : subset
|
4368
4144
|
expr = Polars.struct(struct_fields)
|
@@ -4780,7 +4556,7 @@ module Polars
|
|
4780
4556
|
# # │ 3 ┆ 7 │
|
4781
4557
|
# # └─────┴─────┘
|
4782
4558
|
def gather_every(n, offset = 0)
|
4783
|
-
select(
|
4559
|
+
select(F.col("*").gather_every(n, offset))
|
4784
4560
|
end
|
4785
4561
|
alias_method :take_every, :gather_every
|
4786
4562
|
|
@@ -4850,7 +4626,7 @@ module Polars
|
|
4850
4626
|
# # │ 10.0 ┆ null ┆ 9.0 │
|
4851
4627
|
# # └──────┴──────┴──────────┘
|
4852
4628
|
def interpolate
|
4853
|
-
select(
|
4629
|
+
select(F.col("*").interpolate)
|
4854
4630
|
end
|
4855
4631
|
|
4856
4632
|
# Check if the dataframe is empty.
|
@@ -4986,19 +4762,16 @@ module Polars
|
|
4986
4762
|
#
|
4987
4763
|
# @param column [Object]
|
4988
4764
|
# Columns that are sorted
|
4989
|
-
# @param more_columns [Object]
|
4990
|
-
# Additional columns that are sorted, specified as positional arguments.
|
4991
4765
|
# @param descending [Boolean]
|
4992
4766
|
# Whether the columns are sorted in descending order.
|
4993
4767
|
#
|
4994
4768
|
# @return [DataFrame]
|
4995
4769
|
def set_sorted(
|
4996
4770
|
column,
|
4997
|
-
*more_columns,
|
4998
4771
|
descending: false
|
4999
4772
|
)
|
5000
4773
|
lazy
|
5001
|
-
.set_sorted(column,
|
4774
|
+
.set_sorted(column, descending: descending)
|
5002
4775
|
.collect(no_optimization: true)
|
5003
4776
|
end
|
5004
4777
|
|
@@ -5255,7 +5028,7 @@ module Polars
|
|
5255
5028
|
elsif data[0].is_a?(Hash)
|
5256
5029
|
column_names, dtypes = _unpack_schema(columns)
|
5257
5030
|
schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
|
5258
|
-
rbdf = RbDataFrame.
|
5031
|
+
rbdf = RbDataFrame.from_hashes(data, schema, schema_overrides, false, infer_schema_length)
|
5259
5032
|
if column_names
|
5260
5033
|
rbdf = _post_apply_columns(rbdf, column_names)
|
5261
5034
|
end
|
@@ -5289,7 +5062,7 @@ module Polars
|
|
5289
5062
|
if unpack_nested
|
5290
5063
|
raise Todo
|
5291
5064
|
else
|
5292
|
-
rbdf = RbDataFrame.
|
5065
|
+
rbdf = RbDataFrame.from_rows(
|
5293
5066
|
data,
|
5294
5067
|
infer_schema_length,
|
5295
5068
|
local_schema_override.any? ? local_schema_override : nil
|