polars-df 0.9.0-x86_64-darwin → 0.11.0-x86_64-darwin
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +23 -0
- data/Cargo.lock +144 -57
- data/LICENSE-THIRD-PARTY.txt +629 -29
- data/README.md +7 -6
- data/lib/polars/3.1/polars.bundle +0 -0
- data/lib/polars/3.2/polars.bundle +0 -0
- data/lib/polars/3.3/polars.bundle +0 -0
- data/lib/polars/array_expr.rb +6 -2
- data/lib/polars/batched_csv_reader.rb +11 -3
- data/lib/polars/convert.rb +6 -1
- data/lib/polars/data_frame.rb +225 -370
- data/lib/polars/date_time_expr.rb +11 -4
- data/lib/polars/date_time_name_space.rb +14 -4
- data/lib/polars/dynamic_group_by.rb +2 -2
- data/lib/polars/exceptions.rb +4 -0
- data/lib/polars/expr.rb +1171 -54
- data/lib/polars/functions/lazy.rb +3 -3
- data/lib/polars/functions/range/date_range.rb +92 -0
- data/lib/polars/functions/range/datetime_range.rb +149 -0
- data/lib/polars/functions/range/time_range.rb +141 -0
- data/lib/polars/functions/whenthen.rb +74 -5
- data/lib/polars/group_by.rb +88 -23
- data/lib/polars/io/avro.rb +24 -0
- data/lib/polars/{io.rb → io/csv.rb} +307 -489
- data/lib/polars/io/database.rb +73 -0
- data/lib/polars/io/ipc.rb +247 -0
- data/lib/polars/io/json.rb +18 -0
- data/lib/polars/io/ndjson.rb +69 -0
- data/lib/polars/io/parquet.rb +226 -0
- data/lib/polars/lazy_frame.rb +55 -195
- data/lib/polars/lazy_group_by.rb +100 -3
- data/lib/polars/list_expr.rb +6 -2
- data/lib/polars/rolling_group_by.rb +2 -2
- data/lib/polars/series.rb +14 -12
- data/lib/polars/string_expr.rb +38 -36
- data/lib/polars/utils.rb +89 -1
- data/lib/polars/version.rb +1 -1
- data/lib/polars/whenthen.rb +83 -0
- data/lib/polars.rb +10 -3
- metadata +13 -6
- data/lib/polars/when.rb +0 -16
- data/lib/polars/when_then.rb +0 -19
data/lib/polars/data_frame.rb
CHANGED
@@ -46,268 +46,6 @@ module Polars
|
|
46
46
|
df
|
47
47
|
end
|
48
48
|
|
49
|
-
# @private
|
50
|
-
def self._from_hashes(data, infer_schema_length: 100, schema: nil, schema_overrides: nil)
|
51
|
-
rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema, schema_overrides)
|
52
|
-
_from_rbdf(rbdf)
|
53
|
-
end
|
54
|
-
|
55
|
-
# @private
|
56
|
-
def self._from_hash(data, schema: nil, schema_overrides: nil)
|
57
|
-
_from_rbdf(hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides))
|
58
|
-
end
|
59
|
-
|
60
|
-
# def self._from_records
|
61
|
-
# end
|
62
|
-
|
63
|
-
# def self._from_numo
|
64
|
-
# end
|
65
|
-
|
66
|
-
# no self._from_arrow
|
67
|
-
|
68
|
-
# no self._from_pandas
|
69
|
-
|
70
|
-
# @private
|
71
|
-
def self._read_csv(
|
72
|
-
file,
|
73
|
-
has_header: true,
|
74
|
-
columns: nil,
|
75
|
-
sep: str = ",",
|
76
|
-
comment_char: nil,
|
77
|
-
quote_char: '"',
|
78
|
-
skip_rows: 0,
|
79
|
-
dtypes: nil,
|
80
|
-
null_values: nil,
|
81
|
-
ignore_errors: false,
|
82
|
-
parse_dates: false,
|
83
|
-
n_threads: nil,
|
84
|
-
infer_schema_length: 100,
|
85
|
-
batch_size: 8192,
|
86
|
-
n_rows: nil,
|
87
|
-
encoding: "utf8",
|
88
|
-
low_memory: false,
|
89
|
-
rechunk: true,
|
90
|
-
skip_rows_after_header: 0,
|
91
|
-
row_count_name: nil,
|
92
|
-
row_count_offset: 0,
|
93
|
-
sample_size: 1024,
|
94
|
-
eol_char: "\n"
|
95
|
-
)
|
96
|
-
if Utils.pathlike?(file)
|
97
|
-
path = Utils.normalise_filepath(file)
|
98
|
-
else
|
99
|
-
path = nil
|
100
|
-
# if defined?(StringIO) && file.is_a?(StringIO)
|
101
|
-
# file = file.string
|
102
|
-
# end
|
103
|
-
end
|
104
|
-
|
105
|
-
dtype_list = nil
|
106
|
-
dtype_slice = nil
|
107
|
-
if !dtypes.nil?
|
108
|
-
if dtypes.is_a?(Hash)
|
109
|
-
dtype_list = []
|
110
|
-
dtypes.each do|k, v|
|
111
|
-
dtype_list << [k, Utils.rb_type_to_dtype(v)]
|
112
|
-
end
|
113
|
-
elsif dtypes.is_a?(::Array)
|
114
|
-
dtype_slice = dtypes
|
115
|
-
else
|
116
|
-
raise ArgumentError, "dtype arg should be list or dict"
|
117
|
-
end
|
118
|
-
end
|
119
|
-
|
120
|
-
processed_null_values = Utils._process_null_values(null_values)
|
121
|
-
|
122
|
-
if columns.is_a?(::String)
|
123
|
-
columns = [columns]
|
124
|
-
end
|
125
|
-
if file.is_a?(::String) && file.include?("*")
|
126
|
-
dtypes_dict = nil
|
127
|
-
if !dtype_list.nil?
|
128
|
-
dtypes_dict = dtype_list.to_h
|
129
|
-
end
|
130
|
-
if !dtype_slice.nil?
|
131
|
-
raise ArgumentError, "cannot use glob patterns and unnamed dtypes as `dtypes` argument; Use dtypes: Mapping[str, Type[DataType]"
|
132
|
-
end
|
133
|
-
scan = Polars.scan_csv(
|
134
|
-
file,
|
135
|
-
has_header: has_header,
|
136
|
-
sep: sep,
|
137
|
-
comment_char: comment_char,
|
138
|
-
quote_char: quote_char,
|
139
|
-
skip_rows: skip_rows,
|
140
|
-
dtypes: dtypes_dict,
|
141
|
-
null_values: null_values,
|
142
|
-
ignore_errors: ignore_errors,
|
143
|
-
infer_schema_length: infer_schema_length,
|
144
|
-
n_rows: n_rows,
|
145
|
-
low_memory: low_memory,
|
146
|
-
rechunk: rechunk,
|
147
|
-
skip_rows_after_header: skip_rows_after_header,
|
148
|
-
row_count_name: row_count_name,
|
149
|
-
row_count_offset: row_count_offset,
|
150
|
-
eol_char: eol_char
|
151
|
-
)
|
152
|
-
if columns.nil?
|
153
|
-
return _from_rbdf(scan.collect._df)
|
154
|
-
elsif is_str_sequence(columns, allow_str: false)
|
155
|
-
return _from_rbdf(scan.select(columns).collect._df)
|
156
|
-
else
|
157
|
-
raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: List[str]"
|
158
|
-
end
|
159
|
-
end
|
160
|
-
|
161
|
-
projection, columns = Utils.handle_projection_columns(columns)
|
162
|
-
|
163
|
-
_from_rbdf(
|
164
|
-
RbDataFrame.read_csv(
|
165
|
-
file,
|
166
|
-
infer_schema_length,
|
167
|
-
batch_size,
|
168
|
-
has_header,
|
169
|
-
ignore_errors,
|
170
|
-
n_rows,
|
171
|
-
skip_rows,
|
172
|
-
projection,
|
173
|
-
sep,
|
174
|
-
rechunk,
|
175
|
-
columns,
|
176
|
-
encoding,
|
177
|
-
n_threads,
|
178
|
-
path,
|
179
|
-
dtype_list,
|
180
|
-
dtype_slice,
|
181
|
-
low_memory,
|
182
|
-
comment_char,
|
183
|
-
quote_char,
|
184
|
-
processed_null_values,
|
185
|
-
parse_dates,
|
186
|
-
skip_rows_after_header,
|
187
|
-
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
188
|
-
sample_size,
|
189
|
-
eol_char
|
190
|
-
)
|
191
|
-
)
|
192
|
-
end
|
193
|
-
|
194
|
-
# @private
|
195
|
-
def self._read_parquet(
|
196
|
-
source,
|
197
|
-
columns: nil,
|
198
|
-
n_rows: nil,
|
199
|
-
parallel: "auto",
|
200
|
-
row_count_name: nil,
|
201
|
-
row_count_offset: 0,
|
202
|
-
low_memory: false,
|
203
|
-
use_statistics: true,
|
204
|
-
rechunk: true
|
205
|
-
)
|
206
|
-
if Utils.pathlike?(source)
|
207
|
-
source = Utils.normalise_filepath(source)
|
208
|
-
end
|
209
|
-
if columns.is_a?(::String)
|
210
|
-
columns = [columns]
|
211
|
-
end
|
212
|
-
|
213
|
-
if source.is_a?(::String) && source.include?("*") && Utils.local_file?(source)
|
214
|
-
scan =
|
215
|
-
Polars.scan_parquet(
|
216
|
-
source,
|
217
|
-
n_rows: n_rows,
|
218
|
-
rechunk: true,
|
219
|
-
parallel: parallel,
|
220
|
-
row_count_name: row_count_name,
|
221
|
-
row_count_offset: row_count_offset,
|
222
|
-
low_memory: low_memory
|
223
|
-
)
|
224
|
-
|
225
|
-
if columns.nil?
|
226
|
-
return self._from_rbdf(scan.collect._df)
|
227
|
-
elsif Utils.is_str_sequence(columns, allow_str: false)
|
228
|
-
return self._from_rbdf(scan.select(columns).collect._df)
|
229
|
-
else
|
230
|
-
raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: Array[String]"
|
231
|
-
end
|
232
|
-
end
|
233
|
-
|
234
|
-
projection, columns = Utils.handle_projection_columns(columns)
|
235
|
-
_from_rbdf(
|
236
|
-
RbDataFrame.read_parquet(
|
237
|
-
source,
|
238
|
-
columns,
|
239
|
-
projection,
|
240
|
-
n_rows,
|
241
|
-
parallel,
|
242
|
-
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
243
|
-
low_memory,
|
244
|
-
use_statistics,
|
245
|
-
rechunk
|
246
|
-
)
|
247
|
-
)
|
248
|
-
end
|
249
|
-
|
250
|
-
# @private
|
251
|
-
def self._read_avro(file, columns: nil, n_rows: nil)
|
252
|
-
if Utils.pathlike?(file)
|
253
|
-
file = Utils.normalise_filepath(file)
|
254
|
-
end
|
255
|
-
projection, columns = Utils.handle_projection_columns(columns)
|
256
|
-
_from_rbdf(RbDataFrame.read_avro(file, columns, projection, n_rows))
|
257
|
-
end
|
258
|
-
|
259
|
-
# @private
|
260
|
-
def self._read_ipc(
|
261
|
-
file,
|
262
|
-
columns: nil,
|
263
|
-
n_rows: nil,
|
264
|
-
row_count_name: nil,
|
265
|
-
row_count_offset: 0,
|
266
|
-
rechunk: true,
|
267
|
-
memory_map: true
|
268
|
-
)
|
269
|
-
if Utils.pathlike?(file)
|
270
|
-
file = Utils.normalise_filepath(file)
|
271
|
-
end
|
272
|
-
if columns.is_a?(::String)
|
273
|
-
columns = [columns]
|
274
|
-
end
|
275
|
-
|
276
|
-
if file.is_a?(::String) && file.include?("*")
|
277
|
-
raise Todo
|
278
|
-
end
|
279
|
-
|
280
|
-
projection, columns = Utils.handle_projection_columns(columns)
|
281
|
-
_from_rbdf(
|
282
|
-
RbDataFrame.read_ipc(
|
283
|
-
file,
|
284
|
-
columns,
|
285
|
-
projection,
|
286
|
-
n_rows,
|
287
|
-
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
288
|
-
memory_map
|
289
|
-
)
|
290
|
-
)
|
291
|
-
end
|
292
|
-
|
293
|
-
# @private
|
294
|
-
def self._read_json(file)
|
295
|
-
if Utils.pathlike?(file)
|
296
|
-
file = Utils.normalise_filepath(file)
|
297
|
-
end
|
298
|
-
|
299
|
-
_from_rbdf(RbDataFrame.read_json(file))
|
300
|
-
end
|
301
|
-
|
302
|
-
# @private
|
303
|
-
def self._read_ndjson(file)
|
304
|
-
if Utils.pathlike?(file)
|
305
|
-
file = Utils.normalise_filepath(file)
|
306
|
-
end
|
307
|
-
|
308
|
-
_from_rbdf(RbDataFrame.read_ndjson(file))
|
309
|
-
end
|
310
|
-
|
311
49
|
# Get the shape of the DataFrame.
|
312
50
|
#
|
313
51
|
# @return [Array]
|
@@ -416,6 +154,13 @@ module Polars
|
|
416
154
|
_df.dtypes
|
417
155
|
end
|
418
156
|
|
157
|
+
# Get flags that are set on the columns of this DataFrame.
|
158
|
+
#
|
159
|
+
# @return [Hash]
|
160
|
+
def flags
|
161
|
+
columns.to_h { |name| [name, self[name].flags] }
|
162
|
+
end
|
163
|
+
|
419
164
|
# Get the schema.
|
420
165
|
#
|
421
166
|
# @return [Hash]
|
@@ -814,8 +559,6 @@ module Polars
|
|
814
559
|
|
815
560
|
# Serialize to JSON representation.
|
816
561
|
#
|
817
|
-
# @return [nil]
|
818
|
-
#
|
819
562
|
# @param file [String]
|
820
563
|
# File path to which the result should be written.
|
821
564
|
# @param pretty [Boolean]
|
@@ -823,17 +566,45 @@ module Polars
|
|
823
566
|
# @param row_oriented [Boolean]
|
824
567
|
# Write to row oriented json. This is slower, but more common.
|
825
568
|
#
|
826
|
-
# @
|
569
|
+
# @return [nil]
|
570
|
+
#
|
571
|
+
# @example
|
572
|
+
# df = Polars::DataFrame.new(
|
573
|
+
# {
|
574
|
+
# "foo" => [1, 2, 3],
|
575
|
+
# "bar" => [6, 7, 8]
|
576
|
+
# }
|
577
|
+
# )
|
578
|
+
# df.write_json
|
579
|
+
# # => "{\"columns\":[{\"name\":\"foo\",\"datatype\":\"Int64\",\"bit_settings\":\"\",\"values\":[1,2,3]},{\"name\":\"bar\",\"datatype\":\"Int64\",\"bit_settings\":\"\",\"values\":[6,7,8]}]}"
|
580
|
+
#
|
581
|
+
# @example
|
582
|
+
# df.write_json(row_oriented: true)
|
583
|
+
# # => "[{\"foo\":1,\"bar\":6},{\"foo\":2,\"bar\":7},{\"foo\":3,\"bar\":8}]"
|
827
584
|
def write_json(
|
828
|
-
file,
|
585
|
+
file = nil,
|
829
586
|
pretty: false,
|
830
587
|
row_oriented: false
|
831
588
|
)
|
832
589
|
if Utils.pathlike?(file)
|
833
|
-
file = Utils.
|
590
|
+
file = Utils.normalize_filepath(file)
|
591
|
+
end
|
592
|
+
to_string_io = !file.nil? && file.is_a?(StringIO)
|
593
|
+
if file.nil? || to_string_io
|
594
|
+
buf = StringIO.new
|
595
|
+
buf.set_encoding(Encoding::BINARY)
|
596
|
+
_df.write_json(buf, pretty, row_oriented)
|
597
|
+
json_bytes = buf.string
|
598
|
+
|
599
|
+
json_str = json_bytes.force_encoding(Encoding::UTF_8)
|
600
|
+
if to_string_io
|
601
|
+
file.write(json_str)
|
602
|
+
else
|
603
|
+
return json_str
|
604
|
+
end
|
605
|
+
else
|
606
|
+
_df.write_json(file, pretty, row_oriented)
|
834
607
|
end
|
835
|
-
|
836
|
-
_df.write_json(file, pretty, row_oriented)
|
837
608
|
nil
|
838
609
|
end
|
839
610
|
|
@@ -843,12 +614,36 @@ module Polars
|
|
843
614
|
# File path to which the result should be written.
|
844
615
|
#
|
845
616
|
# @return [nil]
|
846
|
-
|
617
|
+
#
|
618
|
+
# @example
|
619
|
+
# df = Polars::DataFrame.new(
|
620
|
+
# {
|
621
|
+
# "foo" => [1, 2, 3],
|
622
|
+
# "bar" => [6, 7, 8]
|
623
|
+
# }
|
624
|
+
# )
|
625
|
+
# df.write_ndjson()
|
626
|
+
# # => "{\"foo\":1,\"bar\":6}\n{\"foo\":2,\"bar\":7}\n{\"foo\":3,\"bar\":8}\n"
|
627
|
+
def write_ndjson(file = nil)
|
847
628
|
if Utils.pathlike?(file)
|
848
|
-
file = Utils.
|
629
|
+
file = Utils.normalize_filepath(file)
|
630
|
+
end
|
631
|
+
to_string_io = !file.nil? && file.is_a?(StringIO)
|
632
|
+
if file.nil? || to_string_io
|
633
|
+
buf = StringIO.new
|
634
|
+
buf.set_encoding(Encoding::BINARY)
|
635
|
+
_df.write_ndjson(buf)
|
636
|
+
json_bytes = buf.string
|
637
|
+
|
638
|
+
json_str = json_bytes.force_encoding(Encoding::UTF_8)
|
639
|
+
if to_string_io
|
640
|
+
file.write(json_str)
|
641
|
+
else
|
642
|
+
return json_str
|
643
|
+
end
|
644
|
+
else
|
645
|
+
_df.write_ndjson(file)
|
849
646
|
end
|
850
|
-
|
851
|
-
_df.write_ndjson(file)
|
852
647
|
nil
|
853
648
|
end
|
854
649
|
|
@@ -938,7 +733,7 @@ module Polars
|
|
938
733
|
end
|
939
734
|
|
940
735
|
if Utils.pathlike?(file)
|
941
|
-
file = Utils.
|
736
|
+
file = Utils.normalize_filepath(file)
|
942
737
|
end
|
943
738
|
|
944
739
|
_df.write_csv(
|
@@ -976,7 +771,7 @@ module Polars
|
|
976
771
|
compression = "uncompressed"
|
977
772
|
end
|
978
773
|
if Utils.pathlike?(file)
|
979
|
-
file = Utils.
|
774
|
+
file = Utils.normalize_filepath(file)
|
980
775
|
end
|
981
776
|
|
982
777
|
_df.write_avro(file, compression)
|
@@ -997,7 +792,7 @@ module Polars
|
|
997
792
|
file.set_encoding(Encoding::BINARY)
|
998
793
|
end
|
999
794
|
if Utils.pathlike?(file)
|
1000
|
-
file = Utils.
|
795
|
+
file = Utils.normalize_filepath(file)
|
1001
796
|
end
|
1002
797
|
|
1003
798
|
if compression.nil?
|
@@ -1008,9 +803,50 @@ module Polars
|
|
1008
803
|
return_bytes ? file.string : nil
|
1009
804
|
end
|
1010
805
|
|
806
|
+
# Write to Arrow IPC record batch stream.
|
807
|
+
#
|
808
|
+
# See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html.
|
809
|
+
#
|
810
|
+
# @param file [Object]
|
811
|
+
# Path or writable file-like object to which the IPC record batch data will
|
812
|
+
# be written. If set to `None`, the output is returned as a BytesIO object.
|
813
|
+
# @param compression ['uncompressed', 'lz4', 'zstd']
|
814
|
+
# Compression method. Defaults to "uncompressed".
|
815
|
+
#
|
816
|
+
# @return [Object]
|
817
|
+
#
|
818
|
+
# @example
|
819
|
+
# df = Polars::DataFrame.new(
|
820
|
+
# {
|
821
|
+
# "foo" => [1, 2, 3, 4, 5],
|
822
|
+
# "bar" => [6, 7, 8, 9, 10],
|
823
|
+
# "ham" => ["a", "b", "c", "d", "e"]
|
824
|
+
# }
|
825
|
+
# )
|
826
|
+
# df.write_ipc_stream("new_file.arrow")
|
827
|
+
def write_ipc_stream(
|
828
|
+
file,
|
829
|
+
compression: "uncompressed"
|
830
|
+
)
|
831
|
+
return_bytes = file.nil?
|
832
|
+
if return_bytes
|
833
|
+
file = StringIO.new
|
834
|
+
file.set_encoding(Encoding::BINARY)
|
835
|
+
elsif Utils.pathlike?(file)
|
836
|
+
file = Utils.normalize_filepath(file)
|
837
|
+
end
|
838
|
+
|
839
|
+
if compression.nil?
|
840
|
+
compression = "uncompressed"
|
841
|
+
end
|
842
|
+
|
843
|
+
_df.write_ipc_stream(file, compression)
|
844
|
+
return_bytes ? file.string : nil
|
845
|
+
end
|
846
|
+
|
1011
847
|
# Write to Apache Parquet file.
|
1012
848
|
#
|
1013
|
-
# @param file [String]
|
849
|
+
# @param file [String, Pathname, StringIO]
|
1014
850
|
# File path to which the file should be written.
|
1015
851
|
# @param compression ["lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd"]
|
1016
852
|
# Choose "zstd" for good compression performance.
|
@@ -1027,10 +863,9 @@ module Polars
|
|
1027
863
|
# @param statistics [Boolean]
|
1028
864
|
# Write statistics to the parquet headers. This requires extra compute.
|
1029
865
|
# @param row_group_size [Integer, nil]
|
1030
|
-
# Size of the row groups in number of rows.
|
1031
|
-
#
|
1032
|
-
#
|
1033
|
-
# writing speeds.
|
866
|
+
# Size of the row groups in number of rows. Defaults to 512^2 rows.
|
867
|
+
# @param data_page_size [Integer, nil]
|
868
|
+
# Size of the data page in bytes. Defaults to 1024^2 bytes.
|
1034
869
|
#
|
1035
870
|
# @return [nil]
|
1036
871
|
def write_parquet(
|
@@ -1038,17 +873,18 @@ module Polars
|
|
1038
873
|
compression: "zstd",
|
1039
874
|
compression_level: nil,
|
1040
875
|
statistics: false,
|
1041
|
-
row_group_size: nil
|
876
|
+
row_group_size: nil,
|
877
|
+
data_page_size: nil
|
1042
878
|
)
|
1043
879
|
if compression.nil?
|
1044
880
|
compression = "uncompressed"
|
1045
881
|
end
|
1046
882
|
if Utils.pathlike?(file)
|
1047
|
-
file = Utils.
|
883
|
+
file = Utils.normalize_filepath(file)
|
1048
884
|
end
|
1049
885
|
|
1050
886
|
_df.write_parquet(
|
1051
|
-
file, compression, compression_level, statistics, row_group_size
|
887
|
+
file, compression, compression_level, statistics, row_group_size, data_page_size
|
1052
888
|
)
|
1053
889
|
end
|
1054
890
|
|
@@ -1084,7 +920,7 @@ module Polars
|
|
1084
920
|
# df.estimated_size
|
1085
921
|
# # => 25888898
|
1086
922
|
# df.estimated_size("mb")
|
1087
|
-
# # =>
|
923
|
+
# # => 17.0601749420166
|
1088
924
|
def estimated_size(unit = "b")
|
1089
925
|
sz = _df.estimated_size
|
1090
926
|
Utils.scale_bytes(sz, to: unit)
|
@@ -1720,10 +1556,7 @@ module Polars
|
|
1720
1556
|
# # │ 3 ┆ 8 ┆ c │
|
1721
1557
|
# # └─────┴─────┴─────┘
|
1722
1558
|
def drop_nulls(subset: nil)
|
1723
|
-
|
1724
|
-
subset = [subset]
|
1725
|
-
end
|
1726
|
-
_from_rbdf(_df.drop_nulls(subset))
|
1559
|
+
lazy.drop_nulls(subset: subset).collect(_eager: true)
|
1727
1560
|
end
|
1728
1561
|
|
1729
1562
|
# Offers a structured way to apply a sequence of user-defined functions (UDFs).
|
@@ -1785,16 +1618,16 @@ module Polars
|
|
1785
1618
|
# df.with_row_index
|
1786
1619
|
# # =>
|
1787
1620
|
# # shape: (3, 3)
|
1788
|
-
# #
|
1789
|
-
# # │
|
1790
|
-
# # │ ---
|
1791
|
-
# # │ u32
|
1792
|
-
# #
|
1793
|
-
# # │ 0
|
1794
|
-
# # │ 1
|
1795
|
-
# # │ 2
|
1796
|
-
# #
|
1797
|
-
def with_row_index(name: "
|
1621
|
+
# # ┌───────┬─────┬─────┐
|
1622
|
+
# # │ index ┆ a ┆ b │
|
1623
|
+
# # │ --- ┆ --- ┆ --- │
|
1624
|
+
# # │ u32 ┆ i64 ┆ i64 │
|
1625
|
+
# # ╞═══════╪═════╪═════╡
|
1626
|
+
# # │ 0 ┆ 1 ┆ 2 │
|
1627
|
+
# # │ 1 ┆ 3 ┆ 4 │
|
1628
|
+
# # │ 2 ┆ 5 ┆ 6 │
|
1629
|
+
# # └───────┴─────┴─────┘
|
1630
|
+
def with_row_index(name: "index", offset: 0)
|
1798
1631
|
_from_rbdf(_df.with_row_index(name, offset))
|
1799
1632
|
end
|
1800
1633
|
alias_method :with_row_count, :with_row_index
|
@@ -2083,16 +1916,16 @@ module Polars
|
|
2083
1916
|
# )
|
2084
1917
|
# # =>
|
2085
1918
|
# # shape: (4, 3)
|
2086
|
-
# #
|
2087
|
-
# # │ time ┆ time_count ┆ time_agg_list
|
2088
|
-
# # │ --- ┆ --- ┆ ---
|
2089
|
-
# # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
|
2090
|
-
# #
|
2091
|
-
# # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12
|
2092
|
-
# # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12
|
2093
|
-
# # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12
|
2094
|
-
# # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
|
2095
|
-
# #
|
1919
|
+
# # ┌─────────────────────┬────────────┬─────────────────────────────────┐
|
1920
|
+
# # │ time ┆ time_count ┆ time_agg_list │
|
1921
|
+
# # │ --- ┆ --- ┆ --- │
|
1922
|
+
# # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │
|
1923
|
+
# # ╞═════════════════════╪════════════╪═════════════════════════════════╡
|
1924
|
+
# # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-… │
|
1925
|
+
# # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-… │
|
1926
|
+
# # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-… │
|
1927
|
+
# # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
|
1928
|
+
# # └─────────────────────┴────────────┴─────────────────────────────────┘
|
2096
1929
|
#
|
2097
1930
|
# @example When closed="both" the time values at the window boundaries belong to 2 groups.
|
2098
1931
|
# df.group_by_dynamic("time", every: "1h", closed: "both").agg(
|
@@ -2161,12 +1994,13 @@ module Polars
|
|
2161
1994
|
# closed: "right"
|
2162
1995
|
# ).agg(Polars.col("A").alias("A_agg_list"))
|
2163
1996
|
# # =>
|
2164
|
-
# # shape: (
|
1997
|
+
# # shape: (4, 4)
|
2165
1998
|
# # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
|
2166
1999
|
# # │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │
|
2167
2000
|
# # │ --- ┆ --- ┆ --- ┆ --- │
|
2168
2001
|
# # │ i64 ┆ i64 ┆ i64 ┆ list[str] │
|
2169
2002
|
# # ╞═════════════════╪═════════════════╪═════╪═════════════════╡
|
2003
|
+
# # │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │
|
2170
2004
|
# # │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │
|
2171
2005
|
# # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
|
2172
2006
|
# # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
|
@@ -2566,7 +2400,7 @@ module Polars
|
|
2566
2400
|
# df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [-1, 5, 8]})
|
2567
2401
|
#
|
2568
2402
|
# @example Return a DataFrame by mapping each row to a tuple:
|
2569
|
-
# df.
|
2403
|
+
# df.map_rows { |t| [t[0] * 2, t[1] * 3] }
|
2570
2404
|
# # =>
|
2571
2405
|
# # shape: (3, 2)
|
2572
2406
|
# # ┌──────────┬──────────┐
|
@@ -2580,7 +2414,7 @@ module Polars
|
|
2580
2414
|
# # └──────────┴──────────┘
|
2581
2415
|
#
|
2582
2416
|
# @example Return a Series by mapping each row to a scalar:
|
2583
|
-
# df.
|
2417
|
+
# df.map_rows { |t| t[0] * 2 + t[1] }
|
2584
2418
|
# # =>
|
2585
2419
|
# # shape: (3, 1)
|
2586
2420
|
# # ┌───────┐
|
@@ -2592,14 +2426,15 @@ module Polars
|
|
2592
2426
|
# # │ 9 │
|
2593
2427
|
# # │ 14 │
|
2594
2428
|
# # └───────┘
|
2595
|
-
def
|
2596
|
-
out, is_df = _df.
|
2429
|
+
def map_rows(return_dtype: nil, inference_size: 256, &f)
|
2430
|
+
out, is_df = _df.map_rows(f, return_dtype, inference_size)
|
2597
2431
|
if is_df
|
2598
2432
|
_from_rbdf(out)
|
2599
2433
|
else
|
2600
2434
|
_from_rbdf(Utils.wrap_s(out).to_frame._df)
|
2601
2435
|
end
|
2602
2436
|
end
|
2437
|
+
alias_method :apply, :map_rows
|
2603
2438
|
|
2604
2439
|
# Return a new DataFrame with the column added or replaced.
|
2605
2440
|
#
|
@@ -2621,26 +2456,26 @@ module Polars
|
|
2621
2456
|
# # ┌─────┬─────┬───────────┐
|
2622
2457
|
# # │ a ┆ b ┆ b_squared │
|
2623
2458
|
# # │ --- ┆ --- ┆ --- │
|
2624
|
-
# # │ i64 ┆ i64 ┆
|
2459
|
+
# # │ i64 ┆ i64 ┆ i64 │
|
2625
2460
|
# # ╞═════╪═════╪═══════════╡
|
2626
|
-
# # │ 1 ┆ 2 ┆ 4
|
2627
|
-
# # │ 3 ┆ 4 ┆ 16
|
2628
|
-
# # │ 5 ┆ 6 ┆ 36
|
2461
|
+
# # │ 1 ┆ 2 ┆ 4 │
|
2462
|
+
# # │ 3 ┆ 4 ┆ 16 │
|
2463
|
+
# # │ 5 ┆ 6 ┆ 36 │
|
2629
2464
|
# # └─────┴─────┴───────────┘
|
2630
2465
|
#
|
2631
2466
|
# @example Replaced
|
2632
2467
|
# df.with_column(Polars.col("a") ** 2)
|
2633
2468
|
# # =>
|
2634
2469
|
# # shape: (3, 2)
|
2635
|
-
# #
|
2636
|
-
# # │ a
|
2637
|
-
# # │ ---
|
2638
|
-
# # │
|
2639
|
-
# #
|
2640
|
-
# # │ 1
|
2641
|
-
# # │ 9
|
2642
|
-
# # │ 25
|
2643
|
-
# #
|
2470
|
+
# # ┌─────┬─────┐
|
2471
|
+
# # │ a ┆ b │
|
2472
|
+
# # │ --- ┆ --- │
|
2473
|
+
# # │ i64 ┆ i64 │
|
2474
|
+
# # ╞═════╪═════╡
|
2475
|
+
# # │ 1 ┆ 2 │
|
2476
|
+
# # │ 9 ┆ 4 │
|
2477
|
+
# # │ 25 ┆ 6 │
|
2478
|
+
# # └─────┴─────┘
|
2644
2479
|
def with_column(column)
|
2645
2480
|
lazy
|
2646
2481
|
.with_column(column)
|
@@ -2807,16 +2642,36 @@ module Polars
|
|
2807
2642
|
# # │ 2 ┆ 7.0 │
|
2808
2643
|
# # │ 3 ┆ 8.0 │
|
2809
2644
|
# # └─────┴─────┘
|
2810
|
-
|
2811
|
-
|
2812
|
-
|
2813
|
-
|
2814
|
-
|
2815
|
-
|
2816
|
-
|
2817
|
-
|
2818
|
-
|
2819
|
-
|
2645
|
+
#
|
2646
|
+
# @example Drop multiple columns by passing a list of column names.
|
2647
|
+
# df.drop(["bar", "ham"])
|
2648
|
+
# # =>
|
2649
|
+
# # shape: (3, 1)
|
2650
|
+
# # ┌─────┐
|
2651
|
+
# # │ foo │
|
2652
|
+
# # │ --- │
|
2653
|
+
# # │ i64 │
|
2654
|
+
# # ╞═════╡
|
2655
|
+
# # │ 1 │
|
2656
|
+
# # │ 2 │
|
2657
|
+
# # │ 3 │
|
2658
|
+
# # └─────┘
|
2659
|
+
#
|
2660
|
+
# @example Use positional arguments to drop multiple columns.
|
2661
|
+
# df.drop("foo", "ham")
|
2662
|
+
# # =>
|
2663
|
+
# # shape: (3, 1)
|
2664
|
+
# # ┌─────┐
|
2665
|
+
# # │ bar │
|
2666
|
+
# # │ --- │
|
2667
|
+
# # │ f64 │
|
2668
|
+
# # ╞═════╡
|
2669
|
+
# # │ 6.0 │
|
2670
|
+
# # │ 7.0 │
|
2671
|
+
# # │ 8.0 │
|
2672
|
+
# # └─────┘
|
2673
|
+
def drop(*columns)
|
2674
|
+
lazy.drop(*columns).collect(_eager: true)
|
2820
2675
|
end
|
2821
2676
|
|
2822
2677
|
# Drop in place.
|
@@ -3700,7 +3555,7 @@ module Polars
|
|
3700
3555
|
# # ┌─────────┐
|
3701
3556
|
# # │ literal │
|
3702
3557
|
# # │ --- │
|
3703
|
-
# # │
|
3558
|
+
# # │ i32 │
|
3704
3559
|
# # ╞═════════╡
|
3705
3560
|
# # │ 0 │
|
3706
3561
|
# # │ 0 │
|
@@ -3735,16 +3590,16 @@ module Polars
|
|
3735
3590
|
# df.with_columns((Polars.col("a") ** 2).alias("a^2"))
|
3736
3591
|
# # =>
|
3737
3592
|
# # shape: (4, 4)
|
3738
|
-
# #
|
3739
|
-
# # │ a ┆ b ┆ c ┆ a^2
|
3740
|
-
# # │ --- ┆ --- ┆ --- ┆ ---
|
3741
|
-
# # │ i64 ┆ f64 ┆ bool ┆
|
3742
|
-
# #
|
3743
|
-
# # │ 1 ┆ 0.5 ┆ true ┆ 1
|
3744
|
-
# # │ 2 ┆ 4.0 ┆ true ┆ 4
|
3745
|
-
# # │ 3 ┆ 10.0 ┆ false ┆ 9
|
3746
|
-
# # │ 4 ┆ 13.0 ┆ true ┆ 16
|
3747
|
-
# #
|
3593
|
+
# # ┌─────┬──────┬───────┬─────┐
|
3594
|
+
# # │ a ┆ b ┆ c ┆ a^2 │
|
3595
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
3596
|
+
# # │ i64 ┆ f64 ┆ bool ┆ i64 │
|
3597
|
+
# # ╞═════╪══════╪═══════╪═════╡
|
3598
|
+
# # │ 1 ┆ 0.5 ┆ true ┆ 1 │
|
3599
|
+
# # │ 2 ┆ 4.0 ┆ true ┆ 4 │
|
3600
|
+
# # │ 3 ┆ 10.0 ┆ false ┆ 9 │
|
3601
|
+
# # │ 4 ┆ 13.0 ┆ true ┆ 16 │
|
3602
|
+
# # └─────┴──────┴───────┴─────┘
|
3748
3603
|
#
|
3749
3604
|
# @example Added columns will replace existing columns with the same name.
|
3750
3605
|
# df.with_columns(Polars.col("a").cast(Polars::Float64))
|
@@ -3771,16 +3626,16 @@ module Polars
|
|
3771
3626
|
# )
|
3772
3627
|
# # =>
|
3773
3628
|
# # shape: (4, 6)
|
3774
|
-
# #
|
3775
|
-
# # │ a ┆ b ┆ c ┆ a^2
|
3776
|
-
# # │ --- ┆ --- ┆ --- ┆ ---
|
3777
|
-
# # │ i64 ┆ f64 ┆ bool ┆
|
3778
|
-
# #
|
3779
|
-
# # │ 1 ┆ 0.5 ┆ true ┆ 1
|
3780
|
-
# # │ 2 ┆ 4.0 ┆ true ┆ 4
|
3781
|
-
# # │ 3 ┆ 10.0 ┆ false ┆ 9
|
3782
|
-
# # │ 4 ┆ 13.0 ┆ true ┆ 16
|
3783
|
-
# #
|
3629
|
+
# # ┌─────┬──────┬───────┬─────┬──────┬───────┐
|
3630
|
+
# # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
|
3631
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
3632
|
+
# # │ i64 ┆ f64 ┆ bool ┆ i64 ┆ f64 ┆ bool │
|
3633
|
+
# # ╞═════╪══════╪═══════╪═════╪══════╪═══════╡
|
3634
|
+
# # │ 1 ┆ 0.5 ┆ true ┆ 1 ┆ 0.25 ┆ false │
|
3635
|
+
# # │ 2 ┆ 4.0 ┆ true ┆ 4 ┆ 2.0 ┆ false │
|
3636
|
+
# # │ 3 ┆ 10.0 ┆ false ┆ 9 ┆ 5.0 ┆ true │
|
3637
|
+
# # │ 4 ┆ 13.0 ┆ true ┆ 16 ┆ 6.5 ┆ false │
|
3638
|
+
# # └─────┴──────┴───────┴─────┴──────┴───────┘
|
3784
3639
|
#
|
3785
3640
|
# @example Multiple columns also can be added using positional arguments instead of a list.
|
3786
3641
|
# df.with_columns(
|
@@ -3790,16 +3645,16 @@ module Polars
|
|
3790
3645
|
# )
|
3791
3646
|
# # =>
|
3792
3647
|
# # shape: (4, 6)
|
3793
|
-
# #
|
3794
|
-
# # │ a ┆ b ┆ c ┆ a^2
|
3795
|
-
# # │ --- ┆ --- ┆ --- ┆ ---
|
3796
|
-
# # │ i64 ┆ f64 ┆ bool ┆
|
3797
|
-
# #
|
3798
|
-
# # │ 1 ┆ 0.5 ┆ true ┆ 1
|
3799
|
-
# # │ 2 ┆ 4.0 ┆ true ┆ 4
|
3800
|
-
# # │ 3 ┆ 10.0 ┆ false ┆ 9
|
3801
|
-
# # │ 4 ┆ 13.0 ┆ true ┆ 16
|
3802
|
-
# #
|
3648
|
+
# # ┌─────┬──────┬───────┬─────┬──────┬───────┐
|
3649
|
+
# # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
|
3650
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
3651
|
+
# # │ i64 ┆ f64 ┆ bool ┆ i64 ┆ f64 ┆ bool │
|
3652
|
+
# # ╞═════╪══════╪═══════╪═════╪══════╪═══════╡
|
3653
|
+
# # │ 1 ┆ 0.5 ┆ true ┆ 1 ┆ 0.25 ┆ false │
|
3654
|
+
# # │ 2 ┆ 4.0 ┆ true ┆ 4 ┆ 2.0 ┆ false │
|
3655
|
+
# # │ 3 ┆ 10.0 ┆ false ┆ 9 ┆ 5.0 ┆ true │
|
3656
|
+
# # │ 4 ┆ 13.0 ┆ true ┆ 16 ┆ 6.5 ┆ false │
|
3657
|
+
# # └─────┴──────┴───────┴─────┴──────┴───────┘
|
3803
3658
|
#
|
3804
3659
|
# @example Use keyword arguments to easily name your expression inputs.
|
3805
3660
|
# df.with_columns(
|
@@ -5181,7 +5036,7 @@ module Polars
|
|
5181
5036
|
elsif data[0].is_a?(Hash)
|
5182
5037
|
column_names, dtypes = _unpack_schema(columns)
|
5183
5038
|
schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
|
5184
|
-
rbdf = RbDataFrame.
|
5039
|
+
rbdf = RbDataFrame.from_hashes(data, schema, schema_overrides, false, infer_schema_length)
|
5185
5040
|
if column_names
|
5186
5041
|
rbdf = _post_apply_columns(rbdf, column_names)
|
5187
5042
|
end
|
@@ -5215,7 +5070,7 @@ module Polars
|
|
5215
5070
|
if unpack_nested
|
5216
5071
|
raise Todo
|
5217
5072
|
else
|
5218
|
-
rbdf = RbDataFrame.
|
5073
|
+
rbdf = RbDataFrame.from_rows(
|
5219
5074
|
data,
|
5220
5075
|
infer_schema_length,
|
5221
5076
|
local_schema_override.any? ? local_schema_override : nil
|