polars-df 0.9.0-arm64-darwin → 0.11.0-arm64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +23 -0
- data/Cargo.lock +144 -57
- data/LICENSE-THIRD-PARTY.txt +629 -29
- data/README.md +7 -6
- data/lib/polars/3.1/polars.bundle +0 -0
- data/lib/polars/3.2/polars.bundle +0 -0
- data/lib/polars/3.3/polars.bundle +0 -0
- data/lib/polars/array_expr.rb +6 -2
- data/lib/polars/batched_csv_reader.rb +11 -3
- data/lib/polars/convert.rb +6 -1
- data/lib/polars/data_frame.rb +225 -370
- data/lib/polars/date_time_expr.rb +11 -4
- data/lib/polars/date_time_name_space.rb +14 -4
- data/lib/polars/dynamic_group_by.rb +2 -2
- data/lib/polars/exceptions.rb +4 -0
- data/lib/polars/expr.rb +1171 -54
- data/lib/polars/functions/lazy.rb +3 -3
- data/lib/polars/functions/range/date_range.rb +92 -0
- data/lib/polars/functions/range/datetime_range.rb +149 -0
- data/lib/polars/functions/range/time_range.rb +141 -0
- data/lib/polars/functions/whenthen.rb +74 -5
- data/lib/polars/group_by.rb +88 -23
- data/lib/polars/io/avro.rb +24 -0
- data/lib/polars/{io.rb → io/csv.rb} +307 -489
- data/lib/polars/io/database.rb +73 -0
- data/lib/polars/io/ipc.rb +247 -0
- data/lib/polars/io/json.rb +18 -0
- data/lib/polars/io/ndjson.rb +69 -0
- data/lib/polars/io/parquet.rb +226 -0
- data/lib/polars/lazy_frame.rb +55 -195
- data/lib/polars/lazy_group_by.rb +100 -3
- data/lib/polars/list_expr.rb +6 -2
- data/lib/polars/rolling_group_by.rb +2 -2
- data/lib/polars/series.rb +14 -12
- data/lib/polars/string_expr.rb +38 -36
- data/lib/polars/utils.rb +89 -1
- data/lib/polars/version.rb +1 -1
- data/lib/polars/whenthen.rb +83 -0
- data/lib/polars.rb +10 -3
- metadata +13 -6
- data/lib/polars/when.rb +0 -16
- data/lib/polars/when_then.rb +0 -19
data/lib/polars/data_frame.rb
CHANGED
@@ -46,268 +46,6 @@ module Polars
|
|
46
46
|
df
|
47
47
|
end
|
48
48
|
|
49
|
-
# @private
|
50
|
-
def self._from_hashes(data, infer_schema_length: 100, schema: nil, schema_overrides: nil)
|
51
|
-
rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema, schema_overrides)
|
52
|
-
_from_rbdf(rbdf)
|
53
|
-
end
|
54
|
-
|
55
|
-
# @private
|
56
|
-
def self._from_hash(data, schema: nil, schema_overrides: nil)
|
57
|
-
_from_rbdf(hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides))
|
58
|
-
end
|
59
|
-
|
60
|
-
# def self._from_records
|
61
|
-
# end
|
62
|
-
|
63
|
-
# def self._from_numo
|
64
|
-
# end
|
65
|
-
|
66
|
-
# no self._from_arrow
|
67
|
-
|
68
|
-
# no self._from_pandas
|
69
|
-
|
70
|
-
# @private
|
71
|
-
def self._read_csv(
|
72
|
-
file,
|
73
|
-
has_header: true,
|
74
|
-
columns: nil,
|
75
|
-
sep: str = ",",
|
76
|
-
comment_char: nil,
|
77
|
-
quote_char: '"',
|
78
|
-
skip_rows: 0,
|
79
|
-
dtypes: nil,
|
80
|
-
null_values: nil,
|
81
|
-
ignore_errors: false,
|
82
|
-
parse_dates: false,
|
83
|
-
n_threads: nil,
|
84
|
-
infer_schema_length: 100,
|
85
|
-
batch_size: 8192,
|
86
|
-
n_rows: nil,
|
87
|
-
encoding: "utf8",
|
88
|
-
low_memory: false,
|
89
|
-
rechunk: true,
|
90
|
-
skip_rows_after_header: 0,
|
91
|
-
row_count_name: nil,
|
92
|
-
row_count_offset: 0,
|
93
|
-
sample_size: 1024,
|
94
|
-
eol_char: "\n"
|
95
|
-
)
|
96
|
-
if Utils.pathlike?(file)
|
97
|
-
path = Utils.normalise_filepath(file)
|
98
|
-
else
|
99
|
-
path = nil
|
100
|
-
# if defined?(StringIO) && file.is_a?(StringIO)
|
101
|
-
# file = file.string
|
102
|
-
# end
|
103
|
-
end
|
104
|
-
|
105
|
-
dtype_list = nil
|
106
|
-
dtype_slice = nil
|
107
|
-
if !dtypes.nil?
|
108
|
-
if dtypes.is_a?(Hash)
|
109
|
-
dtype_list = []
|
110
|
-
dtypes.each do|k, v|
|
111
|
-
dtype_list << [k, Utils.rb_type_to_dtype(v)]
|
112
|
-
end
|
113
|
-
elsif dtypes.is_a?(::Array)
|
114
|
-
dtype_slice = dtypes
|
115
|
-
else
|
116
|
-
raise ArgumentError, "dtype arg should be list or dict"
|
117
|
-
end
|
118
|
-
end
|
119
|
-
|
120
|
-
processed_null_values = Utils._process_null_values(null_values)
|
121
|
-
|
122
|
-
if columns.is_a?(::String)
|
123
|
-
columns = [columns]
|
124
|
-
end
|
125
|
-
if file.is_a?(::String) && file.include?("*")
|
126
|
-
dtypes_dict = nil
|
127
|
-
if !dtype_list.nil?
|
128
|
-
dtypes_dict = dtype_list.to_h
|
129
|
-
end
|
130
|
-
if !dtype_slice.nil?
|
131
|
-
raise ArgumentError, "cannot use glob patterns and unnamed dtypes as `dtypes` argument; Use dtypes: Mapping[str, Type[DataType]"
|
132
|
-
end
|
133
|
-
scan = Polars.scan_csv(
|
134
|
-
file,
|
135
|
-
has_header: has_header,
|
136
|
-
sep: sep,
|
137
|
-
comment_char: comment_char,
|
138
|
-
quote_char: quote_char,
|
139
|
-
skip_rows: skip_rows,
|
140
|
-
dtypes: dtypes_dict,
|
141
|
-
null_values: null_values,
|
142
|
-
ignore_errors: ignore_errors,
|
143
|
-
infer_schema_length: infer_schema_length,
|
144
|
-
n_rows: n_rows,
|
145
|
-
low_memory: low_memory,
|
146
|
-
rechunk: rechunk,
|
147
|
-
skip_rows_after_header: skip_rows_after_header,
|
148
|
-
row_count_name: row_count_name,
|
149
|
-
row_count_offset: row_count_offset,
|
150
|
-
eol_char: eol_char
|
151
|
-
)
|
152
|
-
if columns.nil?
|
153
|
-
return _from_rbdf(scan.collect._df)
|
154
|
-
elsif is_str_sequence(columns, allow_str: false)
|
155
|
-
return _from_rbdf(scan.select(columns).collect._df)
|
156
|
-
else
|
157
|
-
raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: List[str]"
|
158
|
-
end
|
159
|
-
end
|
160
|
-
|
161
|
-
projection, columns = Utils.handle_projection_columns(columns)
|
162
|
-
|
163
|
-
_from_rbdf(
|
164
|
-
RbDataFrame.read_csv(
|
165
|
-
file,
|
166
|
-
infer_schema_length,
|
167
|
-
batch_size,
|
168
|
-
has_header,
|
169
|
-
ignore_errors,
|
170
|
-
n_rows,
|
171
|
-
skip_rows,
|
172
|
-
projection,
|
173
|
-
sep,
|
174
|
-
rechunk,
|
175
|
-
columns,
|
176
|
-
encoding,
|
177
|
-
n_threads,
|
178
|
-
path,
|
179
|
-
dtype_list,
|
180
|
-
dtype_slice,
|
181
|
-
low_memory,
|
182
|
-
comment_char,
|
183
|
-
quote_char,
|
184
|
-
processed_null_values,
|
185
|
-
parse_dates,
|
186
|
-
skip_rows_after_header,
|
187
|
-
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
188
|
-
sample_size,
|
189
|
-
eol_char
|
190
|
-
)
|
191
|
-
)
|
192
|
-
end
|
193
|
-
|
194
|
-
# @private
|
195
|
-
def self._read_parquet(
|
196
|
-
source,
|
197
|
-
columns: nil,
|
198
|
-
n_rows: nil,
|
199
|
-
parallel: "auto",
|
200
|
-
row_count_name: nil,
|
201
|
-
row_count_offset: 0,
|
202
|
-
low_memory: false,
|
203
|
-
use_statistics: true,
|
204
|
-
rechunk: true
|
205
|
-
)
|
206
|
-
if Utils.pathlike?(source)
|
207
|
-
source = Utils.normalise_filepath(source)
|
208
|
-
end
|
209
|
-
if columns.is_a?(::String)
|
210
|
-
columns = [columns]
|
211
|
-
end
|
212
|
-
|
213
|
-
if source.is_a?(::String) && source.include?("*") && Utils.local_file?(source)
|
214
|
-
scan =
|
215
|
-
Polars.scan_parquet(
|
216
|
-
source,
|
217
|
-
n_rows: n_rows,
|
218
|
-
rechunk: true,
|
219
|
-
parallel: parallel,
|
220
|
-
row_count_name: row_count_name,
|
221
|
-
row_count_offset: row_count_offset,
|
222
|
-
low_memory: low_memory
|
223
|
-
)
|
224
|
-
|
225
|
-
if columns.nil?
|
226
|
-
return self._from_rbdf(scan.collect._df)
|
227
|
-
elsif Utils.is_str_sequence(columns, allow_str: false)
|
228
|
-
return self._from_rbdf(scan.select(columns).collect._df)
|
229
|
-
else
|
230
|
-
raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: Array[String]"
|
231
|
-
end
|
232
|
-
end
|
233
|
-
|
234
|
-
projection, columns = Utils.handle_projection_columns(columns)
|
235
|
-
_from_rbdf(
|
236
|
-
RbDataFrame.read_parquet(
|
237
|
-
source,
|
238
|
-
columns,
|
239
|
-
projection,
|
240
|
-
n_rows,
|
241
|
-
parallel,
|
242
|
-
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
243
|
-
low_memory,
|
244
|
-
use_statistics,
|
245
|
-
rechunk
|
246
|
-
)
|
247
|
-
)
|
248
|
-
end
|
249
|
-
|
250
|
-
# @private
|
251
|
-
def self._read_avro(file, columns: nil, n_rows: nil)
|
252
|
-
if Utils.pathlike?(file)
|
253
|
-
file = Utils.normalise_filepath(file)
|
254
|
-
end
|
255
|
-
projection, columns = Utils.handle_projection_columns(columns)
|
256
|
-
_from_rbdf(RbDataFrame.read_avro(file, columns, projection, n_rows))
|
257
|
-
end
|
258
|
-
|
259
|
-
# @private
|
260
|
-
def self._read_ipc(
|
261
|
-
file,
|
262
|
-
columns: nil,
|
263
|
-
n_rows: nil,
|
264
|
-
row_count_name: nil,
|
265
|
-
row_count_offset: 0,
|
266
|
-
rechunk: true,
|
267
|
-
memory_map: true
|
268
|
-
)
|
269
|
-
if Utils.pathlike?(file)
|
270
|
-
file = Utils.normalise_filepath(file)
|
271
|
-
end
|
272
|
-
if columns.is_a?(::String)
|
273
|
-
columns = [columns]
|
274
|
-
end
|
275
|
-
|
276
|
-
if file.is_a?(::String) && file.include?("*")
|
277
|
-
raise Todo
|
278
|
-
end
|
279
|
-
|
280
|
-
projection, columns = Utils.handle_projection_columns(columns)
|
281
|
-
_from_rbdf(
|
282
|
-
RbDataFrame.read_ipc(
|
283
|
-
file,
|
284
|
-
columns,
|
285
|
-
projection,
|
286
|
-
n_rows,
|
287
|
-
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
288
|
-
memory_map
|
289
|
-
)
|
290
|
-
)
|
291
|
-
end
|
292
|
-
|
293
|
-
# @private
|
294
|
-
def self._read_json(file)
|
295
|
-
if Utils.pathlike?(file)
|
296
|
-
file = Utils.normalise_filepath(file)
|
297
|
-
end
|
298
|
-
|
299
|
-
_from_rbdf(RbDataFrame.read_json(file))
|
300
|
-
end
|
301
|
-
|
302
|
-
# @private
|
303
|
-
def self._read_ndjson(file)
|
304
|
-
if Utils.pathlike?(file)
|
305
|
-
file = Utils.normalise_filepath(file)
|
306
|
-
end
|
307
|
-
|
308
|
-
_from_rbdf(RbDataFrame.read_ndjson(file))
|
309
|
-
end
|
310
|
-
|
311
49
|
# Get the shape of the DataFrame.
|
312
50
|
#
|
313
51
|
# @return [Array]
|
@@ -416,6 +154,13 @@ module Polars
|
|
416
154
|
_df.dtypes
|
417
155
|
end
|
418
156
|
|
157
|
+
# Get flags that are set on the columns of this DataFrame.
|
158
|
+
#
|
159
|
+
# @return [Hash]
|
160
|
+
def flags
|
161
|
+
columns.to_h { |name| [name, self[name].flags] }
|
162
|
+
end
|
163
|
+
|
419
164
|
# Get the schema.
|
420
165
|
#
|
421
166
|
# @return [Hash]
|
@@ -814,8 +559,6 @@ module Polars
|
|
814
559
|
|
815
560
|
# Serialize to JSON representation.
|
816
561
|
#
|
817
|
-
# @return [nil]
|
818
|
-
#
|
819
562
|
# @param file [String]
|
820
563
|
# File path to which the result should be written.
|
821
564
|
# @param pretty [Boolean]
|
@@ -823,17 +566,45 @@ module Polars
|
|
823
566
|
# @param row_oriented [Boolean]
|
824
567
|
# Write to row oriented json. This is slower, but more common.
|
825
568
|
#
|
826
|
-
# @
|
569
|
+
# @return [nil]
|
570
|
+
#
|
571
|
+
# @example
|
572
|
+
# df = Polars::DataFrame.new(
|
573
|
+
# {
|
574
|
+
# "foo" => [1, 2, 3],
|
575
|
+
# "bar" => [6, 7, 8]
|
576
|
+
# }
|
577
|
+
# )
|
578
|
+
# df.write_json
|
579
|
+
# # => "{\"columns\":[{\"name\":\"foo\",\"datatype\":\"Int64\",\"bit_settings\":\"\",\"values\":[1,2,3]},{\"name\":\"bar\",\"datatype\":\"Int64\",\"bit_settings\":\"\",\"values\":[6,7,8]}]}"
|
580
|
+
#
|
581
|
+
# @example
|
582
|
+
# df.write_json(row_oriented: true)
|
583
|
+
# # => "[{\"foo\":1,\"bar\":6},{\"foo\":2,\"bar\":7},{\"foo\":3,\"bar\":8}]"
|
827
584
|
def write_json(
|
828
|
-
file,
|
585
|
+
file = nil,
|
829
586
|
pretty: false,
|
830
587
|
row_oriented: false
|
831
588
|
)
|
832
589
|
if Utils.pathlike?(file)
|
833
|
-
file = Utils.
|
590
|
+
file = Utils.normalize_filepath(file)
|
591
|
+
end
|
592
|
+
to_string_io = !file.nil? && file.is_a?(StringIO)
|
593
|
+
if file.nil? || to_string_io
|
594
|
+
buf = StringIO.new
|
595
|
+
buf.set_encoding(Encoding::BINARY)
|
596
|
+
_df.write_json(buf, pretty, row_oriented)
|
597
|
+
json_bytes = buf.string
|
598
|
+
|
599
|
+
json_str = json_bytes.force_encoding(Encoding::UTF_8)
|
600
|
+
if to_string_io
|
601
|
+
file.write(json_str)
|
602
|
+
else
|
603
|
+
return json_str
|
604
|
+
end
|
605
|
+
else
|
606
|
+
_df.write_json(file, pretty, row_oriented)
|
834
607
|
end
|
835
|
-
|
836
|
-
_df.write_json(file, pretty, row_oriented)
|
837
608
|
nil
|
838
609
|
end
|
839
610
|
|
@@ -843,12 +614,36 @@ module Polars
|
|
843
614
|
# File path to which the result should be written.
|
844
615
|
#
|
845
616
|
# @return [nil]
|
846
|
-
|
617
|
+
#
|
618
|
+
# @example
|
619
|
+
# df = Polars::DataFrame.new(
|
620
|
+
# {
|
621
|
+
# "foo" => [1, 2, 3],
|
622
|
+
# "bar" => [6, 7, 8]
|
623
|
+
# }
|
624
|
+
# )
|
625
|
+
# df.write_ndjson()
|
626
|
+
# # => "{\"foo\":1,\"bar\":6}\n{\"foo\":2,\"bar\":7}\n{\"foo\":3,\"bar\":8}\n"
|
627
|
+
def write_ndjson(file = nil)
|
847
628
|
if Utils.pathlike?(file)
|
848
|
-
file = Utils.
|
629
|
+
file = Utils.normalize_filepath(file)
|
630
|
+
end
|
631
|
+
to_string_io = !file.nil? && file.is_a?(StringIO)
|
632
|
+
if file.nil? || to_string_io
|
633
|
+
buf = StringIO.new
|
634
|
+
buf.set_encoding(Encoding::BINARY)
|
635
|
+
_df.write_ndjson(buf)
|
636
|
+
json_bytes = buf.string
|
637
|
+
|
638
|
+
json_str = json_bytes.force_encoding(Encoding::UTF_8)
|
639
|
+
if to_string_io
|
640
|
+
file.write(json_str)
|
641
|
+
else
|
642
|
+
return json_str
|
643
|
+
end
|
644
|
+
else
|
645
|
+
_df.write_ndjson(file)
|
849
646
|
end
|
850
|
-
|
851
|
-
_df.write_ndjson(file)
|
852
647
|
nil
|
853
648
|
end
|
854
649
|
|
@@ -938,7 +733,7 @@ module Polars
|
|
938
733
|
end
|
939
734
|
|
940
735
|
if Utils.pathlike?(file)
|
941
|
-
file = Utils.
|
736
|
+
file = Utils.normalize_filepath(file)
|
942
737
|
end
|
943
738
|
|
944
739
|
_df.write_csv(
|
@@ -976,7 +771,7 @@ module Polars
|
|
976
771
|
compression = "uncompressed"
|
977
772
|
end
|
978
773
|
if Utils.pathlike?(file)
|
979
|
-
file = Utils.
|
774
|
+
file = Utils.normalize_filepath(file)
|
980
775
|
end
|
981
776
|
|
982
777
|
_df.write_avro(file, compression)
|
@@ -997,7 +792,7 @@ module Polars
|
|
997
792
|
file.set_encoding(Encoding::BINARY)
|
998
793
|
end
|
999
794
|
if Utils.pathlike?(file)
|
1000
|
-
file = Utils.
|
795
|
+
file = Utils.normalize_filepath(file)
|
1001
796
|
end
|
1002
797
|
|
1003
798
|
if compression.nil?
|
@@ -1008,9 +803,50 @@ module Polars
|
|
1008
803
|
return_bytes ? file.string : nil
|
1009
804
|
end
|
1010
805
|
|
806
|
+
# Write to Arrow IPC record batch stream.
|
807
|
+
#
|
808
|
+
# See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html.
|
809
|
+
#
|
810
|
+
# @param file [Object]
|
811
|
+
# Path or writable file-like object to which the IPC record batch data will
|
812
|
+
# be written. If set to `None`, the output is returned as a BytesIO object.
|
813
|
+
# @param compression ['uncompressed', 'lz4', 'zstd']
|
814
|
+
# Compression method. Defaults to "uncompressed".
|
815
|
+
#
|
816
|
+
# @return [Object]
|
817
|
+
#
|
818
|
+
# @example
|
819
|
+
# df = Polars::DataFrame.new(
|
820
|
+
# {
|
821
|
+
# "foo" => [1, 2, 3, 4, 5],
|
822
|
+
# "bar" => [6, 7, 8, 9, 10],
|
823
|
+
# "ham" => ["a", "b", "c", "d", "e"]
|
824
|
+
# }
|
825
|
+
# )
|
826
|
+
# df.write_ipc_stream("new_file.arrow")
|
827
|
+
def write_ipc_stream(
|
828
|
+
file,
|
829
|
+
compression: "uncompressed"
|
830
|
+
)
|
831
|
+
return_bytes = file.nil?
|
832
|
+
if return_bytes
|
833
|
+
file = StringIO.new
|
834
|
+
file.set_encoding(Encoding::BINARY)
|
835
|
+
elsif Utils.pathlike?(file)
|
836
|
+
file = Utils.normalize_filepath(file)
|
837
|
+
end
|
838
|
+
|
839
|
+
if compression.nil?
|
840
|
+
compression = "uncompressed"
|
841
|
+
end
|
842
|
+
|
843
|
+
_df.write_ipc_stream(file, compression)
|
844
|
+
return_bytes ? file.string : nil
|
845
|
+
end
|
846
|
+
|
1011
847
|
# Write to Apache Parquet file.
|
1012
848
|
#
|
1013
|
-
# @param file [String]
|
849
|
+
# @param file [String, Pathname, StringIO]
|
1014
850
|
# File path to which the file should be written.
|
1015
851
|
# @param compression ["lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd"]
|
1016
852
|
# Choose "zstd" for good compression performance.
|
@@ -1027,10 +863,9 @@ module Polars
|
|
1027
863
|
# @param statistics [Boolean]
|
1028
864
|
# Write statistics to the parquet headers. This requires extra compute.
|
1029
865
|
# @param row_group_size [Integer, nil]
|
1030
|
-
# Size of the row groups in number of rows.
|
1031
|
-
#
|
1032
|
-
#
|
1033
|
-
# writing speeds.
|
866
|
+
# Size of the row groups in number of rows. Defaults to 512^2 rows.
|
867
|
+
# @param data_page_size [Integer, nil]
|
868
|
+
# Size of the data page in bytes. Defaults to 1024^2 bytes.
|
1034
869
|
#
|
1035
870
|
# @return [nil]
|
1036
871
|
def write_parquet(
|
@@ -1038,17 +873,18 @@ module Polars
|
|
1038
873
|
compression: "zstd",
|
1039
874
|
compression_level: nil,
|
1040
875
|
statistics: false,
|
1041
|
-
row_group_size: nil
|
876
|
+
row_group_size: nil,
|
877
|
+
data_page_size: nil
|
1042
878
|
)
|
1043
879
|
if compression.nil?
|
1044
880
|
compression = "uncompressed"
|
1045
881
|
end
|
1046
882
|
if Utils.pathlike?(file)
|
1047
|
-
file = Utils.
|
883
|
+
file = Utils.normalize_filepath(file)
|
1048
884
|
end
|
1049
885
|
|
1050
886
|
_df.write_parquet(
|
1051
|
-
file, compression, compression_level, statistics, row_group_size
|
887
|
+
file, compression, compression_level, statistics, row_group_size, data_page_size
|
1052
888
|
)
|
1053
889
|
end
|
1054
890
|
|
@@ -1084,7 +920,7 @@ module Polars
|
|
1084
920
|
# df.estimated_size
|
1085
921
|
# # => 25888898
|
1086
922
|
# df.estimated_size("mb")
|
1087
|
-
# # =>
|
923
|
+
# # => 17.0601749420166
|
1088
924
|
def estimated_size(unit = "b")
|
1089
925
|
sz = _df.estimated_size
|
1090
926
|
Utils.scale_bytes(sz, to: unit)
|
@@ -1720,10 +1556,7 @@ module Polars
|
|
1720
1556
|
# # │ 3 ┆ 8 ┆ c │
|
1721
1557
|
# # └─────┴─────┴─────┘
|
1722
1558
|
def drop_nulls(subset: nil)
|
1723
|
-
|
1724
|
-
subset = [subset]
|
1725
|
-
end
|
1726
|
-
_from_rbdf(_df.drop_nulls(subset))
|
1559
|
+
lazy.drop_nulls(subset: subset).collect(_eager: true)
|
1727
1560
|
end
|
1728
1561
|
|
1729
1562
|
# Offers a structured way to apply a sequence of user-defined functions (UDFs).
|
@@ -1785,16 +1618,16 @@ module Polars
|
|
1785
1618
|
# df.with_row_index
|
1786
1619
|
# # =>
|
1787
1620
|
# # shape: (3, 3)
|
1788
|
-
# #
|
1789
|
-
# # │
|
1790
|
-
# # │ ---
|
1791
|
-
# # │ u32
|
1792
|
-
# #
|
1793
|
-
# # │ 0
|
1794
|
-
# # │ 1
|
1795
|
-
# # │ 2
|
1796
|
-
# #
|
1797
|
-
def with_row_index(name: "
|
1621
|
+
# # ┌───────┬─────┬─────┐
|
1622
|
+
# # │ index ┆ a ┆ b │
|
1623
|
+
# # │ --- ┆ --- ┆ --- │
|
1624
|
+
# # │ u32 ┆ i64 ┆ i64 │
|
1625
|
+
# # ╞═══════╪═════╪═════╡
|
1626
|
+
# # │ 0 ┆ 1 ┆ 2 │
|
1627
|
+
# # │ 1 ┆ 3 ┆ 4 │
|
1628
|
+
# # │ 2 ┆ 5 ┆ 6 │
|
1629
|
+
# # └───────┴─────┴─────┘
|
1630
|
+
def with_row_index(name: "index", offset: 0)
|
1798
1631
|
_from_rbdf(_df.with_row_index(name, offset))
|
1799
1632
|
end
|
1800
1633
|
alias_method :with_row_count, :with_row_index
|
@@ -2083,16 +1916,16 @@ module Polars
|
|
2083
1916
|
# )
|
2084
1917
|
# # =>
|
2085
1918
|
# # shape: (4, 3)
|
2086
|
-
# #
|
2087
|
-
# # │ time ┆ time_count ┆ time_agg_list
|
2088
|
-
# # │ --- ┆ --- ┆ ---
|
2089
|
-
# # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
|
2090
|
-
# #
|
2091
|
-
# # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12
|
2092
|
-
# # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12
|
2093
|
-
# # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12
|
2094
|
-
# # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
|
2095
|
-
# #
|
1919
|
+
# # ┌─────────────────────┬────────────┬─────────────────────────────────┐
|
1920
|
+
# # │ time ┆ time_count ┆ time_agg_list │
|
1921
|
+
# # │ --- ┆ --- ┆ --- │
|
1922
|
+
# # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │
|
1923
|
+
# # ╞═════════════════════╪════════════╪═════════════════════════════════╡
|
1924
|
+
# # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-… │
|
1925
|
+
# # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-… │
|
1926
|
+
# # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-… │
|
1927
|
+
# # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
|
1928
|
+
# # └─────────────────────┴────────────┴─────────────────────────────────┘
|
2096
1929
|
#
|
2097
1930
|
# @example When closed="both" the time values at the window boundaries belong to 2 groups.
|
2098
1931
|
# df.group_by_dynamic("time", every: "1h", closed: "both").agg(
|
@@ -2161,12 +1994,13 @@ module Polars
|
|
2161
1994
|
# closed: "right"
|
2162
1995
|
# ).agg(Polars.col("A").alias("A_agg_list"))
|
2163
1996
|
# # =>
|
2164
|
-
# # shape: (
|
1997
|
+
# # shape: (4, 4)
|
2165
1998
|
# # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
|
2166
1999
|
# # │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │
|
2167
2000
|
# # │ --- ┆ --- ┆ --- ┆ --- │
|
2168
2001
|
# # │ i64 ┆ i64 ┆ i64 ┆ list[str] │
|
2169
2002
|
# # ╞═════════════════╪═════════════════╪═════╪═════════════════╡
|
2003
|
+
# # │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │
|
2170
2004
|
# # │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │
|
2171
2005
|
# # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
|
2172
2006
|
# # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
|
@@ -2566,7 +2400,7 @@ module Polars
|
|
2566
2400
|
# df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [-1, 5, 8]})
|
2567
2401
|
#
|
2568
2402
|
# @example Return a DataFrame by mapping each row to a tuple:
|
2569
|
-
# df.
|
2403
|
+
# df.map_rows { |t| [t[0] * 2, t[1] * 3] }
|
2570
2404
|
# # =>
|
2571
2405
|
# # shape: (3, 2)
|
2572
2406
|
# # ┌──────────┬──────────┐
|
@@ -2580,7 +2414,7 @@ module Polars
|
|
2580
2414
|
# # └──────────┴──────────┘
|
2581
2415
|
#
|
2582
2416
|
# @example Return a Series by mapping each row to a scalar:
|
2583
|
-
# df.
|
2417
|
+
# df.map_rows { |t| t[0] * 2 + t[1] }
|
2584
2418
|
# # =>
|
2585
2419
|
# # shape: (3, 1)
|
2586
2420
|
# # ┌───────┐
|
@@ -2592,14 +2426,15 @@ module Polars
|
|
2592
2426
|
# # │ 9 │
|
2593
2427
|
# # │ 14 │
|
2594
2428
|
# # └───────┘
|
2595
|
-
def
|
2596
|
-
out, is_df = _df.
|
2429
|
+
def map_rows(return_dtype: nil, inference_size: 256, &f)
|
2430
|
+
out, is_df = _df.map_rows(f, return_dtype, inference_size)
|
2597
2431
|
if is_df
|
2598
2432
|
_from_rbdf(out)
|
2599
2433
|
else
|
2600
2434
|
_from_rbdf(Utils.wrap_s(out).to_frame._df)
|
2601
2435
|
end
|
2602
2436
|
end
|
2437
|
+
alias_method :apply, :map_rows
|
2603
2438
|
|
2604
2439
|
# Return a new DataFrame with the column added or replaced.
|
2605
2440
|
#
|
@@ -2621,26 +2456,26 @@ module Polars
|
|
2621
2456
|
# # ┌─────┬─────┬───────────┐
|
2622
2457
|
# # │ a ┆ b ┆ b_squared │
|
2623
2458
|
# # │ --- ┆ --- ┆ --- │
|
2624
|
-
# # │ i64 ┆ i64 ┆
|
2459
|
+
# # │ i64 ┆ i64 ┆ i64 │
|
2625
2460
|
# # ╞═════╪═════╪═══════════╡
|
2626
|
-
# # │ 1 ┆ 2 ┆ 4
|
2627
|
-
# # │ 3 ┆ 4 ┆ 16
|
2628
|
-
# # │ 5 ┆ 6 ┆ 36
|
2461
|
+
# # │ 1 ┆ 2 ┆ 4 │
|
2462
|
+
# # │ 3 ┆ 4 ┆ 16 │
|
2463
|
+
# # │ 5 ┆ 6 ┆ 36 │
|
2629
2464
|
# # └─────┴─────┴───────────┘
|
2630
2465
|
#
|
2631
2466
|
# @example Replaced
|
2632
2467
|
# df.with_column(Polars.col("a") ** 2)
|
2633
2468
|
# # =>
|
2634
2469
|
# # shape: (3, 2)
|
2635
|
-
# #
|
2636
|
-
# # │ a
|
2637
|
-
# # │ ---
|
2638
|
-
# # │
|
2639
|
-
# #
|
2640
|
-
# # │ 1
|
2641
|
-
# # │ 9
|
2642
|
-
# # │ 25
|
2643
|
-
# #
|
2470
|
+
# # ┌─────┬─────┐
|
2471
|
+
# # │ a ┆ b │
|
2472
|
+
# # │ --- ┆ --- │
|
2473
|
+
# # │ i64 ┆ i64 │
|
2474
|
+
# # ╞═════╪═════╡
|
2475
|
+
# # │ 1 ┆ 2 │
|
2476
|
+
# # │ 9 ┆ 4 │
|
2477
|
+
# # │ 25 ┆ 6 │
|
2478
|
+
# # └─────┴─────┘
|
2644
2479
|
def with_column(column)
|
2645
2480
|
lazy
|
2646
2481
|
.with_column(column)
|
@@ -2807,16 +2642,36 @@ module Polars
|
|
2807
2642
|
# # │ 2 ┆ 7.0 │
|
2808
2643
|
# # │ 3 ┆ 8.0 │
|
2809
2644
|
# # └─────┴─────┘
|
2810
|
-
|
2811
|
-
|
2812
|
-
|
2813
|
-
|
2814
|
-
|
2815
|
-
|
2816
|
-
|
2817
|
-
|
2818
|
-
|
2819
|
-
|
2645
|
+
#
|
2646
|
+
# @example Drop multiple columns by passing a list of column names.
|
2647
|
+
# df.drop(["bar", "ham"])
|
2648
|
+
# # =>
|
2649
|
+
# # shape: (3, 1)
|
2650
|
+
# # ┌─────┐
|
2651
|
+
# # │ foo │
|
2652
|
+
# # │ --- │
|
2653
|
+
# # │ i64 │
|
2654
|
+
# # ╞═════╡
|
2655
|
+
# # │ 1 │
|
2656
|
+
# # │ 2 │
|
2657
|
+
# # │ 3 │
|
2658
|
+
# # └─────┘
|
2659
|
+
#
|
2660
|
+
# @example Use positional arguments to drop multiple columns.
|
2661
|
+
# df.drop("foo", "ham")
|
2662
|
+
# # =>
|
2663
|
+
# # shape: (3, 1)
|
2664
|
+
# # ┌─────┐
|
2665
|
+
# # │ bar │
|
2666
|
+
# # │ --- │
|
2667
|
+
# # │ f64 │
|
2668
|
+
# # ╞═════╡
|
2669
|
+
# # │ 6.0 │
|
2670
|
+
# # │ 7.0 │
|
2671
|
+
# # │ 8.0 │
|
2672
|
+
# # └─────┘
|
2673
|
+
def drop(*columns)
|
2674
|
+
lazy.drop(*columns).collect(_eager: true)
|
2820
2675
|
end
|
2821
2676
|
|
2822
2677
|
# Drop in place.
|
@@ -3700,7 +3555,7 @@ module Polars
|
|
3700
3555
|
# # ┌─────────┐
|
3701
3556
|
# # │ literal │
|
3702
3557
|
# # │ --- │
|
3703
|
-
# # │
|
3558
|
+
# # │ i32 │
|
3704
3559
|
# # ╞═════════╡
|
3705
3560
|
# # │ 0 │
|
3706
3561
|
# # │ 0 │
|
@@ -3735,16 +3590,16 @@ module Polars
|
|
3735
3590
|
# df.with_columns((Polars.col("a") ** 2).alias("a^2"))
|
3736
3591
|
# # =>
|
3737
3592
|
# # shape: (4, 4)
|
3738
|
-
# #
|
3739
|
-
# # │ a ┆ b ┆ c ┆ a^2
|
3740
|
-
# # │ --- ┆ --- ┆ --- ┆ ---
|
3741
|
-
# # │ i64 ┆ f64 ┆ bool ┆
|
3742
|
-
# #
|
3743
|
-
# # │ 1 ┆ 0.5 ┆ true ┆ 1
|
3744
|
-
# # │ 2 ┆ 4.0 ┆ true ┆ 4
|
3745
|
-
# # │ 3 ┆ 10.0 ┆ false ┆ 9
|
3746
|
-
# # │ 4 ┆ 13.0 ┆ true ┆ 16
|
3747
|
-
# #
|
3593
|
+
# # ┌─────┬──────┬───────┬─────┐
|
3594
|
+
# # │ a ┆ b ┆ c ┆ a^2 │
|
3595
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
3596
|
+
# # │ i64 ┆ f64 ┆ bool ┆ i64 │
|
3597
|
+
# # ╞═════╪══════╪═══════╪═════╡
|
3598
|
+
# # │ 1 ┆ 0.5 ┆ true ┆ 1 │
|
3599
|
+
# # │ 2 ┆ 4.0 ┆ true ┆ 4 │
|
3600
|
+
# # │ 3 ┆ 10.0 ┆ false ┆ 9 │
|
3601
|
+
# # │ 4 ┆ 13.0 ┆ true ┆ 16 │
|
3602
|
+
# # └─────┴──────┴───────┴─────┘
|
3748
3603
|
#
|
3749
3604
|
# @example Added columns will replace existing columns with the same name.
|
3750
3605
|
# df.with_columns(Polars.col("a").cast(Polars::Float64))
|
@@ -3771,16 +3626,16 @@ module Polars
|
|
3771
3626
|
# )
|
3772
3627
|
# # =>
|
3773
3628
|
# # shape: (4, 6)
|
3774
|
-
# #
|
3775
|
-
# # │ a ┆ b ┆ c ┆ a^2
|
3776
|
-
# # │ --- ┆ --- ┆ --- ┆ ---
|
3777
|
-
# # │ i64 ┆ f64 ┆ bool ┆
|
3778
|
-
# #
|
3779
|
-
# # │ 1 ┆ 0.5 ┆ true ┆ 1
|
3780
|
-
# # │ 2 ┆ 4.0 ┆ true ┆ 4
|
3781
|
-
# # │ 3 ┆ 10.0 ┆ false ┆ 9
|
3782
|
-
# # │ 4 ┆ 13.0 ┆ true ┆ 16
|
3783
|
-
# #
|
3629
|
+
# # ┌─────┬──────┬───────┬─────┬──────┬───────┐
|
3630
|
+
# # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
|
3631
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
3632
|
+
# # │ i64 ┆ f64 ┆ bool ┆ i64 ┆ f64 ┆ bool │
|
3633
|
+
# # ╞═════╪══════╪═══════╪═════╪══════╪═══════╡
|
3634
|
+
# # │ 1 ┆ 0.5 ┆ true ┆ 1 ┆ 0.25 ┆ false │
|
3635
|
+
# # │ 2 ┆ 4.0 ┆ true ┆ 4 ┆ 2.0 ┆ false │
|
3636
|
+
# # │ 3 ┆ 10.0 ┆ false ┆ 9 ┆ 5.0 ┆ true │
|
3637
|
+
# # │ 4 ┆ 13.0 ┆ true ┆ 16 ┆ 6.5 ┆ false │
|
3638
|
+
# # └─────┴──────┴───────┴─────┴──────┴───────┘
|
3784
3639
|
#
|
3785
3640
|
# @example Multiple columns also can be added using positional arguments instead of a list.
|
3786
3641
|
# df.with_columns(
|
@@ -3790,16 +3645,16 @@ module Polars
|
|
3790
3645
|
# )
|
3791
3646
|
# # =>
|
3792
3647
|
# # shape: (4, 6)
|
3793
|
-
# #
|
3794
|
-
# # │ a ┆ b ┆ c ┆ a^2
|
3795
|
-
# # │ --- ┆ --- ┆ --- ┆ ---
|
3796
|
-
# # │ i64 ┆ f64 ┆ bool ┆
|
3797
|
-
# #
|
3798
|
-
# # │ 1 ┆ 0.5 ┆ true ┆ 1
|
3799
|
-
# # │ 2 ┆ 4.0 ┆ true ┆ 4
|
3800
|
-
# # │ 3 ┆ 10.0 ┆ false ┆ 9
|
3801
|
-
# # │ 4 ┆ 13.0 ┆ true ┆ 16
|
3802
|
-
# #
|
3648
|
+
# # ┌─────┬──────┬───────┬─────┬──────┬───────┐
|
3649
|
+
# # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
|
3650
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
3651
|
+
# # │ i64 ┆ f64 ┆ bool ┆ i64 ┆ f64 ┆ bool │
|
3652
|
+
# # ╞═════╪══════╪═══════╪═════╪══════╪═══════╡
|
3653
|
+
# # │ 1 ┆ 0.5 ┆ true ┆ 1 ┆ 0.25 ┆ false │
|
3654
|
+
# # │ 2 ┆ 4.0 ┆ true ┆ 4 ┆ 2.0 ┆ false │
|
3655
|
+
# # │ 3 ┆ 10.0 ┆ false ┆ 9 ┆ 5.0 ┆ true │
|
3656
|
+
# # │ 4 ┆ 13.0 ┆ true ┆ 16 ┆ 6.5 ┆ false │
|
3657
|
+
# # └─────┴──────┴───────┴─────┴──────┴───────┘
|
3803
3658
|
#
|
3804
3659
|
# @example Use keyword arguments to easily name your expression inputs.
|
3805
3660
|
# df.with_columns(
|
@@ -5181,7 +5036,7 @@ module Polars
|
|
5181
5036
|
elsif data[0].is_a?(Hash)
|
5182
5037
|
column_names, dtypes = _unpack_schema(columns)
|
5183
5038
|
schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
|
5184
|
-
rbdf = RbDataFrame.
|
5039
|
+
rbdf = RbDataFrame.from_hashes(data, schema, schema_overrides, false, infer_schema_length)
|
5185
5040
|
if column_names
|
5186
5041
|
rbdf = _post_apply_columns(rbdf, column_names)
|
5187
5042
|
end
|
@@ -5215,7 +5070,7 @@ module Polars
|
|
5215
5070
|
if unpack_nested
|
5216
5071
|
raise Todo
|
5217
5072
|
else
|
5218
|
-
rbdf = RbDataFrame.
|
5073
|
+
rbdf = RbDataFrame.from_rows(
|
5219
5074
|
data,
|
5220
5075
|
infer_schema_length,
|
5221
5076
|
local_schema_override.any? ? local_schema_override : nil
|