polars-df 0.9.0-x86_64-darwin → 0.11.0-x86_64-darwin

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +23 -0
  3. data/Cargo.lock +144 -57
  4. data/LICENSE-THIRD-PARTY.txt +629 -29
  5. data/README.md +7 -6
  6. data/lib/polars/3.1/polars.bundle +0 -0
  7. data/lib/polars/3.2/polars.bundle +0 -0
  8. data/lib/polars/3.3/polars.bundle +0 -0
  9. data/lib/polars/array_expr.rb +6 -2
  10. data/lib/polars/batched_csv_reader.rb +11 -3
  11. data/lib/polars/convert.rb +6 -1
  12. data/lib/polars/data_frame.rb +225 -370
  13. data/lib/polars/date_time_expr.rb +11 -4
  14. data/lib/polars/date_time_name_space.rb +14 -4
  15. data/lib/polars/dynamic_group_by.rb +2 -2
  16. data/lib/polars/exceptions.rb +4 -0
  17. data/lib/polars/expr.rb +1171 -54
  18. data/lib/polars/functions/lazy.rb +3 -3
  19. data/lib/polars/functions/range/date_range.rb +92 -0
  20. data/lib/polars/functions/range/datetime_range.rb +149 -0
  21. data/lib/polars/functions/range/time_range.rb +141 -0
  22. data/lib/polars/functions/whenthen.rb +74 -5
  23. data/lib/polars/group_by.rb +88 -23
  24. data/lib/polars/io/avro.rb +24 -0
  25. data/lib/polars/{io.rb → io/csv.rb} +307 -489
  26. data/lib/polars/io/database.rb +73 -0
  27. data/lib/polars/io/ipc.rb +247 -0
  28. data/lib/polars/io/json.rb +18 -0
  29. data/lib/polars/io/ndjson.rb +69 -0
  30. data/lib/polars/io/parquet.rb +226 -0
  31. data/lib/polars/lazy_frame.rb +55 -195
  32. data/lib/polars/lazy_group_by.rb +100 -3
  33. data/lib/polars/list_expr.rb +6 -2
  34. data/lib/polars/rolling_group_by.rb +2 -2
  35. data/lib/polars/series.rb +14 -12
  36. data/lib/polars/string_expr.rb +38 -36
  37. data/lib/polars/utils.rb +89 -1
  38. data/lib/polars/version.rb +1 -1
  39. data/lib/polars/whenthen.rb +83 -0
  40. data/lib/polars.rb +10 -3
  41. metadata +13 -6
  42. data/lib/polars/when.rb +0 -16
  43. data/lib/polars/when_then.rb +0 -19
@@ -46,268 +46,6 @@ module Polars
46
46
  df
47
47
  end
48
48
 
49
- # @private
50
- def self._from_hashes(data, infer_schema_length: 100, schema: nil, schema_overrides: nil)
51
- rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema, schema_overrides)
52
- _from_rbdf(rbdf)
53
- end
54
-
55
- # @private
56
- def self._from_hash(data, schema: nil, schema_overrides: nil)
57
- _from_rbdf(hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides))
58
- end
59
-
60
- # def self._from_records
61
- # end
62
-
63
- # def self._from_numo
64
- # end
65
-
66
- # no self._from_arrow
67
-
68
- # no self._from_pandas
69
-
70
- # @private
71
- def self._read_csv(
72
- file,
73
- has_header: true,
74
- columns: nil,
75
- sep: str = ",",
76
- comment_char: nil,
77
- quote_char: '"',
78
- skip_rows: 0,
79
- dtypes: nil,
80
- null_values: nil,
81
- ignore_errors: false,
82
- parse_dates: false,
83
- n_threads: nil,
84
- infer_schema_length: 100,
85
- batch_size: 8192,
86
- n_rows: nil,
87
- encoding: "utf8",
88
- low_memory: false,
89
- rechunk: true,
90
- skip_rows_after_header: 0,
91
- row_count_name: nil,
92
- row_count_offset: 0,
93
- sample_size: 1024,
94
- eol_char: "\n"
95
- )
96
- if Utils.pathlike?(file)
97
- path = Utils.normalise_filepath(file)
98
- else
99
- path = nil
100
- # if defined?(StringIO) && file.is_a?(StringIO)
101
- # file = file.string
102
- # end
103
- end
104
-
105
- dtype_list = nil
106
- dtype_slice = nil
107
- if !dtypes.nil?
108
- if dtypes.is_a?(Hash)
109
- dtype_list = []
110
- dtypes.each do|k, v|
111
- dtype_list << [k, Utils.rb_type_to_dtype(v)]
112
- end
113
- elsif dtypes.is_a?(::Array)
114
- dtype_slice = dtypes
115
- else
116
- raise ArgumentError, "dtype arg should be list or dict"
117
- end
118
- end
119
-
120
- processed_null_values = Utils._process_null_values(null_values)
121
-
122
- if columns.is_a?(::String)
123
- columns = [columns]
124
- end
125
- if file.is_a?(::String) && file.include?("*")
126
- dtypes_dict = nil
127
- if !dtype_list.nil?
128
- dtypes_dict = dtype_list.to_h
129
- end
130
- if !dtype_slice.nil?
131
- raise ArgumentError, "cannot use glob patterns and unnamed dtypes as `dtypes` argument; Use dtypes: Mapping[str, Type[DataType]"
132
- end
133
- scan = Polars.scan_csv(
134
- file,
135
- has_header: has_header,
136
- sep: sep,
137
- comment_char: comment_char,
138
- quote_char: quote_char,
139
- skip_rows: skip_rows,
140
- dtypes: dtypes_dict,
141
- null_values: null_values,
142
- ignore_errors: ignore_errors,
143
- infer_schema_length: infer_schema_length,
144
- n_rows: n_rows,
145
- low_memory: low_memory,
146
- rechunk: rechunk,
147
- skip_rows_after_header: skip_rows_after_header,
148
- row_count_name: row_count_name,
149
- row_count_offset: row_count_offset,
150
- eol_char: eol_char
151
- )
152
- if columns.nil?
153
- return _from_rbdf(scan.collect._df)
154
- elsif is_str_sequence(columns, allow_str: false)
155
- return _from_rbdf(scan.select(columns).collect._df)
156
- else
157
- raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: List[str]"
158
- end
159
- end
160
-
161
- projection, columns = Utils.handle_projection_columns(columns)
162
-
163
- _from_rbdf(
164
- RbDataFrame.read_csv(
165
- file,
166
- infer_schema_length,
167
- batch_size,
168
- has_header,
169
- ignore_errors,
170
- n_rows,
171
- skip_rows,
172
- projection,
173
- sep,
174
- rechunk,
175
- columns,
176
- encoding,
177
- n_threads,
178
- path,
179
- dtype_list,
180
- dtype_slice,
181
- low_memory,
182
- comment_char,
183
- quote_char,
184
- processed_null_values,
185
- parse_dates,
186
- skip_rows_after_header,
187
- Utils._prepare_row_count_args(row_count_name, row_count_offset),
188
- sample_size,
189
- eol_char
190
- )
191
- )
192
- end
193
-
194
- # @private
195
- def self._read_parquet(
196
- source,
197
- columns: nil,
198
- n_rows: nil,
199
- parallel: "auto",
200
- row_count_name: nil,
201
- row_count_offset: 0,
202
- low_memory: false,
203
- use_statistics: true,
204
- rechunk: true
205
- )
206
- if Utils.pathlike?(source)
207
- source = Utils.normalise_filepath(source)
208
- end
209
- if columns.is_a?(::String)
210
- columns = [columns]
211
- end
212
-
213
- if source.is_a?(::String) && source.include?("*") && Utils.local_file?(source)
214
- scan =
215
- Polars.scan_parquet(
216
- source,
217
- n_rows: n_rows,
218
- rechunk: true,
219
- parallel: parallel,
220
- row_count_name: row_count_name,
221
- row_count_offset: row_count_offset,
222
- low_memory: low_memory
223
- )
224
-
225
- if columns.nil?
226
- return self._from_rbdf(scan.collect._df)
227
- elsif Utils.is_str_sequence(columns, allow_str: false)
228
- return self._from_rbdf(scan.select(columns).collect._df)
229
- else
230
- raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: Array[String]"
231
- end
232
- end
233
-
234
- projection, columns = Utils.handle_projection_columns(columns)
235
- _from_rbdf(
236
- RbDataFrame.read_parquet(
237
- source,
238
- columns,
239
- projection,
240
- n_rows,
241
- parallel,
242
- Utils._prepare_row_count_args(row_count_name, row_count_offset),
243
- low_memory,
244
- use_statistics,
245
- rechunk
246
- )
247
- )
248
- end
249
-
250
- # @private
251
- def self._read_avro(file, columns: nil, n_rows: nil)
252
- if Utils.pathlike?(file)
253
- file = Utils.normalise_filepath(file)
254
- end
255
- projection, columns = Utils.handle_projection_columns(columns)
256
- _from_rbdf(RbDataFrame.read_avro(file, columns, projection, n_rows))
257
- end
258
-
259
- # @private
260
- def self._read_ipc(
261
- file,
262
- columns: nil,
263
- n_rows: nil,
264
- row_count_name: nil,
265
- row_count_offset: 0,
266
- rechunk: true,
267
- memory_map: true
268
- )
269
- if Utils.pathlike?(file)
270
- file = Utils.normalise_filepath(file)
271
- end
272
- if columns.is_a?(::String)
273
- columns = [columns]
274
- end
275
-
276
- if file.is_a?(::String) && file.include?("*")
277
- raise Todo
278
- end
279
-
280
- projection, columns = Utils.handle_projection_columns(columns)
281
- _from_rbdf(
282
- RbDataFrame.read_ipc(
283
- file,
284
- columns,
285
- projection,
286
- n_rows,
287
- Utils._prepare_row_count_args(row_count_name, row_count_offset),
288
- memory_map
289
- )
290
- )
291
- end
292
-
293
- # @private
294
- def self._read_json(file)
295
- if Utils.pathlike?(file)
296
- file = Utils.normalise_filepath(file)
297
- end
298
-
299
- _from_rbdf(RbDataFrame.read_json(file))
300
- end
301
-
302
- # @private
303
- def self._read_ndjson(file)
304
- if Utils.pathlike?(file)
305
- file = Utils.normalise_filepath(file)
306
- end
307
-
308
- _from_rbdf(RbDataFrame.read_ndjson(file))
309
- end
310
-
311
49
  # Get the shape of the DataFrame.
312
50
  #
313
51
  # @return [Array]
@@ -416,6 +154,13 @@ module Polars
416
154
  _df.dtypes
417
155
  end
418
156
 
157
+ # Get flags that are set on the columns of this DataFrame.
158
+ #
159
+ # @return [Hash]
160
+ def flags
161
+ columns.to_h { |name| [name, self[name].flags] }
162
+ end
163
+
419
164
  # Get the schema.
420
165
  #
421
166
  # @return [Hash]
@@ -814,8 +559,6 @@ module Polars
814
559
 
815
560
  # Serialize to JSON representation.
816
561
  #
817
- # @return [nil]
818
- #
819
562
  # @param file [String]
820
563
  # File path to which the result should be written.
821
564
  # @param pretty [Boolean]
@@ -823,17 +566,45 @@ module Polars
823
566
  # @param row_oriented [Boolean]
824
567
  # Write to row oriented json. This is slower, but more common.
825
568
  #
826
- # @see #write_ndjson
569
+ # @return [nil]
570
+ #
571
+ # @example
572
+ # df = Polars::DataFrame.new(
573
+ # {
574
+ # "foo" => [1, 2, 3],
575
+ # "bar" => [6, 7, 8]
576
+ # }
577
+ # )
578
+ # df.write_json
579
+ # # => "{\"columns\":[{\"name\":\"foo\",\"datatype\":\"Int64\",\"bit_settings\":\"\",\"values\":[1,2,3]},{\"name\":\"bar\",\"datatype\":\"Int64\",\"bit_settings\":\"\",\"values\":[6,7,8]}]}"
580
+ #
581
+ # @example
582
+ # df.write_json(row_oriented: true)
583
+ # # => "[{\"foo\":1,\"bar\":6},{\"foo\":2,\"bar\":7},{\"foo\":3,\"bar\":8}]"
827
584
  def write_json(
828
- file,
585
+ file = nil,
829
586
  pretty: false,
830
587
  row_oriented: false
831
588
  )
832
589
  if Utils.pathlike?(file)
833
- file = Utils.normalise_filepath(file)
590
+ file = Utils.normalize_filepath(file)
591
+ end
592
+ to_string_io = !file.nil? && file.is_a?(StringIO)
593
+ if file.nil? || to_string_io
594
+ buf = StringIO.new
595
+ buf.set_encoding(Encoding::BINARY)
596
+ _df.write_json(buf, pretty, row_oriented)
597
+ json_bytes = buf.string
598
+
599
+ json_str = json_bytes.force_encoding(Encoding::UTF_8)
600
+ if to_string_io
601
+ file.write(json_str)
602
+ else
603
+ return json_str
604
+ end
605
+ else
606
+ _df.write_json(file, pretty, row_oriented)
834
607
  end
835
-
836
- _df.write_json(file, pretty, row_oriented)
837
608
  nil
838
609
  end
839
610
 
@@ -843,12 +614,36 @@ module Polars
843
614
  # File path to which the result should be written.
844
615
  #
845
616
  # @return [nil]
846
- def write_ndjson(file)
617
+ #
618
+ # @example
619
+ # df = Polars::DataFrame.new(
620
+ # {
621
+ # "foo" => [1, 2, 3],
622
+ # "bar" => [6, 7, 8]
623
+ # }
624
+ # )
625
+ # df.write_ndjson()
626
+ # # => "{\"foo\":1,\"bar\":6}\n{\"foo\":2,\"bar\":7}\n{\"foo\":3,\"bar\":8}\n"
627
+ def write_ndjson(file = nil)
847
628
  if Utils.pathlike?(file)
848
- file = Utils.normalise_filepath(file)
629
+ file = Utils.normalize_filepath(file)
630
+ end
631
+ to_string_io = !file.nil? && file.is_a?(StringIO)
632
+ if file.nil? || to_string_io
633
+ buf = StringIO.new
634
+ buf.set_encoding(Encoding::BINARY)
635
+ _df.write_ndjson(buf)
636
+ json_bytes = buf.string
637
+
638
+ json_str = json_bytes.force_encoding(Encoding::UTF_8)
639
+ if to_string_io
640
+ file.write(json_str)
641
+ else
642
+ return json_str
643
+ end
644
+ else
645
+ _df.write_ndjson(file)
849
646
  end
850
-
851
- _df.write_ndjson(file)
852
647
  nil
853
648
  end
854
649
 
@@ -938,7 +733,7 @@ module Polars
938
733
  end
939
734
 
940
735
  if Utils.pathlike?(file)
941
- file = Utils.normalise_filepath(file)
736
+ file = Utils.normalize_filepath(file)
942
737
  end
943
738
 
944
739
  _df.write_csv(
@@ -976,7 +771,7 @@ module Polars
976
771
  compression = "uncompressed"
977
772
  end
978
773
  if Utils.pathlike?(file)
979
- file = Utils.normalise_filepath(file)
774
+ file = Utils.normalize_filepath(file)
980
775
  end
981
776
 
982
777
  _df.write_avro(file, compression)
@@ -997,7 +792,7 @@ module Polars
997
792
  file.set_encoding(Encoding::BINARY)
998
793
  end
999
794
  if Utils.pathlike?(file)
1000
- file = Utils.normalise_filepath(file)
795
+ file = Utils.normalize_filepath(file)
1001
796
  end
1002
797
 
1003
798
  if compression.nil?
@@ -1008,9 +803,50 @@ module Polars
1008
803
  return_bytes ? file.string : nil
1009
804
  end
1010
805
 
806
+ # Write to Arrow IPC record batch stream.
807
+ #
808
+ # See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html.
809
+ #
810
+ # @param file [Object]
811
+ # Path or writable file-like object to which the IPC record batch data will
812
+ # be written. If set to `None`, the output is returned as a BytesIO object.
813
+ # @param compression ['uncompressed', 'lz4', 'zstd']
814
+ # Compression method. Defaults to "uncompressed".
815
+ #
816
+ # @return [Object]
817
+ #
818
+ # @example
819
+ # df = Polars::DataFrame.new(
820
+ # {
821
+ # "foo" => [1, 2, 3, 4, 5],
822
+ # "bar" => [6, 7, 8, 9, 10],
823
+ # "ham" => ["a", "b", "c", "d", "e"]
824
+ # }
825
+ # )
826
+ # df.write_ipc_stream("new_file.arrow")
827
+ def write_ipc_stream(
828
+ file,
829
+ compression: "uncompressed"
830
+ )
831
+ return_bytes = file.nil?
832
+ if return_bytes
833
+ file = StringIO.new
834
+ file.set_encoding(Encoding::BINARY)
835
+ elsif Utils.pathlike?(file)
836
+ file = Utils.normalize_filepath(file)
837
+ end
838
+
839
+ if compression.nil?
840
+ compression = "uncompressed"
841
+ end
842
+
843
+ _df.write_ipc_stream(file, compression)
844
+ return_bytes ? file.string : nil
845
+ end
846
+
1011
847
  # Write to Apache Parquet file.
1012
848
  #
1013
- # @param file [String]
849
+ # @param file [String, Pathname, StringIO]
1014
850
  # File path to which the file should be written.
1015
851
  # @param compression ["lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd"]
1016
852
  # Choose "zstd" for good compression performance.
@@ -1027,10 +863,9 @@ module Polars
1027
863
  # @param statistics [Boolean]
1028
864
  # Write statistics to the parquet headers. This requires extra compute.
1029
865
  # @param row_group_size [Integer, nil]
1030
- # Size of the row groups in number of rows.
1031
- # If `nil` (default), the chunks of the DataFrame are
1032
- # used. Writing in smaller chunks may reduce memory pressure and improve
1033
- # writing speeds.
866
+ # Size of the row groups in number of rows. Defaults to 512^2 rows.
867
+ # @param data_page_size [Integer, nil]
868
+ # Size of the data page in bytes. Defaults to 1024^2 bytes.
1034
869
  #
1035
870
  # @return [nil]
1036
871
  def write_parquet(
@@ -1038,17 +873,18 @@ module Polars
1038
873
  compression: "zstd",
1039
874
  compression_level: nil,
1040
875
  statistics: false,
1041
- row_group_size: nil
876
+ row_group_size: nil,
877
+ data_page_size: nil
1042
878
  )
1043
879
  if compression.nil?
1044
880
  compression = "uncompressed"
1045
881
  end
1046
882
  if Utils.pathlike?(file)
1047
- file = Utils.normalise_filepath(file)
883
+ file = Utils.normalize_filepath(file)
1048
884
  end
1049
885
 
1050
886
  _df.write_parquet(
1051
- file, compression, compression_level, statistics, row_group_size
887
+ file, compression, compression_level, statistics, row_group_size, data_page_size
1052
888
  )
1053
889
  end
1054
890
 
@@ -1084,7 +920,7 @@ module Polars
1084
920
  # df.estimated_size
1085
921
  # # => 25888898
1086
922
  # df.estimated_size("mb")
1087
- # # => 26.702880859375
923
+ # # => 17.0601749420166
1088
924
  def estimated_size(unit = "b")
1089
925
  sz = _df.estimated_size
1090
926
  Utils.scale_bytes(sz, to: unit)
@@ -1720,10 +1556,7 @@ module Polars
1720
1556
  # # │ 3 ┆ 8 ┆ c │
1721
1557
  # # └─────┴─────┴─────┘
1722
1558
  def drop_nulls(subset: nil)
1723
- if subset.is_a?(::String)
1724
- subset = [subset]
1725
- end
1726
- _from_rbdf(_df.drop_nulls(subset))
1559
+ lazy.drop_nulls(subset: subset).collect(_eager: true)
1727
1560
  end
1728
1561
 
1729
1562
  # Offers a structured way to apply a sequence of user-defined functions (UDFs).
@@ -1785,16 +1618,16 @@ module Polars
1785
1618
  # df.with_row_index
1786
1619
  # # =>
1787
1620
  # # shape: (3, 3)
1788
- # # ┌────────┬─────┬─────┐
1789
- # # │ row_nr ┆ a ┆ b │
1790
- # # │ --- ┆ --- ┆ --- │
1791
- # # │ u32 ┆ i64 ┆ i64 │
1792
- # # ╞════════╪═════╪═════╡
1793
- # # │ 0 ┆ 1 ┆ 2 │
1794
- # # │ 1 ┆ 3 ┆ 4 │
1795
- # # │ 2 ┆ 5 ┆ 6 │
1796
- # # └────────┴─────┴─────┘
1797
- def with_row_index(name: "row_nr", offset: 0)
1621
+ # # ┌───────┬─────┬─────┐
1622
+ # # │ index ┆ a ┆ b │
1623
+ # # │ --- ┆ --- ┆ --- │
1624
+ # # │ u32 ┆ i64 ┆ i64 │
1625
+ # # ╞═══════╪═════╪═════╡
1626
+ # # │ 0 ┆ 1 ┆ 2 │
1627
+ # # │ 1 ┆ 3 ┆ 4 │
1628
+ # # │ 2 ┆ 5 ┆ 6 │
1629
+ # # └───────┴─────┴─────┘
1630
+ def with_row_index(name: "index", offset: 0)
1798
1631
  _from_rbdf(_df.with_row_index(name, offset))
1799
1632
  end
1800
1633
  alias_method :with_row_count, :with_row_index
@@ -2083,16 +1916,16 @@ module Polars
2083
1916
  # )
2084
1917
  # # =>
2085
1918
  # # shape: (4, 3)
2086
- # # ┌─────────────────────┬────────────┬───────────────────────────────────┐
2087
- # # │ time ┆ time_count ┆ time_agg_list
2088
- # # │ --- ┆ --- ┆ ---
2089
- # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
2090
- # # ╞═════════════════════╪════════════╪═══════════════════════════════════╡
2091
- # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16…
2092
- # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16…
2093
- # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16…
2094
- # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
2095
- # # └─────────────────────┴────────────┴───────────────────────────────────┘
1919
+ # # ┌─────────────────────┬────────────┬─────────────────────────────────┐
1920
+ # # │ time ┆ time_count ┆ time_agg_list
1921
+ # # │ --- ┆ --- ┆ ---
1922
+ # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
1923
+ # # ╞═════════════════════╪════════════╪═════════════════════════════════╡
1924
+ # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-…
1925
+ # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-…
1926
+ # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-…
1927
+ # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
1928
+ # # └─────────────────────┴────────────┴─────────────────────────────────┘
2096
1929
  #
2097
1930
  # @example When closed="both" the time values at the window boundaries belong to 2 groups.
2098
1931
  # df.group_by_dynamic("time", every: "1h", closed: "both").agg(
@@ -2161,12 +1994,13 @@ module Polars
2161
1994
  # closed: "right"
2162
1995
  # ).agg(Polars.col("A").alias("A_agg_list"))
2163
1996
  # # =>
2164
- # # shape: (3, 4)
1997
+ # # shape: (4, 4)
2165
1998
  # # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
2166
1999
  # # │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │
2167
2000
  # # │ --- ┆ --- ┆ --- ┆ --- │
2168
2001
  # # │ i64 ┆ i64 ┆ i64 ┆ list[str] │
2169
2002
  # # ╞═════════════════╪═════════════════╪═════╪═════════════════╡
2003
+ # # │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │
2170
2004
  # # │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │
2171
2005
  # # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
2172
2006
  # # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
@@ -2566,7 +2400,7 @@ module Polars
2566
2400
  # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [-1, 5, 8]})
2567
2401
  #
2568
2402
  # @example Return a DataFrame by mapping each row to a tuple:
2569
- # df.apply { |t| [t[0] * 2, t[1] * 3] }
2403
+ # df.map_rows { |t| [t[0] * 2, t[1] * 3] }
2570
2404
  # # =>
2571
2405
  # # shape: (3, 2)
2572
2406
  # # ┌──────────┬──────────┐
@@ -2580,7 +2414,7 @@ module Polars
2580
2414
  # # └──────────┴──────────┘
2581
2415
  #
2582
2416
  # @example Return a Series by mapping each row to a scalar:
2583
- # df.apply { |t| t[0] * 2 + t[1] }
2417
+ # df.map_rows { |t| t[0] * 2 + t[1] }
2584
2418
  # # =>
2585
2419
  # # shape: (3, 1)
2586
2420
  # # ┌───────┐
@@ -2592,14 +2426,15 @@ module Polars
2592
2426
  # # │ 9 │
2593
2427
  # # │ 14 │
2594
2428
  # # └───────┘
2595
- def apply(return_dtype: nil, inference_size: 256, &f)
2596
- out, is_df = _df.apply(f, return_dtype, inference_size)
2429
+ def map_rows(return_dtype: nil, inference_size: 256, &f)
2430
+ out, is_df = _df.map_rows(f, return_dtype, inference_size)
2597
2431
  if is_df
2598
2432
  _from_rbdf(out)
2599
2433
  else
2600
2434
  _from_rbdf(Utils.wrap_s(out).to_frame._df)
2601
2435
  end
2602
2436
  end
2437
+ alias_method :apply, :map_rows
2603
2438
 
2604
2439
  # Return a new DataFrame with the column added or replaced.
2605
2440
  #
@@ -2621,26 +2456,26 @@ module Polars
2621
2456
  # # ┌─────┬─────┬───────────┐
2622
2457
  # # │ a ┆ b ┆ b_squared │
2623
2458
  # # │ --- ┆ --- ┆ --- │
2624
- # # │ i64 ┆ i64 ┆ f64
2459
+ # # │ i64 ┆ i64 ┆ i64
2625
2460
  # # ╞═════╪═════╪═══════════╡
2626
- # # │ 1 ┆ 2 ┆ 4.0
2627
- # # │ 3 ┆ 4 ┆ 16.0
2628
- # # │ 5 ┆ 6 ┆ 36.0
2461
+ # # │ 1 ┆ 2 ┆ 4
2462
+ # # │ 3 ┆ 4 ┆ 16
2463
+ # # │ 5 ┆ 6 ┆ 36
2629
2464
  # # └─────┴─────┴───────────┘
2630
2465
  #
2631
2466
  # @example Replaced
2632
2467
  # df.with_column(Polars.col("a") ** 2)
2633
2468
  # # =>
2634
2469
  # # shape: (3, 2)
2635
- # # ┌──────┬─────┐
2636
- # # │ a ┆ b │
2637
- # # │ --- ┆ --- │
2638
- # # │ f64 ┆ i64 │
2639
- # # ╞══════╪═════╡
2640
- # # │ 1.0 ┆ 2 │
2641
- # # │ 9.0 ┆ 4 │
2642
- # # │ 25.0 ┆ 6 │
2643
- # # └──────┴─────┘
2470
+ # # ┌─────┬─────┐
2471
+ # # │ a ┆ b │
2472
+ # # │ --- ┆ --- │
2473
+ # # │ i64 ┆ i64 │
2474
+ # # ╞═════╪═════╡
2475
+ # # │ 1 ┆ 2 │
2476
+ # # │ 9 ┆ 4 │
2477
+ # # │ 25 ┆ 6 │
2478
+ # # └─────┴─────┘
2644
2479
  def with_column(column)
2645
2480
  lazy
2646
2481
  .with_column(column)
@@ -2807,16 +2642,36 @@ module Polars
2807
2642
  # # │ 2 ┆ 7.0 │
2808
2643
  # # │ 3 ┆ 8.0 │
2809
2644
  # # └─────┴─────┘
2810
- def drop(columns)
2811
- if columns.is_a?(::Array)
2812
- df = clone
2813
- columns.each do |n|
2814
- df._df.drop_in_place(n)
2815
- end
2816
- df
2817
- else
2818
- _from_rbdf(_df.drop(columns))
2819
- end
2645
+ #
2646
+ # @example Drop multiple columns by passing a list of column names.
2647
+ # df.drop(["bar", "ham"])
2648
+ # # =>
2649
+ # # shape: (3, 1)
2650
+ # # ┌─────┐
2651
+ # # │ foo │
2652
+ # # │ --- │
2653
+ # # │ i64 │
2654
+ # # ╞═════╡
2655
+ # # │ 1 │
2656
+ # # │ 2 │
2657
+ # # │ 3 │
2658
+ # # └─────┘
2659
+ #
2660
+ # @example Use positional arguments to drop multiple columns.
2661
+ # df.drop("foo", "ham")
2662
+ # # =>
2663
+ # # shape: (3, 1)
2664
+ # # ┌─────┐
2665
+ # # │ bar │
2666
+ # # │ --- │
2667
+ # # │ f64 │
2668
+ # # ╞═════╡
2669
+ # # │ 6.0 │
2670
+ # # │ 7.0 │
2671
+ # # │ 8.0 │
2672
+ # # └─────┘
2673
+ def drop(*columns)
2674
+ lazy.drop(*columns).collect(_eager: true)
2820
2675
  end
2821
2676
 
2822
2677
  # Drop in place.
@@ -3700,7 +3555,7 @@ module Polars
3700
3555
  # # ┌─────────┐
3701
3556
  # # │ literal │
3702
3557
  # # │ --- │
3703
- # # │ i64
3558
+ # # │ i32
3704
3559
  # # ╞═════════╡
3705
3560
  # # │ 0 │
3706
3561
  # # │ 0 │
@@ -3735,16 +3590,16 @@ module Polars
3735
3590
  # df.with_columns((Polars.col("a") ** 2).alias("a^2"))
3736
3591
  # # =>
3737
3592
  # # shape: (4, 4)
3738
- # # ┌─────┬──────┬───────┬──────┐
3739
- # # │ a ┆ b ┆ c ┆ a^2
3740
- # # │ --- ┆ --- ┆ --- ┆ ---
3741
- # # │ i64 ┆ f64 ┆ bool ┆ f64
3742
- # # ╞═════╪══════╪═══════╪══════╡
3743
- # # │ 1 ┆ 0.5 ┆ true ┆ 1.0
3744
- # # │ 2 ┆ 4.0 ┆ true ┆ 4.0
3745
- # # │ 3 ┆ 10.0 ┆ false ┆ 9.0
3746
- # # │ 4 ┆ 13.0 ┆ true ┆ 16.0
3747
- # # └─────┴──────┴───────┴──────┘
3593
+ # # ┌─────┬──────┬───────┬─────┐
3594
+ # # │ a ┆ b ┆ c ┆ a^2
3595
+ # # │ --- ┆ --- ┆ --- ┆ ---
3596
+ # # │ i64 ┆ f64 ┆ bool ┆ i64
3597
+ # # ╞═════╪══════╪═══════╪═════╡
3598
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1
3599
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4
3600
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9
3601
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16
3602
+ # # └─────┴──────┴───────┴─────┘
3748
3603
  #
3749
3604
  # @example Added columns will replace existing columns with the same name.
3750
3605
  # df.with_columns(Polars.col("a").cast(Polars::Float64))
@@ -3771,16 +3626,16 @@ module Polars
3771
3626
  # )
3772
3627
  # # =>
3773
3628
  # # shape: (4, 6)
3774
- # # ┌─────┬──────┬───────┬──────┬──────┬───────┐
3775
- # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
3776
- # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3777
- # # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │
3778
- # # ╞═════╪══════╪═══════╪══════╪══════╪═══════╡
3779
- # # │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │
3780
- # # │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │
3781
- # # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
3782
- # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
3783
- # # └─────┴──────┴───────┴──────┴──────┴───────┘
3629
+ # # ┌─────┬──────┬───────┬─────┬──────┬───────┐
3630
+ # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
3631
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3632
+ # # │ i64 ┆ f64 ┆ bool ┆ i64 ┆ f64 ┆ bool │
3633
+ # # ╞═════╪══════╪═══════╪═════╪══════╪═══════╡
3634
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1 ┆ 0.25 ┆ false │
3635
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4 ┆ 2.0 ┆ false │
3636
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9 ┆ 5.0 ┆ true │
3637
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16 ┆ 6.5 ┆ false │
3638
+ # # └─────┴──────┴───────┴─────┴──────┴───────┘
3784
3639
  #
3785
3640
  # @example Multiple columns also can be added using positional arguments instead of a list.
3786
3641
  # df.with_columns(
@@ -3790,16 +3645,16 @@ module Polars
3790
3645
  # )
3791
3646
  # # =>
3792
3647
  # # shape: (4, 6)
3793
- # # ┌─────┬──────┬───────┬──────┬──────┬───────┐
3794
- # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
3795
- # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3796
- # # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │
3797
- # # ╞═════╪══════╪═══════╪══════╪══════╪═══════╡
3798
- # # │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │
3799
- # # │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │
3800
- # # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
3801
- # # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
3802
- # # └─────┴──────┴───────┴──────┴──────┴───────┘
3648
+ # # ┌─────┬──────┬───────┬─────┬──────┬───────┐
3649
+ # # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
3650
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
3651
+ # # │ i64 ┆ f64 ┆ bool ┆ i64 ┆ f64 ┆ bool │
3652
+ # # ╞═════╪══════╪═══════╪═════╪══════╪═══════╡
3653
+ # # │ 1 ┆ 0.5 ┆ true ┆ 1 ┆ 0.25 ┆ false │
3654
+ # # │ 2 ┆ 4.0 ┆ true ┆ 4 ┆ 2.0 ┆ false │
3655
+ # # │ 3 ┆ 10.0 ┆ false ┆ 9 ┆ 5.0 ┆ true │
3656
+ # # │ 4 ┆ 13.0 ┆ true ┆ 16 ┆ 6.5 ┆ false │
3657
+ # # └─────┴──────┴───────┴─────┴──────┴───────┘
3803
3658
  #
3804
3659
  # @example Use keyword arguments to easily name your expression inputs.
3805
3660
  # df.with_columns(
@@ -5181,7 +5036,7 @@ module Polars
5181
5036
  elsif data[0].is_a?(Hash)
5182
5037
  column_names, dtypes = _unpack_schema(columns)
5183
5038
  schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
5184
- rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema, schema_overrides)
5039
+ rbdf = RbDataFrame.from_hashes(data, schema, schema_overrides, false, infer_schema_length)
5185
5040
  if column_names
5186
5041
  rbdf = _post_apply_columns(rbdf, column_names)
5187
5042
  end
@@ -5215,7 +5070,7 @@ module Polars
5215
5070
  if unpack_nested
5216
5071
  raise Todo
5217
5072
  else
5218
- rbdf = RbDataFrame.read_rows(
5073
+ rbdf = RbDataFrame.from_rows(
5219
5074
  data,
5220
5075
  infer_schema_length,
5221
5076
  local_schema_override.any? ? local_schema_override : nil