polars-df 0.10.0-arm64-darwin → 0.11.0-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -46,271 +46,6 @@ module Polars
46
46
  df
47
47
  end
48
48
 
49
- # @private
50
- def self._from_hashes(data, infer_schema_length: 100, schema: nil, schema_overrides: nil)
51
- rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema, schema_overrides)
52
- _from_rbdf(rbdf)
53
- end
54
-
55
- # @private
56
- def self._from_hash(data, schema: nil, schema_overrides: nil)
57
- _from_rbdf(hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides))
58
- end
59
-
60
- # def self._from_records
61
- # end
62
-
63
- # def self._from_numo
64
- # end
65
-
66
- # no self._from_arrow
67
-
68
- # no self._from_pandas
69
-
70
- # @private
71
- def self._read_csv(
72
- file,
73
- has_header: true,
74
- columns: nil,
75
- sep: str = ",",
76
- comment_char: nil,
77
- quote_char: '"',
78
- skip_rows: 0,
79
- dtypes: nil,
80
- null_values: nil,
81
- ignore_errors: false,
82
- parse_dates: false,
83
- n_threads: nil,
84
- infer_schema_length: 100,
85
- batch_size: 8192,
86
- n_rows: nil,
87
- encoding: "utf8",
88
- low_memory: false,
89
- rechunk: true,
90
- skip_rows_after_header: 0,
91
- row_count_name: nil,
92
- row_count_offset: 0,
93
- sample_size: 1024,
94
- eol_char: "\n",
95
- truncate_ragged_lines: false
96
- )
97
- if Utils.pathlike?(file)
98
- path = Utils.normalise_filepath(file)
99
- else
100
- path = nil
101
- # if defined?(StringIO) && file.is_a?(StringIO)
102
- # file = file.string
103
- # end
104
- end
105
-
106
- dtype_list = nil
107
- dtype_slice = nil
108
- if !dtypes.nil?
109
- if dtypes.is_a?(Hash)
110
- dtype_list = []
111
- dtypes.each do|k, v|
112
- dtype_list << [k, Utils.rb_type_to_dtype(v)]
113
- end
114
- elsif dtypes.is_a?(::Array)
115
- dtype_slice = dtypes
116
- else
117
- raise ArgumentError, "dtype arg should be list or dict"
118
- end
119
- end
120
-
121
- processed_null_values = Utils._process_null_values(null_values)
122
-
123
- if columns.is_a?(::String)
124
- columns = [columns]
125
- end
126
- if file.is_a?(::String) && file.include?("*")
127
- dtypes_dict = nil
128
- if !dtype_list.nil?
129
- dtypes_dict = dtype_list.to_h
130
- end
131
- if !dtype_slice.nil?
132
- raise ArgumentError, "cannot use glob patterns and unnamed dtypes as `dtypes` argument; Use dtypes: Mapping[str, Type[DataType]"
133
- end
134
- scan = Polars.scan_csv(
135
- file,
136
- has_header: has_header,
137
- sep: sep,
138
- comment_char: comment_char,
139
- quote_char: quote_char,
140
- skip_rows: skip_rows,
141
- dtypes: dtypes_dict,
142
- null_values: null_values,
143
- ignore_errors: ignore_errors,
144
- infer_schema_length: infer_schema_length,
145
- n_rows: n_rows,
146
- low_memory: low_memory,
147
- rechunk: rechunk,
148
- skip_rows_after_header: skip_rows_after_header,
149
- row_count_name: row_count_name,
150
- row_count_offset: row_count_offset,
151
- eol_char: eol_char,
152
- truncate_ragged_lines: truncate_ragged_lines
153
- )
154
- if columns.nil?
155
- return _from_rbdf(scan.collect._df)
156
- elsif is_str_sequence(columns, allow_str: false)
157
- return _from_rbdf(scan.select(columns).collect._df)
158
- else
159
- raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: List[str]"
160
- end
161
- end
162
-
163
- projection, columns = Utils.handle_projection_columns(columns)
164
-
165
- _from_rbdf(
166
- RbDataFrame.read_csv(
167
- file,
168
- infer_schema_length,
169
- batch_size,
170
- has_header,
171
- ignore_errors,
172
- n_rows,
173
- skip_rows,
174
- projection,
175
- sep,
176
- rechunk,
177
- columns,
178
- encoding,
179
- n_threads,
180
- path,
181
- dtype_list,
182
- dtype_slice,
183
- low_memory,
184
- comment_char,
185
- quote_char,
186
- processed_null_values,
187
- parse_dates,
188
- skip_rows_after_header,
189
- Utils._prepare_row_count_args(row_count_name, row_count_offset),
190
- sample_size,
191
- eol_char,
192
- truncate_ragged_lines
193
- )
194
- )
195
- end
196
-
197
- # @private
198
- def self._read_parquet(
199
- source,
200
- columns: nil,
201
- n_rows: nil,
202
- parallel: "auto",
203
- row_count_name: nil,
204
- row_count_offset: 0,
205
- low_memory: false,
206
- use_statistics: true,
207
- rechunk: true
208
- )
209
- if Utils.pathlike?(source)
210
- source = Utils.normalise_filepath(source)
211
- end
212
- if columns.is_a?(::String)
213
- columns = [columns]
214
- end
215
-
216
- if source.is_a?(::String) && source.include?("*") && Utils.local_file?(source)
217
- scan =
218
- Polars.scan_parquet(
219
- source,
220
- n_rows: n_rows,
221
- rechunk: true,
222
- parallel: parallel,
223
- row_count_name: row_count_name,
224
- row_count_offset: row_count_offset,
225
- low_memory: low_memory
226
- )
227
-
228
- if columns.nil?
229
- return self._from_rbdf(scan.collect._df)
230
- elsif Utils.is_str_sequence(columns, allow_str: false)
231
- return self._from_rbdf(scan.select(columns).collect._df)
232
- else
233
- raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: Array[String]"
234
- end
235
- end
236
-
237
- projection, columns = Utils.handle_projection_columns(columns)
238
- _from_rbdf(
239
- RbDataFrame.read_parquet(
240
- source,
241
- columns,
242
- projection,
243
- n_rows,
244
- parallel,
245
- Utils._prepare_row_count_args(row_count_name, row_count_offset),
246
- low_memory,
247
- use_statistics,
248
- rechunk
249
- )
250
- )
251
- end
252
-
253
- # @private
254
- def self._read_avro(file, columns: nil, n_rows: nil)
255
- if Utils.pathlike?(file)
256
- file = Utils.normalise_filepath(file)
257
- end
258
- projection, columns = Utils.handle_projection_columns(columns)
259
- _from_rbdf(RbDataFrame.read_avro(file, columns, projection, n_rows))
260
- end
261
-
262
- # @private
263
- def self._read_ipc(
264
- file,
265
- columns: nil,
266
- n_rows: nil,
267
- row_count_name: nil,
268
- row_count_offset: 0,
269
- rechunk: true,
270
- memory_map: true
271
- )
272
- if Utils.pathlike?(file)
273
- file = Utils.normalise_filepath(file)
274
- end
275
- if columns.is_a?(::String)
276
- columns = [columns]
277
- end
278
-
279
- if file.is_a?(::String) && file.include?("*")
280
- raise Todo
281
- end
282
-
283
- projection, columns = Utils.handle_projection_columns(columns)
284
- _from_rbdf(
285
- RbDataFrame.read_ipc(
286
- file,
287
- columns,
288
- projection,
289
- n_rows,
290
- Utils._prepare_row_count_args(row_count_name, row_count_offset),
291
- memory_map
292
- )
293
- )
294
- end
295
-
296
- # @private
297
- def self._read_json(file)
298
- if Utils.pathlike?(file)
299
- file = Utils.normalise_filepath(file)
300
- end
301
-
302
- _from_rbdf(RbDataFrame.read_json(file))
303
- end
304
-
305
- # @private
306
- def self._read_ndjson(file)
307
- if Utils.pathlike?(file)
308
- file = Utils.normalise_filepath(file)
309
- end
310
-
311
- _from_rbdf(RbDataFrame.read_ndjson(file))
312
- end
313
-
314
49
  # Get the shape of the DataFrame.
315
50
  #
316
51
  # @return [Array]
@@ -419,6 +154,13 @@ module Polars
419
154
  _df.dtypes
420
155
  end
421
156
 
157
+ # Get flags that are set on the columns of this DataFrame.
158
+ #
159
+ # @return [Hash]
160
+ def flags
161
+ columns.to_h { |name| [name, self[name].flags] }
162
+ end
163
+
422
164
  # Get the schema.
423
165
  #
424
166
  # @return [Hash]
@@ -845,7 +587,7 @@ module Polars
845
587
  row_oriented: false
846
588
  )
847
589
  if Utils.pathlike?(file)
848
- file = Utils.normalise_filepath(file)
590
+ file = Utils.normalize_filepath(file)
849
591
  end
850
592
  to_string_io = !file.nil? && file.is_a?(StringIO)
851
593
  if file.nil? || to_string_io
@@ -884,7 +626,7 @@ module Polars
884
626
  # # => "{\"foo\":1,\"bar\":6}\n{\"foo\":2,\"bar\":7}\n{\"foo\":3,\"bar\":8}\n"
885
627
  def write_ndjson(file = nil)
886
628
  if Utils.pathlike?(file)
887
- file = Utils.normalise_filepath(file)
629
+ file = Utils.normalize_filepath(file)
888
630
  end
889
631
  to_string_io = !file.nil? && file.is_a?(StringIO)
890
632
  if file.nil? || to_string_io
@@ -991,7 +733,7 @@ module Polars
991
733
  end
992
734
 
993
735
  if Utils.pathlike?(file)
994
- file = Utils.normalise_filepath(file)
736
+ file = Utils.normalize_filepath(file)
995
737
  end
996
738
 
997
739
  _df.write_csv(
@@ -1029,7 +771,7 @@ module Polars
1029
771
  compression = "uncompressed"
1030
772
  end
1031
773
  if Utils.pathlike?(file)
1032
- file = Utils.normalise_filepath(file)
774
+ file = Utils.normalize_filepath(file)
1033
775
  end
1034
776
 
1035
777
  _df.write_avro(file, compression)
@@ -1050,7 +792,7 @@ module Polars
1050
792
  file.set_encoding(Encoding::BINARY)
1051
793
  end
1052
794
  if Utils.pathlike?(file)
1053
- file = Utils.normalise_filepath(file)
795
+ file = Utils.normalize_filepath(file)
1054
796
  end
1055
797
 
1056
798
  if compression.nil?
@@ -1061,6 +803,47 @@ module Polars
1061
803
  return_bytes ? file.string : nil
1062
804
  end
1063
805
 
806
+ # Write to Arrow IPC record batch stream.
807
+ #
808
+ # See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html.
809
+ #
810
+ # @param file [Object]
811
+ # Path or writable file-like object to which the IPC record batch data will
812
+ # be written. If set to `None`, the output is returned as a BytesIO object.
813
+ # @param compression ['uncompressed', 'lz4', 'zstd']
814
+ # Compression method. Defaults to "uncompressed".
815
+ #
816
+ # @return [Object]
817
+ #
818
+ # @example
819
+ # df = Polars::DataFrame.new(
820
+ # {
821
+ # "foo" => [1, 2, 3, 4, 5],
822
+ # "bar" => [6, 7, 8, 9, 10],
823
+ # "ham" => ["a", "b", "c", "d", "e"]
824
+ # }
825
+ # )
826
+ # df.write_ipc_stream("new_file.arrow")
827
+ def write_ipc_stream(
828
+ file,
829
+ compression: "uncompressed"
830
+ )
831
+ return_bytes = file.nil?
832
+ if return_bytes
833
+ file = StringIO.new
834
+ file.set_encoding(Encoding::BINARY)
835
+ elsif Utils.pathlike?(file)
836
+ file = Utils.normalize_filepath(file)
837
+ end
838
+
839
+ if compression.nil?
840
+ compression = "uncompressed"
841
+ end
842
+
843
+ _df.write_ipc_stream(file, compression)
844
+ return_bytes ? file.string : nil
845
+ end
846
+
1064
847
  # Write to Apache Parquet file.
1065
848
  #
1066
849
  # @param file [String, Pathname, StringIO]
@@ -1097,7 +880,7 @@ module Polars
1097
880
  compression = "uncompressed"
1098
881
  end
1099
882
  if Utils.pathlike?(file)
1100
- file = Utils.normalise_filepath(file)
883
+ file = Utils.normalize_filepath(file)
1101
884
  end
1102
885
 
1103
886
  _df.write_parquet(
@@ -1773,10 +1556,7 @@ module Polars
1773
1556
  # # │ 3 ┆ 8 ┆ c │
1774
1557
  # # └─────â”ī─────â”ī─────┘
1775
1558
  def drop_nulls(subset: nil)
1776
- if subset.is_a?(::String)
1777
- subset = [subset]
1778
- end
1779
- _from_rbdf(_df.drop_nulls(subset))
1559
+ lazy.drop_nulls(subset: subset).collect(_eager: true)
1780
1560
  end
1781
1561
 
1782
1562
  # Offers a structured way to apply a sequence of user-defined functions (UDFs).
@@ -1838,16 +1618,16 @@ module Polars
1838
1618
  # df.with_row_index
1839
1619
  # # =>
1840
1620
  # # shape: (3, 3)
1841
- # # ┌────────┮─────┮─────┐
1842
- # # │ row_nr ┆ a ┆ b │
1843
- # # │ --- ┆ --- ┆ --- │
1844
- # # │ u32 ┆ i64 ┆ i64 │
1845
- # # ╞════════╩═════╩═════╡
1846
- # # │ 0 ┆ 1 ┆ 2 │
1847
- # # │ 1 ┆ 3 ┆ 4 │
1848
- # # │ 2 ┆ 5 ┆ 6 │
1849
- # # └────────â”ī─────â”ī─────┘
1850
- def with_row_index(name: "row_nr", offset: 0)
1621
+ # # ┌───────┮─────┮─────┐
1622
+ # # │ index ┆ a ┆ b │
1623
+ # # │ --- ┆ --- ┆ --- │
1624
+ # # │ u32 ┆ i64 ┆ i64 │
1625
+ # # ╞═══════╩═════╩═════╡
1626
+ # # │ 0 ┆ 1 ┆ 2 │
1627
+ # # │ 1 ┆ 3 ┆ 4 │
1628
+ # # │ 2 ┆ 5 ┆ 6 │
1629
+ # # └───────â”ī─────â”ī─────┘
1630
+ def with_row_index(name: "index", offset: 0)
1851
1631
  _from_rbdf(_df.with_row_index(name, offset))
1852
1632
  end
1853
1633
  alias_method :with_row_count, :with_row_index
@@ -2136,16 +1916,16 @@ module Polars
2136
1916
  # )
2137
1917
  # # =>
2138
1918
  # # shape: (4, 3)
2139
- # # ┌─────────────────────┮────────────┮───────────────────────────────────┐
2140
- # # │ time ┆ time_count ┆ time_agg_list │
2141
- # # │ --- ┆ --- ┆ --- │
2142
- # # │ datetime[ξs] ┆ u32 ┆ list[datetime[ξs]] │
2143
- # # ╞═════════════════════╩════════════╩═══════════════════════════════════╡
2144
- # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16â€Ķ │
2145
- # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16â€Ķ │
2146
- # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16â€Ķ │
2147
- # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
2148
- # # └─────────────────────â”ī────────────â”ī───────────────────────────────────┘
1919
+ # # ┌─────────────────────┮────────────┮─────────────────────────────────┐
1920
+ # # │ time ┆ time_count ┆ time_agg_list │
1921
+ # # │ --- ┆ --- ┆ --- │
1922
+ # # │ datetime[ξs] ┆ u32 ┆ list[datetime[ξs]] │
1923
+ # # ╞═════════════════════╩════════════╩═════════════════════════════════╡
1924
+ # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-â€Ķ │
1925
+ # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-â€Ķ │
1926
+ # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-â€Ķ │
1927
+ # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
1928
+ # # └─────────────────────â”ī────────────â”ī─────────────────────────────────┘
2149
1929
  #
2150
1930
  # @example When closed="both" the time values at the window boundaries belong to 2 groups.
2151
1931
  # df.group_by_dynamic("time", every: "1h", closed: "both").agg(
@@ -2620,7 +2400,7 @@ module Polars
2620
2400
  # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [-1, 5, 8]})
2621
2401
  #
2622
2402
  # @example Return a DataFrame by mapping each row to a tuple:
2623
- # df.apply { |t| [t[0] * 2, t[1] * 3] }
2403
+ # df.map_rows { |t| [t[0] * 2, t[1] * 3] }
2624
2404
  # # =>
2625
2405
  # # shape: (3, 2)
2626
2406
  # # ┌──────────┮──────────┐
@@ -2634,7 +2414,7 @@ module Polars
2634
2414
  # # └──────────â”ī──────────┘
2635
2415
  #
2636
2416
  # @example Return a Series by mapping each row to a scalar:
2637
- # df.apply { |t| t[0] * 2 + t[1] }
2417
+ # df.map_rows { |t| t[0] * 2 + t[1] }
2638
2418
  # # =>
2639
2419
  # # shape: (3, 1)
2640
2420
  # # ┌───────┐
@@ -2646,14 +2426,15 @@ module Polars
2646
2426
  # # │ 9 │
2647
2427
  # # │ 14 │
2648
2428
  # # └───────┘
2649
- def apply(return_dtype: nil, inference_size: 256, &f)
2650
- out, is_df = _df.apply(f, return_dtype, inference_size)
2429
+ def map_rows(return_dtype: nil, inference_size: 256, &f)
2430
+ out, is_df = _df.map_rows(f, return_dtype, inference_size)
2651
2431
  if is_df
2652
2432
  _from_rbdf(out)
2653
2433
  else
2654
2434
  _from_rbdf(Utils.wrap_s(out).to_frame._df)
2655
2435
  end
2656
2436
  end
2437
+ alias_method :apply, :map_rows
2657
2438
 
2658
2439
  # Return a new DataFrame with the column added or replaced.
2659
2440
  #
@@ -3774,7 +3555,7 @@ module Polars
3774
3555
  # # ┌─────────┐
3775
3556
  # # │ literal │
3776
3557
  # # │ --- │
3777
- # # │ i64 │
3558
+ # # │ i32 │
3778
3559
  # # ╞═════════╡
3779
3560
  # # │ 0 │
3780
3561
  # # │ 0 │
@@ -5255,7 +5036,7 @@ module Polars
5255
5036
  elsif data[0].is_a?(Hash)
5256
5037
  column_names, dtypes = _unpack_schema(columns)
5257
5038
  schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
5258
- rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema, schema_overrides)
5039
+ rbdf = RbDataFrame.from_hashes(data, schema, schema_overrides, false, infer_schema_length)
5259
5040
  if column_names
5260
5041
  rbdf = _post_apply_columns(rbdf, column_names)
5261
5042
  end
@@ -5289,7 +5070,7 @@ module Polars
5289
5070
  if unpack_nested
5290
5071
  raise Todo
5291
5072
  else
5292
- rbdf = RbDataFrame.read_rows(
5073
+ rbdf = RbDataFrame.from_rows(
5293
5074
  data,
5294
5075
  infer_schema_length,
5295
5076
  local_schema_override.any? ? local_schema_override : nil
@@ -215,6 +215,7 @@ module Polars
215
215
  offset = "0ns"
216
216
  end
217
217
 
218
+ every = Utils.parse_as_expression(every, str_as_lit: true)
218
219
  Utils.wrap_expr(
219
220
  _rbexpr.dt_round(
220
221
  Utils._timedelta_to_pl_duration(every),
@@ -66,6 +66,8 @@ module Polars
66
66
  if !out.nil?
67
67
  if s.dtype == Date
68
68
  return Utils._to_ruby_date(out.to_i)
69
+ elsif [Datetime, Duration, Time].include?(s.dtype)
70
+ return out
69
71
  else
70
72
  return Utils._to_ruby_datetime(out.to_i, s.time_unit)
71
73
  end
@@ -93,10 +95,12 @@ module Polars
93
95
  # # => 2001-01-02 00:00:00 UTC
94
96
  def mean
95
97
  s = Utils.wrap_s(_s)
96
- out = s.mean.to_i
98
+ out = s.mean
97
99
  if !out.nil?
98
100
  if s.dtype == Date
99
101
  return Utils._to_ruby_date(out.to_i)
102
+ elsif [Datetime, Duration, Time].include?(s.dtype)
103
+ return out
100
104
  else
101
105
  return Utils._to_ruby_datetime(out.to_i, s.time_unit)
102
106
  end
@@ -32,7 +32,7 @@ module Polars
32
32
  @start_by = start_by
33
33
  end
34
34
 
35
- def agg(aggs)
35
+ def agg(*aggs, **named_aggs)
36
36
  @df.lazy
37
37
  .group_by_dynamic(
38
38
  @time_column,
@@ -45,7 +45,7 @@ module Polars
45
45
  by: @by,
46
46
  start_by: @start_by
47
47
  )
48
- .agg(aggs)
48
+ .agg(*aggs, **named_aggs)
49
49
  .collect(no_optimization: true, string_cache: false)
50
50
  end
51
51
  end
@@ -3,6 +3,10 @@ module Polars
3
3
  # Base class for all Polars errors.
4
4
  class Error < StandardError; end
5
5
 
6
+ # @private
7
+ # Exception raised when an operation is not allowed (or possible) against a given object or data structure.
8
+ class InvalidOperationError < Error; end
9
+
6
10
  # @private
7
11
  # Exception raised when an unsupported testing assert is made.
8
12
  class InvalidAssert < Error; end