polars-df 0.10.0-x86_64-linux → 0.11.0-x86_64-linux

Sign up to get free protection for your applications and to get access to all the features.
@@ -46,271 +46,6 @@ module Polars
46
46
  df
47
47
  end
48
48
 
49
- # @private
50
- def self._from_hashes(data, infer_schema_length: 100, schema: nil, schema_overrides: nil)
51
- rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema, schema_overrides)
52
- _from_rbdf(rbdf)
53
- end
54
-
55
- # @private
56
- def self._from_hash(data, schema: nil, schema_overrides: nil)
57
- _from_rbdf(hash_to_rbdf(data, schema: schema, schema_overrides: schema_overrides))
58
- end
59
-
60
- # def self._from_records
61
- # end
62
-
63
- # def self._from_numo
64
- # end
65
-
66
- # no self._from_arrow
67
-
68
- # no self._from_pandas
69
-
70
- # @private
71
- def self._read_csv(
72
- file,
73
- has_header: true,
74
- columns: nil,
75
- sep: str = ",",
76
- comment_char: nil,
77
- quote_char: '"',
78
- skip_rows: 0,
79
- dtypes: nil,
80
- null_values: nil,
81
- ignore_errors: false,
82
- parse_dates: false,
83
- n_threads: nil,
84
- infer_schema_length: 100,
85
- batch_size: 8192,
86
- n_rows: nil,
87
- encoding: "utf8",
88
- low_memory: false,
89
- rechunk: true,
90
- skip_rows_after_header: 0,
91
- row_count_name: nil,
92
- row_count_offset: 0,
93
- sample_size: 1024,
94
- eol_char: "\n",
95
- truncate_ragged_lines: false
96
- )
97
- if Utils.pathlike?(file)
98
- path = Utils.normalise_filepath(file)
99
- else
100
- path = nil
101
- # if defined?(StringIO) && file.is_a?(StringIO)
102
- # file = file.string
103
- # end
104
- end
105
-
106
- dtype_list = nil
107
- dtype_slice = nil
108
- if !dtypes.nil?
109
- if dtypes.is_a?(Hash)
110
- dtype_list = []
111
- dtypes.each do|k, v|
112
- dtype_list << [k, Utils.rb_type_to_dtype(v)]
113
- end
114
- elsif dtypes.is_a?(::Array)
115
- dtype_slice = dtypes
116
- else
117
- raise ArgumentError, "dtype arg should be list or dict"
118
- end
119
- end
120
-
121
- processed_null_values = Utils._process_null_values(null_values)
122
-
123
- if columns.is_a?(::String)
124
- columns = [columns]
125
- end
126
- if file.is_a?(::String) && file.include?("*")
127
- dtypes_dict = nil
128
- if !dtype_list.nil?
129
- dtypes_dict = dtype_list.to_h
130
- end
131
- if !dtype_slice.nil?
132
- raise ArgumentError, "cannot use glob patterns and unnamed dtypes as `dtypes` argument; Use dtypes: Mapping[str, Type[DataType]"
133
- end
134
- scan = Polars.scan_csv(
135
- file,
136
- has_header: has_header,
137
- sep: sep,
138
- comment_char: comment_char,
139
- quote_char: quote_char,
140
- skip_rows: skip_rows,
141
- dtypes: dtypes_dict,
142
- null_values: null_values,
143
- ignore_errors: ignore_errors,
144
- infer_schema_length: infer_schema_length,
145
- n_rows: n_rows,
146
- low_memory: low_memory,
147
- rechunk: rechunk,
148
- skip_rows_after_header: skip_rows_after_header,
149
- row_count_name: row_count_name,
150
- row_count_offset: row_count_offset,
151
- eol_char: eol_char,
152
- truncate_ragged_lines: truncate_ragged_lines
153
- )
154
- if columns.nil?
155
- return _from_rbdf(scan.collect._df)
156
- elsif is_str_sequence(columns, allow_str: false)
157
- return _from_rbdf(scan.select(columns).collect._df)
158
- else
159
- raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: List[str]"
160
- end
161
- end
162
-
163
- projection, columns = Utils.handle_projection_columns(columns)
164
-
165
- _from_rbdf(
166
- RbDataFrame.read_csv(
167
- file,
168
- infer_schema_length,
169
- batch_size,
170
- has_header,
171
- ignore_errors,
172
- n_rows,
173
- skip_rows,
174
- projection,
175
- sep,
176
- rechunk,
177
- columns,
178
- encoding,
179
- n_threads,
180
- path,
181
- dtype_list,
182
- dtype_slice,
183
- low_memory,
184
- comment_char,
185
- quote_char,
186
- processed_null_values,
187
- parse_dates,
188
- skip_rows_after_header,
189
- Utils._prepare_row_count_args(row_count_name, row_count_offset),
190
- sample_size,
191
- eol_char,
192
- truncate_ragged_lines
193
- )
194
- )
195
- end
196
-
197
- # @private
198
- def self._read_parquet(
199
- source,
200
- columns: nil,
201
- n_rows: nil,
202
- parallel: "auto",
203
- row_count_name: nil,
204
- row_count_offset: 0,
205
- low_memory: false,
206
- use_statistics: true,
207
- rechunk: true
208
- )
209
- if Utils.pathlike?(source)
210
- source = Utils.normalise_filepath(source)
211
- end
212
- if columns.is_a?(::String)
213
- columns = [columns]
214
- end
215
-
216
- if source.is_a?(::String) && source.include?("*") && Utils.local_file?(source)
217
- scan =
218
- Polars.scan_parquet(
219
- source,
220
- n_rows: n_rows,
221
- rechunk: true,
222
- parallel: parallel,
223
- row_count_name: row_count_name,
224
- row_count_offset: row_count_offset,
225
- low_memory: low_memory
226
- )
227
-
228
- if columns.nil?
229
- return self._from_rbdf(scan.collect._df)
230
- elsif Utils.is_str_sequence(columns, allow_str: false)
231
- return self._from_rbdf(scan.select(columns).collect._df)
232
- else
233
- raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: Array[String]"
234
- end
235
- end
236
-
237
- projection, columns = Utils.handle_projection_columns(columns)
238
- _from_rbdf(
239
- RbDataFrame.read_parquet(
240
- source,
241
- columns,
242
- projection,
243
- n_rows,
244
- parallel,
245
- Utils._prepare_row_count_args(row_count_name, row_count_offset),
246
- low_memory,
247
- use_statistics,
248
- rechunk
249
- )
250
- )
251
- end
252
-
253
- # @private
254
- def self._read_avro(file, columns: nil, n_rows: nil)
255
- if Utils.pathlike?(file)
256
- file = Utils.normalise_filepath(file)
257
- end
258
- projection, columns = Utils.handle_projection_columns(columns)
259
- _from_rbdf(RbDataFrame.read_avro(file, columns, projection, n_rows))
260
- end
261
-
262
- # @private
263
- def self._read_ipc(
264
- file,
265
- columns: nil,
266
- n_rows: nil,
267
- row_count_name: nil,
268
- row_count_offset: 0,
269
- rechunk: true,
270
- memory_map: true
271
- )
272
- if Utils.pathlike?(file)
273
- file = Utils.normalise_filepath(file)
274
- end
275
- if columns.is_a?(::String)
276
- columns = [columns]
277
- end
278
-
279
- if file.is_a?(::String) && file.include?("*")
280
- raise Todo
281
- end
282
-
283
- projection, columns = Utils.handle_projection_columns(columns)
284
- _from_rbdf(
285
- RbDataFrame.read_ipc(
286
- file,
287
- columns,
288
- projection,
289
- n_rows,
290
- Utils._prepare_row_count_args(row_count_name, row_count_offset),
291
- memory_map
292
- )
293
- )
294
- end
295
-
296
- # @private
297
- def self._read_json(file)
298
- if Utils.pathlike?(file)
299
- file = Utils.normalise_filepath(file)
300
- end
301
-
302
- _from_rbdf(RbDataFrame.read_json(file))
303
- end
304
-
305
- # @private
306
- def self._read_ndjson(file)
307
- if Utils.pathlike?(file)
308
- file = Utils.normalise_filepath(file)
309
- end
310
-
311
- _from_rbdf(RbDataFrame.read_ndjson(file))
312
- end
313
-
314
49
  # Get the shape of the DataFrame.
315
50
  #
316
51
  # @return [Array]
@@ -419,6 +154,13 @@ module Polars
419
154
  _df.dtypes
420
155
  end
421
156
 
157
+ # Get flags that are set on the columns of this DataFrame.
158
+ #
159
+ # @return [Hash]
160
+ def flags
161
+ columns.to_h { |name| [name, self[name].flags] }
162
+ end
163
+
422
164
  # Get the schema.
423
165
  #
424
166
  # @return [Hash]
@@ -845,7 +587,7 @@ module Polars
845
587
  row_oriented: false
846
588
  )
847
589
  if Utils.pathlike?(file)
848
- file = Utils.normalise_filepath(file)
590
+ file = Utils.normalize_filepath(file)
849
591
  end
850
592
  to_string_io = !file.nil? && file.is_a?(StringIO)
851
593
  if file.nil? || to_string_io
@@ -884,7 +626,7 @@ module Polars
884
626
  # # => "{\"foo\":1,\"bar\":6}\n{\"foo\":2,\"bar\":7}\n{\"foo\":3,\"bar\":8}\n"
885
627
  def write_ndjson(file = nil)
886
628
  if Utils.pathlike?(file)
887
- file = Utils.normalise_filepath(file)
629
+ file = Utils.normalize_filepath(file)
888
630
  end
889
631
  to_string_io = !file.nil? && file.is_a?(StringIO)
890
632
  if file.nil? || to_string_io
@@ -991,7 +733,7 @@ module Polars
991
733
  end
992
734
 
993
735
  if Utils.pathlike?(file)
994
- file = Utils.normalise_filepath(file)
736
+ file = Utils.normalize_filepath(file)
995
737
  end
996
738
 
997
739
  _df.write_csv(
@@ -1029,7 +771,7 @@ module Polars
1029
771
  compression = "uncompressed"
1030
772
  end
1031
773
  if Utils.pathlike?(file)
1032
- file = Utils.normalise_filepath(file)
774
+ file = Utils.normalize_filepath(file)
1033
775
  end
1034
776
 
1035
777
  _df.write_avro(file, compression)
@@ -1050,7 +792,7 @@ module Polars
1050
792
  file.set_encoding(Encoding::BINARY)
1051
793
  end
1052
794
  if Utils.pathlike?(file)
1053
- file = Utils.normalise_filepath(file)
795
+ file = Utils.normalize_filepath(file)
1054
796
  end
1055
797
 
1056
798
  if compression.nil?
@@ -1061,6 +803,47 @@ module Polars
1061
803
  return_bytes ? file.string : nil
1062
804
  end
1063
805
 
806
+ # Write to Arrow IPC record batch stream.
807
+ #
808
+ # See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html.
809
+ #
810
+ # @param file [Object]
811
+ # Path or writable file-like object to which the IPC record batch data will
812
+ # be written. If set to `None`, the output is returned as a BytesIO object.
813
+ # @param compression ['uncompressed', 'lz4', 'zstd']
814
+ # Compression method. Defaults to "uncompressed".
815
+ #
816
+ # @return [Object]
817
+ #
818
+ # @example
819
+ # df = Polars::DataFrame.new(
820
+ # {
821
+ # "foo" => [1, 2, 3, 4, 5],
822
+ # "bar" => [6, 7, 8, 9, 10],
823
+ # "ham" => ["a", "b", "c", "d", "e"]
824
+ # }
825
+ # )
826
+ # df.write_ipc_stream("new_file.arrow")
827
+ def write_ipc_stream(
828
+ file,
829
+ compression: "uncompressed"
830
+ )
831
+ return_bytes = file.nil?
832
+ if return_bytes
833
+ file = StringIO.new
834
+ file.set_encoding(Encoding::BINARY)
835
+ elsif Utils.pathlike?(file)
836
+ file = Utils.normalize_filepath(file)
837
+ end
838
+
839
+ if compression.nil?
840
+ compression = "uncompressed"
841
+ end
842
+
843
+ _df.write_ipc_stream(file, compression)
844
+ return_bytes ? file.string : nil
845
+ end
846
+
1064
847
  # Write to Apache Parquet file.
1065
848
  #
1066
849
  # @param file [String, Pathname, StringIO]
@@ -1097,7 +880,7 @@ module Polars
1097
880
  compression = "uncompressed"
1098
881
  end
1099
882
  if Utils.pathlike?(file)
1100
- file = Utils.normalise_filepath(file)
883
+ file = Utils.normalize_filepath(file)
1101
884
  end
1102
885
 
1103
886
  _df.write_parquet(
@@ -1773,10 +1556,7 @@ module Polars
1773
1556
  # # │ 3 ┆ 8 ┆ c │
1774
1557
  # # └─────┴─────┴─────┘
1775
1558
  def drop_nulls(subset: nil)
1776
- if subset.is_a?(::String)
1777
- subset = [subset]
1778
- end
1779
- _from_rbdf(_df.drop_nulls(subset))
1559
+ lazy.drop_nulls(subset: subset).collect(_eager: true)
1780
1560
  end
1781
1561
 
1782
1562
  # Offers a structured way to apply a sequence of user-defined functions (UDFs).
@@ -1838,16 +1618,16 @@ module Polars
1838
1618
  # df.with_row_index
1839
1619
  # # =>
1840
1620
  # # shape: (3, 3)
1841
- # # ┌────────┬─────┬─────┐
1842
- # # │ row_nr ┆ a ┆ b │
1843
- # # │ --- ┆ --- ┆ --- │
1844
- # # │ u32 ┆ i64 ┆ i64 │
1845
- # # ╞════════╪═════╪═════╡
1846
- # # │ 0 ┆ 1 ┆ 2 │
1847
- # # │ 1 ┆ 3 ┆ 4 │
1848
- # # │ 2 ┆ 5 ┆ 6 │
1849
- # # └────────┴─────┴─────┘
1850
- def with_row_index(name: "row_nr", offset: 0)
1621
+ # # ┌───────┬─────┬─────┐
1622
+ # # │ index ┆ a ┆ b │
1623
+ # # │ --- ┆ --- ┆ --- │
1624
+ # # │ u32 ┆ i64 ┆ i64 │
1625
+ # # ╞═══════╪═════╪═════╡
1626
+ # # │ 0 ┆ 1 ┆ 2 │
1627
+ # # │ 1 ┆ 3 ┆ 4 │
1628
+ # # │ 2 ┆ 5 ┆ 6 │
1629
+ # # └───────┴─────┴─────┘
1630
+ def with_row_index(name: "index", offset: 0)
1851
1631
  _from_rbdf(_df.with_row_index(name, offset))
1852
1632
  end
1853
1633
  alias_method :with_row_count, :with_row_index
@@ -2136,16 +1916,16 @@ module Polars
2136
1916
  # )
2137
1917
  # # =>
2138
1918
  # # shape: (4, 3)
2139
- # # ┌─────────────────────┬────────────┬───────────────────────────────────┐
2140
- # # │ time ┆ time_count ┆ time_agg_list
2141
- # # │ --- ┆ --- ┆ ---
2142
- # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
2143
- # # ╞═════════════════════╪════════════╪═══════════════════════════════════╡
2144
- # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16…
2145
- # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16…
2146
- # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16…
2147
- # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
2148
- # # └─────────────────────┴────────────┴───────────────────────────────────┘
1919
+ # # ┌─────────────────────┬────────────┬─────────────────────────────────┐
1920
+ # # │ time ┆ time_count ┆ time_agg_list
1921
+ # # │ --- ┆ --- ┆ ---
1922
+ # # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
1923
+ # # ╞═════════════════════╪════════════╪═════════════════════════════════╡
1924
+ # # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-…
1925
+ # # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-…
1926
+ # # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-…
1927
+ # # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
1928
+ # # └─────────────────────┴────────────┴─────────────────────────────────┘
2149
1929
  #
2150
1930
  # @example When closed="both" the time values at the window boundaries belong to 2 groups.
2151
1931
  # df.group_by_dynamic("time", every: "1h", closed: "both").agg(
@@ -2620,7 +2400,7 @@ module Polars
2620
2400
  # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [-1, 5, 8]})
2621
2401
  #
2622
2402
  # @example Return a DataFrame by mapping each row to a tuple:
2623
- # df.apply { |t| [t[0] * 2, t[1] * 3] }
2403
+ # df.map_rows { |t| [t[0] * 2, t[1] * 3] }
2624
2404
  # # =>
2625
2405
  # # shape: (3, 2)
2626
2406
  # # ┌──────────┬──────────┐
@@ -2634,7 +2414,7 @@ module Polars
2634
2414
  # # └──────────┴──────────┘
2635
2415
  #
2636
2416
  # @example Return a Series by mapping each row to a scalar:
2637
- # df.apply { |t| t[0] * 2 + t[1] }
2417
+ # df.map_rows { |t| t[0] * 2 + t[1] }
2638
2418
  # # =>
2639
2419
  # # shape: (3, 1)
2640
2420
  # # ┌───────┐
@@ -2646,14 +2426,15 @@ module Polars
2646
2426
  # # │ 9 │
2647
2427
  # # │ 14 │
2648
2428
  # # └───────┘
2649
- def apply(return_dtype: nil, inference_size: 256, &f)
2650
- out, is_df = _df.apply(f, return_dtype, inference_size)
2429
+ def map_rows(return_dtype: nil, inference_size: 256, &f)
2430
+ out, is_df = _df.map_rows(f, return_dtype, inference_size)
2651
2431
  if is_df
2652
2432
  _from_rbdf(out)
2653
2433
  else
2654
2434
  _from_rbdf(Utils.wrap_s(out).to_frame._df)
2655
2435
  end
2656
2436
  end
2437
+ alias_method :apply, :map_rows
2657
2438
 
2658
2439
  # Return a new DataFrame with the column added or replaced.
2659
2440
  #
@@ -3774,7 +3555,7 @@ module Polars
3774
3555
  # # ┌─────────┐
3775
3556
  # # │ literal │
3776
3557
  # # │ --- │
3777
- # # │ i64
3558
+ # # │ i32
3778
3559
  # # ╞═════════╡
3779
3560
  # # │ 0 │
3780
3561
  # # │ 0 │
@@ -5255,7 +5036,7 @@ module Polars
5255
5036
  elsif data[0].is_a?(Hash)
5256
5037
  column_names, dtypes = _unpack_schema(columns)
5257
5038
  schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
5258
- rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema, schema_overrides)
5039
+ rbdf = RbDataFrame.from_hashes(data, schema, schema_overrides, false, infer_schema_length)
5259
5040
  if column_names
5260
5041
  rbdf = _post_apply_columns(rbdf, column_names)
5261
5042
  end
@@ -5289,7 +5070,7 @@ module Polars
5289
5070
  if unpack_nested
5290
5071
  raise Todo
5291
5072
  else
5292
- rbdf = RbDataFrame.read_rows(
5073
+ rbdf = RbDataFrame.from_rows(
5293
5074
  data,
5294
5075
  infer_schema_length,
5295
5076
  local_schema_override.any? ? local_schema_override : nil
@@ -215,6 +215,7 @@ module Polars
215
215
  offset = "0ns"
216
216
  end
217
217
 
218
+ every = Utils.parse_as_expression(every, str_as_lit: true)
218
219
  Utils.wrap_expr(
219
220
  _rbexpr.dt_round(
220
221
  Utils._timedelta_to_pl_duration(every),
@@ -66,6 +66,8 @@ module Polars
66
66
  if !out.nil?
67
67
  if s.dtype == Date
68
68
  return Utils._to_ruby_date(out.to_i)
69
+ elsif [Datetime, Duration, Time].include?(s.dtype)
70
+ return out
69
71
  else
70
72
  return Utils._to_ruby_datetime(out.to_i, s.time_unit)
71
73
  end
@@ -93,10 +95,12 @@ module Polars
93
95
  # # => 2001-01-02 00:00:00 UTC
94
96
  def mean
95
97
  s = Utils.wrap_s(_s)
96
- out = s.mean.to_i
98
+ out = s.mean
97
99
  if !out.nil?
98
100
  if s.dtype == Date
99
101
  return Utils._to_ruby_date(out.to_i)
102
+ elsif [Datetime, Duration, Time].include?(s.dtype)
103
+ return out
100
104
  else
101
105
  return Utils._to_ruby_datetime(out.to_i, s.time_unit)
102
106
  end
@@ -32,7 +32,7 @@ module Polars
32
32
  @start_by = start_by
33
33
  end
34
34
 
35
- def agg(aggs)
35
+ def agg(*aggs, **named_aggs)
36
36
  @df.lazy
37
37
  .group_by_dynamic(
38
38
  @time_column,
@@ -45,7 +45,7 @@ module Polars
45
45
  by: @by,
46
46
  start_by: @start_by
47
47
  )
48
- .agg(aggs)
48
+ .agg(*aggs, **named_aggs)
49
49
  .collect(no_optimization: true, string_cache: false)
50
50
  end
51
51
  end
@@ -3,6 +3,10 @@ module Polars
3
3
  # Base class for all Polars errors.
4
4
  class Error < StandardError; end
5
5
 
6
+ # @private
7
+ # Exception raised when an operation is not allowed (or possible) against a given object or data structure.
8
+ class InvalidOperationError < Error; end
9
+
6
10
  # @private
7
11
  # Exception raised when an unsupported testing assert is made.
8
12
  class InvalidAssert < Error; end