polars-df 0.1.3 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/Cargo.lock +142 -11
  4. data/Cargo.toml +5 -0
  5. data/ext/polars/Cargo.toml +17 -1
  6. data/ext/polars/src/apply/dataframe.rs +292 -0
  7. data/ext/polars/src/apply/mod.rs +254 -0
  8. data/ext/polars/src/apply/series.rs +1173 -0
  9. data/ext/polars/src/conversion.rs +180 -5
  10. data/ext/polars/src/dataframe.rs +146 -1
  11. data/ext/polars/src/error.rs +12 -0
  12. data/ext/polars/src/lazy/apply.rs +34 -2
  13. data/ext/polars/src/lazy/dataframe.rs +74 -3
  14. data/ext/polars/src/lazy/dsl.rs +136 -0
  15. data/ext/polars/src/lib.rs +199 -1
  16. data/ext/polars/src/list_construction.rs +100 -0
  17. data/ext/polars/src/series.rs +331 -0
  18. data/ext/polars/src/utils.rs +25 -0
  19. data/lib/polars/cat_name_space.rb +54 -0
  20. data/lib/polars/convert.rb +100 -0
  21. data/lib/polars/data_frame.rb +1558 -60
  22. data/lib/polars/date_time_expr.rb +2 -2
  23. data/lib/polars/date_time_name_space.rb +1484 -0
  24. data/lib/polars/dynamic_group_by.rb +49 -0
  25. data/lib/polars/expr.rb +4072 -107
  26. data/lib/polars/expr_dispatch.rb +8 -0
  27. data/lib/polars/functions.rb +192 -3
  28. data/lib/polars/group_by.rb +44 -3
  29. data/lib/polars/io.rb +20 -4
  30. data/lib/polars/lazy_frame.rb +800 -26
  31. data/lib/polars/lazy_functions.rb +687 -43
  32. data/lib/polars/lazy_group_by.rb +1 -0
  33. data/lib/polars/list_expr.rb +502 -5
  34. data/lib/polars/list_name_space.rb +346 -0
  35. data/lib/polars/rolling_group_by.rb +35 -0
  36. data/lib/polars/series.rb +934 -62
  37. data/lib/polars/string_expr.rb +189 -13
  38. data/lib/polars/string_name_space.rb +690 -0
  39. data/lib/polars/struct_name_space.rb +64 -0
  40. data/lib/polars/utils.rb +44 -0
  41. data/lib/polars/version.rb +1 -1
  42. data/lib/polars.rb +14 -1
  43. metadata +15 -3
@@ -3,11 +3,19 @@ module Polars
3
3
  module ExprDispatch
4
4
  private
5
5
 
6
+ def self.included(base)
7
+ base.attr_accessor :_s
8
+ base.singleton_class.attr_accessor :_accessor
9
+ end
10
+
6
11
  def method_missing(method, ...)
7
12
  return super unless self.class.method_defined?(method)
8
13
 
14
+ namespace = self.class._accessor
15
+
9
16
  s = Utils.wrap_s(_s)
10
17
  expr = Utils.col(s.name)
18
+ expr = expr.send(namespace) if namespace
11
19
  s.to_frame.select(expr.send(method, ...)).to_series
12
20
  end
13
21
  end
@@ -199,12 +199,201 @@ module Polars
199
199
  dt_range
200
200
  end
201
201
 
202
- # def cut
203
- # end
202
+ # Bin values into discrete values.
203
+ #
204
+ # @param s [Series]
205
+ # Series to bin.
206
+ # @param bins [Array]
207
+ # Bins to create.
208
+ # @param labels [Array]
209
+ # Labels to assign to the bins. If given the length of labels must be
210
+ # len(bins) + 1.
211
+ # @param break_point_label [String]
212
+ # Name given to the breakpoint column.
213
+ # @param category_label [String]
214
+ # Name given to the category column.
215
+ #
216
+ # @return [DataFrame]
217
+ #
218
+ # @note
219
+ # This functionality is experimental and may change without it being considered a
220
+ # breaking change.
221
+ #
222
+ # @example
223
+ # a = Polars::Series.new("a", 13.times.map { |i| (-30 + i * 5) / 10.0 })
224
+ # Polars.cut(a, [-1, 1])
225
+ # # =>
226
+ # # shape: (12, 3)
227
+ # # ┌──────┬─────────────┬──────────────┐
228
+ # # │ a ┆ break_point ┆ category │
229
+ # # │ --- ┆ --- ┆ --- │
230
+ # # │ f64 ┆ f64 ┆ cat │
231
+ # # ╞══════╪═════════════╪══════════════╡
232
+ # # │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │
233
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
234
+ # # │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │
235
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
236
+ # # │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │
237
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
238
+ # # │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │
239
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
240
+ # # │ ... ┆ ... ┆ ... │
241
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
242
+ # # │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │
243
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
244
+ # # │ 1.5 ┆ inf ┆ (1.0, inf] │
245
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
246
+ # # │ 2.0 ┆ inf ┆ (1.0, inf] │
247
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
248
+ # # │ 2.5 ┆ inf ┆ (1.0, inf] │
249
+ # # └──────┴─────────────┴──────────────┘
250
+ # def cut(
251
+ # s,
252
+ # bins,
253
+ # labels: nil,
254
+ # break_point_label: "break_point",
255
+ # category_label: "category"
256
+ # )
257
+ # var_nm = s.name
204
258
 
205
- # def align_frames
259
+ # cuts_df = DataFrame.new(
260
+ # [
261
+ # Series.new(
262
+ # break_point_label, bins, dtype: :f64
263
+ # ).extend_constant(Float::INFINITY, 1)
264
+ # ]
265
+ # )
266
+
267
+ # if labels
268
+ # if labels.length != bins.length + 1
269
+ # raise ArgumentError, "expected more labels"
270
+ # end
271
+ # cuts_df = cuts_df.with_column(Series.new(category_label, labels))
272
+ # else
273
+ # cuts_df = cuts_df.with_column(
274
+ # Polars.format(
275
+ # "({}, {}]",
276
+ # Polars.col(break_point_label).shift_and_fill(1, -Float::INFINITY),
277
+ # Polars.col(break_point_label)
278
+ # ).alias(category_label)
279
+ # )
280
+ # end
281
+
282
+ # cuts_df = cuts_df.with_column(Polars.col(category_label).cast(:cat))
283
+
284
+ # s.cast(:f64)
285
+ # .sort
286
+ # .to_frame
287
+ # .join_asof(
288
+ # cuts_df,
289
+ # left_on: var_nm,
290
+ # right_on: break_point_label,
291
+ # strategy: "forward"
292
+ # )
206
293
  # end
207
294
 
295
+ # Align a sequence of frames using the uique values from one or more columns as a key.
296
+ #
297
+ # Frames that do not contain the given key values have rows injected (with nulls
298
+ # filling the non-key columns), and each resulting frame is sorted by the key.
299
+ #
300
+ # The original column order of input frames is not changed unless ``select`` is
301
+ # specified (in which case the final column order is determined from that).
302
+ #
303
+ # Note that this does not result in a joined frame - you receive the same number
304
+ # of frames back that you passed in, but each is now aligned by key and has
305
+ # the same number of rows.
306
+ #
307
+ # @param frames [Array]
308
+ # Sequence of DataFrames or LazyFrames.
309
+ # @param on [Object]
310
+ # One or more columns whose unique values will be used to align the frames.
311
+ # @param select [Object]
312
+ # Optional post-alignment column select to constrain and/or order
313
+ # the columns returned from the newly aligned frames.
314
+ # @param reverse [Object]
315
+ # Sort the alignment column values in descending order; can be a single
316
+ # boolean or a list of booleans associated with each column in `on`.
317
+ #
318
+ # @return [Object]
319
+ #
320
+ # @example
321
+ # df1 = Polars::DataFrame.new(
322
+ # {
323
+ # "dt" => [Date.new(2022, 9, 1), Date.new(2022, 9, 2), Date.new(2022, 9, 3)],
324
+ # "x" => [3.5, 4.0, 1.0],
325
+ # "y" => [10.0, 2.5, 1.5]
326
+ # }
327
+ # )
328
+ # df2 = Polars::DataFrame.new(
329
+ # {
330
+ # "dt" => [Date.new(2022, 9, 2), Date.new(2022, 9, 3), Date.new(2022, 9, 1)],
331
+ # "x" => [8.0, 1.0, 3.5],
332
+ # "y" => [1.5, 12.0, 5.0]
333
+ # }
334
+ # )
335
+ # df3 = Polars::DataFrame.new(
336
+ # {
337
+ # "dt" => [Date.new(2022, 9, 3), Date.new(2022, 9, 2)],
338
+ # "x" => [2.0, 5.0],
339
+ # "y" => [2.5, 2.0]
340
+ # }
341
+ # )
342
+ # af1, af2, af3 = Polars.align_frames(
343
+ # df1, df2, df3, on: "dt", select: ["x", "y"]
344
+ # )
345
+ # (af1 * af2 * af3).fill_null(0).select(Polars.sum(Polars.col("*")).alias("dot"))
346
+ # # =>
347
+ # # shape: (3, 1)
348
+ # # ┌───────┐
349
+ # # │ dot │
350
+ # # │ --- │
351
+ # # │ f64 │
352
+ # # ╞═══════╡
353
+ # # │ 0.0 │
354
+ # # ├╌╌╌╌╌╌╌┤
355
+ # # │ 167.5 │
356
+ # # ├╌╌╌╌╌╌╌┤
357
+ # # │ 47.0 │
358
+ # # └───────┘
359
+ def align_frames(
360
+ *frames,
361
+ on:,
362
+ select: nil,
363
+ reverse: false
364
+ )
365
+ if frames.empty?
366
+ return []
367
+ elsif frames.map(&:class).uniq.length != 1
368
+ raise TypeError, "Input frames must be of a consistent type (all LazyFrame or all DataFrame)"
369
+ end
370
+
371
+ # establish the superset of all "on" column values, sort, and cache
372
+ eager = frames[0].is_a?(DataFrame)
373
+ alignment_frame = (
374
+ concat(frames.map { |df| df.lazy.select(on) })
375
+ .unique(maintain_order: false)
376
+ .sort(on, reverse: reverse)
377
+ )
378
+ alignment_frame = (
379
+ eager ? alignment_frame.collect.lazy : alignment_frame.cache
380
+ )
381
+ # finally, align all frames
382
+ aligned_frames =
383
+ frames.map do |df|
384
+ alignment_frame.join(
385
+ df.lazy,
386
+ on: alignment_frame.columns,
387
+ how: "left"
388
+ ).select(df.columns)
389
+ end
390
+ if !select.nil?
391
+ aligned_frames = aligned_frames.map { |df| df.select(select) }
392
+ end
393
+
394
+ eager ? aligned_frames.map(&:collect) : aligned_frames
395
+ end
396
+
208
397
  # Return a new Series of given length and type, filled with ones.
209
398
  #
210
399
  # @param n [Integer]
@@ -1,4 +1,5 @@
1
1
  module Polars
2
+ # Starts a new GroupBy operation.
2
3
  class GroupBy
3
4
  # @private
4
5
  attr_accessor :_df, :_dataframe_class, :by, :maintain_order
@@ -11,7 +12,48 @@ module Polars
11
12
  self.maintain_order = maintain_order
12
13
  end
13
14
 
14
- # def apply
15
+ # Apply a custom/user-defined function (UDF) over the groups as a sub-DataFrame.
16
+ #
17
+ # Implementing logic using a Ruby function is almost always _significantly_
18
+ # slower and more memory intensive than implementing the same logic using
19
+ # the native expression API because:
20
+
21
+ # - The native expression engine runs in Rust; UDFs run in Ruby.
22
+ # - Use of Ruby UDFs forces the DataFrame to be materialized in memory.
23
+ # - Polars-native expressions can be parallelised (UDFs cannot).
24
+ # - Polars-native expressions can be logically optimised (UDFs cannot).
25
+ #
26
+ # Wherever possible you should strongly prefer the native expression API
27
+ # to achieve the best performance.
28
+ #
29
+ # @return [DataFrame]
30
+ #
31
+ # @example
32
+ # df = Polars::DataFrame.new(
33
+ # {
34
+ # "id" => [0, 1, 2, 3, 4],
35
+ # "color" => ["red", "green", "green", "red", "red"],
36
+ # "shape" => ["square", "triangle", "square", "triangle", "square"]
37
+ # }
38
+ # )
39
+ # df.groupby("color").apply { |group_df| group_df.sample(2) }
40
+ # # =>
41
+ # # shape: (4, 3)
42
+ # # ┌─────┬───────┬──────────┐
43
+ # # │ id ┆ color ┆ shape │
44
+ # # │ --- ┆ --- ┆ --- │
45
+ # # │ i64 ┆ str ┆ str │
46
+ # # ╞═════╪═══════╪══════════╡
47
+ # # │ 1 ┆ green ┆ triangle │
48
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
49
+ # # │ 2 ┆ green ┆ square │
50
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
51
+ # # │ 4 ┆ red ┆ square │
52
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
53
+ # # │ 3 ┆ red ┆ triangle │
54
+ # # └─────┴───────┴──────────┘
55
+ # def apply(&f)
56
+ # _dataframe_class._from_rbdf(_df.groupby_apply(by, f))
15
57
  # end
16
58
 
17
59
  # Use multiple aggregations on columns.
@@ -181,8 +223,7 @@ module Polars
181
223
  _dataframe_class._from_rbdf(df._df)
182
224
  end
183
225
 
184
- # def pivot
185
- # end
226
+ # pivot is deprecated
186
227
 
187
228
  # Aggregate the first values in the group.
188
229
  #
data/lib/polars/io.rb CHANGED
@@ -59,7 +59,7 @@ module Polars
59
59
  # Lossy means that invalid utf8 values are replaced with `�`
60
60
  # characters. When using other encodings than `utf8` or
61
61
  # `utf8-lossy`, the input is first decoded im memory with
62
- # python.
62
+ # Ruby.
63
63
  # @param low_memory [Boolean]
64
64
  # Reduce memory usage at expense of performance.
65
65
  # @param rechunk [Boolean]
@@ -183,7 +183,7 @@ module Polars
183
183
  # @param has_header [Boolean]
184
184
  # Indicate if the first row of dataset is a header or not.
185
185
  # If set to false, column names will be autogenerated in the
186
- # following format: ``column_x``, with ``x`` being an
186
+ # following format: `column_x`, with `x` being an
187
187
  # enumeration over every column in the dataset starting at 1.
188
188
  # @param sep [String]
189
189
  # Single byte character to use as delimiter in the file.
@@ -451,8 +451,24 @@ module Polars
451
451
  )
452
452
  end
453
453
 
454
- # def read_avro
455
- # end
454
+ # Read into a DataFrame from Apache Avro format.
455
+ #
456
+ # @param file [Object]
457
+ # Path to a file or a file-like object.
458
+ # @param columns [Object]
459
+ # Columns to select. Accepts a list of column indices (starting at zero) or a list
460
+ # of column names.
461
+ # @param n_rows [Integer]
462
+ # Stop reading from Apache Avro file after reading ``n_rows``.
463
+ #
464
+ # @return [DataFrame]
465
+ def read_avro(file, columns: nil, n_rows: nil)
466
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
467
+ file = Utils.format_path(file)
468
+ end
469
+
470
+ DataFrame._read_avro(file, n_rows: n_rows, columns: columns)
471
+ end
456
472
 
457
473
  # Read into a DataFrame from Arrow IPC (Feather v2) file.
458
474
  #