polars-df 0.2.0-arm64-darwin

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +33 -0
  4. data/Cargo.lock +2230 -0
  5. data/Cargo.toml +10 -0
  6. data/LICENSE-THIRD-PARTY.txt +38856 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +91 -0
  9. data/lib/polars/3.0/polars.bundle +0 -0
  10. data/lib/polars/3.1/polars.bundle +0 -0
  11. data/lib/polars/3.2/polars.bundle +0 -0
  12. data/lib/polars/batched_csv_reader.rb +96 -0
  13. data/lib/polars/cat_expr.rb +52 -0
  14. data/lib/polars/cat_name_space.rb +54 -0
  15. data/lib/polars/convert.rb +100 -0
  16. data/lib/polars/data_frame.rb +4833 -0
  17. data/lib/polars/data_types.rb +122 -0
  18. data/lib/polars/date_time_expr.rb +1418 -0
  19. data/lib/polars/date_time_name_space.rb +1484 -0
  20. data/lib/polars/dynamic_group_by.rb +52 -0
  21. data/lib/polars/exceptions.rb +20 -0
  22. data/lib/polars/expr.rb +5307 -0
  23. data/lib/polars/expr_dispatch.rb +22 -0
  24. data/lib/polars/functions.rb +453 -0
  25. data/lib/polars/group_by.rb +558 -0
  26. data/lib/polars/io.rb +814 -0
  27. data/lib/polars/lazy_frame.rb +2442 -0
  28. data/lib/polars/lazy_functions.rb +1195 -0
  29. data/lib/polars/lazy_group_by.rb +93 -0
  30. data/lib/polars/list_expr.rb +610 -0
  31. data/lib/polars/list_name_space.rb +346 -0
  32. data/lib/polars/meta_expr.rb +54 -0
  33. data/lib/polars/rolling_group_by.rb +35 -0
  34. data/lib/polars/series.rb +3730 -0
  35. data/lib/polars/slice.rb +104 -0
  36. data/lib/polars/string_expr.rb +972 -0
  37. data/lib/polars/string_name_space.rb +690 -0
  38. data/lib/polars/struct_expr.rb +100 -0
  39. data/lib/polars/struct_name_space.rb +64 -0
  40. data/lib/polars/utils.rb +192 -0
  41. data/lib/polars/version.rb +4 -0
  42. data/lib/polars/when.rb +16 -0
  43. data/lib/polars/when_then.rb +19 -0
  44. data/lib/polars-df.rb +1 -0
  45. data/lib/polars.rb +50 -0
  46. metadata +89 -0
@@ -0,0 +1,22 @@
1
+ module Polars
2
+ # @private
3
+ module ExprDispatch
4
+ private
5
+
6
+ def self.included(base)
7
+ base.attr_accessor :_s
8
+ base.singleton_class.attr_accessor :_accessor
9
+ end
10
+
11
+ def method_missing(method, ...)
12
+ return super unless self.class.method_defined?(method)
13
+
14
+ namespace = self.class._accessor
15
+
16
+ s = Utils.wrap_s(_s)
17
+ expr = Utils.col(s.name)
18
+ expr = expr.send(namespace) if namespace
19
+ s.to_frame.select(expr.send(method, ...)).to_series
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,453 @@
1
+ module Polars
2
+ module Functions
3
+ # Convert categorical variables into dummy/indicator variables.
4
+ #
5
+ # @param df [DataFrame]
6
+ # DataFrame to convert.
7
+ # @param columns [Array, nil]
8
+ # A subset of columns to convert to dummy variables. `nil` means
9
+ # "all columns".
10
+ #
11
+ # @return [DataFrame]
12
+ def get_dummies(df, columns: nil)
13
+ df.to_dummies(columns: columns)
14
+ end
15
+
16
+ # Aggregate multiple Dataframes/Series to a single DataFrame/Series.
17
+ #
18
+ # @param items [Object]
19
+ # DataFrames/Series/LazyFrames to concatenate.
20
+ # @param rechunk [Boolean]
21
+ # Make sure that all data is in contiguous memory.
22
+ # @param how ["vertical", "diagonal", "horizontal"]
23
+ # Lazy only supports the 'vertical' strategy.
24
+ #
25
+ # - Vertical: applies multiple `vstack` operations.
26
+ # - Diagonal: finds a union between the column schemas and fills missing column values with null.
27
+ # - Horizontal: stacks Series horizontally and fills with nulls if the lengths don't match.
28
+ # @param parallel [Boolean]
29
+ # Only relevant for LazyFrames. This determines if the concatenated
30
+ # lazy computations may be executed in parallel.
31
+ #
32
+ # @return [Object]
33
+ #
34
+ # @example
35
+ # df1 = Polars::DataFrame.new({"a" => [1], "b" => [3]})
36
+ # df2 = Polars::DataFrame.new({"a" => [2], "b" => [4]})
37
+ # Polars.concat([df1, df2])
38
+ # # =>
39
+ # # shape: (2, 2)
40
+ # # ┌─────┬─────┐
41
+ # # │ a ┆ b │
42
+ # # │ --- ┆ --- │
43
+ # # │ i64 ┆ i64 │
44
+ # # ╞═════╪═════╡
45
+ # # │ 1 ┆ 3 │
46
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
47
+ # # │ 2 ┆ 4 │
48
+ # # └─────┴─────┘
49
+ def concat(items, rechunk: true, how: "vertical", parallel: true)
50
+ if items.empty?
51
+ raise ArgumentError, "cannot concat empty list"
52
+ end
53
+
54
+ first = items[0]
55
+ if first.is_a?(DataFrame)
56
+ if how == "vertical"
57
+ out = Utils.wrap_df(_concat_df(items))
58
+ elsif how == "diagonal"
59
+ out = Utils.wrap_df(_diag_concat_df(items))
60
+ elsif how == "horizontal"
61
+ out = Utils.wrap_df(_hor_concat_df(items))
62
+ else
63
+ raise ArgumentError, "how must be one of {{'vertical', 'diagonal', 'horizontal'}}, got #{how}"
64
+ end
65
+ elsif first.is_a?(LazyFrame)
66
+ if how == "vertical"
67
+ # TODO
68
+ return Utils.wrap_ldf(_concat_lf(items, rechunk, parallel))
69
+ else
70
+ raise ArgumentError, "Lazy only allows 'vertical' concat strategy."
71
+ end
72
+ elsif first.is_a?(Series)
73
+ # TODO
74
+ out = Utils.wrap_s(_concat_series(items))
75
+ elsif first.is_a?(Expr)
76
+ out = first
77
+ items[1..-1].each do |e|
78
+ out = out.append(e)
79
+ end
80
+ else
81
+ raise ArgumentError, "did not expect type: #{first.class.name} in 'Polars.concat'."
82
+ end
83
+
84
+ if rechunk
85
+ out.rechunk
86
+ else
87
+ out
88
+ end
89
+ end
90
+
91
+ # Create a range of type `Datetime` (or `Date`).
92
+ #
93
+ # @param low [Object]
94
+ # Lower bound of the date range.
95
+ # @param high [Object]
96
+ # Upper bound of the date range.
97
+ # @param interval [Object]
98
+ # Interval periods. It can be a polars duration string, such as `3d12h4m25s`
99
+ # representing 3 days, 12 hours, 4 minutes, and 25 seconds.
100
+ # @param lazy [Boolean]
101
+ # Return an expression.
102
+ # @param closed ["both", "left", "right", "none"]
103
+ # Define whether the temporal window interval is closed or not.
104
+ # @param name [String]
105
+ # Name of the output Series.
106
+ # @param time_unit [nil, "ns", "us", "ms"]
107
+ # Set the time unit.
108
+ # @param time_zone [String]
109
+ # Optional timezone
110
+ #
111
+ # @return [Object]
112
+ #
113
+ # @note
114
+ # If both `low` and `high` are passed as date types (not datetime), and the
115
+ # interval granularity is no finer than 1d, the returned range is also of
116
+ # type date. All other permutations return a datetime Series.
117
+ #
118
+ # @example Using polars duration string to specify the interval
119
+ # Polars.date_range(Date.new(2022, 1, 1), Date.new(2022, 3, 1), "1mo", name: "drange")
120
+ # # =>
121
+ # # shape: (3,)
122
+ # # Series: 'drange' [date]
123
+ # # [
124
+ # # 2022-01-01
125
+ # # 2022-02-01
126
+ # # 2022-03-01
127
+ # # ]
128
+ #
129
+ # @example Using `timedelta` object to specify the interval:
130
+ # Polars.date_range(
131
+ # DateTime.new(1985, 1, 1),
132
+ # DateTime.new(1985, 1, 10),
133
+ # "1d12h",
134
+ # time_unit: "ms"
135
+ # )
136
+ # # =>
137
+ # # shape: (7,)
138
+ # # Series: '' [datetime[ms]]
139
+ # # [
140
+ # # 1985-01-01 00:00:00
141
+ # # 1985-01-02 12:00:00
142
+ # # 1985-01-04 00:00:00
143
+ # # 1985-01-05 12:00:00
144
+ # # 1985-01-07 00:00:00
145
+ # # 1985-01-08 12:00:00
146
+ # # 1985-01-10 00:00:00
147
+ # # ]
148
+ def date_range(
149
+ low,
150
+ high,
151
+ interval,
152
+ lazy: false,
153
+ closed: "both",
154
+ name: nil,
155
+ time_unit: nil,
156
+ time_zone: nil
157
+ )
158
+ if defined?(ActiveSupport::Duration) && interval.is_a?(ActiveSupport::Duration)
159
+ raise Todo
160
+ else
161
+ interval = interval.to_s
162
+ if interval.include?(" ")
163
+ interval = interval.gsub(" ", "")
164
+ end
165
+ end
166
+
167
+ if low.is_a?(Expr) || high.is_a?(Expr) || lazy
168
+ low = Utils.expr_to_lit_or_expr(low, str_to_lit: true)
169
+ high = Utils.expr_to_lit_or_expr(high, str_to_lit: true)
170
+ return Utils.wrap_expr(
171
+ _rb_date_range_lazy(low, high, interval, closed, name, time_zone)
172
+ )
173
+ end
174
+
175
+ low, low_is_date = _ensure_datetime(low)
176
+ high, high_is_date = _ensure_datetime(high)
177
+
178
+ if !time_unit.nil?
179
+ tu = time_unit
180
+ elsif interval.include?("ns")
181
+ tu = "ns"
182
+ else
183
+ tu = "us"
184
+ end
185
+
186
+ start = Utils._datetime_to_pl_timestamp(low, tu)
187
+ stop = Utils._datetime_to_pl_timestamp(high, tu)
188
+ if name.nil?
189
+ name = ""
190
+ end
191
+
192
+ dt_range = Utils.wrap_s(
193
+ _rb_date_range(start, stop, interval, closed, name, tu, time_zone)
194
+ )
195
+ if low_is_date && high_is_date && !["h", "m", "s"].any? { |v| _interval_granularity(interval).end_with?(v) }
196
+ dt_range = dt_range.cast(Date)
197
+ end
198
+
199
+ dt_range
200
+ end
201
+
202
+ # Bin values into discrete values.
203
+ #
204
+ # @param s [Series]
205
+ # Series to bin.
206
+ # @param bins [Array]
207
+ # Bins to create.
208
+ # @param labels [Array]
209
+ # Labels to assign to the bins. If given the length of labels must be
210
+ # len(bins) + 1.
211
+ # @param break_point_label [String]
212
+ # Name given to the breakpoint column.
213
+ # @param category_label [String]
214
+ # Name given to the category column.
215
+ #
216
+ # @return [DataFrame]
217
+ #
218
+ # @note
219
+ # This functionality is experimental and may change without it being considered a
220
+ # breaking change.
221
+ #
222
+ # @example
223
+ # a = Polars::Series.new("a", 13.times.map { |i| (-30 + i * 5) / 10.0 })
224
+ # Polars.cut(a, [-1, 1])
225
+ # # =>
226
+ # # shape: (12, 3)
227
+ # # ┌──────┬─────────────┬──────────────┐
228
+ # # │ a ┆ break_point ┆ category │
229
+ # # │ --- ┆ --- ┆ --- │
230
+ # # │ f64 ┆ f64 ┆ cat │
231
+ # # ╞══════╪═════════════╪══════════════╡
232
+ # # │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │
233
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
234
+ # # │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │
235
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
236
+ # # │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │
237
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
238
+ # # │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │
239
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
240
+ # # │ ... ┆ ... ┆ ... │
241
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
242
+ # # │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │
243
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
244
+ # # │ 1.5 ┆ inf ┆ (1.0, inf] │
245
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
246
+ # # │ 2.0 ┆ inf ┆ (1.0, inf] │
247
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
248
+ # # │ 2.5 ┆ inf ┆ (1.0, inf] │
249
+ # # └──────┴─────────────┴──────────────┘
250
+ # def cut(
251
+ # s,
252
+ # bins,
253
+ # labels: nil,
254
+ # break_point_label: "break_point",
255
+ # category_label: "category"
256
+ # )
257
+ # var_nm = s.name
258
+
259
+ # cuts_df = DataFrame.new(
260
+ # [
261
+ # Series.new(
262
+ # break_point_label, bins, dtype: :f64
263
+ # ).extend_constant(Float::INFINITY, 1)
264
+ # ]
265
+ # )
266
+
267
+ # if labels
268
+ # if labels.length != bins.length + 1
269
+ # raise ArgumentError, "expected more labels"
270
+ # end
271
+ # cuts_df = cuts_df.with_column(Series.new(category_label, labels))
272
+ # else
273
+ # cuts_df = cuts_df.with_column(
274
+ # Polars.format(
275
+ # "({}, {}]",
276
+ # Polars.col(break_point_label).shift_and_fill(1, -Float::INFINITY),
277
+ # Polars.col(break_point_label)
278
+ # ).alias(category_label)
279
+ # )
280
+ # end
281
+
282
+ # cuts_df = cuts_df.with_column(Polars.col(category_label).cast(:cat))
283
+
284
+ # s.cast(:f64)
285
+ # .sort
286
+ # .to_frame
287
+ # .join_asof(
288
+ # cuts_df,
289
+ # left_on: var_nm,
290
+ # right_on: break_point_label,
291
+ # strategy: "forward"
292
+ # )
293
+ # end
294
+
295
+ # Align a sequence of frames using the uique values from one or more columns as a key.
296
+ #
297
+ # Frames that do not contain the given key values have rows injected (with nulls
298
+ # filling the non-key columns), and each resulting frame is sorted by the key.
299
+ #
300
+ # The original column order of input frames is not changed unless ``select`` is
301
+ # specified (in which case the final column order is determined from that).
302
+ #
303
+ # Note that this does not result in a joined frame - you receive the same number
304
+ # of frames back that you passed in, but each is now aligned by key and has
305
+ # the same number of rows.
306
+ #
307
+ # @param frames [Array]
308
+ # Sequence of DataFrames or LazyFrames.
309
+ # @param on [Object]
310
+ # One or more columns whose unique values will be used to align the frames.
311
+ # @param select [Object]
312
+ # Optional post-alignment column select to constrain and/or order
313
+ # the columns returned from the newly aligned frames.
314
+ # @param reverse [Object]
315
+ # Sort the alignment column values in descending order; can be a single
316
+ # boolean or a list of booleans associated with each column in `on`.
317
+ #
318
+ # @return [Object]
319
+ #
320
+ # @example
321
+ # df1 = Polars::DataFrame.new(
322
+ # {
323
+ # "dt" => [Date.new(2022, 9, 1), Date.new(2022, 9, 2), Date.new(2022, 9, 3)],
324
+ # "x" => [3.5, 4.0, 1.0],
325
+ # "y" => [10.0, 2.5, 1.5]
326
+ # }
327
+ # )
328
+ # df2 = Polars::DataFrame.new(
329
+ # {
330
+ # "dt" => [Date.new(2022, 9, 2), Date.new(2022, 9, 3), Date.new(2022, 9, 1)],
331
+ # "x" => [8.0, 1.0, 3.5],
332
+ # "y" => [1.5, 12.0, 5.0]
333
+ # }
334
+ # )
335
+ # df3 = Polars::DataFrame.new(
336
+ # {
337
+ # "dt" => [Date.new(2022, 9, 3), Date.new(2022, 9, 2)],
338
+ # "x" => [2.0, 5.0],
339
+ # "y" => [2.5, 2.0]
340
+ # }
341
+ # )
342
+ # af1, af2, af3 = Polars.align_frames(
343
+ # df1, df2, df3, on: "dt", select: ["x", "y"]
344
+ # )
345
+ # (af1 * af2 * af3).fill_null(0).select(Polars.sum(Polars.col("*")).alias("dot"))
346
+ # # =>
347
+ # # shape: (3, 1)
348
+ # # ┌───────┐
349
+ # # │ dot │
350
+ # # │ --- │
351
+ # # │ f64 │
352
+ # # ╞═══════╡
353
+ # # │ 0.0 │
354
+ # # ├╌╌╌╌╌╌╌┤
355
+ # # │ 167.5 │
356
+ # # ├╌╌╌╌╌╌╌┤
357
+ # # │ 47.0 │
358
+ # # └───────┘
359
+ def align_frames(
360
+ *frames,
361
+ on:,
362
+ select: nil,
363
+ reverse: false
364
+ )
365
+ if frames.empty?
366
+ return []
367
+ elsif frames.map(&:class).uniq.length != 1
368
+ raise TypeError, "Input frames must be of a consistent type (all LazyFrame or all DataFrame)"
369
+ end
370
+
371
+ # establish the superset of all "on" column values, sort, and cache
372
+ eager = frames[0].is_a?(DataFrame)
373
+ alignment_frame = (
374
+ concat(frames.map { |df| df.lazy.select(on) })
375
+ .unique(maintain_order: false)
376
+ .sort(on, reverse: reverse)
377
+ )
378
+ alignment_frame = (
379
+ eager ? alignment_frame.collect.lazy : alignment_frame.cache
380
+ )
381
+ # finally, align all frames
382
+ aligned_frames =
383
+ frames.map do |df|
384
+ alignment_frame.join(
385
+ df.lazy,
386
+ on: alignment_frame.columns,
387
+ how: "left"
388
+ ).select(df.columns)
389
+ end
390
+ if !select.nil?
391
+ aligned_frames = aligned_frames.map { |df| df.select(select) }
392
+ end
393
+
394
+ eager ? aligned_frames.map(&:collect) : aligned_frames
395
+ end
396
+
397
+ # Return a new Series of given length and type, filled with ones.
398
+ #
399
+ # @param n [Integer]
400
+ # Number of elements in the `Series`
401
+ # @param dtype [Symbol]
402
+ # DataType of the elements, defaults to `:f64`
403
+ #
404
+ # @return [Series]
405
+ #
406
+ # @note
407
+ # In the lazy API you should probably not use this, but use `lit(1)`
408
+ # instead.
409
+ def ones(n, dtype: nil)
410
+ s = Series.new([1.0])
411
+ if dtype
412
+ s = s.cast(dtype)
413
+ end
414
+ s.new_from_index(0, n)
415
+ end
416
+
417
+ # Return a new Series of given length and type, filled with zeros.
418
+ #
419
+ # @param n [Integer]
420
+ # Number of elements in the `Series`
421
+ # @param dtype [Symbol]
422
+ # DataType of the elements, defaults to `:f64`
423
+ #
424
+ # @return [Series]
425
+ #
426
+ # @note
427
+ # In the lazy API you should probably not use this, but use `lit(0)`
428
+ # instead.
429
+ def zeros(n, dtype: nil)
430
+ s = Series.new([0.0])
431
+ if dtype
432
+ s = s.cast(dtype)
433
+ end
434
+ s.new_from_index(0, n)
435
+ end
436
+
437
+ private
438
+
439
+ def _ensure_datetime(value)
440
+ is_date_type = false
441
+ if !value.is_a?(::DateTime)
442
+ value = ::DateTime.new(value.year, value.month, value.day)
443
+ is_date_type = true
444
+ end
445
+ [value, is_date_type]
446
+ end
447
+
448
+ # TODO
449
+ def _interval_granularity(interval)
450
+ interval
451
+ end
452
+ end
453
+ end