polars-df 0.2.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +33 -0
  4. data/Cargo.lock +2230 -0
  5. data/Cargo.toml +10 -0
  6. data/LICENSE-THIRD-PARTY.txt +38828 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +91 -0
  9. data/lib/polars/3.0/polars.so +0 -0
  10. data/lib/polars/3.1/polars.so +0 -0
  11. data/lib/polars/3.2/polars.so +0 -0
  12. data/lib/polars/batched_csv_reader.rb +96 -0
  13. data/lib/polars/cat_expr.rb +52 -0
  14. data/lib/polars/cat_name_space.rb +54 -0
  15. data/lib/polars/convert.rb +100 -0
  16. data/lib/polars/data_frame.rb +4833 -0
  17. data/lib/polars/data_types.rb +122 -0
  18. data/lib/polars/date_time_expr.rb +1418 -0
  19. data/lib/polars/date_time_name_space.rb +1484 -0
  20. data/lib/polars/dynamic_group_by.rb +52 -0
  21. data/lib/polars/exceptions.rb +20 -0
  22. data/lib/polars/expr.rb +5307 -0
  23. data/lib/polars/expr_dispatch.rb +22 -0
  24. data/lib/polars/functions.rb +453 -0
  25. data/lib/polars/group_by.rb +558 -0
  26. data/lib/polars/io.rb +814 -0
  27. data/lib/polars/lazy_frame.rb +2442 -0
  28. data/lib/polars/lazy_functions.rb +1195 -0
  29. data/lib/polars/lazy_group_by.rb +93 -0
  30. data/lib/polars/list_expr.rb +610 -0
  31. data/lib/polars/list_name_space.rb +346 -0
  32. data/lib/polars/meta_expr.rb +54 -0
  33. data/lib/polars/rolling_group_by.rb +35 -0
  34. data/lib/polars/series.rb +3730 -0
  35. data/lib/polars/slice.rb +104 -0
  36. data/lib/polars/string_expr.rb +972 -0
  37. data/lib/polars/string_name_space.rb +690 -0
  38. data/lib/polars/struct_expr.rb +100 -0
  39. data/lib/polars/struct_name_space.rb +64 -0
  40. data/lib/polars/utils.rb +192 -0
  41. data/lib/polars/version.rb +4 -0
  42. data/lib/polars/when.rb +16 -0
  43. data/lib/polars/when_then.rb +19 -0
  44. data/lib/polars-df.rb +1 -0
  45. data/lib/polars.rb +50 -0
  46. metadata +89 -0
@@ -0,0 +1,22 @@
1
+ module Polars
2
+ # @private
3
+ module ExprDispatch
4
+ private
5
+
6
+ def self.included(base)
7
+ base.attr_accessor :_s
8
+ base.singleton_class.attr_accessor :_accessor
9
+ end
10
+
11
+ def method_missing(method, ...)
12
+ return super unless self.class.method_defined?(method)
13
+
14
+ namespace = self.class._accessor
15
+
16
+ s = Utils.wrap_s(_s)
17
+ expr = Utils.col(s.name)
18
+ expr = expr.send(namespace) if namespace
19
+ s.to_frame.select(expr.send(method, ...)).to_series
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,453 @@
1
+ module Polars
2
+ module Functions
3
+ # Convert categorical variables into dummy/indicator variables.
4
+ #
5
+ # @param df [DataFrame]
6
+ # DataFrame to convert.
7
+ # @param columns [Array, nil]
8
+ # A subset of columns to convert to dummy variables. `nil` means
9
+ # "all columns".
10
+ #
11
+ # @return [DataFrame]
12
+ def get_dummies(df, columns: nil)
13
+ df.to_dummies(columns: columns)
14
+ end
15
+
16
+ # Aggregate multiple Dataframes/Series to a single DataFrame/Series.
17
+ #
18
+ # @param items [Object]
19
+ # DataFrames/Series/LazyFrames to concatenate.
20
+ # @param rechunk [Boolean]
21
+ # Make sure that all data is in contiguous memory.
22
+ # @param how ["vertical", "diagonal", "horizontal"]
23
+ # Lazy only supports the 'vertical' strategy.
24
+ #
25
+ # - Vertical: applies multiple `vstack` operations.
26
+ # - Diagonal: finds a union between the column schemas and fills missing column values with null.
27
+ # - Horizontal: stacks Series horizontally and fills with nulls if the lengths don't match.
28
+ # @param parallel [Boolean]
29
+ # Only relevant for LazyFrames. This determines if the concatenated
30
+ # lazy computations may be executed in parallel.
31
+ #
32
+ # @return [Object]
33
+ #
34
+ # @example
35
+ # df1 = Polars::DataFrame.new({"a" => [1], "b" => [3]})
36
+ # df2 = Polars::DataFrame.new({"a" => [2], "b" => [4]})
37
+ # Polars.concat([df1, df2])
38
+ # # =>
39
+ # # shape: (2, 2)
40
+ # # ┌─────┬─────┐
41
+ # # │ a ┆ b │
42
+ # # │ --- ┆ --- │
43
+ # # │ i64 ┆ i64 │
44
+ # # ╞═════╪═════╡
45
+ # # │ 1 ┆ 3 │
46
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
47
+ # # │ 2 ┆ 4 │
48
+ # # └─────┴─────┘
49
+ def concat(items, rechunk: true, how: "vertical", parallel: true)
50
+ if items.empty?
51
+ raise ArgumentError, "cannot concat empty list"
52
+ end
53
+
54
+ first = items[0]
55
+ if first.is_a?(DataFrame)
56
+ if how == "vertical"
57
+ out = Utils.wrap_df(_concat_df(items))
58
+ elsif how == "diagonal"
59
+ out = Utils.wrap_df(_diag_concat_df(items))
60
+ elsif how == "horizontal"
61
+ out = Utils.wrap_df(_hor_concat_df(items))
62
+ else
63
+ raise ArgumentError, "how must be one of {{'vertical', 'diagonal', 'horizontal'}}, got #{how}"
64
+ end
65
+ elsif first.is_a?(LazyFrame)
66
+ if how == "vertical"
67
+ # TODO
68
+ return Utils.wrap_ldf(_concat_lf(items, rechunk, parallel))
69
+ else
70
+ raise ArgumentError, "Lazy only allows 'vertical' concat strategy."
71
+ end
72
+ elsif first.is_a?(Series)
73
+ # TODO
74
+ out = Utils.wrap_s(_concat_series(items))
75
+ elsif first.is_a?(Expr)
76
+ out = first
77
+ items[1..-1].each do |e|
78
+ out = out.append(e)
79
+ end
80
+ else
81
+ raise ArgumentError, "did not expect type: #{first.class.name} in 'Polars.concat'."
82
+ end
83
+
84
+ if rechunk
85
+ out.rechunk
86
+ else
87
+ out
88
+ end
89
+ end
90
+
91
+ # Create a range of type `Datetime` (or `Date`).
92
+ #
93
+ # @param low [Object]
94
+ # Lower bound of the date range.
95
+ # @param high [Object]
96
+ # Upper bound of the date range.
97
+ # @param interval [Object]
98
+ # Interval periods. It can be a polars duration string, such as `3d12h4m25s`
99
+ # representing 3 days, 12 hours, 4 minutes, and 25 seconds.
100
+ # @param lazy [Boolean]
101
+ # Return an expression.
102
+ # @param closed ["both", "left", "right", "none"]
103
+ # Define whether the temporal window interval is closed or not.
104
+ # @param name [String]
105
+ # Name of the output Series.
106
+ # @param time_unit [nil, "ns", "us", "ms"]
107
+ # Set the time unit.
108
+ # @param time_zone [String]
109
+ # Optional timezone
110
+ #
111
+ # @return [Object]
112
+ #
113
+ # @note
114
+ # If both `low` and `high` are passed as date types (not datetime), and the
115
+ # interval granularity is no finer than 1d, the returned range is also of
116
+ # type date. All other permutations return a datetime Series.
117
+ #
118
+ # @example Using polars duration string to specify the interval
119
+ # Polars.date_range(Date.new(2022, 1, 1), Date.new(2022, 3, 1), "1mo", name: "drange")
120
+ # # =>
121
+ # # shape: (3,)
122
+ # # Series: 'drange' [date]
123
+ # # [
124
+ # # 2022-01-01
125
+ # # 2022-02-01
126
+ # # 2022-03-01
127
+ # # ]
128
+ #
129
+ # @example Using `timedelta` object to specify the interval:
130
+ # Polars.date_range(
131
+ # DateTime.new(1985, 1, 1),
132
+ # DateTime.new(1985, 1, 10),
133
+ # "1d12h",
134
+ # time_unit: "ms"
135
+ # )
136
+ # # =>
137
+ # # shape: (7,)
138
+ # # Series: '' [datetime[ms]]
139
+ # # [
140
+ # # 1985-01-01 00:00:00
141
+ # # 1985-01-02 12:00:00
142
+ # # 1985-01-04 00:00:00
143
+ # # 1985-01-05 12:00:00
144
+ # # 1985-01-07 00:00:00
145
+ # # 1985-01-08 12:00:00
146
+ # # 1985-01-10 00:00:00
147
+ # # ]
148
+ def date_range(
149
+ low,
150
+ high,
151
+ interval,
152
+ lazy: false,
153
+ closed: "both",
154
+ name: nil,
155
+ time_unit: nil,
156
+ time_zone: nil
157
+ )
158
+ if defined?(ActiveSupport::Duration) && interval.is_a?(ActiveSupport::Duration)
159
+ raise Todo
160
+ else
161
+ interval = interval.to_s
162
+ if interval.include?(" ")
163
+ interval = interval.gsub(" ", "")
164
+ end
165
+ end
166
+
167
+ if low.is_a?(Expr) || high.is_a?(Expr) || lazy
168
+ low = Utils.expr_to_lit_or_expr(low, str_to_lit: true)
169
+ high = Utils.expr_to_lit_or_expr(high, str_to_lit: true)
170
+ return Utils.wrap_expr(
171
+ _rb_date_range_lazy(low, high, interval, closed, name, time_zone)
172
+ )
173
+ end
174
+
175
+ low, low_is_date = _ensure_datetime(low)
176
+ high, high_is_date = _ensure_datetime(high)
177
+
178
+ if !time_unit.nil?
179
+ tu = time_unit
180
+ elsif interval.include?("ns")
181
+ tu = "ns"
182
+ else
183
+ tu = "us"
184
+ end
185
+
186
+ start = Utils._datetime_to_pl_timestamp(low, tu)
187
+ stop = Utils._datetime_to_pl_timestamp(high, tu)
188
+ if name.nil?
189
+ name = ""
190
+ end
191
+
192
+ dt_range = Utils.wrap_s(
193
+ _rb_date_range(start, stop, interval, closed, name, tu, time_zone)
194
+ )
195
+ if low_is_date && high_is_date && !["h", "m", "s"].any? { |v| _interval_granularity(interval).end_with?(v) }
196
+ dt_range = dt_range.cast(Date)
197
+ end
198
+
199
+ dt_range
200
+ end
201
+
202
+ # Bin values into discrete values.
203
+ #
204
+ # @param s [Series]
205
+ # Series to bin.
206
+ # @param bins [Array]
207
+ # Bins to create.
208
+ # @param labels [Array]
209
+ # Labels to assign to the bins. If given the length of labels must be
210
+ # len(bins) + 1.
211
+ # @param break_point_label [String]
212
+ # Name given to the breakpoint column.
213
+ # @param category_label [String]
214
+ # Name given to the category column.
215
+ #
216
+ # @return [DataFrame]
217
+ #
218
+ # @note
219
+ # This functionality is experimental and may change without it being considered a
220
+ # breaking change.
221
+ #
222
+ # @example
223
+ # a = Polars::Series.new("a", 13.times.map { |i| (-30 + i * 5) / 10.0 })
224
+ # Polars.cut(a, [-1, 1])
225
+ # # =>
226
+ # # shape: (12, 3)
227
+ # # ┌──────┬─────────────┬──────────────┐
228
+ # # │ a ┆ break_point ┆ category │
229
+ # # │ --- ┆ --- ┆ --- │
230
+ # # │ f64 ┆ f64 ┆ cat │
231
+ # # ╞══════╪═════════════╪══════════════╡
232
+ # # │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │
233
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
234
+ # # │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │
235
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
236
+ # # │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │
237
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
238
+ # # │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │
239
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
240
+ # # │ ... ┆ ... ┆ ... │
241
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
242
+ # # │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │
243
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
244
+ # # │ 1.5 ┆ inf ┆ (1.0, inf] │
245
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
246
+ # # │ 2.0 ┆ inf ┆ (1.0, inf] │
247
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
248
+ # # │ 2.5 ┆ inf ┆ (1.0, inf] │
249
+ # # └──────┴─────────────┴──────────────┘
250
+ # def cut(
251
+ # s,
252
+ # bins,
253
+ # labels: nil,
254
+ # break_point_label: "break_point",
255
+ # category_label: "category"
256
+ # )
257
+ # var_nm = s.name
258
+
259
+ # cuts_df = DataFrame.new(
260
+ # [
261
+ # Series.new(
262
+ # break_point_label, bins, dtype: :f64
263
+ # ).extend_constant(Float::INFINITY, 1)
264
+ # ]
265
+ # )
266
+
267
+ # if labels
268
+ # if labels.length != bins.length + 1
269
+ # raise ArgumentError, "expected more labels"
270
+ # end
271
+ # cuts_df = cuts_df.with_column(Series.new(category_label, labels))
272
+ # else
273
+ # cuts_df = cuts_df.with_column(
274
+ # Polars.format(
275
+ # "({}, {}]",
276
+ # Polars.col(break_point_label).shift_and_fill(1, -Float::INFINITY),
277
+ # Polars.col(break_point_label)
278
+ # ).alias(category_label)
279
+ # )
280
+ # end
281
+
282
+ # cuts_df = cuts_df.with_column(Polars.col(category_label).cast(:cat))
283
+
284
+ # s.cast(:f64)
285
+ # .sort
286
+ # .to_frame
287
+ # .join_asof(
288
+ # cuts_df,
289
+ # left_on: var_nm,
290
+ # right_on: break_point_label,
291
+ # strategy: "forward"
292
+ # )
293
+ # end
294
+
295
+ # Align a sequence of frames using the uique values from one or more columns as a key.
296
+ #
297
+ # Frames that do not contain the given key values have rows injected (with nulls
298
+ # filling the non-key columns), and each resulting frame is sorted by the key.
299
+ #
300
+ # The original column order of input frames is not changed unless ``select`` is
301
+ # specified (in which case the final column order is determined from that).
302
+ #
303
+ # Note that this does not result in a joined frame - you receive the same number
304
+ # of frames back that you passed in, but each is now aligned by key and has
305
+ # the same number of rows.
306
+ #
307
+ # @param frames [Array]
308
+ # Sequence of DataFrames or LazyFrames.
309
+ # @param on [Object]
310
+ # One or more columns whose unique values will be used to align the frames.
311
+ # @param select [Object]
312
+ # Optional post-alignment column select to constrain and/or order
313
+ # the columns returned from the newly aligned frames.
314
+ # @param reverse [Object]
315
+ # Sort the alignment column values in descending order; can be a single
316
+ # boolean or a list of booleans associated with each column in `on`.
317
+ #
318
+ # @return [Object]
319
+ #
320
+ # @example
321
+ # df1 = Polars::DataFrame.new(
322
+ # {
323
+ # "dt" => [Date.new(2022, 9, 1), Date.new(2022, 9, 2), Date.new(2022, 9, 3)],
324
+ # "x" => [3.5, 4.0, 1.0],
325
+ # "y" => [10.0, 2.5, 1.5]
326
+ # }
327
+ # )
328
+ # df2 = Polars::DataFrame.new(
329
+ # {
330
+ # "dt" => [Date.new(2022, 9, 2), Date.new(2022, 9, 3), Date.new(2022, 9, 1)],
331
+ # "x" => [8.0, 1.0, 3.5],
332
+ # "y" => [1.5, 12.0, 5.0]
333
+ # }
334
+ # )
335
+ # df3 = Polars::DataFrame.new(
336
+ # {
337
+ # "dt" => [Date.new(2022, 9, 3), Date.new(2022, 9, 2)],
338
+ # "x" => [2.0, 5.0],
339
+ # "y" => [2.5, 2.0]
340
+ # }
341
+ # )
342
+ # af1, af2, af3 = Polars.align_frames(
343
+ # df1, df2, df3, on: "dt", select: ["x", "y"]
344
+ # )
345
+ # (af1 * af2 * af3).fill_null(0).select(Polars.sum(Polars.col("*")).alias("dot"))
346
+ # # =>
347
+ # # shape: (3, 1)
348
+ # # ┌───────┐
349
+ # # │ dot │
350
+ # # │ --- │
351
+ # # │ f64 │
352
+ # # ╞═══════╡
353
+ # # │ 0.0 │
354
+ # # ├╌╌╌╌╌╌╌┤
355
+ # # │ 167.5 │
356
+ # # ├╌╌╌╌╌╌╌┤
357
+ # # │ 47.0 │
358
+ # # └───────┘
359
+ def align_frames(
360
+ *frames,
361
+ on:,
362
+ select: nil,
363
+ reverse: false
364
+ )
365
+ if frames.empty?
366
+ return []
367
+ elsif frames.map(&:class).uniq.length != 1
368
+ raise TypeError, "Input frames must be of a consistent type (all LazyFrame or all DataFrame)"
369
+ end
370
+
371
+ # establish the superset of all "on" column values, sort, and cache
372
+ eager = frames[0].is_a?(DataFrame)
373
+ alignment_frame = (
374
+ concat(frames.map { |df| df.lazy.select(on) })
375
+ .unique(maintain_order: false)
376
+ .sort(on, reverse: reverse)
377
+ )
378
+ alignment_frame = (
379
+ eager ? alignment_frame.collect.lazy : alignment_frame.cache
380
+ )
381
+ # finally, align all frames
382
+ aligned_frames =
383
+ frames.map do |df|
384
+ alignment_frame.join(
385
+ df.lazy,
386
+ on: alignment_frame.columns,
387
+ how: "left"
388
+ ).select(df.columns)
389
+ end
390
+ if !select.nil?
391
+ aligned_frames = aligned_frames.map { |df| df.select(select) }
392
+ end
393
+
394
+ eager ? aligned_frames.map(&:collect) : aligned_frames
395
+ end
396
+
397
+ # Return a new Series of given length and type, filled with ones.
398
+ #
399
+ # @param n [Integer]
400
+ # Number of elements in the `Series`
401
+ # @param dtype [Symbol]
402
+ # DataType of the elements, defaults to `:f64`
403
+ #
404
+ # @return [Series]
405
+ #
406
+ # @note
407
+ # In the lazy API you should probably not use this, but use `lit(1)`
408
+ # instead.
409
+ def ones(n, dtype: nil)
410
+ s = Series.new([1.0])
411
+ if dtype
412
+ s = s.cast(dtype)
413
+ end
414
+ s.new_from_index(0, n)
415
+ end
416
+
417
+ # Return a new Series of given length and type, filled with zeros.
418
+ #
419
+ # @param n [Integer]
420
+ # Number of elements in the `Series`
421
+ # @param dtype [Symbol]
422
+ # DataType of the elements, defaults to `:f64`
423
+ #
424
+ # @return [Series]
425
+ #
426
+ # @note
427
+ # In the lazy API you should probably not use this, but use `lit(0)`
428
+ # instead.
429
+ def zeros(n, dtype: nil)
430
+ s = Series.new([0.0])
431
+ if dtype
432
+ s = s.cast(dtype)
433
+ end
434
+ s.new_from_index(0, n)
435
+ end
436
+
437
+ private
438
+
439
+ def _ensure_datetime(value)
440
+ is_date_type = false
441
+ if !value.is_a?(::DateTime)
442
+ value = ::DateTime.new(value.year, value.month, value.day)
443
+ is_date_type = true
444
+ end
445
+ [value, is_date_type]
446
+ end
447
+
448
+ # TODO
449
+ def _interval_granularity(interval)
450
+ interval
451
+ end
452
+ end
453
+ end