polars-df 0.7.0-arm64-darwin → 0.9.0-arm64-darwin

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +41 -0
  3. data/Cargo.lock +353 -237
  4. data/Cargo.toml +0 -3
  5. data/LICENSE-THIRD-PARTY.txt +4014 -3495
  6. data/LICENSE.txt +1 -1
  7. data/README.md +2 -2
  8. data/lib/polars/3.1/polars.bundle +0 -0
  9. data/lib/polars/3.2/polars.bundle +0 -0
  10. data/lib/polars/{3.0 → 3.3}/polars.bundle +0 -0
  11. data/lib/polars/array_expr.rb +449 -0
  12. data/lib/polars/array_name_space.rb +346 -0
  13. data/lib/polars/cat_expr.rb +24 -0
  14. data/lib/polars/cat_name_space.rb +75 -0
  15. data/lib/polars/config.rb +2 -2
  16. data/lib/polars/data_frame.rb +248 -108
  17. data/lib/polars/data_types.rb +195 -29
  18. data/lib/polars/date_time_expr.rb +41 -24
  19. data/lib/polars/date_time_name_space.rb +12 -12
  20. data/lib/polars/exceptions.rb +12 -1
  21. data/lib/polars/expr.rb +1080 -195
  22. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  23. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  24. data/lib/polars/functions/as_datatype.rb +248 -0
  25. data/lib/polars/functions/col.rb +47 -0
  26. data/lib/polars/functions/eager.rb +182 -0
  27. data/lib/polars/functions/lazy.rb +1280 -0
  28. data/lib/polars/functions/len.rb +49 -0
  29. data/lib/polars/functions/lit.rb +35 -0
  30. data/lib/polars/functions/random.rb +16 -0
  31. data/lib/polars/functions/range/date_range.rb +103 -0
  32. data/lib/polars/functions/range/int_range.rb +51 -0
  33. data/lib/polars/functions/repeat.rb +144 -0
  34. data/lib/polars/functions/whenthen.rb +27 -0
  35. data/lib/polars/functions.rb +29 -416
  36. data/lib/polars/group_by.rb +3 -3
  37. data/lib/polars/io.rb +21 -28
  38. data/lib/polars/lazy_frame.rb +390 -76
  39. data/lib/polars/list_expr.rb +152 -6
  40. data/lib/polars/list_name_space.rb +102 -0
  41. data/lib/polars/meta_expr.rb +175 -7
  42. data/lib/polars/series.rb +557 -59
  43. data/lib/polars/sql_context.rb +1 -1
  44. data/lib/polars/string_cache.rb +75 -0
  45. data/lib/polars/string_expr.rb +412 -96
  46. data/lib/polars/string_name_space.rb +4 -4
  47. data/lib/polars/struct_expr.rb +1 -1
  48. data/lib/polars/struct_name_space.rb +1 -1
  49. data/lib/polars/testing.rb +507 -0
  50. data/lib/polars/utils.rb +64 -20
  51. data/lib/polars/version.rb +1 -1
  52. data/lib/polars.rb +15 -2
  53. metadata +36 -7
  54. data/lib/polars/lazy_functions.rb +0 -1197
@@ -0,0 +1,248 @@
1
+ module Polars
2
+ module Functions
3
+ # Create polars `Duration` from distinct time components.
4
+ #
5
+ # @return [Expr]
6
+ #
7
+ # @example
8
+ # df = Polars::DataFrame.new(
9
+ # {
10
+ # "datetime" => [DateTime.new(2022, 1, 1), DateTime.new(2022, 1, 2)],
11
+ # "add" => [1, 2]
12
+ # }
13
+ # )
14
+ # df.select(
15
+ # [
16
+ # (Polars.col("datetime") + Polars.duration(weeks: "add")).alias("add_weeks"),
17
+ # (Polars.col("datetime") + Polars.duration(days: "add")).alias("add_days"),
18
+ # (Polars.col("datetime") + Polars.duration(seconds: "add")).alias("add_seconds"),
19
+ # (Polars.col("datetime") + Polars.duration(milliseconds: "add")).alias(
20
+ # "add_milliseconds"
21
+ # ),
22
+ # (Polars.col("datetime") + Polars.duration(hours: "add")).alias("add_hours")
23
+ # ]
24
+ # )
25
+ # # =>
26
+ # # shape: (2, 5)
27
+ # # ┌─────────────────────┬─────────────────────┬─────────────────────┬─────────────────────────┬─────────────────────┐
28
+ # # │ add_weeks ┆ add_days ┆ add_seconds ┆ add_milliseconds ┆ add_hours │
29
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
30
+ # # │ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] │
31
+ # # ╞═════════════════════╪═════════════════════╪═════════════════════╪═════════════════════════╪═════════════════════╡
32
+ # # │ 2022-01-08 00:00:00 ┆ 2022-01-02 00:00:00 ┆ 2022-01-01 00:00:01 ┆ 2022-01-01 00:00:00.001 ┆ 2022-01-01 01:00:00 │
33
+ # # │ 2022-01-16 00:00:00 ┆ 2022-01-04 00:00:00 ┆ 2022-01-02 00:00:02 ┆ 2022-01-02 00:00:00.002 ┆ 2022-01-02 02:00:00 │
34
+ # # └─────────────────────┴─────────────────────┴─────────────────────┴─────────────────────────┴─────────────────────┘
35
+ def duration(
36
+ weeks: nil,
37
+ days: nil,
38
+ hours: nil,
39
+ minutes: nil,
40
+ seconds: nil,
41
+ milliseconds: nil,
42
+ microseconds: nil,
43
+ nanoseconds: nil,
44
+ time_unit: "us"
45
+ )
46
+ if !weeks.nil?
47
+ weeks = Utils.expr_to_lit_or_expr(weeks, str_to_lit: false)._rbexpr
48
+ end
49
+ if !days.nil?
50
+ days = Utils.expr_to_lit_or_expr(days, str_to_lit: false)._rbexpr
51
+ end
52
+ if !hours.nil?
53
+ hours = Utils.expr_to_lit_or_expr(hours, str_to_lit: false)._rbexpr
54
+ end
55
+ if !minutes.nil?
56
+ minutes = Utils.expr_to_lit_or_expr(minutes, str_to_lit: false)._rbexpr
57
+ end
58
+ if !seconds.nil?
59
+ seconds = Utils.expr_to_lit_or_expr(seconds, str_to_lit: false)._rbexpr
60
+ end
61
+ if !milliseconds.nil?
62
+ milliseconds = Utils.expr_to_lit_or_expr(milliseconds, str_to_lit: false)._rbexpr
63
+ end
64
+ if !microseconds.nil?
65
+ microseconds = Utils.expr_to_lit_or_expr(microseconds, str_to_lit: false)._rbexpr
66
+ end
67
+ if !nanoseconds.nil?
68
+ nanoseconds = Utils.expr_to_lit_or_expr(nanoseconds, str_to_lit: false)._rbexpr
69
+ end
70
+
71
+ Utils.wrap_expr(
72
+ Plr.duration(
73
+ weeks,
74
+ days,
75
+ hours,
76
+ minutes,
77
+ seconds,
78
+ milliseconds,
79
+ microseconds,
80
+ nanoseconds,
81
+ time_unit
82
+ )
83
+ )
84
+ end
85
+
86
+ # Concat the arrays in a Series dtype List in linear time.
87
+ #
88
+ # @return [Expr]
89
+ def concat_list(exprs)
90
+ exprs = Utils.selection_to_rbexpr_list(exprs)
91
+ Utils.wrap_expr(Plr.concat_list(exprs))
92
+ end
93
+
94
+ # Collect several columns into a Series of dtype Struct.
95
+ #
96
+ # @param exprs [Object]
97
+ # Columns/Expressions to collect into a Struct
98
+ # @param eager [Boolean]
99
+ # Evaluate immediately
100
+ #
101
+ # @return [Object]
102
+ #
103
+ # @example
104
+ # Polars::DataFrame.new(
105
+ # {
106
+ # "int" => [1, 2],
107
+ # "str" => ["a", "b"],
108
+ # "bool" => [true, nil],
109
+ # "list" => [[1, 2], [3]],
110
+ # }
111
+ # ).select([Polars.struct(Polars.all).alias("my_struct")])
112
+ # # =>
113
+ # # shape: (2, 1)
114
+ # # ┌─────────────────────┐
115
+ # # │ my_struct │
116
+ # # │ --- │
117
+ # # │ struct[4] │
118
+ # # ╞═════════════════════╡
119
+ # # │ {1,"a",true,[1, 2]} │
120
+ # # │ {2,"b",null,[3]} │
121
+ # # └─────────────────────┘
122
+ #
123
+ # @example Only collect specific columns as a struct:
124
+ # df = Polars::DataFrame.new(
125
+ # {"a" => [1, 2, 3, 4], "b" => ["one", "two", "three", "four"], "c" => [9, 8, 7, 6]}
126
+ # )
127
+ # df.with_column(Polars.struct(Polars.col(["a", "b"])).alias("a_and_b"))
128
+ # # =>
129
+ # # shape: (4, 4)
130
+ # # ┌─────┬───────┬─────┬─────────────┐
131
+ # # │ a ┆ b ┆ c ┆ a_and_b │
132
+ # # │ --- ┆ --- ┆ --- ┆ --- │
133
+ # # │ i64 ┆ str ┆ i64 ┆ struct[2] │
134
+ # # ╞═════╪═══════╪═════╪═════════════╡
135
+ # # │ 1 ┆ one ┆ 9 ┆ {1,"one"} │
136
+ # # │ 2 ┆ two ┆ 8 ┆ {2,"two"} │
137
+ # # │ 3 ┆ three ┆ 7 ┆ {3,"three"} │
138
+ # # │ 4 ┆ four ┆ 6 ┆ {4,"four"} │
139
+ # # └─────┴───────┴─────┴─────────────┘
140
+ def struct(exprs, eager: false)
141
+ if eager
142
+ Polars.select(struct(exprs, eager: false)).to_series
143
+ end
144
+ exprs = Utils.selection_to_rbexpr_list(exprs)
145
+ Utils.wrap_expr(Plr.as_struct(exprs))
146
+ end
147
+
148
+ # Horizontally concat Utf8 Series in linear time. Non-Utf8 columns are cast to Utf8.
149
+ #
150
+ # @param exprs [Object]
151
+ # Columns to concat into a Utf8 Series.
152
+ # @param sep [String]
153
+ # String value that will be used to separate the values.
154
+ # @param ignore_nulls [Boolean]
155
+ # Ignore null values (default).
156
+ #
157
+ # @return [Expr]
158
+ #
159
+ # @example
160
+ # df = Polars::DataFrame.new(
161
+ # {
162
+ # "a" => [1, 2, 3],
163
+ # "b" => ["dogs", "cats", nil],
164
+ # "c" => ["play", "swim", "walk"]
165
+ # }
166
+ # )
167
+ # df.with_columns(
168
+ # [
169
+ # Polars.concat_str(
170
+ # [
171
+ # Polars.col("a") * 2,
172
+ # Polars.col("b"),
173
+ # Polars.col("c")
174
+ # ],
175
+ # sep: " "
176
+ # ).alias("full_sentence")
177
+ # ]
178
+ # )
179
+ # # =>
180
+ # # shape: (3, 4)
181
+ # # ┌─────┬──────┬──────┬───────────────┐
182
+ # # │ a ┆ b ┆ c ┆ full_sentence │
183
+ # # │ --- ┆ --- ┆ --- ┆ --- │
184
+ # # │ i64 ┆ str ┆ str ┆ str │
185
+ # # ╞═════╪══════╪══════╪═══════════════╡
186
+ # # │ 1 ┆ dogs ┆ play ┆ 2 dogs play │
187
+ # # │ 2 ┆ cats ┆ swim ┆ 4 cats swim │
188
+ # # │ 3 ┆ null ┆ walk ┆ null │
189
+ # # └─────┴──────┴──────┴───────────────┘
190
+ def concat_str(exprs, sep: "", ignore_nulls: false)
191
+ exprs = Utils.selection_to_rbexpr_list(exprs)
192
+ Utils.wrap_expr(Plr.concat_str(exprs, sep, ignore_nulls))
193
+ end
194
+
195
+ # Format expressions as a string.
196
+ #
197
+ # @param fstring [String]
198
+ # A string that with placeholders.
199
+ # For example: "hello_{}" or "{}_world
200
+ # @param args [Object]
201
+ # Expression(s) that fill the placeholders
202
+ #
203
+ # @return [Expr]
204
+ #
205
+ # @example
206
+ # df = Polars::DataFrame.new(
207
+ # {
208
+ # "a": ["a", "b", "c"],
209
+ # "b": [1, 2, 3]
210
+ # }
211
+ # )
212
+ # df.select(
213
+ # [
214
+ # Polars.format("foo_{}_bar_{}", Polars.col("a"), "b").alias("fmt")
215
+ # ]
216
+ # )
217
+ # # =>
218
+ # # shape: (3, 1)
219
+ # # ┌─────────────┐
220
+ # # │ fmt │
221
+ # # │ --- │
222
+ # # │ str │
223
+ # # ╞═════════════╡
224
+ # # │ foo_a_bar_1 │
225
+ # # │ foo_b_bar_2 │
226
+ # # │ foo_c_bar_3 │
227
+ # # └─────────────┘
228
+ def format(fstring, *args)
229
+ if fstring.scan("{}").length != args.length
230
+ raise ArgumentError, "number of placeholders should equal the number of arguments"
231
+ end
232
+
233
+ exprs = []
234
+
235
+ arguments = args.each
236
+ fstring.split(/(\{\})/).each do |s|
237
+ if s == "{}"
238
+ e = Utils.expr_to_lit_or_expr(arguments.next, str_to_lit: false)
239
+ exprs << e
240
+ elsif s.length > 0
241
+ exprs << lit(s)
242
+ end
243
+ end
244
+
245
+ concat_str(exprs, sep: "")
246
+ end
247
+ end
248
+ end
@@ -0,0 +1,47 @@
1
+ module Polars
2
+ module Functions
3
+ # Return an expression representing a column in a DataFrame.
4
+ #
5
+ # @return [Expr]
6
+ def col(name, *more_names)
7
+ if more_names.any?
8
+ if Utils.strlike?(name)
9
+ names_str = [name]
10
+ names_str.concat(more_names)
11
+ return Utils.wrap_expr(Plr.cols(names_str.map(&:to_s)))
12
+ elsif Utils.is_polars_dtype(name)
13
+ dtypes = [name]
14
+ dtypes.concat(more_names)
15
+ return Utils.wrap_expr(Plr.dtype_cols(dtypes))
16
+ else
17
+ msg = "invalid input for `col`\n\nExpected `str` or `DataType`, got #{name.class.name}."
18
+ raise TypeError, msg
19
+ end
20
+ end
21
+
22
+ if Utils.strlike?(name)
23
+ Utils.wrap_expr(Plr.col(name.to_s))
24
+ elsif Utils.is_polars_dtype(name)
25
+ Utils.wrap_expr(Plr.dtype_cols([name]))
26
+ elsif name.is_a?(::Array)
27
+ names = Array(name)
28
+ if names.empty?
29
+ return Utils.wrap_expr(Plr.cols(names))
30
+ end
31
+
32
+ item = names[0]
33
+ if Utils.strlike?(item)
34
+ Utils.wrap_expr(Plr.cols(names.map(&:to_s)))
35
+ elsif Utils.is_polars_dtype(item)
36
+ Utils.wrap_expr(Plr.dtype_cols(names))
37
+ else
38
+ msg = "invalid input for `col`\n\nExpected iterable of type `str` or `DataType`, got iterable of type #{item.class.name}."
39
+ raise TypeError, msg
40
+ end
41
+ else
42
+ msg = "invalid input for `col`\n\nExpected `str` or `DataType`, got #{name.class.name}."
43
+ raise TypeError, msg
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,182 @@
1
+ module Polars
2
+ module Functions
3
+ # Aggregate multiple Dataframes/Series to a single DataFrame/Series.
4
+ #
5
+ # @param items [Object]
6
+ # DataFrames/Series/LazyFrames to concatenate.
7
+ # @param rechunk [Boolean]
8
+ # Make sure that all data is in contiguous memory.
9
+ # @param how ["vertical", "vertical_relaxed", "diagonal", "horizontal"]
10
+ # LazyFrames do not support the `horizontal` strategy.
11
+ #
12
+ # - Vertical: applies multiple `vstack` operations.
13
+ # - Diagonal: finds a union between the column schemas and fills missing column values with null.
14
+ # - Horizontal: stacks Series horizontally and fills with nulls if the lengths don't match.
15
+ # @param parallel [Boolean]
16
+ # Only relevant for LazyFrames. This determines if the concatenated
17
+ # lazy computations may be executed in parallel.
18
+ #
19
+ # @return [Object]
20
+ #
21
+ # @example
22
+ # df1 = Polars::DataFrame.new({"a" => [1], "b" => [3]})
23
+ # df2 = Polars::DataFrame.new({"a" => [2], "b" => [4]})
24
+ # Polars.concat([df1, df2])
25
+ # # =>
26
+ # # shape: (2, 2)
27
+ # # ┌─────┬─────┐
28
+ # # │ a ┆ b │
29
+ # # │ --- ┆ --- │
30
+ # # │ i64 ┆ i64 │
31
+ # # ╞═════╪═════╡
32
+ # # │ 1 ┆ 3 │
33
+ # # │ 2 ┆ 4 │
34
+ # # └─────┴─────┘
35
+ def concat(items, rechunk: true, how: "vertical", parallel: true)
36
+ if items.empty?
37
+ raise ArgumentError, "cannot concat empty list"
38
+ end
39
+
40
+ first = items[0]
41
+ if first.is_a?(DataFrame)
42
+ if how == "vertical"
43
+ out = Utils.wrap_df(Plr.concat_df(items))
44
+ elsif how == "diagonal"
45
+ out = Utils.wrap_df(Plr.concat_df_diagonal(items))
46
+ elsif how == "horizontal"
47
+ out = Utils.wrap_df(Plr.concat_df_horizontal(items))
48
+ else
49
+ raise ArgumentError, "how must be one of {{'vertical', 'diagonal', 'horizontal'}}, got #{how}"
50
+ end
51
+ elsif first.is_a?(LazyFrame)
52
+ if how == "vertical"
53
+ return Utils.wrap_ldf(Plr.concat_lf(items, rechunk, parallel, false))
54
+ elsif how == "vertical_relaxed"
55
+ return Utils.wrap_ldf(Plr.concat_lf(items, rechunk, parallel, true))
56
+ elsif how == "diagonal"
57
+ return Utils.wrap_ldf(Plr.concat_lf_diagonal(items, rechunk, parallel, false))
58
+ else
59
+ raise ArgumentError, "Lazy only allows 'vertical', 'vertical_relaxed', and 'diagonal' concat strategy."
60
+ end
61
+ elsif first.is_a?(Series)
62
+ # TODO
63
+ out = Utils.wrap_s(Plr.concat_series(items))
64
+ elsif first.is_a?(Expr)
65
+ out = first
66
+ items[1..-1].each do |e|
67
+ out = out.append(e)
68
+ end
69
+ else
70
+ raise ArgumentError, "did not expect type: #{first.class.name} in 'Polars.concat'."
71
+ end
72
+
73
+ if rechunk
74
+ out.rechunk
75
+ else
76
+ out
77
+ end
78
+ end
79
+
80
+ # Align a sequence of frames using the uique values from one or more columns as a key.
81
+ #
82
+ # Frames that do not contain the given key values have rows injected (with nulls
83
+ # filling the non-key columns), and each resulting frame is sorted by the key.
84
+ #
85
+ # The original column order of input frames is not changed unless ``select`` is
86
+ # specified (in which case the final column order is determined from that).
87
+ #
88
+ # Note that this does not result in a joined frame - you receive the same number
89
+ # of frames back that you passed in, but each is now aligned by key and has
90
+ # the same number of rows.
91
+ #
92
+ # @param frames [Array]
93
+ # Sequence of DataFrames or LazyFrames.
94
+ # @param on [Object]
95
+ # One or more columns whose unique values will be used to align the frames.
96
+ # @param select [Object]
97
+ # Optional post-alignment column select to constrain and/or order
98
+ # the columns returned from the newly aligned frames.
99
+ # @param reverse [Object]
100
+ # Sort the alignment column values in descending order; can be a single
101
+ # boolean or a list of booleans associated with each column in `on`.
102
+ #
103
+ # @return [Object]
104
+ #
105
+ # @example
106
+ # df1 = Polars::DataFrame.new(
107
+ # {
108
+ # "dt" => [Date.new(2022, 9, 1), Date.new(2022, 9, 2), Date.new(2022, 9, 3)],
109
+ # "x" => [3.5, 4.0, 1.0],
110
+ # "y" => [10.0, 2.5, 1.5]
111
+ # }
112
+ # )
113
+ # df2 = Polars::DataFrame.new(
114
+ # {
115
+ # "dt" => [Date.new(2022, 9, 2), Date.new(2022, 9, 3), Date.new(2022, 9, 1)],
116
+ # "x" => [8.0, 1.0, 3.5],
117
+ # "y" => [1.5, 12.0, 5.0]
118
+ # }
119
+ # )
120
+ # df3 = Polars::DataFrame.new(
121
+ # {
122
+ # "dt" => [Date.new(2022, 9, 3), Date.new(2022, 9, 2)],
123
+ # "x" => [2.0, 5.0],
124
+ # "y" => [2.5, 2.0]
125
+ # }
126
+ # )
127
+ # af1, af2, af3 = Polars.align_frames(
128
+ # df1, df2, df3, on: "dt", select: ["x", "y"]
129
+ # )
130
+ # (af1 * af2 * af3).fill_null(0).select(Polars.sum(Polars.col("*")).alias("dot"))
131
+ # # =>
132
+ # # shape: (3, 1)
133
+ # # ┌───────┐
134
+ # # │ dot │
135
+ # # │ --- │
136
+ # # │ f64 │
137
+ # # ╞═══════╡
138
+ # # │ 0.0 │
139
+ # # ├╌╌╌╌╌╌╌┤
140
+ # # │ 167.5 │
141
+ # # ├╌╌╌╌╌╌╌┤
142
+ # # │ 47.0 │
143
+ # # └───────┘
144
+ def align_frames(
145
+ *frames,
146
+ on:,
147
+ select: nil,
148
+ reverse: false
149
+ )
150
+ if frames.empty?
151
+ return []
152
+ elsif frames.map(&:class).uniq.length != 1
153
+ raise TypeError, "Input frames must be of a consistent type (all LazyFrame or all DataFrame)"
154
+ end
155
+
156
+ # establish the superset of all "on" column values, sort, and cache
157
+ eager = frames[0].is_a?(DataFrame)
158
+ alignment_frame = (
159
+ concat(frames.map { |df| df.lazy.select(on) })
160
+ .unique(maintain_order: false)
161
+ .sort(on, reverse: reverse)
162
+ )
163
+ alignment_frame = (
164
+ eager ? alignment_frame.collect.lazy : alignment_frame.cache
165
+ )
166
+ # finally, align all frames
167
+ aligned_frames =
168
+ frames.map do |df|
169
+ alignment_frame.join(
170
+ df.lazy,
171
+ on: alignment_frame.columns,
172
+ how: "left"
173
+ ).select(df.columns)
174
+ end
175
+ if !select.nil?
176
+ aligned_frames = aligned_frames.map { |df| df.select(select) }
177
+ end
178
+
179
+ eager ? aligned_frames.map(&:collect) : aligned_frames
180
+ end
181
+ end
182
+ end