polars-df 0.7.0-x86_64-linux → 0.9.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +41 -0
  3. data/Cargo.lock +353 -237
  4. data/Cargo.toml +0 -3
  5. data/LICENSE-THIRD-PARTY.txt +1978 -1459
  6. data/LICENSE.txt +1 -1
  7. data/README.md +2 -2
  8. data/lib/polars/3.1/polars.so +0 -0
  9. data/lib/polars/3.2/polars.so +0 -0
  10. data/lib/polars/{3.0 → 3.3}/polars.so +0 -0
  11. data/lib/polars/array_expr.rb +449 -0
  12. data/lib/polars/array_name_space.rb +346 -0
  13. data/lib/polars/cat_expr.rb +24 -0
  14. data/lib/polars/cat_name_space.rb +75 -0
  15. data/lib/polars/config.rb +2 -2
  16. data/lib/polars/data_frame.rb +248 -108
  17. data/lib/polars/data_types.rb +195 -29
  18. data/lib/polars/date_time_expr.rb +41 -24
  19. data/lib/polars/date_time_name_space.rb +12 -12
  20. data/lib/polars/exceptions.rb +12 -1
  21. data/lib/polars/expr.rb +1080 -195
  22. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  23. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  24. data/lib/polars/functions/as_datatype.rb +248 -0
  25. data/lib/polars/functions/col.rb +47 -0
  26. data/lib/polars/functions/eager.rb +182 -0
  27. data/lib/polars/functions/lazy.rb +1280 -0
  28. data/lib/polars/functions/len.rb +49 -0
  29. data/lib/polars/functions/lit.rb +35 -0
  30. data/lib/polars/functions/random.rb +16 -0
  31. data/lib/polars/functions/range/date_range.rb +103 -0
  32. data/lib/polars/functions/range/int_range.rb +51 -0
  33. data/lib/polars/functions/repeat.rb +144 -0
  34. data/lib/polars/functions/whenthen.rb +27 -0
  35. data/lib/polars/functions.rb +29 -416
  36. data/lib/polars/group_by.rb +3 -3
  37. data/lib/polars/io.rb +21 -28
  38. data/lib/polars/lazy_frame.rb +390 -76
  39. data/lib/polars/list_expr.rb +152 -6
  40. data/lib/polars/list_name_space.rb +102 -0
  41. data/lib/polars/meta_expr.rb +175 -7
  42. data/lib/polars/series.rb +557 -59
  43. data/lib/polars/sql_context.rb +1 -1
  44. data/lib/polars/string_cache.rb +75 -0
  45. data/lib/polars/string_expr.rb +412 -96
  46. data/lib/polars/string_name_space.rb +4 -4
  47. data/lib/polars/struct_expr.rb +1 -1
  48. data/lib/polars/struct_name_space.rb +1 -1
  49. data/lib/polars/testing.rb +507 -0
  50. data/lib/polars/utils.rb +64 -20
  51. data/lib/polars/version.rb +1 -1
  52. data/lib/polars.rb +15 -2
  53. metadata +36 -7
  54. data/lib/polars/lazy_functions.rb +0 -1197
@@ -0,0 +1,248 @@
1
+ module Polars
2
+ module Functions
3
+ # Create polars `Duration` from distinct time components.
4
+ #
5
+ # @return [Expr]
6
+ #
7
+ # @example
8
+ # df = Polars::DataFrame.new(
9
+ # {
10
+ # "datetime" => [DateTime.new(2022, 1, 1), DateTime.new(2022, 1, 2)],
11
+ # "add" => [1, 2]
12
+ # }
13
+ # )
14
+ # df.select(
15
+ # [
16
+ # (Polars.col("datetime") + Polars.duration(weeks: "add")).alias("add_weeks"),
17
+ # (Polars.col("datetime") + Polars.duration(days: "add")).alias("add_days"),
18
+ # (Polars.col("datetime") + Polars.duration(seconds: "add")).alias("add_seconds"),
19
+ # (Polars.col("datetime") + Polars.duration(milliseconds: "add")).alias(
20
+ # "add_milliseconds"
21
+ # ),
22
+ # (Polars.col("datetime") + Polars.duration(hours: "add")).alias("add_hours")
23
+ # ]
24
+ # )
25
+ # # =>
26
+ # # shape: (2, 5)
27
+ # # ┌─────────────────────┬─────────────────────┬─────────────────────┬─────────────────────────┬─────────────────────┐
28
+ # # │ add_weeks ┆ add_days ┆ add_seconds ┆ add_milliseconds ┆ add_hours │
29
+ # # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
30
+ # # │ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] ┆ datetime[ns] │
31
+ # # ╞═════════════════════╪═════════════════════╪═════════════════════╪═════════════════════════╪═════════════════════╡
32
+ # # │ 2022-01-08 00:00:00 ┆ 2022-01-02 00:00:00 ┆ 2022-01-01 00:00:01 ┆ 2022-01-01 00:00:00.001 ┆ 2022-01-01 01:00:00 │
33
+ # # │ 2022-01-16 00:00:00 ┆ 2022-01-04 00:00:00 ┆ 2022-01-02 00:00:02 ┆ 2022-01-02 00:00:00.002 ┆ 2022-01-02 02:00:00 │
34
+ # # └─────────────────────┴─────────────────────┴─────────────────────┴─────────────────────────┴─────────────────────┘
35
+ def duration(
36
+ weeks: nil,
37
+ days: nil,
38
+ hours: nil,
39
+ minutes: nil,
40
+ seconds: nil,
41
+ milliseconds: nil,
42
+ microseconds: nil,
43
+ nanoseconds: nil,
44
+ time_unit: "us"
45
+ )
46
+ if !weeks.nil?
47
+ weeks = Utils.expr_to_lit_or_expr(weeks, str_to_lit: false)._rbexpr
48
+ end
49
+ if !days.nil?
50
+ days = Utils.expr_to_lit_or_expr(days, str_to_lit: false)._rbexpr
51
+ end
52
+ if !hours.nil?
53
+ hours = Utils.expr_to_lit_or_expr(hours, str_to_lit: false)._rbexpr
54
+ end
55
+ if !minutes.nil?
56
+ minutes = Utils.expr_to_lit_or_expr(minutes, str_to_lit: false)._rbexpr
57
+ end
58
+ if !seconds.nil?
59
+ seconds = Utils.expr_to_lit_or_expr(seconds, str_to_lit: false)._rbexpr
60
+ end
61
+ if !milliseconds.nil?
62
+ milliseconds = Utils.expr_to_lit_or_expr(milliseconds, str_to_lit: false)._rbexpr
63
+ end
64
+ if !microseconds.nil?
65
+ microseconds = Utils.expr_to_lit_or_expr(microseconds, str_to_lit: false)._rbexpr
66
+ end
67
+ if !nanoseconds.nil?
68
+ nanoseconds = Utils.expr_to_lit_or_expr(nanoseconds, str_to_lit: false)._rbexpr
69
+ end
70
+
71
+ Utils.wrap_expr(
72
+ Plr.duration(
73
+ weeks,
74
+ days,
75
+ hours,
76
+ minutes,
77
+ seconds,
78
+ milliseconds,
79
+ microseconds,
80
+ nanoseconds,
81
+ time_unit
82
+ )
83
+ )
84
+ end
85
+
86
+ # Concat the arrays in a Series dtype List in linear time.
87
+ #
88
+ # @return [Expr]
89
+ def concat_list(exprs)
90
+ exprs = Utils.selection_to_rbexpr_list(exprs)
91
+ Utils.wrap_expr(Plr.concat_list(exprs))
92
+ end
93
+
94
+ # Collect several columns into a Series of dtype Struct.
95
+ #
96
+ # @param exprs [Object]
97
+ # Columns/Expressions to collect into a Struct
98
+ # @param eager [Boolean]
99
+ # Evaluate immediately
100
+ #
101
+ # @return [Object]
102
+ #
103
+ # @example
104
+ # Polars::DataFrame.new(
105
+ # {
106
+ # "int" => [1, 2],
107
+ # "str" => ["a", "b"],
108
+ # "bool" => [true, nil],
109
+ # "list" => [[1, 2], [3]],
110
+ # }
111
+ # ).select([Polars.struct(Polars.all).alias("my_struct")])
112
+ # # =>
113
+ # # shape: (2, 1)
114
+ # # ┌─────────────────────┐
115
+ # # │ my_struct │
116
+ # # │ --- │
117
+ # # │ struct[4] │
118
+ # # ╞═════════════════════╡
119
+ # # │ {1,"a",true,[1, 2]} │
120
+ # # │ {2,"b",null,[3]} │
121
+ # # └─────────────────────┘
122
+ #
123
+ # @example Only collect specific columns as a struct:
124
+ # df = Polars::DataFrame.new(
125
+ # {"a" => [1, 2, 3, 4], "b" => ["one", "two", "three", "four"], "c" => [9, 8, 7, 6]}
126
+ # )
127
+ # df.with_column(Polars.struct(Polars.col(["a", "b"])).alias("a_and_b"))
128
+ # # =>
129
+ # # shape: (4, 4)
130
+ # # ┌─────┬───────┬─────┬─────────────┐
131
+ # # │ a ┆ b ┆ c ┆ a_and_b │
132
+ # # │ --- ┆ --- ┆ --- ┆ --- │
133
+ # # │ i64 ┆ str ┆ i64 ┆ struct[2] │
134
+ # # ╞═════╪═══════╪═════╪═════════════╡
135
+ # # │ 1 ┆ one ┆ 9 ┆ {1,"one"} │
136
+ # # │ 2 ┆ two ┆ 8 ┆ {2,"two"} │
137
+ # # │ 3 ┆ three ┆ 7 ┆ {3,"three"} │
138
+ # # │ 4 ┆ four ┆ 6 ┆ {4,"four"} │
139
+ # # └─────┴───────┴─────┴─────────────┘
140
+ def struct(exprs, eager: false)
141
+ if eager
142
+ Polars.select(struct(exprs, eager: false)).to_series
143
+ end
144
+ exprs = Utils.selection_to_rbexpr_list(exprs)
145
+ Utils.wrap_expr(Plr.as_struct(exprs))
146
+ end
147
+
148
+ # Horizontally concat Utf8 Series in linear time. Non-Utf8 columns are cast to Utf8.
149
+ #
150
+ # @param exprs [Object]
151
+ # Columns to concat into a Utf8 Series.
152
+ # @param sep [String]
153
+ # String value that will be used to separate the values.
154
+ # @param ignore_nulls [Boolean]
155
+ # Ignore null values (default).
156
+ #
157
+ # @return [Expr]
158
+ #
159
+ # @example
160
+ # df = Polars::DataFrame.new(
161
+ # {
162
+ # "a" => [1, 2, 3],
163
+ # "b" => ["dogs", "cats", nil],
164
+ # "c" => ["play", "swim", "walk"]
165
+ # }
166
+ # )
167
+ # df.with_columns(
168
+ # [
169
+ # Polars.concat_str(
170
+ # [
171
+ # Polars.col("a") * 2,
172
+ # Polars.col("b"),
173
+ # Polars.col("c")
174
+ # ],
175
+ # sep: " "
176
+ # ).alias("full_sentence")
177
+ # ]
178
+ # )
179
+ # # =>
180
+ # # shape: (3, 4)
181
+ # # ┌─────┬──────┬──────┬───────────────┐
182
+ # # │ a ┆ b ┆ c ┆ full_sentence │
183
+ # # │ --- ┆ --- ┆ --- ┆ --- │
184
+ # # │ i64 ┆ str ┆ str ┆ str │
185
+ # # ╞═════╪══════╪══════╪═══════════════╡
186
+ # # │ 1 ┆ dogs ┆ play ┆ 2 dogs play │
187
+ # # │ 2 ┆ cats ┆ swim ┆ 4 cats swim │
188
+ # # │ 3 ┆ null ┆ walk ┆ null │
189
+ # # └─────┴──────┴──────┴───────────────┘
190
+ def concat_str(exprs, sep: "", ignore_nulls: false)
191
+ exprs = Utils.selection_to_rbexpr_list(exprs)
192
+ Utils.wrap_expr(Plr.concat_str(exprs, sep, ignore_nulls))
193
+ end
194
+
195
+ # Format expressions as a string.
196
+ #
197
+ # @param fstring [String]
198
+ # A string that with placeholders.
199
+ # For example: "hello_{}" or "{}_world
200
+ # @param args [Object]
201
+ # Expression(s) that fill the placeholders
202
+ #
203
+ # @return [Expr]
204
+ #
205
+ # @example
206
+ # df = Polars::DataFrame.new(
207
+ # {
208
+ # "a": ["a", "b", "c"],
209
+ # "b": [1, 2, 3]
210
+ # }
211
+ # )
212
+ # df.select(
213
+ # [
214
+ # Polars.format("foo_{}_bar_{}", Polars.col("a"), "b").alias("fmt")
215
+ # ]
216
+ # )
217
+ # # =>
218
+ # # shape: (3, 1)
219
+ # # ┌─────────────┐
220
+ # # │ fmt │
221
+ # # │ --- │
222
+ # # │ str │
223
+ # # ╞═════════════╡
224
+ # # │ foo_a_bar_1 │
225
+ # # │ foo_b_bar_2 │
226
+ # # │ foo_c_bar_3 │
227
+ # # └─────────────┘
228
+ def format(fstring, *args)
229
+ if fstring.scan("{}").length != args.length
230
+ raise ArgumentError, "number of placeholders should equal the number of arguments"
231
+ end
232
+
233
+ exprs = []
234
+
235
+ arguments = args.each
236
+ fstring.split(/(\{\})/).each do |s|
237
+ if s == "{}"
238
+ e = Utils.expr_to_lit_or_expr(arguments.next, str_to_lit: false)
239
+ exprs << e
240
+ elsif s.length > 0
241
+ exprs << lit(s)
242
+ end
243
+ end
244
+
245
+ concat_str(exprs, sep: "")
246
+ end
247
+ end
248
+ end
@@ -0,0 +1,47 @@
1
+ module Polars
2
+ module Functions
3
+ # Return an expression representing a column in a DataFrame.
4
+ #
5
+ # @return [Expr]
6
+ def col(name, *more_names)
7
+ if more_names.any?
8
+ if Utils.strlike?(name)
9
+ names_str = [name]
10
+ names_str.concat(more_names)
11
+ return Utils.wrap_expr(Plr.cols(names_str.map(&:to_s)))
12
+ elsif Utils.is_polars_dtype(name)
13
+ dtypes = [name]
14
+ dtypes.concat(more_names)
15
+ return Utils.wrap_expr(Plr.dtype_cols(dtypes))
16
+ else
17
+ msg = "invalid input for `col`\n\nExpected `str` or `DataType`, got #{name.class.name}."
18
+ raise TypeError, msg
19
+ end
20
+ end
21
+
22
+ if Utils.strlike?(name)
23
+ Utils.wrap_expr(Plr.col(name.to_s))
24
+ elsif Utils.is_polars_dtype(name)
25
+ Utils.wrap_expr(Plr.dtype_cols([name]))
26
+ elsif name.is_a?(::Array)
27
+ names = Array(name)
28
+ if names.empty?
29
+ return Utils.wrap_expr(Plr.cols(names))
30
+ end
31
+
32
+ item = names[0]
33
+ if Utils.strlike?(item)
34
+ Utils.wrap_expr(Plr.cols(names.map(&:to_s)))
35
+ elsif Utils.is_polars_dtype(item)
36
+ Utils.wrap_expr(Plr.dtype_cols(names))
37
+ else
38
+ msg = "invalid input for `col`\n\nExpected iterable of type `str` or `DataType`, got iterable of type #{item.class.name}."
39
+ raise TypeError, msg
40
+ end
41
+ else
42
+ msg = "invalid input for `col`\n\nExpected `str` or `DataType`, got #{name.class.name}."
43
+ raise TypeError, msg
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,182 @@
1
+ module Polars
2
+ module Functions
3
+ # Aggregate multiple Dataframes/Series to a single DataFrame/Series.
4
+ #
5
+ # @param items [Object]
6
+ # DataFrames/Series/LazyFrames to concatenate.
7
+ # @param rechunk [Boolean]
8
+ # Make sure that all data is in contiguous memory.
9
+ # @param how ["vertical", "vertical_relaxed", "diagonal", "horizontal"]
10
+ # LazyFrames do not support the `horizontal` strategy.
11
+ #
12
+ # - Vertical: applies multiple `vstack` operations.
13
+ # - Diagonal: finds a union between the column schemas and fills missing column values with null.
14
+ # - Horizontal: stacks Series horizontally and fills with nulls if the lengths don't match.
15
+ # @param parallel [Boolean]
16
+ # Only relevant for LazyFrames. This determines if the concatenated
17
+ # lazy computations may be executed in parallel.
18
+ #
19
+ # @return [Object]
20
+ #
21
+ # @example
22
+ # df1 = Polars::DataFrame.new({"a" => [1], "b" => [3]})
23
+ # df2 = Polars::DataFrame.new({"a" => [2], "b" => [4]})
24
+ # Polars.concat([df1, df2])
25
+ # # =>
26
+ # # shape: (2, 2)
27
+ # # ┌─────┬─────┐
28
+ # # │ a ┆ b │
29
+ # # │ --- ┆ --- │
30
+ # # │ i64 ┆ i64 │
31
+ # # ╞═════╪═════╡
32
+ # # │ 1 ┆ 3 │
33
+ # # │ 2 ┆ 4 │
34
+ # # └─────┴─────┘
35
+ def concat(items, rechunk: true, how: "vertical", parallel: true)
36
+ if items.empty?
37
+ raise ArgumentError, "cannot concat empty list"
38
+ end
39
+
40
+ first = items[0]
41
+ if first.is_a?(DataFrame)
42
+ if how == "vertical"
43
+ out = Utils.wrap_df(Plr.concat_df(items))
44
+ elsif how == "diagonal"
45
+ out = Utils.wrap_df(Plr.concat_df_diagonal(items))
46
+ elsif how == "horizontal"
47
+ out = Utils.wrap_df(Plr.concat_df_horizontal(items))
48
+ else
49
+ raise ArgumentError, "how must be one of {{'vertical', 'diagonal', 'horizontal'}}, got #{how}"
50
+ end
51
+ elsif first.is_a?(LazyFrame)
52
+ if how == "vertical"
53
+ return Utils.wrap_ldf(Plr.concat_lf(items, rechunk, parallel, false))
54
+ elsif how == "vertical_relaxed"
55
+ return Utils.wrap_ldf(Plr.concat_lf(items, rechunk, parallel, true))
56
+ elsif how == "diagonal"
57
+ return Utils.wrap_ldf(Plr.concat_lf_diagonal(items, rechunk, parallel, false))
58
+ else
59
+ raise ArgumentError, "Lazy only allows 'vertical', 'vertical_relaxed', and 'diagonal' concat strategy."
60
+ end
61
+ elsif first.is_a?(Series)
62
+ # TODO
63
+ out = Utils.wrap_s(Plr.concat_series(items))
64
+ elsif first.is_a?(Expr)
65
+ out = first
66
+ items[1..-1].each do |e|
67
+ out = out.append(e)
68
+ end
69
+ else
70
+ raise ArgumentError, "did not expect type: #{first.class.name} in 'Polars.concat'."
71
+ end
72
+
73
+ if rechunk
74
+ out.rechunk
75
+ else
76
+ out
77
+ end
78
+ end
79
+
80
+ # Align a sequence of frames using the uique values from one or more columns as a key.
81
+ #
82
+ # Frames that do not contain the given key values have rows injected (with nulls
83
+ # filling the non-key columns), and each resulting frame is sorted by the key.
84
+ #
85
+ # The original column order of input frames is not changed unless ``select`` is
86
+ # specified (in which case the final column order is determined from that).
87
+ #
88
+ # Note that this does not result in a joined frame - you receive the same number
89
+ # of frames back that you passed in, but each is now aligned by key and has
90
+ # the same number of rows.
91
+ #
92
+ # @param frames [Array]
93
+ # Sequence of DataFrames or LazyFrames.
94
+ # @param on [Object]
95
+ # One or more columns whose unique values will be used to align the frames.
96
+ # @param select [Object]
97
+ # Optional post-alignment column select to constrain and/or order
98
+ # the columns returned from the newly aligned frames.
99
+ # @param reverse [Object]
100
+ # Sort the alignment column values in descending order; can be a single
101
+ # boolean or a list of booleans associated with each column in `on`.
102
+ #
103
+ # @return [Object]
104
+ #
105
+ # @example
106
+ # df1 = Polars::DataFrame.new(
107
+ # {
108
+ # "dt" => [Date.new(2022, 9, 1), Date.new(2022, 9, 2), Date.new(2022, 9, 3)],
109
+ # "x" => [3.5, 4.0, 1.0],
110
+ # "y" => [10.0, 2.5, 1.5]
111
+ # }
112
+ # )
113
+ # df2 = Polars::DataFrame.new(
114
+ # {
115
+ # "dt" => [Date.new(2022, 9, 2), Date.new(2022, 9, 3), Date.new(2022, 9, 1)],
116
+ # "x" => [8.0, 1.0, 3.5],
117
+ # "y" => [1.5, 12.0, 5.0]
118
+ # }
119
+ # )
120
+ # df3 = Polars::DataFrame.new(
121
+ # {
122
+ # "dt" => [Date.new(2022, 9, 3), Date.new(2022, 9, 2)],
123
+ # "x" => [2.0, 5.0],
124
+ # "y" => [2.5, 2.0]
125
+ # }
126
+ # )
127
+ # af1, af2, af3 = Polars.align_frames(
128
+ # df1, df2, df3, on: "dt", select: ["x", "y"]
129
+ # )
130
+ # (af1 * af2 * af3).fill_null(0).select(Polars.sum(Polars.col("*")).alias("dot"))
131
+ # # =>
132
+ # # shape: (3, 1)
133
+ # # ┌───────┐
134
+ # # │ dot │
135
+ # # │ --- │
136
+ # # │ f64 │
137
+ # # ╞═══════╡
138
+ # # │ 0.0 │
139
+ # # ├╌╌╌╌╌╌╌┤
140
+ # # │ 167.5 │
141
+ # # ├╌╌╌╌╌╌╌┤
142
+ # # │ 47.0 │
143
+ # # └───────┘
144
+ def align_frames(
145
+ *frames,
146
+ on:,
147
+ select: nil,
148
+ reverse: false
149
+ )
150
+ if frames.empty?
151
+ return []
152
+ elsif frames.map(&:class).uniq.length != 1
153
+ raise TypeError, "Input frames must be of a consistent type (all LazyFrame or all DataFrame)"
154
+ end
155
+
156
+ # establish the superset of all "on" column values, sort, and cache
157
+ eager = frames[0].is_a?(DataFrame)
158
+ alignment_frame = (
159
+ concat(frames.map { |df| df.lazy.select(on) })
160
+ .unique(maintain_order: false)
161
+ .sort(on, reverse: reverse)
162
+ )
163
+ alignment_frame = (
164
+ eager ? alignment_frame.collect.lazy : alignment_frame.cache
165
+ )
166
+ # finally, align all frames
167
+ aligned_frames =
168
+ frames.map do |df|
169
+ alignment_frame.join(
170
+ df.lazy,
171
+ on: alignment_frame.columns,
172
+ how: "left"
173
+ ).select(df.columns)
174
+ end
175
+ if !select.nil?
176
+ aligned_frames = aligned_frames.map { |df| df.select(select) }
177
+ end
178
+
179
+ eager ? aligned_frames.map(&:collect) : aligned_frames
180
+ end
181
+ end
182
+ end