polars-df 0.10.0-x86_64-linux → 0.12.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +27 -0
  3. data/Cargo.lock +392 -351
  4. data/LICENSE-THIRD-PARTY.txt +1125 -865
  5. data/README.md +6 -6
  6. data/lib/polars/3.1/polars.so +0 -0
  7. data/lib/polars/3.2/polars.so +0 -0
  8. data/lib/polars/3.3/polars.so +0 -0
  9. data/lib/polars/array_expr.rb +4 -4
  10. data/lib/polars/batched_csv_reader.rb +11 -5
  11. data/lib/polars/cat_expr.rb +0 -36
  12. data/lib/polars/cat_name_space.rb +0 -37
  13. data/lib/polars/convert.rb +6 -1
  14. data/lib/polars/data_frame.rb +176 -403
  15. data/lib/polars/data_types.rb +1 -1
  16. data/lib/polars/date_time_expr.rb +525 -572
  17. data/lib/polars/date_time_name_space.rb +263 -460
  18. data/lib/polars/dynamic_group_by.rb +5 -5
  19. data/lib/polars/exceptions.rb +7 -0
  20. data/lib/polars/expr.rb +1394 -243
  21. data/lib/polars/expr_dispatch.rb +1 -1
  22. data/lib/polars/functions/aggregation/horizontal.rb +8 -8
  23. data/lib/polars/functions/as_datatype.rb +63 -40
  24. data/lib/polars/functions/lazy.rb +63 -14
  25. data/lib/polars/functions/lit.rb +1 -1
  26. data/lib/polars/functions/range/date_range.rb +90 -57
  27. data/lib/polars/functions/range/datetime_range.rb +149 -0
  28. data/lib/polars/functions/range/int_range.rb +2 -2
  29. data/lib/polars/functions/range/time_range.rb +141 -0
  30. data/lib/polars/functions/repeat.rb +1 -1
  31. data/lib/polars/functions/whenthen.rb +1 -1
  32. data/lib/polars/group_by.rb +88 -23
  33. data/lib/polars/io/avro.rb +24 -0
  34. data/lib/polars/{io.rb → io/csv.rb} +299 -493
  35. data/lib/polars/io/database.rb +73 -0
  36. data/lib/polars/io/ipc.rb +247 -0
  37. data/lib/polars/io/json.rb +29 -0
  38. data/lib/polars/io/ndjson.rb +80 -0
  39. data/lib/polars/io/parquet.rb +227 -0
  40. data/lib/polars/lazy_frame.rb +143 -272
  41. data/lib/polars/lazy_group_by.rb +100 -3
  42. data/lib/polars/list_expr.rb +11 -11
  43. data/lib/polars/list_name_space.rb +5 -1
  44. data/lib/polars/rolling_group_by.rb +7 -9
  45. data/lib/polars/series.rb +103 -187
  46. data/lib/polars/string_expr.rb +78 -102
  47. data/lib/polars/string_name_space.rb +5 -4
  48. data/lib/polars/testing.rb +2 -2
  49. data/lib/polars/utils/constants.rb +9 -0
  50. data/lib/polars/utils/convert.rb +97 -0
  51. data/lib/polars/utils/parse.rb +89 -0
  52. data/lib/polars/utils/various.rb +76 -0
  53. data/lib/polars/utils/wrap.rb +19 -0
  54. data/lib/polars/utils.rb +8 -300
  55. data/lib/polars/version.rb +1 -1
  56. data/lib/polars/whenthen.rb +6 -6
  57. data/lib/polars.rb +20 -1
  58. metadata +17 -4
@@ -0,0 +1,149 @@
1
+ module Polars
2
+ module Functions
3
+ # Generate a datetime range.
4
+ #
5
+ # @param start [Object]
6
+ # Lower bound of the datetime range.
7
+ # @param stop [Object]
8
+ # Upper bound of the datetime range.
9
+ # @param interval [String]
10
+ # Interval of the range periods, specified using the Polars duration string language.
11
+ # @param closed ['both', 'left', 'right', 'none']
12
+ # Define which sides of the range are closed (inclusive).
13
+ # @param time_unit [nil, 'ns', 'us', 'ms']
14
+ # Time unit of the resulting `Datetime` data type.
15
+ # @param time_zone [String]
16
+ # Time zone of the resulting `Datetime` data type.
17
+ # @param eager [Boolean]
18
+ # Evaluate immediately and return a `Series`.
19
+ # If set to `false` (default), return an expression instead.
20
+ #
21
+ # @return [Object]
22
+ #
23
+ # @example Using Polars duration string to specify the interval:
24
+ # Polars.datetime_range(
25
+ # DateTime.new(2022, 1, 1), DateTime.new(2022, 3, 1), "1mo", eager: true
26
+ # ).alias("datetime")
27
+ # # =>
28
+ # # shape: (3,)
29
+ # # Series: 'datetime' [datetime[ns]]
30
+ # # [
31
+ # # 2022-01-01 00:00:00
32
+ # # 2022-02-01 00:00:00
33
+ # # 2022-03-01 00:00:00
34
+ # # ]
35
+ #
36
+ # @example Specifying a time zone:
37
+ # Polars.datetime_range(
38
+ # DateTime.new(2022, 1, 1),
39
+ # DateTime.new(2022, 3, 1),
40
+ # "1mo",
41
+ # time_zone: "America/New_York",
42
+ # eager: true
43
+ # ).alias("datetime")
44
+ # # =>
45
+ # # shape: (3,)
46
+ # # Series: 'datetime' [datetime[ns, America/New_York]]
47
+ # # [
48
+ # # 2022-01-01 00:00:00 EST
49
+ # # 2022-02-01 00:00:00 EST
50
+ # # 2022-03-01 00:00:00 EST
51
+ # # ]
52
+ def datetime_range(
53
+ start,
54
+ stop,
55
+ interval = "1d",
56
+ closed: "both",
57
+ time_unit: nil,
58
+ time_zone: nil,
59
+ eager: false
60
+ )
61
+ interval = Utils.parse_interval_argument(interval)
62
+ if time_unit.nil? && interval.include?("ns")
63
+ time_unit = "ns"
64
+ end
65
+
66
+ start_rbexpr = Utils.parse_into_expression(start)
67
+ end_rbexpr = Utils.parse_into_expression(stop)
68
+ result = Utils.wrap_expr(
69
+ Plr.datetime_range(
70
+ start_rbexpr, end_rbexpr, interval, closed, time_unit, time_zone
71
+ )
72
+ )
73
+
74
+ if eager
75
+ return Polars.select(result).to_series
76
+ end
77
+
78
+ result
79
+ end
80
+
81
+ # Create a column of datetime ranges.
82
+ #
83
+ # @param start [Object]
84
+ # Lower bound of the datetime range.
85
+ # @param stop [Object]
86
+ # Upper bound of the datetime range.
87
+ # @param interval [String]
88
+ # Interval of the range periods, specified using the Polars duration string language.
89
+ # @param closed ['both', 'left', 'right', 'none']
90
+ # Define which sides of the range are closed (inclusive).
91
+ # @param time_unit [nil, 'ns', 'us', 'ms']
92
+ # Time unit of the resulting `Datetime` data type.
93
+ # @param time_zone [String]
94
+ # Time zone of the resulting `Datetime` data type.
95
+ # @param eager [Boolean]
96
+ # Evaluate immediately and return a `Series`.
97
+ # If set to `false` (default), return an expression instead.
98
+ #
99
+ # @return [Object]
100
+ #
101
+ # @example
102
+ # df = Polars::DataFrame.new(
103
+ # {
104
+ # "start" => [DateTime.new(2022, 1, 1), DateTime.new(2022, 1, 2)],
105
+ # "end" => DateTime.new(2022, 1, 3),
106
+ # }
107
+ # )
108
+ # df.select(datetime_range: Polars.datetime_ranges("start", "end"))
109
+ # # =>
110
+ # # shape: (2, 1)
111
+ # # ┌─────────────────────────────────┐
112
+ # # │ datetime_range │
113
+ # # │ --- │
114
+ # # │ list[datetime[ns]] │
115
+ # # ╞═════════════════════════════════╡
116
+ # # │ [2022-01-01 00:00:00, 2022-01-… │
117
+ # # │ [2022-01-02 00:00:00, 2022-01-… │
118
+ # # └─────────────────────────────────┘
119
+ def datetime_ranges(
120
+ start,
121
+ stop,
122
+ interval: "1d",
123
+ closed: "both",
124
+ time_unit: nil,
125
+ time_zone: nil,
126
+ eager: false
127
+ )
128
+ interval = Utils.parse_interval_argument(interval)
129
+ if time_unit.nil? && interval.include?("ns")
130
+ time_unit = "ns"
131
+ end
132
+
133
+ start_rbexpr = Utils.parse_into_expression(start)
134
+ end_rbexpr = Utils.parse_into_expression(stop)
135
+
136
+ result = Utils.wrap_expr(
137
+ Plr.datetime_ranges(
138
+ start_rbexpr, end_rbexpr, interval, closed, time_unit, time_zone
139
+ )
140
+ )
141
+
142
+ if eager
143
+ return Polars.select(result).to_series
144
+ end
145
+
146
+ result
147
+ end
148
+ end
149
+ end
@@ -34,8 +34,8 @@ module Polars
34
34
  start = 0
35
35
  end
36
36
 
37
- start = Utils.parse_as_expression(start)
38
- stop = Utils.parse_as_expression(stop)
37
+ start = Utils.parse_into_expression(start)
38
+ stop = Utils.parse_into_expression(stop)
39
39
  dtype ||= Int64
40
40
  dtype = dtype.to_s if dtype.is_a?(Symbol)
41
41
  result = Utils.wrap_expr(Plr.int_range(start, stop, step, dtype)).alias("arange")
@@ -0,0 +1,141 @@
1
+ module Polars
2
+ module Functions
3
+ # Generate a time range.
4
+ #
5
+ # @param start [Object]
6
+ # Lower bound of the time range.
7
+ # @param stop [Object]
8
+ # Upper bound of the time range.
9
+ # @param interval [String]
10
+ # Interval of the range periods, specified using the Polars duration string language.
11
+ # @param closed ['both', 'left', 'right', 'none']
12
+ # Define which sides of the range are closed (inclusive).
13
+ # @param eager [Boolean]
14
+ # Evaluate immediately and return a `Series`.
15
+ # If set to `False` (default), return an expression instead.
16
+ #
17
+ # @return [Object]
18
+ #
19
+ # @example
20
+ # Polars.time_range(
21
+ # time(14, 0),
22
+ # nil,
23
+ # "3h15m",
24
+ # eager: true
25
+ # ).alias("time")
26
+ # # =>
27
+ # # shape: (4,)
28
+ # # Series: 'time' [time]
29
+ # # [
30
+ # # 14:00:00
31
+ # # 17:15:00
32
+ # # 20:30:00
33
+ # # 23:45:00
34
+ # # ]
35
+ def time_range(
36
+ start = nil,
37
+ stop = nil,
38
+ interval = "1h",
39
+ closed: "both",
40
+ eager: false
41
+ )
42
+ interval = Utils.parse_interval_argument(interval)
43
+ ["y", "mo", "w", "d"].each do |unit|
44
+ if interval.include?(unit)
45
+ msg = "invalid interval unit for time_range: found #{unit.inspect}"
46
+ raise ArgumentError, msg
47
+ end
48
+ end
49
+
50
+ if start.nil?
51
+ # start = time(0, 0, 0)
52
+ raise Todo
53
+ end
54
+ if stop.nil?
55
+ # stop = time(23, 59, 59, 999999)
56
+ raise Todo
57
+ end
58
+
59
+ start_rbexpr = Utils.parse_into_expression(start)
60
+ end_rbexpr = Utils.parse_into_expression(stop)
61
+
62
+ result = Utils.wrap_expr(Plr.time_range(start_rbexpr, end_rbexpr, interval, closed))
63
+
64
+ if eager
65
+ return Polars.select(result).to_series
66
+ end
67
+
68
+ result
69
+ end
70
+
71
+ # Create a column of time ranges.
72
+ #
73
+ # @param start [Object]
74
+ # Lower bound of the time range.
75
+ # @param stop [Object]
76
+ # Upper bound of the time range.
77
+ # @param interval [Integer]
78
+ # Interval of the range periods, specified using the Polars duration string language.
79
+ # @param closed ['both', 'left', 'right', 'none']
80
+ # Define which sides of the range are closed (inclusive).
81
+ # @param eager [Boolean]
82
+ # Evaluate immediately and return a `Series`.
83
+ # If set to `false` (default), return an expression instead.
84
+ #
85
+ # @return [Object]
86
+ #
87
+ # @example
88
+ # df = Polars::DataFrame.new(
89
+ # {
90
+ # "start" => [time(9, 0), time(10, 0)],
91
+ # "end" => time(11, 0)
92
+ # }
93
+ # )
94
+ # df.with_columns(time_range: Polars.time_ranges("start", "end"))
95
+ # # =>
96
+ # # shape: (2, 3)
97
+ # # ┌──────────┬──────────┬────────────────────────────────┐
98
+ # # │ start ┆ end ┆ time_range │
99
+ # # │ --- ┆ --- ┆ --- │
100
+ # # │ time ┆ time ┆ list[time] │
101
+ # # ╞══════════╪══════════╪════════════════════════════════╡
102
+ # # │ 09:00:00 ┆ 11:00:00 ┆ [09:00:00, 10:00:00, 11:00:00] │
103
+ # # │ 10:00:00 ┆ 11:00:00 ┆ [10:00:00, 11:00:00] │
104
+ # # └──────────┴──────────┴────────────────────────────────┘
105
+ def time_ranges(
106
+ start = nil,
107
+ stop = nil,
108
+ interval = "1h",
109
+ closed: "both",
110
+ eager: false
111
+ )
112
+ interval = Utils.parse_interval_argument(interval)
113
+ ["y", "mo", "w", "d"].each do |unit|
114
+ if interval.include?(unit)
115
+ msg = "invalid interval unit for time_range: found #{unit.inspect}"
116
+ raise ArgumentError, msg
117
+ end
118
+ end
119
+
120
+ if start.nil?
121
+ # start = time(0, 0, 0)
122
+ raise Todo
123
+ end
124
+ if stop.nil?
125
+ # stop = time(23, 59, 59, 999999)
126
+ raise Todo
127
+ end
128
+
129
+ start_rbexpr = Utils.parse_into_expression(start)
130
+ end_rbexpr = Utils.parse_into_expression(stop)
131
+
132
+ result = Utils.wrap_expr(Plr.time_ranges(start_rbexpr, end_rbexpr, interval, closed))
133
+
134
+ if eager
135
+ return Polars.select(result).to_series
136
+ end
137
+
138
+ result
139
+ end
140
+ end
141
+ end
@@ -43,7 +43,7 @@ module Polars
43
43
  n = lit(n)
44
44
  end
45
45
 
46
- value = Utils.parse_as_expression(value, str_as_lit: true)
46
+ value = Utils.parse_into_expression(value, str_as_lit: true)
47
47
  expr = Utils.wrap_expr(Plr.repeat(value, n._rbexpr, dtype))
48
48
  if !name.nil?
49
49
  expr = expr.alias(name)
@@ -89,7 +89,7 @@ module Polars
89
89
  # # │ 4 ┆ 0 ┆ 99 │
90
90
  # # └─────┴─────┴─────┘
91
91
  def when(*predicates, **constraints)
92
- condition = Utils.parse_when_inputs(*predicates, **constraints)
92
+ condition = Utils.parse_predicates_constraints_into_expression(*predicates, **constraints)
93
93
  When.new(Plr.when(condition))
94
94
  end
95
95
  end
@@ -106,39 +106,104 @@ module Polars
106
106
  # _dataframe_class._from_rbdf(_df.group_by_apply(by, f))
107
107
  # end
108
108
 
109
- # Use multiple aggregations on columns.
109
+ # Compute aggregations for each group of a group by operation.
110
110
  #
111
- # This can be combined with complete lazy API and is considered idiomatic polars.
112
- #
113
- # @param aggs [Object]
114
- # Single / multiple aggregation expression(s).
111
+ # @param aggs [Array]
112
+ # Aggregations to compute for each group of the group by operation,
113
+ # specified as positional arguments.
114
+ # Accepts expression input. Strings are parsed as column names.
115
+ # @param named_aggs [Hash]
116
+ # Additional aggregations, specified as keyword arguments.
117
+ # The resulting columns will be renamed to the keyword used.
115
118
  #
116
119
  # @return [DataFrame]
117
120
  #
118
- # @example
121
+ # @example Compute the aggregation of the columns for each group.
119
122
  # df = Polars::DataFrame.new(
120
- # {"foo" => ["one", "two", "two", "one", "two"], "bar" => [5, 3, 2, 4, 1]}
123
+ # {
124
+ # "a" => ["a", "b", "a", "b", "c"],
125
+ # "b" => [1, 2, 1, 3, 3],
126
+ # "c" => [5, 4, 3, 2, 1]
127
+ # }
121
128
  # )
122
- # df.group_by("foo", maintain_order: true).agg(
123
- # [
124
- # Polars.sum("bar").suffix("_sum"),
125
- # Polars.col("bar").sort.tail(2).sum.suffix("_tail_sum")
126
- # ]
129
+ # df.group_by("a").agg(Polars.col("b"), Polars.col("c"))
130
+ # # =>
131
+ # # shape: (3, 3)
132
+ # # ┌─────┬───────────┬───────────┐
133
+ # # │ a ┆ b ┆ c │
134
+ # # │ --- ┆ --- ┆ --- │
135
+ # # │ str ┆ list[i64] ┆ list[i64] │
136
+ # # ╞═════╪═══════════╪═══════════╡
137
+ # # │ a ┆ [1, 1] ┆ [5, 3] │
138
+ # # │ b ┆ [2, 3] ┆ [4, 2] │
139
+ # # │ c ┆ [3] ┆ [1] │
140
+ # # └─────┴───────────┴───────────┘
141
+ #
142
+ # @example Compute the sum of a column for each group.
143
+ # df.group_by("a").agg(Polars.col("b").sum)
144
+ # # =>
145
+ # # shape: (3, 2)
146
+ # # ┌─────┬─────┐
147
+ # # │ a ┆ b │
148
+ # # │ --- ┆ --- │
149
+ # # │ str ┆ i64 │
150
+ # # ╞═════╪═════╡
151
+ # # │ a ┆ 2 │
152
+ # # │ b ┆ 5 │
153
+ # # │ c ┆ 3 │
154
+ # # └─────┴─────┘
155
+ #
156
+ # @example Compute multiple aggregates at once by passing a list of expressions.
157
+ # df.group_by("a").agg([Polars.sum("b"), Polars.mean("c")])
158
+ # # =>
159
+ # # shape: (3, 3)
160
+ # # ┌─────┬─────┬─────┐
161
+ # # │ a ┆ b ┆ c │
162
+ # # │ --- ┆ --- ┆ --- │
163
+ # # │ str ┆ i64 ┆ f64 │
164
+ # # ╞═════╪═════╪═════╡
165
+ # # │ c ┆ 3 ┆ 1.0 │
166
+ # # │ a ┆ 2 ┆ 4.0 │
167
+ # # │ b ┆ 5 ┆ 3.0 │
168
+ # # └─────┴─────┴─────┘
169
+ #
170
+ # @example Or use positional arguments to compute multiple aggregations in the same way.
171
+ # df.group_by("a").agg(
172
+ # Polars.sum("b").name.suffix("_sum"),
173
+ # (Polars.col("c") ** 2).mean.name.suffix("_mean_squared")
127
174
  # )
128
175
  # # =>
129
- # # shape: (2, 3)
130
- # # ┌─────┬─────────┬──────────────┐
131
- # # │ foo bar_sumbar_tail_sum
132
- # # │ --- ┆ --- ┆ ---
133
- # # │ str ┆ i64 i64
134
- # # ╞═════╪═════════╪══════════════╡
135
- # # │ one 9 9
136
- # # │ two 6 5
137
- # # └─────┴─────────┴──────────────┘
138
- def agg(aggs)
176
+ # # shape: (3, 3)
177
+ # # ┌─────┬───────┬────────────────┐
178
+ # # │ a b_sumc_mean_squared
179
+ # # │ --- ┆ --- ┆ ---
180
+ # # │ str ┆ i64 f64
181
+ # # ╞═════╪═══════╪════════════════╡
182
+ # # │ a 2 17.0
183
+ # # │ c 3 1.0
184
+ # # │ b ┆ 5 ┆ 10.0 │
185
+ # # └─────┴───────┴────────────────┘
186
+ #
187
+ # @example Use keyword arguments to easily name your expression inputs.
188
+ # df.group_by("a").agg(
189
+ # b_sum: Polars.sum("b"),
190
+ # c_mean_squared: (Polars.col("c") ** 2).mean
191
+ # )
192
+ # # =>
193
+ # # shape: (3, 3)
194
+ # # ┌─────┬───────┬────────────────┐
195
+ # # │ a ┆ b_sum ┆ c_mean_squared │
196
+ # # │ --- ┆ --- ┆ --- │
197
+ # # │ str ┆ i64 ┆ f64 │
198
+ # # ╞═════╪═══════╪════════════════╡
199
+ # # │ a ┆ 2 ┆ 17.0 │
200
+ # # │ c ┆ 3 ┆ 1.0 │
201
+ # # │ b ┆ 5 ┆ 10.0 │
202
+ # # └─────┴───────┴────────────────┘
203
+ def agg(*aggs, **named_aggs)
139
204
  @df.lazy
140
205
  .group_by(@by, maintain_order: @maintain_order)
141
- .agg(aggs)
206
+ .agg(*aggs, **named_aggs)
142
207
  .collect(no_optimization: true)
143
208
  end
144
209
 
@@ -0,0 +1,24 @@
1
+ module Polars
2
+ module IO
3
+ # Read into a DataFrame from Apache Avro format.
4
+ #
5
+ # @param source [Object]
6
+ # Path to a file or a file-like object.
7
+ # @param columns [Object]
8
+ # Columns to select. Accepts a list of column indices (starting at zero) or a list
9
+ # of column names.
10
+ # @param n_rows [Integer]
11
+ # Stop reading from Apache Avro file after reading ``n_rows``.
12
+ #
13
+ # @return [DataFrame]
14
+ def read_avro(source, columns: nil, n_rows: nil)
15
+ if Utils.pathlike?(source)
16
+ source = Utils.normalize_filepath(source)
17
+ end
18
+ projection, column_names = Utils.handle_projection_columns(columns)
19
+
20
+ rbdf = RbDataFrame.read_avro(source, column_names, projection, n_rows)
21
+ Utils.wrap_df(rbdf)
22
+ end
23
+ end
24
+ end