polars-df 0.10.0-aarch64-linux → 0.11.0-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -99,5 +99,97 @@ module Polars
99
99
 
100
100
  result
101
101
  end
102
+
103
+ # Create a column of date ranges.
104
+ #
105
+ # @param start [Object]
106
+ # Lower bound of the date range.
107
+ # @param stop [Object]
108
+ # Upper bound of the date range.
109
+ # @param interval [Object]
110
+ # Interval of the range periods, specified using the Polars duration string language (see "Notes" section below).
111
+ # @param closed ["both", "left", "right", "none"]
112
+ # Define which sides of the range are closed (inclusive).
113
+ # @param time_unit [nil, "ns", "us", "ms"]
114
+ # Time unit of the resulting `Datetime` data type.
115
+ # Only takes effect if the output column is of type `Datetime`.
116
+ # @param time_zone [String]
117
+ # Time zone of the resulting `Datetime` data type.
118
+ # Only takes effect if the output column is of type `Datetime`.
119
+ # @param eager [Boolean]
120
+ # Evaluate immediately and return a `Series`.
121
+ # If set to `false` (default), return an expression instead.
122
+ #
123
+ # @return [Object]
124
+ #
125
+ # @note
126
+ # `interval` is created according to the following string language:
127
+ #
128
+ # - 1ns (1 nanosecond)
129
+ # - 1us (1 microsecond)
130
+ # - 1ms (1 millisecond)
131
+ # - 1s (1 second)
132
+ # - 1m (1 minute)
133
+ # - 1h (1 hour)
134
+ # - 1d (1 calendar day)
135
+ # - 1w (1 calendar week)
136
+ # - 1mo (1 calendar month)
137
+ # - 1q (1 calendar quarter)
138
+ # - 1y (1 calendar year)
139
+ #
140
+ # Or combine them:
141
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
142
+ #
143
+ # By "calendar day", we mean the corresponding time on the next day (which may
144
+ # not be 24 hours, due to daylight savings). Similarly for "calendar week",
145
+ # "calendar month", "calendar quarter", and "calendar year".
146
+ #
147
+ # @example
148
+ # df = Polars::DataFrame.new(
149
+ # {
150
+ # "start" => [Date.new(2022, 1, 1), Date.new(2022, 1, 2)],
151
+ # "end" => Date.new(2022, 1, 3)
152
+ # }
153
+ # )
154
+ # df.with_columns(date_range: Polars.date_ranges("start", "end"))
155
+ # # =>
156
+ # # shape: (2, 3)
157
+ # # ┌────────────┬────────────┬─────────────────────────────────┐
158
+ # # │ start ┆ end ┆ date_range │
159
+ # # │ --- ┆ --- ┆ --- │
160
+ # # │ date ┆ date ┆ list[date] │
161
+ # # ╞════════════╪════════════╪═════════════════════════════════╡
162
+ # # │ 2022-01-01 ┆ 2022-01-03 ┆ [2022-01-01, 2022-01-02, 2022-… │
163
+ # # │ 2022-01-02 ┆ 2022-01-03 ┆ [2022-01-02, 2022-01-03] │
164
+ # # └────────────┴────────────┴─────────────────────────────────┘
165
+ def date_ranges(
166
+ start,
167
+ stop,
168
+ interval = "1d",
169
+ closed: "both",
170
+ time_unit: nil,
171
+ time_zone: nil,
172
+ eager: false
173
+ )
174
+ interval = Utils.parse_interval_argument(interval)
175
+ if time_unit.nil? && interval.include?("ns")
176
+ time_unit = "ns"
177
+ end
178
+
179
+ start_rbexpr = Utils.parse_as_expression(start)
180
+ end_rbexpr = Utils.parse_as_expression(stop)
181
+
182
+ result = Utils.wrap_expr(
183
+ Plr.date_ranges(
184
+ start_rbexpr, end_rbexpr, interval, closed, time_unit, time_zone
185
+ )
186
+ )
187
+
188
+ if eager
189
+ return F.select(result).to_series
190
+ end
191
+
192
+ result
193
+ end
102
194
  end
103
195
  end
@@ -0,0 +1,149 @@
1
+ module Polars
2
+ module Functions
3
+ # Generate a datetime range.
4
+ #
5
+ # @param start [Object]
6
+ # Lower bound of the datetime range.
7
+ # @param stop [Object]
8
+ # Upper bound of the datetime range.
9
+ # @param interval [String]
10
+ # Interval of the range periods, specified using the Polars duration string language.
11
+ # @param closed ['both', 'left', 'right', 'none']
12
+ # Define which sides of the range are closed (inclusive).
13
+ # @param time_unit [nil, 'ns', 'us', 'ms']
14
+ # Time unit of the resulting `Datetime` data type.
15
+ # @param time_zone [String]
16
+ # Time zone of the resulting `Datetime` data type.
17
+ # @param eager [Boolean]
18
+ # Evaluate immediately and return a `Series`.
19
+ # If set to `false` (default), return an expression instead.
20
+ #
21
+ # @return [Object]
22
+ #
23
+ # @example Using Polars duration string to specify the interval:
24
+ # Polars.datetime_range(
25
+ # DateTime.new(2022, 1, 1), DateTime.new(2022, 3, 1), "1mo", eager: true
26
+ # ).alias("datetime")
27
+ # # =>
28
+ # # shape: (3,)
29
+ # # Series: 'datetime' [datetime[ns]]
30
+ # # [
31
+ # # 2022-01-01 00:00:00
32
+ # # 2022-02-01 00:00:00
33
+ # # 2022-03-01 00:00:00
34
+ # # ]
35
+ #
36
+ # @example Specifying a time zone:
37
+ # Polars.datetime_range(
38
+ # DateTime.new(2022, 1, 1),
39
+ # DateTime.new(2022, 3, 1),
40
+ # "1mo",
41
+ # time_zone: "America/New_York",
42
+ # eager: true
43
+ # ).alias("datetime")
44
+ # # =>
45
+ # # shape: (3,)
46
+ # # Series: 'datetime' [datetime[ns, America/New_York]]
47
+ # # [
48
+ # # 2022-01-01 00:00:00 EST
49
+ # # 2022-02-01 00:00:00 EST
50
+ # # 2022-03-01 00:00:00 EST
51
+ # # ]
52
+ def datetime_range(
53
+ start,
54
+ stop,
55
+ interval = "1d",
56
+ closed: "both",
57
+ time_unit: nil,
58
+ time_zone: nil,
59
+ eager: false
60
+ )
61
+ interval = Utils.parse_interval_argument(interval)
62
+ if time_unit.nil? && interval.include?("ns")
63
+ time_unit = "ns"
64
+ end
65
+
66
+ start_rbexpr = Utils.parse_as_expression(start)
67
+ end_rbexpr = Utils.parse_as_expression(stop)
68
+ result = Utils.wrap_expr(
69
+ Plr.datetime_range(
70
+ start_rbexpr, end_rbexpr, interval, closed, time_unit, time_zone
71
+ )
72
+ )
73
+
74
+ if eager
75
+ return Polars.select(result).to_series
76
+ end
77
+
78
+ result
79
+ end
80
+
81
+ # Create a column of datetime ranges.
82
+ #
83
+ # @param start [Object]
84
+ # Lower bound of the datetime range.
85
+ # @param stop [Object]
86
+ # Upper bound of the datetime range.
87
+ # @param interval [String]
88
+ # Interval of the range periods, specified using the Polars duration string language.
89
+ # @param closed ['both', 'left', 'right', 'none']
90
+ # Define which sides of the range are closed (inclusive).
91
+ # @param time_unit [nil, 'ns', 'us', 'ms']
92
+ # Time unit of the resulting `Datetime` data type.
93
+ # @param time_zone [String]
94
+ # Time zone of the resulting `Datetime` data type.
95
+ # @param eager [Boolean]
96
+ # Evaluate immediately and return a `Series`.
97
+ # If set to `false` (default), return an expression instead.
98
+ #
99
+ # @return [Object]
100
+ #
101
+ # @example
102
+ # df = Polars::DataFrame.new(
103
+ # {
104
+ # "start" => [DateTime.new(2022, 1, 1), DateTime.new(2022, 1, 2)],
105
+ # "end" => DateTime.new(2022, 1, 3),
106
+ # }
107
+ # )
108
+ # df.select(datetime_range: Polars.datetime_ranges("start", "end"))
109
+ # # =>
110
+ # # shape: (2, 1)
111
+ # # ┌─────────────────────────────────┐
112
+ # # │ datetime_range │
113
+ # # │ --- │
114
+ # # │ list[datetime[ns]] │
115
+ # # ╞═════════════════════════════════╡
116
+ # # │ [2022-01-01 00:00:00, 2022-01-… │
117
+ # # │ [2022-01-02 00:00:00, 2022-01-… │
118
+ # # └─────────────────────────────────┘
119
+ def datetime_ranges(
120
+ start,
121
+ stop,
122
+ interval: "1d",
123
+ closed: "both",
124
+ time_unit: nil,
125
+ time_zone: nil,
126
+ eager: false
127
+ )
128
+ interval = Utils.parse_interval_argument(interval)
129
+ if time_unit.nil? && interval.include?("ns")
130
+ time_unit = "ns"
131
+ end
132
+
133
+ start_rbexpr = Utils.parse_as_expression(start)
134
+ end_rbexpr = Utils.parse_as_expression(stop)
135
+
136
+ result = Utils.wrap_expr(
137
+ Plr.datetime_ranges(
138
+ start_rbexpr, end_rbexpr, interval, closed, time_unit, time_zone
139
+ )
140
+ )
141
+
142
+ if eager
143
+ return Polars.select(result).to_series
144
+ end
145
+
146
+ result
147
+ end
148
+ end
149
+ end
@@ -0,0 +1,141 @@
1
+ module Polars
2
+ module Functions
3
+ # Generate a time range.
4
+ #
5
+ # @param start [Object]
6
+ # Lower bound of the time range.
7
+ # @param stop [Object]
8
+ # Upper bound of the time range.
9
+ # @param interval [String]
10
+ # Interval of the range periods, specified using the Polars duration string language.
11
+ # @param closed ['both', 'left', 'right', 'none']
12
+ # Define which sides of the range are closed (inclusive).
13
+ # @param eager [Boolean]
14
+ # Evaluate immediately and return a `Series`.
15
+ # If set to `False` (default), return an expression instead.
16
+ #
17
+ # @return [Object]
18
+ #
19
+ # @example
20
+ # Polars.time_range(
21
+ # time(14, 0),
22
+ # nil,
23
+ # "3h15m",
24
+ # eager: true
25
+ # ).alias("time")
26
+ # # =>
27
+ # # shape: (4,)
28
+ # # Series: 'time' [time]
29
+ # # [
30
+ # # 14:00:00
31
+ # # 17:15:00
32
+ # # 20:30:00
33
+ # # 23:45:00
34
+ # # ]
35
+ def time_range(
36
+ start = nil,
37
+ stop = nil,
38
+ interval = "1h",
39
+ closed: "both",
40
+ eager: false
41
+ )
42
+ interval = Utils.parse_interval_argument(interval)
43
+ ["y", "mo", "w", "d"].each do |unit|
44
+ if interval.include?(unit)
45
+ msg = "invalid interval unit for time_range: found #{unit.inspect}"
46
+ raise ArgumentError, msg
47
+ end
48
+ end
49
+
50
+ if start.nil?
51
+ # start = time(0, 0, 0)
52
+ raise Todo
53
+ end
54
+ if stop.nil?
55
+ # stop = time(23, 59, 59, 999999)
56
+ raise Todo
57
+ end
58
+
59
+ start_rbexpr = Utils.parse_as_expression(start)
60
+ end_rbexpr = Utils.parse_as_expression(stop)
61
+
62
+ result = Utils.wrap_expr(Plr.time_range(start_rbexpr, end_rbexpr, interval, closed))
63
+
64
+ if eager
65
+ return Polars.select(result).to_series
66
+ end
67
+
68
+ result
69
+ end
70
+
71
+ # Create a column of time ranges.
72
+ #
73
+ # @param start [Object]
74
+ # Lower bound of the time range.
75
+ # @param stop [Object]
76
+ # Upper bound of the time range.
77
+ # @param interval [Integer]
78
+ # Interval of the range periods, specified using the Polars duration string language.
79
+ # @param closed ['both', 'left', 'right', 'none']
80
+ # Define which sides of the range are closed (inclusive).
81
+ # @param eager [Boolean]
82
+ # Evaluate immediately and return a `Series`.
83
+ # If set to `false` (default), return an expression instead.
84
+ #
85
+ # @return [Object]
86
+ #
87
+ # @example
88
+ # df = Polars::DataFrame.new(
89
+ # {
90
+ # "start" => [time(9, 0), time(10, 0)],
91
+ # "end" => time(11, 0)
92
+ # }
93
+ # )
94
+ # df.with_columns(time_range: Polars.time_ranges("start", "end"))
95
+ # # =>
96
+ # # shape: (2, 3)
97
+ # # ┌──────────┬──────────┬────────────────────────────────┐
98
+ # # │ start ┆ end ┆ time_range │
99
+ # # │ --- ┆ --- ┆ --- │
100
+ # # │ time ┆ time ┆ list[time] │
101
+ # # ╞══════════╪══════════╪════════════════════════════════╡
102
+ # # │ 09:00:00 ┆ 11:00:00 ┆ [09:00:00, 10:00:00, 11:00:00] │
103
+ # # │ 10:00:00 ┆ 11:00:00 ┆ [10:00:00, 11:00:00] │
104
+ # # └──────────┴──────────┴────────────────────────────────┘
105
+ def time_ranges(
106
+ start = nil,
107
+ stop = nil,
108
+ interval = "1h",
109
+ closed: "both",
110
+ eager: false
111
+ )
112
+ interval = Utils.parse_interval_argument(interval)
113
+ ["y", "mo", "w", "d"].each do |unit|
114
+ if interval.include?(unit)
115
+ msg = "invalid interval unit for time_range: found #{unit.inspect}"
116
+ raise ArgumentError, msg
117
+ end
118
+ end
119
+
120
+ if start.nil?
121
+ # start = time(0, 0, 0)
122
+ raise Todo
123
+ end
124
+ if stop.nil?
125
+ # stop = time(23, 59, 59, 999999)
126
+ raise Todo
127
+ end
128
+
129
+ start_rbexpr = Utils.parse_as_expression(start)
130
+ end_rbexpr = Utils.parse_as_expression(stop)
131
+
132
+ result = Utils.wrap_expr(Plr.time_ranges(start_rbexpr, end_rbexpr, interval, closed))
133
+
134
+ if eager
135
+ return Polars.select(result).to_series
136
+ end
137
+
138
+ result
139
+ end
140
+ end
141
+ end
@@ -106,39 +106,104 @@ module Polars
106
106
  # _dataframe_class._from_rbdf(_df.group_by_apply(by, f))
107
107
  # end
108
108
 
109
- # Use multiple aggregations on columns.
109
+ # Compute aggregations for each group of a group by operation.
110
110
  #
111
- # This can be combined with complete lazy API and is considered idiomatic polars.
112
- #
113
- # @param aggs [Object]
114
- # Single / multiple aggregation expression(s).
111
+ # @param aggs [Array]
112
+ # Aggregations to compute for each group of the group by operation,
113
+ # specified as positional arguments.
114
+ # Accepts expression input. Strings are parsed as column names.
115
+ # @param named_aggs [Hash]
116
+ # Additional aggregations, specified as keyword arguments.
117
+ # The resulting columns will be renamed to the keyword used.
115
118
  #
116
119
  # @return [DataFrame]
117
120
  #
118
- # @example
121
+ # @example Compute the aggregation of the columns for each group.
119
122
  # df = Polars::DataFrame.new(
120
- # {"foo" => ["one", "two", "two", "one", "two"], "bar" => [5, 3, 2, 4, 1]}
123
+ # {
124
+ # "a" => ["a", "b", "a", "b", "c"],
125
+ # "b" => [1, 2, 1, 3, 3],
126
+ # "c" => [5, 4, 3, 2, 1]
127
+ # }
121
128
  # )
122
- # df.group_by("foo", maintain_order: true).agg(
123
- # [
124
- # Polars.sum("bar").suffix("_sum"),
125
- # Polars.col("bar").sort.tail(2).sum.suffix("_tail_sum")
126
- # ]
129
+ # df.group_by("a").agg(Polars.col("b"), Polars.col("c"))
130
+ # # =>
131
+ # # shape: (3, 3)
132
+ # # ┌─────┬───────────┬───────────┐
133
+ # # │ a ┆ b ┆ c │
134
+ # # │ --- ┆ --- ┆ --- │
135
+ # # │ str ┆ list[i64] ┆ list[i64] │
136
+ # # ╞═════╪═══════════╪═══════════╡
137
+ # # │ a ┆ [1, 1] ┆ [5, 3] │
138
+ # # │ b ┆ [2, 3] ┆ [4, 2] │
139
+ # # │ c ┆ [3] ┆ [1] │
140
+ # # └─────┴───────────┴───────────┘
141
+ #
142
+ # @example Compute the sum of a column for each group.
143
+ # df.group_by("a").agg(Polars.col("b").sum)
144
+ # # =>
145
+ # # shape: (3, 2)
146
+ # # ┌─────┬─────┐
147
+ # # │ a ┆ b │
148
+ # # │ --- ┆ --- │
149
+ # # │ str ┆ i64 │
150
+ # # ╞═════╪═════╡
151
+ # # │ a ┆ 2 │
152
+ # # │ b ┆ 5 │
153
+ # # │ c ┆ 3 │
154
+ # # └─────┴─────┘
155
+ #
156
+ # @example Compute multiple aggregates at once by passing a list of expressions.
157
+ # df.group_by("a").agg([Polars.sum("b"), Polars.mean("c")])
158
+ # # =>
159
+ # # shape: (3, 3)
160
+ # # ┌─────┬─────┬─────┐
161
+ # # │ a ┆ b ┆ c │
162
+ # # │ --- ┆ --- ┆ --- │
163
+ # # │ str ┆ i64 ┆ f64 │
164
+ # # ╞═════╪═════╪═════╡
165
+ # # │ c ┆ 3 ┆ 1.0 │
166
+ # # │ a ┆ 2 ┆ 4.0 │
167
+ # # │ b ┆ 5 ┆ 3.0 │
168
+ # # └─────┴─────┴─────┘
169
+ #
170
+ # @example Or use positional arguments to compute multiple aggregations in the same way.
171
+ # df.group_by("a").agg(
172
+ # Polars.sum("b").name.suffix("_sum"),
173
+ # (Polars.col("c") ** 2).mean.name.suffix("_mean_squared")
127
174
  # )
128
175
  # # =>
129
- # # shape: (2, 3)
130
- # # ┌─────┬─────────┬──────────────┐
131
- # # │ foo bar_sumbar_tail_sum
132
- # # │ --- ┆ --- ┆ ---
133
- # # │ str ┆ i64 i64
134
- # # ╞═════╪═════════╪══════════════╡
135
- # # │ one 9 9
136
- # # │ two 6 5
137
- # # └─────┴─────────┴──────────────┘
138
- def agg(aggs)
176
+ # # shape: (3, 3)
177
+ # # ┌─────┬───────┬────────────────┐
178
+ # # │ a b_sumc_mean_squared
179
+ # # │ --- ┆ --- ┆ ---
180
+ # # │ str ┆ i64 f64
181
+ # # ╞═════╪═══════╪════════════════╡
182
+ # # │ a 2 17.0
183
+ # # │ c 3 1.0
184
+ # # │ b ┆ 5 ┆ 10.0 │
185
+ # # └─────┴───────┴────────────────┘
186
+ #
187
+ # @example Use keyword arguments to easily name your expression inputs.
188
+ # df.group_by("a").agg(
189
+ # b_sum: Polars.sum("b"),
190
+ # c_mean_squared: (Polars.col("c") ** 2).mean
191
+ # )
192
+ # # =>
193
+ # # shape: (3, 3)
194
+ # # ┌─────┬───────┬────────────────┐
195
+ # # │ a ┆ b_sum ┆ c_mean_squared │
196
+ # # │ --- ┆ --- ┆ --- │
197
+ # # │ str ┆ i64 ┆ f64 │
198
+ # # ╞═════╪═══════╪════════════════╡
199
+ # # │ a ┆ 2 ┆ 17.0 │
200
+ # # │ c ┆ 3 ┆ 1.0 │
201
+ # # │ b ┆ 5 ┆ 10.0 │
202
+ # # └─────┴───────┴────────────────┘
203
+ def agg(*aggs, **named_aggs)
139
204
  @df.lazy
140
205
  .group_by(@by, maintain_order: @maintain_order)
141
- .agg(aggs)
206
+ .agg(*aggs, **named_aggs)
142
207
  .collect(no_optimization: true)
143
208
  end
144
209
 
@@ -0,0 +1,24 @@
1
+ module Polars
2
+ module IO
3
+ # Read into a DataFrame from Apache Avro format.
4
+ #
5
+ # @param source [Object]
6
+ # Path to a file or a file-like object.
7
+ # @param columns [Object]
8
+ # Columns to select. Accepts a list of column indices (starting at zero) or a list
9
+ # of column names.
10
+ # @param n_rows [Integer]
11
+ # Stop reading from Apache Avro file after reading ``n_rows``.
12
+ #
13
+ # @return [DataFrame]
14
+ def read_avro(source, columns: nil, n_rows: nil)
15
+ if Utils.pathlike?(source)
16
+ source = Utils.normalize_filepath(source)
17
+ end
18
+ projection, column_names = Utils.handle_projection_columns(columns)
19
+
20
+ rbdf = RbDataFrame.read_avro(source, column_names, projection, n_rows)
21
+ Utils.wrap_df(rbdf)
22
+ end
23
+ end
24
+ end