polars-df 0.10.0-x86_64-linux → 0.11.0-x86_64-linux

Sign up to get free protection for your applications and to get access to all the features.
@@ -99,5 +99,97 @@ module Polars
99
99
 
100
100
  result
101
101
  end
102
+
103
+ # Create a column of date ranges.
104
+ #
105
+ # @param start [Object]
106
+ # Lower bound of the date range.
107
+ # @param stop [Object]
108
+ # Upper bound of the date range.
109
+ # @param interval [Object]
110
+ # Interval of the range periods, specified using the Polars duration string language (see "Notes" section below).
111
+ # @param closed ["both", "left", "right", "none"]
112
+ # Define which sides of the range are closed (inclusive).
113
+ # @param time_unit [nil, "ns", "us", "ms"]
114
+ # Time unit of the resulting `Datetime` data type.
115
+ # Only takes effect if the output column is of type `Datetime`.
116
+ # @param time_zone [String]
117
+ # Time zone of the resulting `Datetime` data type.
118
+ # Only takes effect if the output column is of type `Datetime`.
119
+ # @param eager [Boolean]
120
+ # Evaluate immediately and return a `Series`.
121
+ # If set to `false` (default), return an expression instead.
122
+ #
123
+ # @return [Object]
124
+ #
125
+ # @note
126
+ # `interval` is created according to the following string language:
127
+ #
128
+ # - 1ns (1 nanosecond)
129
+ # - 1us (1 microsecond)
130
+ # - 1ms (1 millisecond)
131
+ # - 1s (1 second)
132
+ # - 1m (1 minute)
133
+ # - 1h (1 hour)
134
+ # - 1d (1 calendar day)
135
+ # - 1w (1 calendar week)
136
+ # - 1mo (1 calendar month)
137
+ # - 1q (1 calendar quarter)
138
+ # - 1y (1 calendar year)
139
+ #
140
+ # Or combine them:
141
+ # "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
142
+ #
143
+ # By "calendar day", we mean the corresponding time on the next day (which may
144
+ # not be 24 hours, due to daylight savings). Similarly for "calendar week",
145
+ # "calendar month", "calendar quarter", and "calendar year".
146
+ #
147
+ # @example
148
+ # df = Polars::DataFrame.new(
149
+ # {
150
+ # "start" => [Date.new(2022, 1, 1), Date.new(2022, 1, 2)],
151
+ # "end" => Date.new(2022, 1, 3)
152
+ # }
153
+ # )
154
+ # df.with_columns(date_range: Polars.date_ranges("start", "end"))
155
+ # # =>
156
+ # # shape: (2, 3)
157
+ # # ┌────────────┬────────────┬─────────────────────────────────┐
158
+ # # │ start ┆ end ┆ date_range │
159
+ # # │ --- ┆ --- ┆ --- │
160
+ # # │ date ┆ date ┆ list[date] │
161
+ # # ╞════════════╪════════════╪═════════════════════════════════╡
162
+ # # │ 2022-01-01 ┆ 2022-01-03 ┆ [2022-01-01, 2022-01-02, 2022-… │
163
+ # # │ 2022-01-02 ┆ 2022-01-03 ┆ [2022-01-02, 2022-01-03] │
164
+ # # └────────────┴────────────┴─────────────────────────────────┘
165
+ def date_ranges(
166
+ start,
167
+ stop,
168
+ interval = "1d",
169
+ closed: "both",
170
+ time_unit: nil,
171
+ time_zone: nil,
172
+ eager: false
173
+ )
174
+ interval = Utils.parse_interval_argument(interval)
175
+ if time_unit.nil? && interval.include?("ns")
176
+ time_unit = "ns"
177
+ end
178
+
179
+ start_rbexpr = Utils.parse_as_expression(start)
180
+ end_rbexpr = Utils.parse_as_expression(stop)
181
+
182
+ result = Utils.wrap_expr(
183
+ Plr.date_ranges(
184
+ start_rbexpr, end_rbexpr, interval, closed, time_unit, time_zone
185
+ )
186
+ )
187
+
188
+ if eager
189
+ return F.select(result).to_series
190
+ end
191
+
192
+ result
193
+ end
102
194
  end
103
195
  end
@@ -0,0 +1,149 @@
1
+ module Polars
2
+ module Functions
3
+ # Generate a datetime range.
4
+ #
5
+ # @param start [Object]
6
+ # Lower bound of the datetime range.
7
+ # @param stop [Object]
8
+ # Upper bound of the datetime range.
9
+ # @param interval [String]
10
+ # Interval of the range periods, specified using the Polars duration string language.
11
+ # @param closed ['both', 'left', 'right', 'none']
12
+ # Define which sides of the range are closed (inclusive).
13
+ # @param time_unit [nil, 'ns', 'us', 'ms']
14
+ # Time unit of the resulting `Datetime` data type.
15
+ # @param time_zone [String]
16
+ # Time zone of the resulting `Datetime` data type.
17
+ # @param eager [Boolean]
18
+ # Evaluate immediately and return a `Series`.
19
+ # If set to `false` (default), return an expression instead.
20
+ #
21
+ # @return [Object]
22
+ #
23
+ # @example Using Polars duration string to specify the interval:
24
+ # Polars.datetime_range(
25
+ # DateTime.new(2022, 1, 1), DateTime.new(2022, 3, 1), "1mo", eager: true
26
+ # ).alias("datetime")
27
+ # # =>
28
+ # # shape: (3,)
29
+ # # Series: 'datetime' [datetime[ns]]
30
+ # # [
31
+ # # 2022-01-01 00:00:00
32
+ # # 2022-02-01 00:00:00
33
+ # # 2022-03-01 00:00:00
34
+ # # ]
35
+ #
36
+ # @example Specifying a time zone:
37
+ # Polars.datetime_range(
38
+ # DateTime.new(2022, 1, 1),
39
+ # DateTime.new(2022, 3, 1),
40
+ # "1mo",
41
+ # time_zone: "America/New_York",
42
+ # eager: true
43
+ # ).alias("datetime")
44
+ # # =>
45
+ # # shape: (3,)
46
+ # # Series: 'datetime' [datetime[ns, America/New_York]]
47
+ # # [
48
+ # # 2022-01-01 00:00:00 EST
49
+ # # 2022-02-01 00:00:00 EST
50
+ # # 2022-03-01 00:00:00 EST
51
+ # # ]
52
+ def datetime_range(
53
+ start,
54
+ stop,
55
+ interval = "1d",
56
+ closed: "both",
57
+ time_unit: nil,
58
+ time_zone: nil,
59
+ eager: false
60
+ )
61
+ interval = Utils.parse_interval_argument(interval)
62
+ if time_unit.nil? && interval.include?("ns")
63
+ time_unit = "ns"
64
+ end
65
+
66
+ start_rbexpr = Utils.parse_as_expression(start)
67
+ end_rbexpr = Utils.parse_as_expression(stop)
68
+ result = Utils.wrap_expr(
69
+ Plr.datetime_range(
70
+ start_rbexpr, end_rbexpr, interval, closed, time_unit, time_zone
71
+ )
72
+ )
73
+
74
+ if eager
75
+ return Polars.select(result).to_series
76
+ end
77
+
78
+ result
79
+ end
80
+
81
+ # Create a column of datetime ranges.
82
+ #
83
+ # @param start [Object]
84
+ # Lower bound of the datetime range.
85
+ # @param stop [Object]
86
+ # Upper bound of the datetime range.
87
+ # @param interval [String]
88
+ # Interval of the range periods, specified using the Polars duration string language.
89
+ # @param closed ['both', 'left', 'right', 'none']
90
+ # Define which sides of the range are closed (inclusive).
91
+ # @param time_unit [nil, 'ns', 'us', 'ms']
92
+ # Time unit of the resulting `Datetime` data type.
93
+ # @param time_zone [String]
94
+ # Time zone of the resulting `Datetime` data type.
95
+ # @param eager [Boolean]
96
+ # Evaluate immediately and return a `Series`.
97
+ # If set to `false` (default), return an expression instead.
98
+ #
99
+ # @return [Object]
100
+ #
101
+ # @example
102
+ # df = Polars::DataFrame.new(
103
+ # {
104
+ # "start" => [DateTime.new(2022, 1, 1), DateTime.new(2022, 1, 2)],
105
+ # "end" => DateTime.new(2022, 1, 3),
106
+ # }
107
+ # )
108
+ # df.select(datetime_range: Polars.datetime_ranges("start", "end"))
109
+ # # =>
110
+ # # shape: (2, 1)
111
+ # # ┌─────────────────────────────────┐
112
+ # # │ datetime_range │
113
+ # # │ --- │
114
+ # # │ list[datetime[ns]] │
115
+ # # ╞═════════════════════════════════╡
116
+ # # │ [2022-01-01 00:00:00, 2022-01-… │
117
+ # # │ [2022-01-02 00:00:00, 2022-01-… │
118
+ # # └─────────────────────────────────┘
119
+ def datetime_ranges(
120
+ start,
121
+ stop,
122
+ interval: "1d",
123
+ closed: "both",
124
+ time_unit: nil,
125
+ time_zone: nil,
126
+ eager: false
127
+ )
128
+ interval = Utils.parse_interval_argument(interval)
129
+ if time_unit.nil? && interval.include?("ns")
130
+ time_unit = "ns"
131
+ end
132
+
133
+ start_rbexpr = Utils.parse_as_expression(start)
134
+ end_rbexpr = Utils.parse_as_expression(stop)
135
+
136
+ result = Utils.wrap_expr(
137
+ Plr.datetime_ranges(
138
+ start_rbexpr, end_rbexpr, interval, closed, time_unit, time_zone
139
+ )
140
+ )
141
+
142
+ if eager
143
+ return Polars.select(result).to_series
144
+ end
145
+
146
+ result
147
+ end
148
+ end
149
+ end
@@ -0,0 +1,141 @@
1
+ module Polars
2
+ module Functions
3
+ # Generate a time range.
4
+ #
5
+ # @param start [Object]
6
+ # Lower bound of the time range.
7
+ # @param stop [Object]
8
+ # Upper bound of the time range.
9
+ # @param interval [String]
10
+ # Interval of the range periods, specified using the Polars duration string language.
11
+ # @param closed ['both', 'left', 'right', 'none']
12
+ # Define which sides of the range are closed (inclusive).
13
+ # @param eager [Boolean]
14
+ # Evaluate immediately and return a `Series`.
15
+ # If set to `False` (default), return an expression instead.
16
+ #
17
+ # @return [Object]
18
+ #
19
+ # @example
20
+ # Polars.time_range(
21
+ # time(14, 0),
22
+ # nil,
23
+ # "3h15m",
24
+ # eager: true
25
+ # ).alias("time")
26
+ # # =>
27
+ # # shape: (4,)
28
+ # # Series: 'time' [time]
29
+ # # [
30
+ # # 14:00:00
31
+ # # 17:15:00
32
+ # # 20:30:00
33
+ # # 23:45:00
34
+ # # ]
35
+ def time_range(
36
+ start = nil,
37
+ stop = nil,
38
+ interval = "1h",
39
+ closed: "both",
40
+ eager: false
41
+ )
42
+ interval = Utils.parse_interval_argument(interval)
43
+ ["y", "mo", "w", "d"].each do |unit|
44
+ if interval.include?(unit)
45
+ msg = "invalid interval unit for time_range: found #{unit.inspect}"
46
+ raise ArgumentError, msg
47
+ end
48
+ end
49
+
50
+ if start.nil?
51
+ # start = time(0, 0, 0)
52
+ raise Todo
53
+ end
54
+ if stop.nil?
55
+ # stop = time(23, 59, 59, 999999)
56
+ raise Todo
57
+ end
58
+
59
+ start_rbexpr = Utils.parse_as_expression(start)
60
+ end_rbexpr = Utils.parse_as_expression(stop)
61
+
62
+ result = Utils.wrap_expr(Plr.time_range(start_rbexpr, end_rbexpr, interval, closed))
63
+
64
+ if eager
65
+ return Polars.select(result).to_series
66
+ end
67
+
68
+ result
69
+ end
70
+
71
+ # Create a column of time ranges.
72
+ #
73
+ # @param start [Object]
74
+ # Lower bound of the time range.
75
+ # @param stop [Object]
76
+ # Upper bound of the time range.
77
+ # @param interval [Integer]
78
+ # Interval of the range periods, specified using the Polars duration string language.
79
+ # @param closed ['both', 'left', 'right', 'none']
80
+ # Define which sides of the range are closed (inclusive).
81
+ # @param eager [Boolean]
82
+ # Evaluate immediately and return a `Series`.
83
+ # If set to `false` (default), return an expression instead.
84
+ #
85
+ # @return [Object]
86
+ #
87
+ # @example
88
+ # df = Polars::DataFrame.new(
89
+ # {
90
+ # "start" => [time(9, 0), time(10, 0)],
91
+ # "end" => time(11, 0)
92
+ # }
93
+ # )
94
+ # df.with_columns(time_range: Polars.time_ranges("start", "end"))
95
+ # # =>
96
+ # # shape: (2, 3)
97
+ # # ┌──────────┬──────────┬────────────────────────────────┐
98
+ # # │ start ┆ end ┆ time_range │
99
+ # # │ --- ┆ --- ┆ --- │
100
+ # # │ time ┆ time ┆ list[time] │
101
+ # # ╞══════════╪══════════╪════════════════════════════════╡
102
+ # # │ 09:00:00 ┆ 11:00:00 ┆ [09:00:00, 10:00:00, 11:00:00] │
103
+ # # │ 10:00:00 ┆ 11:00:00 ┆ [10:00:00, 11:00:00] │
104
+ # # └──────────┴──────────┴────────────────────────────────┘
105
+ def time_ranges(
106
+ start = nil,
107
+ stop = nil,
108
+ interval = "1h",
109
+ closed: "both",
110
+ eager: false
111
+ )
112
+ interval = Utils.parse_interval_argument(interval)
113
+ ["y", "mo", "w", "d"].each do |unit|
114
+ if interval.include?(unit)
115
+ msg = "invalid interval unit for time_range: found #{unit.inspect}"
116
+ raise ArgumentError, msg
117
+ end
118
+ end
119
+
120
+ if start.nil?
121
+ # start = time(0, 0, 0)
122
+ raise Todo
123
+ end
124
+ if stop.nil?
125
+ # stop = time(23, 59, 59, 999999)
126
+ raise Todo
127
+ end
128
+
129
+ start_rbexpr = Utils.parse_as_expression(start)
130
+ end_rbexpr = Utils.parse_as_expression(stop)
131
+
132
+ result = Utils.wrap_expr(Plr.time_ranges(start_rbexpr, end_rbexpr, interval, closed))
133
+
134
+ if eager
135
+ return Polars.select(result).to_series
136
+ end
137
+
138
+ result
139
+ end
140
+ end
141
+ end
@@ -106,39 +106,104 @@ module Polars
106
106
  # _dataframe_class._from_rbdf(_df.group_by_apply(by, f))
107
107
  # end
108
108
 
109
- # Use multiple aggregations on columns.
109
+ # Compute aggregations for each group of a group by operation.
110
110
  #
111
- # This can be combined with complete lazy API and is considered idiomatic polars.
112
- #
113
- # @param aggs [Object]
114
- # Single / multiple aggregation expression(s).
111
+ # @param aggs [Array]
112
+ # Aggregations to compute for each group of the group by operation,
113
+ # specified as positional arguments.
114
+ # Accepts expression input. Strings are parsed as column names.
115
+ # @param named_aggs [Hash]
116
+ # Additional aggregations, specified as keyword arguments.
117
+ # The resulting columns will be renamed to the keyword used.
115
118
  #
116
119
  # @return [DataFrame]
117
120
  #
118
- # @example
121
+ # @example Compute the aggregation of the columns for each group.
119
122
  # df = Polars::DataFrame.new(
120
- # {"foo" => ["one", "two", "two", "one", "two"], "bar" => [5, 3, 2, 4, 1]}
123
+ # {
124
+ # "a" => ["a", "b", "a", "b", "c"],
125
+ # "b" => [1, 2, 1, 3, 3],
126
+ # "c" => [5, 4, 3, 2, 1]
127
+ # }
121
128
  # )
122
- # df.group_by("foo", maintain_order: true).agg(
123
- # [
124
- # Polars.sum("bar").suffix("_sum"),
125
- # Polars.col("bar").sort.tail(2).sum.suffix("_tail_sum")
126
- # ]
129
+ # df.group_by("a").agg(Polars.col("b"), Polars.col("c"))
130
+ # # =>
131
+ # # shape: (3, 3)
132
+ # # ┌─────┬───────────┬───────────┐
133
+ # # │ a ┆ b ┆ c │
134
+ # # │ --- ┆ --- ┆ --- │
135
+ # # │ str ┆ list[i64] ┆ list[i64] │
136
+ # # ╞═════╪═══════════╪═══════════╡
137
+ # # │ a ┆ [1, 1] ┆ [5, 3] │
138
+ # # │ b ┆ [2, 3] ┆ [4, 2] │
139
+ # # │ c ┆ [3] ┆ [1] │
140
+ # # └─────┴───────────┴───────────┘
141
+ #
142
+ # @example Compute the sum of a column for each group.
143
+ # df.group_by("a").agg(Polars.col("b").sum)
144
+ # # =>
145
+ # # shape: (3, 2)
146
+ # # ┌─────┬─────┐
147
+ # # │ a ┆ b │
148
+ # # │ --- ┆ --- │
149
+ # # │ str ┆ i64 │
150
+ # # ╞═════╪═════╡
151
+ # # │ a ┆ 2 │
152
+ # # │ b ┆ 5 │
153
+ # # │ c ┆ 3 │
154
+ # # └─────┴─────┘
155
+ #
156
+ # @example Compute multiple aggregates at once by passing a list of expressions.
157
+ # df.group_by("a").agg([Polars.sum("b"), Polars.mean("c")])
158
+ # # =>
159
+ # # shape: (3, 3)
160
+ # # ┌─────┬─────┬─────┐
161
+ # # │ a ┆ b ┆ c │
162
+ # # │ --- ┆ --- ┆ --- │
163
+ # # │ str ┆ i64 ┆ f64 │
164
+ # # ╞═════╪═════╪═════╡
165
+ # # │ c ┆ 3 ┆ 1.0 │
166
+ # # │ a ┆ 2 ┆ 4.0 │
167
+ # # │ b ┆ 5 ┆ 3.0 │
168
+ # # └─────┴─────┴─────┘
169
+ #
170
+ # @example Or use positional arguments to compute multiple aggregations in the same way.
171
+ # df.group_by("a").agg(
172
+ # Polars.sum("b").name.suffix("_sum"),
173
+ # (Polars.col("c") ** 2).mean.name.suffix("_mean_squared")
127
174
  # )
128
175
  # # =>
129
- # # shape: (2, 3)
130
- # # ┌─────┬─────────┬──────────────┐
131
- # # │ foo bar_sumbar_tail_sum
132
- # # │ --- ┆ --- ┆ ---
133
- # # │ str ┆ i64 i64
134
- # # ╞═════╪═════════╪══════════════╡
135
- # # │ one 9 9
136
- # # │ two 6 5
137
- # # └─────┴─────────┴──────────────┘
138
- def agg(aggs)
176
+ # # shape: (3, 3)
177
+ # # ┌─────┬───────┬────────────────┐
178
+ # # │ a b_sumc_mean_squared
179
+ # # │ --- ┆ --- ┆ ---
180
+ # # │ str ┆ i64 f64
181
+ # # ╞═════╪═══════╪════════════════╡
182
+ # # │ a 2 17.0
183
+ # # │ c 3 1.0
184
+ # # │ b ┆ 5 ┆ 10.0 │
185
+ # # └─────┴───────┴────────────────┘
186
+ #
187
+ # @example Use keyword arguments to easily name your expression inputs.
188
+ # df.group_by("a").agg(
189
+ # b_sum: Polars.sum("b"),
190
+ # c_mean_squared: (Polars.col("c") ** 2).mean
191
+ # )
192
+ # # =>
193
+ # # shape: (3, 3)
194
+ # # ┌─────┬───────┬────────────────┐
195
+ # # │ a ┆ b_sum ┆ c_mean_squared │
196
+ # # │ --- ┆ --- ┆ --- │
197
+ # # │ str ┆ i64 ┆ f64 │
198
+ # # ╞═════╪═══════╪════════════════╡
199
+ # # │ a ┆ 2 ┆ 17.0 │
200
+ # # │ c ┆ 3 ┆ 1.0 │
201
+ # # │ b ┆ 5 ┆ 10.0 │
202
+ # # └─────┴───────┴────────────────┘
203
+ def agg(*aggs, **named_aggs)
139
204
  @df.lazy
140
205
  .group_by(@by, maintain_order: @maintain_order)
141
- .agg(aggs)
206
+ .agg(*aggs, **named_aggs)
142
207
  .collect(no_optimization: true)
143
208
  end
144
209
 
@@ -0,0 +1,24 @@
1
+ module Polars
2
+ module IO
3
+ # Read into a DataFrame from Apache Avro format.
4
+ #
5
+ # @param source [Object]
6
+ # Path to a file or a file-like object.
7
+ # @param columns [Object]
8
+ # Columns to select. Accepts a list of column indices (starting at zero) or a list
9
+ # of column names.
10
+ # @param n_rows [Integer]
11
+ # Stop reading from Apache Avro file after reading ``n_rows``.
12
+ #
13
+ # @return [DataFrame]
14
+ def read_avro(source, columns: nil, n_rows: nil)
15
+ if Utils.pathlike?(source)
16
+ source = Utils.normalize_filepath(source)
17
+ end
18
+ projection, column_names = Utils.handle_projection_columns(columns)
19
+
20
+ rbdf = RbDataFrame.read_avro(source, column_names, projection, n_rows)
21
+ Utils.wrap_df(rbdf)
22
+ end
23
+ end
24
+ end