polars-df 0.10.0-x86_64-linux → 0.12.0-x86_64-linux

Sign up to get free protection for your applications and to get access to all the features.
Files changed (58) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +27 -0
  3. data/Cargo.lock +392 -351
  4. data/LICENSE-THIRD-PARTY.txt +1125 -865
  5. data/README.md +6 -6
  6. data/lib/polars/3.1/polars.so +0 -0
  7. data/lib/polars/3.2/polars.so +0 -0
  8. data/lib/polars/3.3/polars.so +0 -0
  9. data/lib/polars/array_expr.rb +4 -4
  10. data/lib/polars/batched_csv_reader.rb +11 -5
  11. data/lib/polars/cat_expr.rb +0 -36
  12. data/lib/polars/cat_name_space.rb +0 -37
  13. data/lib/polars/convert.rb +6 -1
  14. data/lib/polars/data_frame.rb +176 -403
  15. data/lib/polars/data_types.rb +1 -1
  16. data/lib/polars/date_time_expr.rb +525 -572
  17. data/lib/polars/date_time_name_space.rb +263 -460
  18. data/lib/polars/dynamic_group_by.rb +5 -5
  19. data/lib/polars/exceptions.rb +7 -0
  20. data/lib/polars/expr.rb +1394 -243
  21. data/lib/polars/expr_dispatch.rb +1 -1
  22. data/lib/polars/functions/aggregation/horizontal.rb +8 -8
  23. data/lib/polars/functions/as_datatype.rb +63 -40
  24. data/lib/polars/functions/lazy.rb +63 -14
  25. data/lib/polars/functions/lit.rb +1 -1
  26. data/lib/polars/functions/range/date_range.rb +90 -57
  27. data/lib/polars/functions/range/datetime_range.rb +149 -0
  28. data/lib/polars/functions/range/int_range.rb +2 -2
  29. data/lib/polars/functions/range/time_range.rb +141 -0
  30. data/lib/polars/functions/repeat.rb +1 -1
  31. data/lib/polars/functions/whenthen.rb +1 -1
  32. data/lib/polars/group_by.rb +88 -23
  33. data/lib/polars/io/avro.rb +24 -0
  34. data/lib/polars/{io.rb → io/csv.rb} +299 -493
  35. data/lib/polars/io/database.rb +73 -0
  36. data/lib/polars/io/ipc.rb +247 -0
  37. data/lib/polars/io/json.rb +29 -0
  38. data/lib/polars/io/ndjson.rb +80 -0
  39. data/lib/polars/io/parquet.rb +227 -0
  40. data/lib/polars/lazy_frame.rb +143 -272
  41. data/lib/polars/lazy_group_by.rb +100 -3
  42. data/lib/polars/list_expr.rb +11 -11
  43. data/lib/polars/list_name_space.rb +5 -1
  44. data/lib/polars/rolling_group_by.rb +7 -9
  45. data/lib/polars/series.rb +103 -187
  46. data/lib/polars/string_expr.rb +78 -102
  47. data/lib/polars/string_name_space.rb +5 -4
  48. data/lib/polars/testing.rb +2 -2
  49. data/lib/polars/utils/constants.rb +9 -0
  50. data/lib/polars/utils/convert.rb +97 -0
  51. data/lib/polars/utils/parse.rb +89 -0
  52. data/lib/polars/utils/various.rb +76 -0
  53. data/lib/polars/utils/wrap.rb +19 -0
  54. data/lib/polars/utils.rb +8 -300
  55. data/lib/polars/version.rb +1 -1
  56. data/lib/polars/whenthen.rb +6 -6
  57. data/lib/polars.rb +20 -1
  58. metadata +17 -4
@@ -0,0 +1,149 @@
1
+ module Polars
2
+ module Functions
3
+ # Generate a datetime range.
4
+ #
5
+ # @param start [Object]
6
+ # Lower bound of the datetime range.
7
+ # @param stop [Object]
8
+ # Upper bound of the datetime range.
9
+ # @param interval [String]
10
+ # Interval of the range periods, specified using the Polars duration string language.
11
+ # @param closed ['both', 'left', 'right', 'none']
12
+ # Define which sides of the range are closed (inclusive).
13
+ # @param time_unit [nil, 'ns', 'us', 'ms']
14
+ # Time unit of the resulting `Datetime` data type.
15
+ # @param time_zone [String]
16
+ # Time zone of the resulting `Datetime` data type.
17
+ # @param eager [Boolean]
18
+ # Evaluate immediately and return a `Series`.
19
+ # If set to `false` (default), return an expression instead.
20
+ #
21
+ # @return [Object]
22
+ #
23
+ # @example Using Polars duration string to specify the interval:
24
+ # Polars.datetime_range(
25
+ # DateTime.new(2022, 1, 1), DateTime.new(2022, 3, 1), "1mo", eager: true
26
+ # ).alias("datetime")
27
+ # # =>
28
+ # # shape: (3,)
29
+ # # Series: 'datetime' [datetime[ns]]
30
+ # # [
31
+ # # 2022-01-01 00:00:00
32
+ # # 2022-02-01 00:00:00
33
+ # # 2022-03-01 00:00:00
34
+ # # ]
35
+ #
36
+ # @example Specifying a time zone:
37
+ # Polars.datetime_range(
38
+ # DateTime.new(2022, 1, 1),
39
+ # DateTime.new(2022, 3, 1),
40
+ # "1mo",
41
+ # time_zone: "America/New_York",
42
+ # eager: true
43
+ # ).alias("datetime")
44
+ # # =>
45
+ # # shape: (3,)
46
+ # # Series: 'datetime' [datetime[ns, America/New_York]]
47
+ # # [
48
+ # # 2022-01-01 00:00:00 EST
49
+ # # 2022-02-01 00:00:00 EST
50
+ # # 2022-03-01 00:00:00 EST
51
+ # # ]
52
+ def datetime_range(
53
+ start,
54
+ stop,
55
+ interval = "1d",
56
+ closed: "both",
57
+ time_unit: nil,
58
+ time_zone: nil,
59
+ eager: false
60
+ )
61
+ interval = Utils.parse_interval_argument(interval)
62
+ if time_unit.nil? && interval.include?("ns")
63
+ time_unit = "ns"
64
+ end
65
+
66
+ start_rbexpr = Utils.parse_into_expression(start)
67
+ end_rbexpr = Utils.parse_into_expression(stop)
68
+ result = Utils.wrap_expr(
69
+ Plr.datetime_range(
70
+ start_rbexpr, end_rbexpr, interval, closed, time_unit, time_zone
71
+ )
72
+ )
73
+
74
+ if eager
75
+ return Polars.select(result).to_series
76
+ end
77
+
78
+ result
79
+ end
80
+
81
+ # Create a column of datetime ranges.
82
+ #
83
+ # @param start [Object]
84
+ # Lower bound of the datetime range.
85
+ # @param stop [Object]
86
+ # Upper bound of the datetime range.
87
+ # @param interval [String]
88
+ # Interval of the range periods, specified using the Polars duration string language.
89
+ # @param closed ['both', 'left', 'right', 'none']
90
+ # Define which sides of the range are closed (inclusive).
91
+ # @param time_unit [nil, 'ns', 'us', 'ms']
92
+ # Time unit of the resulting `Datetime` data type.
93
+ # @param time_zone [String]
94
+ # Time zone of the resulting `Datetime` data type.
95
+ # @param eager [Boolean]
96
+ # Evaluate immediately and return a `Series`.
97
+ # If set to `false` (default), return an expression instead.
98
+ #
99
+ # @return [Object]
100
+ #
101
+ # @example
102
+ # df = Polars::DataFrame.new(
103
+ # {
104
+ # "start" => [DateTime.new(2022, 1, 1), DateTime.new(2022, 1, 2)],
105
+ # "end" => DateTime.new(2022, 1, 3),
106
+ # }
107
+ # )
108
+ # df.select(datetime_range: Polars.datetime_ranges("start", "end"))
109
+ # # =>
110
+ # # shape: (2, 1)
111
+ # # ┌─────────────────────────────────┐
112
+ # # │ datetime_range │
113
+ # # │ --- │
114
+ # # │ list[datetime[ns]] │
115
+ # # ╞═════════════════════════════════╡
116
+ # # │ [2022-01-01 00:00:00, 2022-01-… │
117
+ # # │ [2022-01-02 00:00:00, 2022-01-… │
118
+ # # └─────────────────────────────────┘
119
+ def datetime_ranges(
120
+ start,
121
+ stop,
122
+ interval: "1d",
123
+ closed: "both",
124
+ time_unit: nil,
125
+ time_zone: nil,
126
+ eager: false
127
+ )
128
+ interval = Utils.parse_interval_argument(interval)
129
+ if time_unit.nil? && interval.include?("ns")
130
+ time_unit = "ns"
131
+ end
132
+
133
+ start_rbexpr = Utils.parse_into_expression(start)
134
+ end_rbexpr = Utils.parse_into_expression(stop)
135
+
136
+ result = Utils.wrap_expr(
137
+ Plr.datetime_ranges(
138
+ start_rbexpr, end_rbexpr, interval, closed, time_unit, time_zone
139
+ )
140
+ )
141
+
142
+ if eager
143
+ return Polars.select(result).to_series
144
+ end
145
+
146
+ result
147
+ end
148
+ end
149
+ end
@@ -34,8 +34,8 @@ module Polars
34
34
  start = 0
35
35
  end
36
36
 
37
- start = Utils.parse_as_expression(start)
38
- stop = Utils.parse_as_expression(stop)
37
+ start = Utils.parse_into_expression(start)
38
+ stop = Utils.parse_into_expression(stop)
39
39
  dtype ||= Int64
40
40
  dtype = dtype.to_s if dtype.is_a?(Symbol)
41
41
  result = Utils.wrap_expr(Plr.int_range(start, stop, step, dtype)).alias("arange")
@@ -0,0 +1,141 @@
1
+ module Polars
2
+ module Functions
3
+ # Generate a time range.
4
+ #
5
+ # @param start [Object]
6
+ # Lower bound of the time range.
7
+ # @param stop [Object]
8
+ # Upper bound of the time range.
9
+ # @param interval [String]
10
+ # Interval of the range periods, specified using the Polars duration string language.
11
+ # @param closed ['both', 'left', 'right', 'none']
12
+ # Define which sides of the range are closed (inclusive).
13
+ # @param eager [Boolean]
14
+ # Evaluate immediately and return a `Series`.
15
+ # If set to `False` (default), return an expression instead.
16
+ #
17
+ # @return [Object]
18
+ #
19
+ # @example
20
+ # Polars.time_range(
21
+ # time(14, 0),
22
+ # nil,
23
+ # "3h15m",
24
+ # eager: true
25
+ # ).alias("time")
26
+ # # =>
27
+ # # shape: (4,)
28
+ # # Series: 'time' [time]
29
+ # # [
30
+ # # 14:00:00
31
+ # # 17:15:00
32
+ # # 20:30:00
33
+ # # 23:45:00
34
+ # # ]
35
+ def time_range(
36
+ start = nil,
37
+ stop = nil,
38
+ interval = "1h",
39
+ closed: "both",
40
+ eager: false
41
+ )
42
+ interval = Utils.parse_interval_argument(interval)
43
+ ["y", "mo", "w", "d"].each do |unit|
44
+ if interval.include?(unit)
45
+ msg = "invalid interval unit for time_range: found #{unit.inspect}"
46
+ raise ArgumentError, msg
47
+ end
48
+ end
49
+
50
+ if start.nil?
51
+ # start = time(0, 0, 0)
52
+ raise Todo
53
+ end
54
+ if stop.nil?
55
+ # stop = time(23, 59, 59, 999999)
56
+ raise Todo
57
+ end
58
+
59
+ start_rbexpr = Utils.parse_into_expression(start)
60
+ end_rbexpr = Utils.parse_into_expression(stop)
61
+
62
+ result = Utils.wrap_expr(Plr.time_range(start_rbexpr, end_rbexpr, interval, closed))
63
+
64
+ if eager
65
+ return Polars.select(result).to_series
66
+ end
67
+
68
+ result
69
+ end
70
+
71
+ # Create a column of time ranges.
72
+ #
73
+ # @param start [Object]
74
+ # Lower bound of the time range.
75
+ # @param stop [Object]
76
+ # Upper bound of the time range.
77
+ # @param interval [Integer]
78
+ # Interval of the range periods, specified using the Polars duration string language.
79
+ # @param closed ['both', 'left', 'right', 'none']
80
+ # Define which sides of the range are closed (inclusive).
81
+ # @param eager [Boolean]
82
+ # Evaluate immediately and return a `Series`.
83
+ # If set to `false` (default), return an expression instead.
84
+ #
85
+ # @return [Object]
86
+ #
87
+ # @example
88
+ # df = Polars::DataFrame.new(
89
+ # {
90
+ # "start" => [time(9, 0), time(10, 0)],
91
+ # "end" => time(11, 0)
92
+ # }
93
+ # )
94
+ # df.with_columns(time_range: Polars.time_ranges("start", "end"))
95
+ # # =>
96
+ # # shape: (2, 3)
97
+ # # ┌──────────┬──────────┬────────────────────────────────┐
98
+ # # │ start ┆ end ┆ time_range │
99
+ # # │ --- ┆ --- ┆ --- │
100
+ # # │ time ┆ time ┆ list[time] │
101
+ # # ╞══════════╪══════════╪════════════════════════════════╡
102
+ # # │ 09:00:00 ┆ 11:00:00 ┆ [09:00:00, 10:00:00, 11:00:00] │
103
+ # # │ 10:00:00 ┆ 11:00:00 ┆ [10:00:00, 11:00:00] │
104
+ # # └──────────┴──────────┴────────────────────────────────┘
105
+ def time_ranges(
106
+ start = nil,
107
+ stop = nil,
108
+ interval = "1h",
109
+ closed: "both",
110
+ eager: false
111
+ )
112
+ interval = Utils.parse_interval_argument(interval)
113
+ ["y", "mo", "w", "d"].each do |unit|
114
+ if interval.include?(unit)
115
+ msg = "invalid interval unit for time_range: found #{unit.inspect}"
116
+ raise ArgumentError, msg
117
+ end
118
+ end
119
+
120
+ if start.nil?
121
+ # start = time(0, 0, 0)
122
+ raise Todo
123
+ end
124
+ if stop.nil?
125
+ # stop = time(23, 59, 59, 999999)
126
+ raise Todo
127
+ end
128
+
129
+ start_rbexpr = Utils.parse_into_expression(start)
130
+ end_rbexpr = Utils.parse_into_expression(stop)
131
+
132
+ result = Utils.wrap_expr(Plr.time_ranges(start_rbexpr, end_rbexpr, interval, closed))
133
+
134
+ if eager
135
+ return Polars.select(result).to_series
136
+ end
137
+
138
+ result
139
+ end
140
+ end
141
+ end
@@ -43,7 +43,7 @@ module Polars
43
43
  n = lit(n)
44
44
  end
45
45
 
46
- value = Utils.parse_as_expression(value, str_as_lit: true)
46
+ value = Utils.parse_into_expression(value, str_as_lit: true)
47
47
  expr = Utils.wrap_expr(Plr.repeat(value, n._rbexpr, dtype))
48
48
  if !name.nil?
49
49
  expr = expr.alias(name)
@@ -89,7 +89,7 @@ module Polars
89
89
  # # │ 4 ┆ 0 ┆ 99 │
90
90
  # # └─────┴─────┴─────┘
91
91
  def when(*predicates, **constraints)
92
- condition = Utils.parse_when_inputs(*predicates, **constraints)
92
+ condition = Utils.parse_predicates_constraints_into_expression(*predicates, **constraints)
93
93
  When.new(Plr.when(condition))
94
94
  end
95
95
  end
@@ -106,39 +106,104 @@ module Polars
106
106
  # _dataframe_class._from_rbdf(_df.group_by_apply(by, f))
107
107
  # end
108
108
 
109
- # Use multiple aggregations on columns.
109
+ # Compute aggregations for each group of a group by operation.
110
110
  #
111
- # This can be combined with complete lazy API and is considered idiomatic polars.
112
- #
113
- # @param aggs [Object]
114
- # Single / multiple aggregation expression(s).
111
+ # @param aggs [Array]
112
+ # Aggregations to compute for each group of the group by operation,
113
+ # specified as positional arguments.
114
+ # Accepts expression input. Strings are parsed as column names.
115
+ # @param named_aggs [Hash]
116
+ # Additional aggregations, specified as keyword arguments.
117
+ # The resulting columns will be renamed to the keyword used.
115
118
  #
116
119
  # @return [DataFrame]
117
120
  #
118
- # @example
121
+ # @example Compute the aggregation of the columns for each group.
119
122
  # df = Polars::DataFrame.new(
120
- # {"foo" => ["one", "two", "two", "one", "two"], "bar" => [5, 3, 2, 4, 1]}
123
+ # {
124
+ # "a" => ["a", "b", "a", "b", "c"],
125
+ # "b" => [1, 2, 1, 3, 3],
126
+ # "c" => [5, 4, 3, 2, 1]
127
+ # }
121
128
  # )
122
- # df.group_by("foo", maintain_order: true).agg(
123
- # [
124
- # Polars.sum("bar").suffix("_sum"),
125
- # Polars.col("bar").sort.tail(2).sum.suffix("_tail_sum")
126
- # ]
129
+ # df.group_by("a").agg(Polars.col("b"), Polars.col("c"))
130
+ # # =>
131
+ # # shape: (3, 3)
132
+ # # ┌─────┬───────────┬───────────┐
133
+ # # │ a ┆ b ┆ c │
134
+ # # │ --- ┆ --- ┆ --- │
135
+ # # │ str ┆ list[i64] ┆ list[i64] │
136
+ # # ╞═════╪═══════════╪═══════════╡
137
+ # # │ a ┆ [1, 1] ┆ [5, 3] │
138
+ # # │ b ┆ [2, 3] ┆ [4, 2] │
139
+ # # │ c ┆ [3] ┆ [1] │
140
+ # # └─────┴───────────┴───────────┘
141
+ #
142
+ # @example Compute the sum of a column for each group.
143
+ # df.group_by("a").agg(Polars.col("b").sum)
144
+ # # =>
145
+ # # shape: (3, 2)
146
+ # # ┌─────┬─────┐
147
+ # # │ a ┆ b │
148
+ # # │ --- ┆ --- │
149
+ # # │ str ┆ i64 │
150
+ # # ╞═════╪═════╡
151
+ # # │ a ┆ 2 │
152
+ # # │ b ┆ 5 │
153
+ # # │ c ┆ 3 │
154
+ # # └─────┴─────┘
155
+ #
156
+ # @example Compute multiple aggregates at once by passing a list of expressions.
157
+ # df.group_by("a").agg([Polars.sum("b"), Polars.mean("c")])
158
+ # # =>
159
+ # # shape: (3, 3)
160
+ # # ┌─────┬─────┬─────┐
161
+ # # │ a ┆ b ┆ c │
162
+ # # │ --- ┆ --- ┆ --- │
163
+ # # │ str ┆ i64 ┆ f64 │
164
+ # # ╞═════╪═════╪═════╡
165
+ # # │ c ┆ 3 ┆ 1.0 │
166
+ # # │ a ┆ 2 ┆ 4.0 │
167
+ # # │ b ┆ 5 ┆ 3.0 │
168
+ # # └─────┴─────┴─────┘
169
+ #
170
+ # @example Or use positional arguments to compute multiple aggregations in the same way.
171
+ # df.group_by("a").agg(
172
+ # Polars.sum("b").name.suffix("_sum"),
173
+ # (Polars.col("c") ** 2).mean.name.suffix("_mean_squared")
127
174
  # )
128
175
  # # =>
129
- # # shape: (2, 3)
130
- # # ┌─────┬─────────┬──────────────┐
131
- # # │ foo bar_sumbar_tail_sum
132
- # # │ --- ┆ --- ┆ ---
133
- # # │ str ┆ i64 i64
134
- # # ╞═════╪═════════╪══════════════╡
135
- # # │ one 9 9
136
- # # │ two 6 5
137
- # # └─────┴─────────┴──────────────┘
138
- def agg(aggs)
176
+ # # shape: (3, 3)
177
+ # # ┌─────┬───────┬────────────────┐
178
+ # # │ a b_sumc_mean_squared
179
+ # # │ --- ┆ --- ┆ ---
180
+ # # │ str ┆ i64 f64
181
+ # # ╞═════╪═══════╪════════════════╡
182
+ # # │ a 2 17.0
183
+ # # │ c 3 1.0
184
+ # # │ b ┆ 5 ┆ 10.0 │
185
+ # # └─────┴───────┴────────────────┘
186
+ #
187
+ # @example Use keyword arguments to easily name your expression inputs.
188
+ # df.group_by("a").agg(
189
+ # b_sum: Polars.sum("b"),
190
+ # c_mean_squared: (Polars.col("c") ** 2).mean
191
+ # )
192
+ # # =>
193
+ # # shape: (3, 3)
194
+ # # ┌─────┬───────┬────────────────┐
195
+ # # │ a ┆ b_sum ┆ c_mean_squared │
196
+ # # │ --- ┆ --- ┆ --- │
197
+ # # │ str ┆ i64 ┆ f64 │
198
+ # # ╞═════╪═══════╪════════════════╡
199
+ # # │ a ┆ 2 ┆ 17.0 │
200
+ # # │ c ┆ 3 ┆ 1.0 │
201
+ # # │ b ┆ 5 ┆ 10.0 │
202
+ # # └─────┴───────┴────────────────┘
203
+ def agg(*aggs, **named_aggs)
139
204
  @df.lazy
140
205
  .group_by(@by, maintain_order: @maintain_order)
141
- .agg(aggs)
206
+ .agg(*aggs, **named_aggs)
142
207
  .collect(no_optimization: true)
143
208
  end
144
209
 
@@ -0,0 +1,24 @@
1
+ module Polars
2
+ module IO
3
+ # Read into a DataFrame from Apache Avro format.
4
+ #
5
+ # @param source [Object]
6
+ # Path to a file or a file-like object.
7
+ # @param columns [Object]
8
+ # Columns to select. Accepts a list of column indices (starting at zero) or a list
9
+ # of column names.
10
+ # @param n_rows [Integer]
11
+ # Stop reading from Apache Avro file after reading ``n_rows``.
12
+ #
13
+ # @return [DataFrame]
14
+ def read_avro(source, columns: nil, n_rows: nil)
15
+ if Utils.pathlike?(source)
16
+ source = Utils.normalize_filepath(source)
17
+ end
18
+ projection, column_names = Utils.handle_projection_columns(columns)
19
+
20
+ rbdf = RbDataFrame.read_avro(source, column_names, projection, n_rows)
21
+ Utils.wrap_df(rbdf)
22
+ end
23
+ end
24
+ end