polars-df 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +9 -0
  4. data/Cargo.lock +74 -3
  5. data/Cargo.toml +3 -0
  6. data/README.md +1 -1
  7. data/ext/polars/Cargo.toml +18 -1
  8. data/ext/polars/src/conversion.rs +115 -2
  9. data/ext/polars/src/dataframe.rs +228 -11
  10. data/ext/polars/src/error.rs +4 -0
  11. data/ext/polars/src/lazy/dataframe.rs +5 -5
  12. data/ext/polars/src/lazy/dsl.rs +157 -2
  13. data/ext/polars/src/lib.rs +185 -10
  14. data/ext/polars/src/list_construction.rs +100 -0
  15. data/ext/polars/src/series.rs +217 -29
  16. data/ext/polars/src/set.rs +91 -0
  17. data/ext/polars/src/utils.rs +19 -0
  18. data/lib/polars/batched_csv_reader.rb +1 -0
  19. data/lib/polars/cat_expr.rb +39 -0
  20. data/lib/polars/cat_name_space.rb +54 -0
  21. data/lib/polars/data_frame.rb +2384 -140
  22. data/lib/polars/date_time_expr.rb +1282 -7
  23. data/lib/polars/date_time_name_space.rb +1484 -0
  24. data/lib/polars/exceptions.rb +20 -0
  25. data/lib/polars/expr.rb +4374 -53
  26. data/lib/polars/expr_dispatch.rb +22 -0
  27. data/lib/polars/functions.rb +219 -0
  28. data/lib/polars/group_by.rb +518 -0
  29. data/lib/polars/io.rb +421 -2
  30. data/lib/polars/lazy_frame.rb +1267 -69
  31. data/lib/polars/lazy_functions.rb +412 -24
  32. data/lib/polars/lazy_group_by.rb +80 -0
  33. data/lib/polars/list_expr.rb +507 -5
  34. data/lib/polars/list_name_space.rb +346 -0
  35. data/lib/polars/meta_expr.rb +21 -0
  36. data/lib/polars/series.rb +2256 -242
  37. data/lib/polars/slice.rb +104 -0
  38. data/lib/polars/string_expr.rb +847 -10
  39. data/lib/polars/string_name_space.rb +690 -0
  40. data/lib/polars/struct_expr.rb +73 -0
  41. data/lib/polars/struct_name_space.rb +64 -0
  42. data/lib/polars/utils.rb +71 -3
  43. data/lib/polars/version.rb +2 -1
  44. data/lib/polars/when.rb +1 -0
  45. data/lib/polars/when_then.rb +1 -0
  46. data/lib/polars.rb +12 -10
  47. metadata +15 -2
@@ -0,0 +1,22 @@
1
+ module Polars
2
+ # @private
3
+ module ExprDispatch
4
+ private
5
+
6
+ def self.included(base)
7
+ base.attr_accessor :_s
8
+ base.singleton_class.attr_accessor :_accessor
9
+ end
10
+
11
+ def method_missing(method, ...)
12
+ return super unless self.class.method_defined?(method)
13
+
14
+ namespace = self.class._accessor
15
+
16
+ s = Utils.wrap_s(_s)
17
+ expr = Utils.col(s.name)
18
+ expr = expr.send(namespace) if namespace
19
+ s.to_frame.select(expr.send(method, ...)).to_series
20
+ end
21
+ end
22
+ end
@@ -1,5 +1,51 @@
1
1
  module Polars
2
2
  module Functions
3
+ # Convert categorical variables into dummy/indicator variables.
4
+ #
5
+ # @param df [DataFrame]
6
+ # DataFrame to convert.
7
+ # @param columns [Array, nil]
8
+ # A subset of columns to convert to dummy variables. `nil` means
9
+ # "all columns".
10
+ #
11
+ # @return [DataFrame]
12
+ def get_dummies(df, columns: nil)
13
+ df.to_dummies(columns: columns)
14
+ end
15
+
16
+ # Aggregate multiple Dataframes/Series to a single DataFrame/Series.
17
+ #
18
+ # @param items [Object]
19
+ # DataFrames/Series/LazyFrames to concatenate.
20
+ # @param rechunk [Boolean]
21
+ # Make sure that all data is in contiguous memory.
22
+ # @param how ["vertical", "diagonal", "horizontal"]
23
+ # Lazy only supports the 'vertical' strategy.
24
+ #
25
+ # - Vertical: applies multiple `vstack` operations.
26
+ # - Diagonal: finds a union between the column schemas and fills missing column values with null.
27
+ # - Horizontal: stacks Series horizontally and fills with nulls if the lengths don't match.
28
+ # @param parallel [Boolean]
29
+ # Only relevant for LazyFrames. This determines if the concatenated
30
+ # lazy computations may be executed in parallel.
31
+ #
32
+ # @return [Object]
33
+ #
34
+ # @example
35
+ # df1 = Polars::DataFrame.new({"a" => [1], "b" => [3]})
36
+ # df2 = Polars::DataFrame.new({"a" => [2], "b" => [4]})
37
+ # Polars.concat([df1, df2])
38
+ # # =>
39
+ # # shape: (2, 2)
40
+ # # ┌─────┬─────┐
41
+ # # │ a ┆ b │
42
+ # # │ --- ┆ --- │
43
+ # # │ i64 ┆ i64 │
44
+ # # ╞═════╪═════╡
45
+ # # │ 1 ┆ 3 │
46
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
47
+ # # │ 2 ┆ 4 │
48
+ # # └─────┴─────┘
3
49
  def concat(items, rechunk: true, how: "vertical", parallel: true)
4
50
  if items.empty?
5
51
  raise ArgumentError, "cannot concat empty list"
@@ -41,5 +87,178 @@ module Polars
41
87
  out
42
88
  end
43
89
  end
90
+
91
+ # Create a range of type `Datetime` (or `Date`).
92
+ #
93
+ # @param low [Object]
94
+ # Lower bound of the date range.
95
+ # @param high [Object]
96
+ # Upper bound of the date range.
97
+ # @param interval [Object]
98
+ # Interval periods. It can be a polars duration string, such as `3d12h4m25s`
99
+ # representing 3 days, 12 hours, 4 minutes, and 25 seconds.
100
+ # @param lazy [Boolean]
101
+ # Return an expression.
102
+ # @param closed ["both", "left", "right", "none"]
103
+ # Define whether the temporal window interval is closed or not.
104
+ # @param name [String]
105
+ # Name of the output Series.
106
+ # @param time_unit [nil, "ns", "us", "ms"]
107
+ # Set the time unit.
108
+ # @param time_zone [String]
109
+ # Optional timezone
110
+ #
111
+ # @return [Object]
112
+ #
113
+ # @note
114
+ # If both `low` and `high` are passed as date types (not datetime), and the
115
+ # interval granularity is no finer than 1d, the returned range is also of
116
+ # type date. All other permutations return a datetime Series.
117
+ #
118
+ # @example Using polars duration string to specify the interval
119
+ # Polars.date_range(Date.new(2022, 1, 1), Date.new(2022, 3, 1), "1mo", name: "drange")
120
+ # # =>
121
+ # # shape: (3,)
122
+ # # Series: 'drange' [date]
123
+ # # [
124
+ # # 2022-01-01
125
+ # # 2022-02-01
126
+ # # 2022-03-01
127
+ # # ]
128
+ #
129
+ # @example Using `timedelta` object to specify the interval:
130
+ # Polars.date_range(
131
+ # DateTime.new(1985, 1, 1),
132
+ # DateTime.new(1985, 1, 10),
133
+ # "1d12h",
134
+ # time_unit: "ms"
135
+ # )
136
+ # # =>
137
+ # # shape: (7,)
138
+ # # Series: '' [datetime[ms]]
139
+ # # [
140
+ # # 1985-01-01 00:00:00
141
+ # # 1985-01-02 12:00:00
142
+ # # 1985-01-04 00:00:00
143
+ # # 1985-01-05 12:00:00
144
+ # # 1985-01-07 00:00:00
145
+ # # 1985-01-08 12:00:00
146
+ # # 1985-01-10 00:00:00
147
+ # # ]
148
+ def date_range(
149
+ low,
150
+ high,
151
+ interval,
152
+ lazy: false,
153
+ closed: "both",
154
+ name: nil,
155
+ time_unit: nil,
156
+ time_zone: nil
157
+ )
158
+ if defined?(ActiveSupport::Duration) && interval.is_a?(ActiveSupport::Duration)
159
+ raise Todo
160
+ else
161
+ interval = interval.to_s
162
+ if interval.include?(" ")
163
+ interval = interval.gsub(" ", "")
164
+ end
165
+ end
166
+
167
+ if low.is_a?(Expr) || high.is_a?(Expr) || lazy
168
+ low = Utils.expr_to_lit_or_expr(low, str_to_lit: true)
169
+ high = Utils.expr_to_lit_or_expr(high, str_to_lit: true)
170
+ return Utils.wrap_expr(
171
+ _rb_date_range_lazy(low, high, interval, closed, name, time_zone)
172
+ )
173
+ end
174
+
175
+ low, low_is_date = _ensure_datetime(low)
176
+ high, high_is_date = _ensure_datetime(high)
177
+
178
+ if !time_unit.nil?
179
+ tu = time_unit
180
+ elsif interval.include?("ns")
181
+ tu = "ns"
182
+ else
183
+ tu = "us"
184
+ end
185
+
186
+ start = Utils._datetime_to_pl_timestamp(low, tu)
187
+ stop = Utils._datetime_to_pl_timestamp(high, tu)
188
+ if name.nil?
189
+ name = ""
190
+ end
191
+
192
+ dt_range = Utils.wrap_s(
193
+ _rb_date_range(start, stop, interval, closed, name, tu, time_zone)
194
+ )
195
+ if low_is_date && high_is_date && !["h", "m", "s"].any? { |v| _interval_granularity(interval).end_with?(v) }
196
+ dt_range = dt_range.cast(Date)
197
+ end
198
+
199
+ dt_range
200
+ end
201
+
202
+ # def cut
203
+ # end
204
+
205
+ # def align_frames
206
+ # end
207
+
208
+ # Return a new Series of given length and type, filled with ones.
209
+ #
210
+ # @param n [Integer]
211
+ # Number of elements in the `Series`
212
+ # @param dtype [Symbol]
213
+ # DataType of the elements, defaults to `:f64`
214
+ #
215
+ # @return [Series]
216
+ #
217
+ # @note
218
+ # In the lazy API you should probably not use this, but use `lit(1)`
219
+ # instead.
220
+ def ones(n, dtype: nil)
221
+ s = Series.new([1.0])
222
+ if dtype
223
+ s = s.cast(dtype)
224
+ end
225
+ s.new_from_index(0, n)
226
+ end
227
+
228
+ # Return a new Series of given length and type, filled with zeros.
229
+ #
230
+ # @param n [Integer]
231
+ # Number of elements in the `Series`
232
+ # @param dtype [Symbol]
233
+ # DataType of the elements, defaults to `:f64`
234
+ #
235
+ # @return [Series]
236
+ #
237
+ # @note
238
+ # In the lazy API you should probably not use this, but use `lit(0)`
239
+ # instead.
240
+ def zeros(n, dtype: nil)
241
+ s = Series.new([0.0])
242
+ if dtype
243
+ s = s.cast(dtype)
244
+ end
245
+ s.new_from_index(0, n)
246
+ end
247
+
248
+ private
249
+
250
+ def _ensure_datetime(value)
251
+ is_date_type = false
252
+ if !value.is_a?(DateTime)
253
+ value = DateTime.new(value.year, value.month, value.day)
254
+ is_date_type = true
255
+ end
256
+ [value, is_date_type]
257
+ end
258
+
259
+ # TODO
260
+ def _interval_granularity(interval)
261
+ interval
262
+ end
44
263
  end
45
264
  end