polars-df 0.8.0-arm64-darwin → 0.10.0-arm64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +42 -1
- data/Cargo.lock +159 -66
- data/Cargo.toml +0 -3
- data/LICENSE-THIRD-PARTY.txt +3112 -1613
- data/LICENSE.txt +1 -1
- data/README.md +3 -2
- data/lib/polars/3.1/polars.bundle +0 -0
- data/lib/polars/3.2/polars.bundle +0 -0
- data/lib/polars/3.3/polars.bundle +0 -0
- data/lib/polars/array_expr.rb +453 -0
- data/lib/polars/array_name_space.rb +346 -0
- data/lib/polars/batched_csv_reader.rb +4 -2
- data/lib/polars/cat_expr.rb +24 -0
- data/lib/polars/cat_name_space.rb +75 -0
- data/lib/polars/config.rb +2 -2
- data/lib/polars/data_frame.rb +306 -96
- data/lib/polars/data_types.rb +191 -28
- data/lib/polars/date_time_expr.rb +41 -18
- data/lib/polars/date_time_name_space.rb +9 -3
- data/lib/polars/exceptions.rb +12 -1
- data/lib/polars/expr.rb +898 -215
- data/lib/polars/functions/aggregation/horizontal.rb +246 -0
- data/lib/polars/functions/aggregation/vertical.rb +282 -0
- data/lib/polars/functions/as_datatype.rb +248 -0
- data/lib/polars/functions/col.rb +47 -0
- data/lib/polars/functions/eager.rb +182 -0
- data/lib/polars/functions/lazy.rb +1280 -0
- data/lib/polars/functions/len.rb +49 -0
- data/lib/polars/functions/lit.rb +35 -0
- data/lib/polars/functions/random.rb +16 -0
- data/lib/polars/functions/range/date_range.rb +103 -0
- data/lib/polars/functions/range/int_range.rb +51 -0
- data/lib/polars/functions/repeat.rb +144 -0
- data/lib/polars/functions/whenthen.rb +96 -0
- data/lib/polars/functions.rb +29 -416
- data/lib/polars/group_by.rb +2 -2
- data/lib/polars/io.rb +36 -31
- data/lib/polars/lazy_frame.rb +405 -88
- data/lib/polars/list_expr.rb +158 -8
- data/lib/polars/list_name_space.rb +102 -0
- data/lib/polars/meta_expr.rb +175 -7
- data/lib/polars/series.rb +282 -41
- data/lib/polars/string_cache.rb +75 -0
- data/lib/polars/string_expr.rb +413 -96
- data/lib/polars/string_name_space.rb +4 -4
- data/lib/polars/testing.rb +507 -0
- data/lib/polars/utils.rb +106 -8
- data/lib/polars/version.rb +1 -1
- data/lib/polars/whenthen.rb +83 -0
- data/lib/polars.rb +16 -4
- metadata +34 -6
- data/lib/polars/lazy_functions.rb +0 -1181
- data/lib/polars/when.rb +0 -16
- data/lib/polars/when_then.rb +0 -19
data/lib/polars/functions.rb
CHANGED
@@ -13,432 +13,45 @@ module Polars
|
|
13
13
|
df.to_dummies(columns: columns)
|
14
14
|
end
|
15
15
|
|
16
|
-
# Aggregate
|
16
|
+
# Aggregate to list.
|
17
17
|
#
|
18
|
-
# @
|
19
|
-
|
20
|
-
|
21
|
-
# Make sure that all data is in contiguous memory.
|
22
|
-
# @param how ["vertical", "vertical_relaxed", "diagonal", "horizontal"]
|
23
|
-
# LazyFrames do not support the `horizontal` strategy.
|
24
|
-
#
|
25
|
-
# - Vertical: applies multiple `vstack` operations.
|
26
|
-
# - Diagonal: finds a union between the column schemas and fills missing column values with null.
|
27
|
-
# - Horizontal: stacks Series horizontally and fills with nulls if the lengths don't match.
|
28
|
-
# @param parallel [Boolean]
|
29
|
-
# Only relevant for LazyFrames. This determines if the concatenated
|
30
|
-
# lazy computations may be executed in parallel.
|
31
|
-
#
|
32
|
-
# @return [Object]
|
33
|
-
#
|
34
|
-
# @example
|
35
|
-
# df1 = Polars::DataFrame.new({"a" => [1], "b" => [3]})
|
36
|
-
# df2 = Polars::DataFrame.new({"a" => [2], "b" => [4]})
|
37
|
-
# Polars.concat([df1, df2])
|
38
|
-
# # =>
|
39
|
-
# # shape: (2, 2)
|
40
|
-
# # ┌─────┬─────┐
|
41
|
-
# # │ a ┆ b │
|
42
|
-
# # │ --- ┆ --- │
|
43
|
-
# # │ i64 ┆ i64 │
|
44
|
-
# # ╞═════╪═════╡
|
45
|
-
# # │ 1 ┆ 3 │
|
46
|
-
# # │ 2 ┆ 4 │
|
47
|
-
# # └─────┴─────┘
|
48
|
-
def concat(items, rechunk: true, how: "vertical", parallel: true)
|
49
|
-
if items.empty?
|
50
|
-
raise ArgumentError, "cannot concat empty list"
|
51
|
-
end
|
52
|
-
|
53
|
-
first = items[0]
|
54
|
-
if first.is_a?(DataFrame)
|
55
|
-
if how == "vertical"
|
56
|
-
out = Utils.wrap_df(_concat_df(items))
|
57
|
-
elsif how == "diagonal"
|
58
|
-
out = Utils.wrap_df(_concat_df_diagonal(items))
|
59
|
-
elsif how == "horizontal"
|
60
|
-
out = Utils.wrap_df(_concat_df_horizontal(items))
|
61
|
-
else
|
62
|
-
raise ArgumentError, "how must be one of {{'vertical', 'diagonal', 'horizontal'}}, got #{how}"
|
63
|
-
end
|
64
|
-
elsif first.is_a?(LazyFrame)
|
65
|
-
if how == "vertical"
|
66
|
-
return Utils.wrap_ldf(_concat_lf(items, rechunk, parallel, false))
|
67
|
-
elsif how == "vertical_relaxed"
|
68
|
-
return Utils.wrap_ldf(_concat_lf(items, rechunk, parallel, true))
|
69
|
-
elsif how == "diagonal"
|
70
|
-
return Utils.wrap_ldf(_concat_lf_diagonal(items, rechunk, parallel, false))
|
71
|
-
else
|
72
|
-
raise ArgumentError, "Lazy only allows 'vertical', 'vertical_relaxed', and 'diagonal' concat strategy."
|
73
|
-
end
|
74
|
-
elsif first.is_a?(Series)
|
75
|
-
# TODO
|
76
|
-
out = Utils.wrap_s(_concat_series(items))
|
77
|
-
elsif first.is_a?(Expr)
|
78
|
-
out = first
|
79
|
-
items[1..-1].each do |e|
|
80
|
-
out = out.append(e)
|
81
|
-
end
|
82
|
-
else
|
83
|
-
raise ArgumentError, "did not expect type: #{first.class.name} in 'Polars.concat'."
|
84
|
-
end
|
85
|
-
|
86
|
-
if rechunk
|
87
|
-
out.rechunk
|
88
|
-
else
|
89
|
-
out
|
90
|
-
end
|
91
|
-
end
|
92
|
-
|
93
|
-
# Create a range of type `Datetime` (or `Date`).
|
94
|
-
#
|
95
|
-
# @param start [Object]
|
96
|
-
# Lower bound of the date range.
|
97
|
-
# @param stop [Object]
|
98
|
-
# Upper bound of the date range.
|
99
|
-
# @param interval [Object]
|
100
|
-
# Interval periods. It can be a polars duration string, such as `3d12h4m25s`
|
101
|
-
# representing 3 days, 12 hours, 4 minutes, and 25 seconds.
|
102
|
-
# @param lazy [Boolean]
|
103
|
-
# Return an expression.
|
104
|
-
# @param closed ["both", "left", "right", "none"]
|
105
|
-
# Define whether the temporal window interval is closed or not.
|
106
|
-
# @param name [String]
|
107
|
-
# Name of the output Series.
|
108
|
-
# @param time_unit [nil, "ns", "us", "ms"]
|
109
|
-
# Set the time unit.
|
110
|
-
# @param time_zone [String]
|
111
|
-
# Optional timezone
|
112
|
-
#
|
113
|
-
# @return [Object]
|
114
|
-
#
|
115
|
-
# @note
|
116
|
-
# If both `low` and `high` are passed as date types (not datetime), and the
|
117
|
-
# interval granularity is no finer than 1d, the returned range is also of
|
118
|
-
# type date. All other permutations return a datetime Series.
|
119
|
-
#
|
120
|
-
# @example Using polars duration string to specify the interval
|
121
|
-
# Polars.date_range(Date.new(2022, 1, 1), Date.new(2022, 3, 1), "1mo", name: "drange")
|
122
|
-
# # =>
|
123
|
-
# # shape: (3,)
|
124
|
-
# # Series: 'drange' [date]
|
125
|
-
# # [
|
126
|
-
# # 2022-01-01
|
127
|
-
# # 2022-02-01
|
128
|
-
# # 2022-03-01
|
129
|
-
# # ]
|
130
|
-
#
|
131
|
-
# @example Using `timedelta` object to specify the interval:
|
132
|
-
# Polars.date_range(
|
133
|
-
# DateTime.new(1985, 1, 1),
|
134
|
-
# DateTime.new(1985, 1, 10),
|
135
|
-
# "1d12h",
|
136
|
-
# time_unit: "ms"
|
137
|
-
# )
|
138
|
-
# # =>
|
139
|
-
# # shape: (7,)
|
140
|
-
# # Series: '' [datetime[ms]]
|
141
|
-
# # [
|
142
|
-
# # 1985-01-01 00:00:00
|
143
|
-
# # 1985-01-02 12:00:00
|
144
|
-
# # 1985-01-04 00:00:00
|
145
|
-
# # 1985-01-05 12:00:00
|
146
|
-
# # 1985-01-07 00:00:00
|
147
|
-
# # 1985-01-08 12:00:00
|
148
|
-
# # 1985-01-10 00:00:00
|
149
|
-
# # ]
|
150
|
-
def date_range(
|
151
|
-
start,
|
152
|
-
stop,
|
153
|
-
interval,
|
154
|
-
lazy: false,
|
155
|
-
closed: "both",
|
156
|
-
name: nil,
|
157
|
-
time_unit: nil,
|
158
|
-
time_zone: nil
|
159
|
-
)
|
160
|
-
if defined?(ActiveSupport::Duration) && interval.is_a?(ActiveSupport::Duration)
|
161
|
-
raise Todo
|
162
|
-
else
|
163
|
-
interval = interval.to_s
|
164
|
-
if interval.include?(" ")
|
165
|
-
interval = interval.gsub(" ", "")
|
166
|
-
end
|
167
|
-
end
|
168
|
-
|
169
|
-
if time_unit.nil?
|
170
|
-
if interval.include?("ns")
|
171
|
-
time_unit = "ns"
|
172
|
-
else
|
173
|
-
time_unit = "us"
|
174
|
-
end
|
175
|
-
end
|
176
|
-
|
177
|
-
start_rbexpr = Utils.parse_as_expression(start)
|
178
|
-
stop_rbexpr = Utils.parse_as_expression(stop)
|
179
|
-
|
180
|
-
result = Utils.wrap_expr(
|
181
|
-
_rb_date_range(start_rbexpr, stop_rbexpr, interval, closed, time_unit, time_zone)
|
182
|
-
)
|
183
|
-
|
184
|
-
result = result.alias(name.to_s)
|
185
|
-
|
186
|
-
if !lazy
|
187
|
-
return select(result).to_series
|
188
|
-
end
|
189
|
-
|
190
|
-
result
|
191
|
-
end
|
192
|
-
|
193
|
-
# Bin values into discrete values.
|
194
|
-
#
|
195
|
-
# @param s [Series]
|
196
|
-
# Series to bin.
|
197
|
-
# @param bins [Array]
|
198
|
-
# Bins to create.
|
199
|
-
# @param labels [Array]
|
200
|
-
# Labels to assign to the bins. If given the length of labels must be
|
201
|
-
# len(bins) + 1.
|
202
|
-
# @param break_point_label [String]
|
203
|
-
# Name given to the breakpoint column.
|
204
|
-
# @param category_label [String]
|
205
|
-
# Name given to the category column.
|
206
|
-
#
|
207
|
-
# @return [DataFrame]
|
208
|
-
#
|
209
|
-
# @note
|
210
|
-
# This functionality is experimental and may change without it being considered a
|
211
|
-
# breaking change.
|
212
|
-
#
|
213
|
-
# @example
|
214
|
-
# a = Polars::Series.new("a", 13.times.map { |i| (-30 + i * 5) / 10.0 })
|
215
|
-
# Polars.cut(a, [-1, 1])
|
216
|
-
# # =>
|
217
|
-
# # shape: (12, 3)
|
218
|
-
# # ┌──────┬─────────────┬──────────────┐
|
219
|
-
# # │ a ┆ break_point ┆ category │
|
220
|
-
# # │ --- ┆ --- ┆ --- │
|
221
|
-
# # │ f64 ┆ f64 ┆ cat │
|
222
|
-
# # ╞══════╪═════════════╪══════════════╡
|
223
|
-
# # │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │
|
224
|
-
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
225
|
-
# # │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │
|
226
|
-
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
227
|
-
# # │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │
|
228
|
-
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
229
|
-
# # │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │
|
230
|
-
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
231
|
-
# # │ ... ┆ ... ┆ ... │
|
232
|
-
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
233
|
-
# # │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │
|
234
|
-
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
235
|
-
# # │ 1.5 ┆ inf ┆ (1.0, inf] │
|
236
|
-
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
237
|
-
# # │ 2.0 ┆ inf ┆ (1.0, inf] │
|
238
|
-
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
239
|
-
# # │ 2.5 ┆ inf ┆ (1.0, inf] │
|
240
|
-
# # └──────┴─────────────┴──────────────┘
|
241
|
-
# def cut(
|
242
|
-
# s,
|
243
|
-
# bins,
|
244
|
-
# labels: nil,
|
245
|
-
# break_point_label: "break_point",
|
246
|
-
# category_label: "category"
|
247
|
-
# )
|
248
|
-
# var_nm = s.name
|
249
|
-
|
250
|
-
# cuts_df = DataFrame.new(
|
251
|
-
# [
|
252
|
-
# Series.new(
|
253
|
-
# break_point_label, bins, dtype: :f64
|
254
|
-
# ).extend_constant(Float::INFINITY, 1)
|
255
|
-
# ]
|
256
|
-
# )
|
257
|
-
|
258
|
-
# if labels
|
259
|
-
# if labels.length != bins.length + 1
|
260
|
-
# raise ArgumentError, "expected more labels"
|
261
|
-
# end
|
262
|
-
# cuts_df = cuts_df.with_column(Series.new(category_label, labels))
|
263
|
-
# else
|
264
|
-
# cuts_df = cuts_df.with_column(
|
265
|
-
# Polars.format(
|
266
|
-
# "({}, {}]",
|
267
|
-
# Polars.col(break_point_label).shift_and_fill(1, -Float::INFINITY),
|
268
|
-
# Polars.col(break_point_label)
|
269
|
-
# ).alias(category_label)
|
270
|
-
# )
|
271
|
-
# end
|
272
|
-
|
273
|
-
# cuts_df = cuts_df.with_column(Polars.col(category_label).cast(:cat))
|
274
|
-
|
275
|
-
# s.cast(:f64)
|
276
|
-
# .sort
|
277
|
-
# .to_frame
|
278
|
-
# .join_asof(
|
279
|
-
# cuts_df,
|
280
|
-
# left_on: var_nm,
|
281
|
-
# right_on: break_point_label,
|
282
|
-
# strategy: "forward"
|
283
|
-
# )
|
284
|
-
# end
|
285
|
-
|
286
|
-
# Align a sequence of frames using the uique values from one or more columns as a key.
|
287
|
-
#
|
288
|
-
# Frames that do not contain the given key values have rows injected (with nulls
|
289
|
-
# filling the non-key columns), and each resulting frame is sorted by the key.
|
290
|
-
#
|
291
|
-
# The original column order of input frames is not changed unless ``select`` is
|
292
|
-
# specified (in which case the final column order is determined from that).
|
293
|
-
#
|
294
|
-
# Note that this does not result in a joined frame - you receive the same number
|
295
|
-
# of frames back that you passed in, but each is now aligned by key and has
|
296
|
-
# the same number of rows.
|
297
|
-
#
|
298
|
-
# @param frames [Array]
|
299
|
-
# Sequence of DataFrames or LazyFrames.
|
300
|
-
# @param on [Object]
|
301
|
-
# One or more columns whose unique values will be used to align the frames.
|
302
|
-
# @param select [Object]
|
303
|
-
# Optional post-alignment column select to constrain and/or order
|
304
|
-
# the columns returned from the newly aligned frames.
|
305
|
-
# @param reverse [Object]
|
306
|
-
# Sort the alignment column values in descending order; can be a single
|
307
|
-
# boolean or a list of booleans associated with each column in `on`.
|
308
|
-
#
|
309
|
-
# @return [Object]
|
310
|
-
#
|
311
|
-
# @example
|
312
|
-
# df1 = Polars::DataFrame.new(
|
313
|
-
# {
|
314
|
-
# "dt" => [Date.new(2022, 9, 1), Date.new(2022, 9, 2), Date.new(2022, 9, 3)],
|
315
|
-
# "x" => [3.5, 4.0, 1.0],
|
316
|
-
# "y" => [10.0, 2.5, 1.5]
|
317
|
-
# }
|
318
|
-
# )
|
319
|
-
# df2 = Polars::DataFrame.new(
|
320
|
-
# {
|
321
|
-
# "dt" => [Date.new(2022, 9, 2), Date.new(2022, 9, 3), Date.new(2022, 9, 1)],
|
322
|
-
# "x" => [8.0, 1.0, 3.5],
|
323
|
-
# "y" => [1.5, 12.0, 5.0]
|
324
|
-
# }
|
325
|
-
# )
|
326
|
-
# df3 = Polars::DataFrame.new(
|
327
|
-
# {
|
328
|
-
# "dt" => [Date.new(2022, 9, 3), Date.new(2022, 9, 2)],
|
329
|
-
# "x" => [2.0, 5.0],
|
330
|
-
# "y" => [2.5, 2.0]
|
331
|
-
# }
|
332
|
-
# )
|
333
|
-
# af1, af2, af3 = Polars.align_frames(
|
334
|
-
# df1, df2, df3, on: "dt", select: ["x", "y"]
|
335
|
-
# )
|
336
|
-
# (af1 * af2 * af3).fill_null(0).select(Polars.sum(Polars.col("*")).alias("dot"))
|
337
|
-
# # =>
|
338
|
-
# # shape: (3, 1)
|
339
|
-
# # ┌───────┐
|
340
|
-
# # │ dot │
|
341
|
-
# # │ --- │
|
342
|
-
# # │ f64 │
|
343
|
-
# # ╞═══════╡
|
344
|
-
# # │ 0.0 │
|
345
|
-
# # ├╌╌╌╌╌╌╌┤
|
346
|
-
# # │ 167.5 │
|
347
|
-
# # ├╌╌╌╌╌╌╌┤
|
348
|
-
# # │ 47.0 │
|
349
|
-
# # └───────┘
|
350
|
-
def align_frames(
|
351
|
-
*frames,
|
352
|
-
on:,
|
353
|
-
select: nil,
|
354
|
-
reverse: false
|
355
|
-
)
|
356
|
-
if frames.empty?
|
357
|
-
return []
|
358
|
-
elsif frames.map(&:class).uniq.length != 1
|
359
|
-
raise TypeError, "Input frames must be of a consistent type (all LazyFrame or all DataFrame)"
|
360
|
-
end
|
361
|
-
|
362
|
-
# establish the superset of all "on" column values, sort, and cache
|
363
|
-
eager = frames[0].is_a?(DataFrame)
|
364
|
-
alignment_frame = (
|
365
|
-
concat(frames.map { |df| df.lazy.select(on) })
|
366
|
-
.unique(maintain_order: false)
|
367
|
-
.sort(on, reverse: reverse)
|
368
|
-
)
|
369
|
-
alignment_frame = (
|
370
|
-
eager ? alignment_frame.collect.lazy : alignment_frame.cache
|
371
|
-
)
|
372
|
-
# finally, align all frames
|
373
|
-
aligned_frames =
|
374
|
-
frames.map do |df|
|
375
|
-
alignment_frame.join(
|
376
|
-
df.lazy,
|
377
|
-
on: alignment_frame.columns,
|
378
|
-
how: "left"
|
379
|
-
).select(df.columns)
|
380
|
-
end
|
381
|
-
if !select.nil?
|
382
|
-
aligned_frames = aligned_frames.map { |df| df.select(select) }
|
383
|
-
end
|
384
|
-
|
385
|
-
eager ? aligned_frames.map(&:collect) : aligned_frames
|
18
|
+
# @return [Expr]
|
19
|
+
def to_list(name)
|
20
|
+
col(name).list
|
386
21
|
end
|
387
22
|
|
388
|
-
#
|
23
|
+
# Compute the spearman rank correlation between two columns.
|
389
24
|
#
|
390
|
-
#
|
391
|
-
# Number of elements in the `Series`
|
392
|
-
# @param dtype [Symbol]
|
393
|
-
# DataType of the elements, defaults to `:f64`
|
25
|
+
# Missing data will be excluded from the computation.
|
394
26
|
#
|
395
|
-
# @
|
27
|
+
# @param a [Object]
|
28
|
+
# Column name or Expression.
|
29
|
+
# @param b [Object]
|
30
|
+
# Column name or Expression.
|
31
|
+
# @param ddof [Integer]
|
32
|
+
# Delta degrees of freedom
|
33
|
+
# @param propagate_nans [Boolean]
|
34
|
+
# If `True` any `NaN` encountered will lead to `NaN` in the output.
|
35
|
+
# Defaults to `False` where `NaN` are regarded as larger than any finite number
|
36
|
+
# and thus lead to the highest rank.
|
396
37
|
#
|
397
|
-
# @
|
398
|
-
|
399
|
-
|
400
|
-
def ones(n, dtype: nil)
|
401
|
-
s = Series.new([1.0])
|
402
|
-
if dtype
|
403
|
-
s = s.cast(dtype)
|
404
|
-
end
|
405
|
-
s.new_from_index(0, n)
|
38
|
+
# @return [Expr]
|
39
|
+
def spearman_rank_corr(a, b, ddof: 1, propagate_nans: false)
|
40
|
+
corr(a, b, method: "spearman", ddof: ddof, propagate_nans: propagate_nans)
|
406
41
|
end
|
407
42
|
|
408
|
-
#
|
409
|
-
#
|
410
|
-
# @param n [Integer]
|
411
|
-
# Number of elements in the `Series`
|
412
|
-
# @param dtype [Symbol]
|
413
|
-
# DataType of the elements, defaults to `:f64`
|
43
|
+
# Compute the pearson's correlation between two columns.
|
414
44
|
#
|
415
|
-
# @
|
45
|
+
# @param a [Object]
|
46
|
+
# Column name or Expression.
|
47
|
+
# @param b [Object]
|
48
|
+
# Column name or Expression.
|
49
|
+
# @param ddof [Integer]
|
50
|
+
# Delta degrees of freedom
|
416
51
|
#
|
417
|
-
# @
|
418
|
-
|
419
|
-
|
420
|
-
def zeros(n, dtype: nil)
|
421
|
-
s = Series.new([0.0])
|
422
|
-
if dtype
|
423
|
-
s = s.cast(dtype)
|
424
|
-
end
|
425
|
-
s.new_from_index(0, n)
|
426
|
-
end
|
427
|
-
|
428
|
-
private
|
429
|
-
|
430
|
-
def _ensure_datetime(value)
|
431
|
-
is_date_type = false
|
432
|
-
if !value.is_a?(::DateTime)
|
433
|
-
value = ::DateTime.new(value.year, value.month, value.day)
|
434
|
-
is_date_type = true
|
435
|
-
end
|
436
|
-
[value, is_date_type]
|
437
|
-
end
|
438
|
-
|
439
|
-
# TODO
|
440
|
-
def _interval_granularity(interval)
|
441
|
-
interval
|
52
|
+
# @return [Expr]
|
53
|
+
def pearson_corr(a, b, ddof: 1)
|
54
|
+
corr(a, b, method: "pearson", ddof: ddof)
|
442
55
|
end
|
443
56
|
end
|
444
57
|
end
|
data/lib/polars/group_by.rb
CHANGED
@@ -38,7 +38,7 @@ module Polars
|
|
38
38
|
temp_col = "__POLARS_GB_GROUP_INDICES"
|
39
39
|
groups_df =
|
40
40
|
@df.lazy
|
41
|
-
.
|
41
|
+
.with_row_index(name: temp_col)
|
42
42
|
.group_by(@by, maintain_order: @maintain_order)
|
43
43
|
.agg(Polars.col(temp_col))
|
44
44
|
.collect(no_optimization: true)
|
@@ -415,7 +415,7 @@ module Polars
|
|
415
415
|
# # │ Banana ┆ 2 │
|
416
416
|
# # └────────┴───────┘
|
417
417
|
def count
|
418
|
-
agg(Polars.count)
|
418
|
+
agg(Polars.len.alias("count"))
|
419
419
|
end
|
420
420
|
|
421
421
|
# Reduce the groups to the mean values.
|
data/lib/polars/io.rb
CHANGED
@@ -80,6 +80,8 @@ module Polars
|
|
80
80
|
# allocation needed.
|
81
81
|
# @param eol_char [String]
|
82
82
|
# Single byte end of line character.
|
83
|
+
# @param truncate_ragged_lines [Boolean]
|
84
|
+
# Truncate lines that are longer than the schema.
|
83
85
|
#
|
84
86
|
# @return [DataFrame]
|
85
87
|
#
|
@@ -113,12 +115,13 @@ module Polars
|
|
113
115
|
row_count_name: nil,
|
114
116
|
row_count_offset: 0,
|
115
117
|
sample_size: 1024,
|
116
|
-
eol_char: "\n"
|
118
|
+
eol_char: "\n",
|
119
|
+
truncate_ragged_lines: false
|
117
120
|
)
|
118
|
-
_check_arg_is_1byte("sep", sep, false)
|
119
|
-
_check_arg_is_1byte("comment_char", comment_char, false)
|
120
|
-
_check_arg_is_1byte("quote_char", quote_char, true)
|
121
|
-
_check_arg_is_1byte("eol_char", eol_char, false)
|
121
|
+
Utils._check_arg_is_1byte("sep", sep, false)
|
122
|
+
Utils._check_arg_is_1byte("comment_char", comment_char, false)
|
123
|
+
Utils._check_arg_is_1byte("quote_char", quote_char, true)
|
124
|
+
Utils._check_arg_is_1byte("eol_char", eol_char, false)
|
122
125
|
|
123
126
|
projection, columns = Utils.handle_projection_columns(columns)
|
124
127
|
|
@@ -161,7 +164,8 @@ module Polars
|
|
161
164
|
row_count_name: row_count_name,
|
162
165
|
row_count_offset: row_count_offset,
|
163
166
|
sample_size: sample_size,
|
164
|
-
eol_char: eol_char
|
167
|
+
eol_char: eol_char,
|
168
|
+
truncate_ragged_lines: truncate_ragged_lines
|
165
169
|
)
|
166
170
|
end
|
167
171
|
|
@@ -239,6 +243,8 @@ module Polars
|
|
239
243
|
# the column remains of data type `:str`.
|
240
244
|
# @param eol_char [String]
|
241
245
|
# Single byte end of line character.
|
246
|
+
# @param truncate_ragged_lines [Boolean]
|
247
|
+
# Truncate lines that are longer than the schema.
|
242
248
|
#
|
243
249
|
# @return [LazyFrame]
|
244
250
|
def scan_csv(
|
@@ -262,11 +268,12 @@ module Polars
|
|
262
268
|
row_count_name: nil,
|
263
269
|
row_count_offset: 0,
|
264
270
|
parse_dates: false,
|
265
|
-
eol_char: "\n"
|
271
|
+
eol_char: "\n",
|
272
|
+
truncate_ragged_lines: false
|
266
273
|
)
|
267
|
-
_check_arg_is_1byte("sep", sep, false)
|
268
|
-
_check_arg_is_1byte("comment_char", comment_char, false)
|
269
|
-
_check_arg_is_1byte("quote_char", quote_char, true)
|
274
|
+
Utils._check_arg_is_1byte("sep", sep, false)
|
275
|
+
Utils._check_arg_is_1byte("comment_char", comment_char, false)
|
276
|
+
Utils._check_arg_is_1byte("quote_char", quote_char, true)
|
270
277
|
|
271
278
|
if Utils.pathlike?(source)
|
272
279
|
source = Utils.normalise_filepath(source)
|
@@ -294,6 +301,7 @@ module Polars
|
|
294
301
|
row_count_offset: row_count_offset,
|
295
302
|
parse_dates: parse_dates,
|
296
303
|
eol_char: eol_char,
|
304
|
+
truncate_ragged_lines: truncate_ragged_lines
|
297
305
|
)
|
298
306
|
end
|
299
307
|
|
@@ -520,7 +528,7 @@ module Polars
|
|
520
528
|
|
521
529
|
# Read into a DataFrame from a parquet file.
|
522
530
|
#
|
523
|
-
# @param source [
|
531
|
+
# @param source [String, Pathname, StringIO]
|
524
532
|
# Path to a file or a file-like object.
|
525
533
|
# @param columns [Object]
|
526
534
|
# Columns to select. Accepts a list of column indices (starting at zero) or a list
|
@@ -604,9 +612,12 @@ module Polars
|
|
604
612
|
#
|
605
613
|
# @param query [Object]
|
606
614
|
# ActiveRecord::Relation or ActiveRecord::Result.
|
615
|
+
# @param schema_overrides [Hash]
|
616
|
+
# A hash mapping column names to dtypes, used to override the schema
|
617
|
+
# inferred from the query.
|
607
618
|
#
|
608
619
|
# @return [DataFrame]
|
609
|
-
def read_database(query)
|
620
|
+
def read_database(query, schema_overrides: nil)
|
610
621
|
if !defined?(ActiveRecord)
|
611
622
|
raise Error, "Active Record not available"
|
612
623
|
end
|
@@ -623,7 +634,7 @@ module Polars
|
|
623
634
|
end
|
624
635
|
|
625
636
|
data = {}
|
626
|
-
schema_overrides = {}
|
637
|
+
schema_overrides = (schema_overrides || {}).transform_keys(&:to_s)
|
627
638
|
|
628
639
|
result.columns.each_with_index do |k, i|
|
629
640
|
column_type = result.column_types[i]
|
@@ -655,9 +666,12 @@ module Polars
|
|
655
666
|
String
|
656
667
|
when :time
|
657
668
|
Time
|
669
|
+
# TODO fix issue with null
|
670
|
+
# when :json, :jsonb
|
671
|
+
# Struct
|
658
672
|
end
|
659
673
|
|
660
|
-
schema_overrides[k]
|
674
|
+
schema_overrides[k] ||= polars_type if polars_type
|
661
675
|
end
|
662
676
|
|
663
677
|
DataFrame.new(data, schema_overrides: schema_overrides)
|
@@ -749,6 +763,8 @@ module Polars
|
|
749
763
|
# allocation needed.
|
750
764
|
# @param eol_char [String]
|
751
765
|
# Single byte end of line character.
|
766
|
+
# @param truncate_ragged_lines [Boolean]
|
767
|
+
# Truncate lines that are longer than the schema.
|
752
768
|
#
|
753
769
|
# @return [BatchedCsvReader]
|
754
770
|
#
|
@@ -781,7 +797,8 @@ module Polars
|
|
781
797
|
row_count_name: nil,
|
782
798
|
row_count_offset: 0,
|
783
799
|
sample_size: 1024,
|
784
|
-
eol_char: "\n"
|
800
|
+
eol_char: "\n",
|
801
|
+
truncate_ragged_lines: false
|
785
802
|
)
|
786
803
|
projection, columns = Utils.handle_projection_columns(columns)
|
787
804
|
|
@@ -821,7 +838,8 @@ module Polars
|
|
821
838
|
row_count_offset: row_count_offset,
|
822
839
|
sample_size: sample_size,
|
823
840
|
eol_char: eol_char,
|
824
|
-
new_columns: new_columns
|
841
|
+
new_columns: new_columns,
|
842
|
+
truncate_ragged_lines: truncate_ragged_lines
|
825
843
|
)
|
826
844
|
end
|
827
845
|
|
@@ -836,7 +854,7 @@ module Polars
|
|
836
854
|
source = Utils.normalise_filepath(source)
|
837
855
|
end
|
838
856
|
|
839
|
-
|
857
|
+
Plr.ipc_schema(source)
|
840
858
|
end
|
841
859
|
|
842
860
|
# Get a schema of the Parquet file without reading data.
|
@@ -850,7 +868,7 @@ module Polars
|
|
850
868
|
source = Utils.normalise_filepath(source)
|
851
869
|
end
|
852
870
|
|
853
|
-
|
871
|
+
Plr.parquet_schema(source)
|
854
872
|
end
|
855
873
|
|
856
874
|
private
|
@@ -868,18 +886,5 @@ module Polars
|
|
868
886
|
|
869
887
|
yield file
|
870
888
|
end
|
871
|
-
|
872
|
-
def _check_arg_is_1byte(arg_name, arg, can_be_empty = false)
|
873
|
-
if arg.is_a?(::String)
|
874
|
-
arg_byte_length = arg.bytesize
|
875
|
-
if can_be_empty
|
876
|
-
if arg_byte_length > 1
|
877
|
-
raise ArgumentError, "#{arg_name} should be a single byte character or empty, but is #{arg_byte_length} bytes long."
|
878
|
-
end
|
879
|
-
elsif arg_byte_length != 1
|
880
|
-
raise ArgumentError, "#{arg_name} should be a single byte character, but is #{arg_byte_length} bytes long."
|
881
|
-
end
|
882
|
-
end
|
883
|
-
end
|
884
889
|
end
|
885
890
|
end
|