polars-df 0.7.0-x86_64-linux → 0.9.0-x86_64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +41 -0
- data/Cargo.lock +353 -237
- data/Cargo.toml +0 -3
- data/LICENSE-THIRD-PARTY.txt +1978 -1459
- data/LICENSE.txt +1 -1
- data/README.md +2 -2
- data/lib/polars/3.1/polars.so +0 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/{3.0 → 3.3}/polars.so +0 -0
- data/lib/polars/array_expr.rb +449 -0
- data/lib/polars/array_name_space.rb +346 -0
- data/lib/polars/cat_expr.rb +24 -0
- data/lib/polars/cat_name_space.rb +75 -0
- data/lib/polars/config.rb +2 -2
- data/lib/polars/data_frame.rb +248 -108
- data/lib/polars/data_types.rb +195 -29
- data/lib/polars/date_time_expr.rb +41 -24
- data/lib/polars/date_time_name_space.rb +12 -12
- data/lib/polars/exceptions.rb +12 -1
- data/lib/polars/expr.rb +1080 -195
- data/lib/polars/functions/aggregation/horizontal.rb +246 -0
- data/lib/polars/functions/aggregation/vertical.rb +282 -0
- data/lib/polars/functions/as_datatype.rb +248 -0
- data/lib/polars/functions/col.rb +47 -0
- data/lib/polars/functions/eager.rb +182 -0
- data/lib/polars/functions/lazy.rb +1280 -0
- data/lib/polars/functions/len.rb +49 -0
- data/lib/polars/functions/lit.rb +35 -0
- data/lib/polars/functions/random.rb +16 -0
- data/lib/polars/functions/range/date_range.rb +103 -0
- data/lib/polars/functions/range/int_range.rb +51 -0
- data/lib/polars/functions/repeat.rb +144 -0
- data/lib/polars/functions/whenthen.rb +27 -0
- data/lib/polars/functions.rb +29 -416
- data/lib/polars/group_by.rb +3 -3
- data/lib/polars/io.rb +21 -28
- data/lib/polars/lazy_frame.rb +390 -76
- data/lib/polars/list_expr.rb +152 -6
- data/lib/polars/list_name_space.rb +102 -0
- data/lib/polars/meta_expr.rb +175 -7
- data/lib/polars/series.rb +557 -59
- data/lib/polars/sql_context.rb +1 -1
- data/lib/polars/string_cache.rb +75 -0
- data/lib/polars/string_expr.rb +412 -96
- data/lib/polars/string_name_space.rb +4 -4
- data/lib/polars/struct_expr.rb +1 -1
- data/lib/polars/struct_name_space.rb +1 -1
- data/lib/polars/testing.rb +507 -0
- data/lib/polars/utils.rb +64 -20
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +15 -2
- metadata +36 -7
- data/lib/polars/lazy_functions.rb +0 -1197
data/lib/polars/functions.rb
CHANGED
@@ -13,432 +13,45 @@ module Polars
|
|
13
13
|
df.to_dummies(columns: columns)
|
14
14
|
end
|
15
15
|
|
16
|
-
# Aggregate
|
16
|
+
# Aggregate to list.
|
17
17
|
#
|
18
|
-
# @
|
19
|
-
|
20
|
-
|
21
|
-
# Make sure that all data is in contiguous memory.
|
22
|
-
# @param how ["vertical", "vertical_relaxed", "diagonal", "horizontal"]
|
23
|
-
# LazyFrames do not support the `horizontal` strategy.
|
24
|
-
#
|
25
|
-
# - Vertical: applies multiple `vstack` operations.
|
26
|
-
# - Diagonal: finds a union between the column schemas and fills missing column values with null.
|
27
|
-
# - Horizontal: stacks Series horizontally and fills with nulls if the lengths don't match.
|
28
|
-
# @param parallel [Boolean]
|
29
|
-
# Only relevant for LazyFrames. This determines if the concatenated
|
30
|
-
# lazy computations may be executed in parallel.
|
31
|
-
#
|
32
|
-
# @return [Object]
|
33
|
-
#
|
34
|
-
# @example
|
35
|
-
# df1 = Polars::DataFrame.new({"a" => [1], "b" => [3]})
|
36
|
-
# df2 = Polars::DataFrame.new({"a" => [2], "b" => [4]})
|
37
|
-
# Polars.concat([df1, df2])
|
38
|
-
# # =>
|
39
|
-
# # shape: (2, 2)
|
40
|
-
# # ┌─────┬─────┐
|
41
|
-
# # │ a ┆ b │
|
42
|
-
# # │ --- ┆ --- │
|
43
|
-
# # │ i64 ┆ i64 │
|
44
|
-
# # ╞═════╪═════╡
|
45
|
-
# # │ 1 ┆ 3 │
|
46
|
-
# # │ 2 ┆ 4 │
|
47
|
-
# # └─────┴─────┘
|
48
|
-
def concat(items, rechunk: true, how: "vertical", parallel: true)
|
49
|
-
if items.empty?
|
50
|
-
raise ArgumentError, "cannot concat empty list"
|
51
|
-
end
|
52
|
-
|
53
|
-
first = items[0]
|
54
|
-
if first.is_a?(DataFrame)
|
55
|
-
if how == "vertical"
|
56
|
-
out = Utils.wrap_df(_concat_df(items))
|
57
|
-
elsif how == "diagonal"
|
58
|
-
out = Utils.wrap_df(_concat_df_diagonal(items))
|
59
|
-
elsif how == "horizontal"
|
60
|
-
out = Utils.wrap_df(_concat_df_horizontal(items))
|
61
|
-
else
|
62
|
-
raise ArgumentError, "how must be one of {{'vertical', 'diagonal', 'horizontal'}}, got #{how}"
|
63
|
-
end
|
64
|
-
elsif first.is_a?(LazyFrame)
|
65
|
-
if how == "vertical"
|
66
|
-
return Utils.wrap_ldf(_concat_lf(items, rechunk, parallel, false))
|
67
|
-
elsif how == "vertical_relaxed"
|
68
|
-
return Utils.wrap_ldf(_concat_lf(items, rechunk, parallel, true))
|
69
|
-
elsif how == "diagonal"
|
70
|
-
return Utils.wrap_ldf(_concat_lf_diagonal(items, rechunk, parallel, false))
|
71
|
-
else
|
72
|
-
raise ArgumentError, "Lazy only allows 'vertical', 'vertical_relaxed', and 'diagonal' concat strategy."
|
73
|
-
end
|
74
|
-
elsif first.is_a?(Series)
|
75
|
-
# TODO
|
76
|
-
out = Utils.wrap_s(_concat_series(items))
|
77
|
-
elsif first.is_a?(Expr)
|
78
|
-
out = first
|
79
|
-
items[1..-1].each do |e|
|
80
|
-
out = out.append(e)
|
81
|
-
end
|
82
|
-
else
|
83
|
-
raise ArgumentError, "did not expect type: #{first.class.name} in 'Polars.concat'."
|
84
|
-
end
|
85
|
-
|
86
|
-
if rechunk
|
87
|
-
out.rechunk
|
88
|
-
else
|
89
|
-
out
|
90
|
-
end
|
91
|
-
end
|
92
|
-
|
93
|
-
# Create a range of type `Datetime` (or `Date`).
|
94
|
-
#
|
95
|
-
# @param start [Object]
|
96
|
-
# Lower bound of the date range.
|
97
|
-
# @param stop [Object]
|
98
|
-
# Upper bound of the date range.
|
99
|
-
# @param interval [Object]
|
100
|
-
# Interval periods. It can be a polars duration string, such as `3d12h4m25s`
|
101
|
-
# representing 3 days, 12 hours, 4 minutes, and 25 seconds.
|
102
|
-
# @param lazy [Boolean]
|
103
|
-
# Return an expression.
|
104
|
-
# @param closed ["both", "left", "right", "none"]
|
105
|
-
# Define whether the temporal window interval is closed or not.
|
106
|
-
# @param name [String]
|
107
|
-
# Name of the output Series.
|
108
|
-
# @param time_unit [nil, "ns", "us", "ms"]
|
109
|
-
# Set the time unit.
|
110
|
-
# @param time_zone [String]
|
111
|
-
# Optional timezone
|
112
|
-
#
|
113
|
-
# @return [Object]
|
114
|
-
#
|
115
|
-
# @note
|
116
|
-
# If both `low` and `high` are passed as date types (not datetime), and the
|
117
|
-
# interval granularity is no finer than 1d, the returned range is also of
|
118
|
-
# type date. All other permutations return a datetime Series.
|
119
|
-
#
|
120
|
-
# @example Using polars duration string to specify the interval
|
121
|
-
# Polars.date_range(Date.new(2022, 1, 1), Date.new(2022, 3, 1), "1mo", name: "drange")
|
122
|
-
# # =>
|
123
|
-
# # shape: (3,)
|
124
|
-
# # Series: 'drange' [date]
|
125
|
-
# # [
|
126
|
-
# # 2022-01-01
|
127
|
-
# # 2022-02-01
|
128
|
-
# # 2022-03-01
|
129
|
-
# # ]
|
130
|
-
#
|
131
|
-
# @example Using `timedelta` object to specify the interval:
|
132
|
-
# Polars.date_range(
|
133
|
-
# DateTime.new(1985, 1, 1),
|
134
|
-
# DateTime.new(1985, 1, 10),
|
135
|
-
# "1d12h",
|
136
|
-
# time_unit: "ms"
|
137
|
-
# )
|
138
|
-
# # =>
|
139
|
-
# # shape: (7,)
|
140
|
-
# # Series: '' [datetime[ms]]
|
141
|
-
# # [
|
142
|
-
# # 1985-01-01 00:00:00
|
143
|
-
# # 1985-01-02 12:00:00
|
144
|
-
# # 1985-01-04 00:00:00
|
145
|
-
# # 1985-01-05 12:00:00
|
146
|
-
# # 1985-01-07 00:00:00
|
147
|
-
# # 1985-01-08 12:00:00
|
148
|
-
# # 1985-01-10 00:00:00
|
149
|
-
# # ]
|
150
|
-
def date_range(
|
151
|
-
start,
|
152
|
-
stop,
|
153
|
-
interval,
|
154
|
-
lazy: false,
|
155
|
-
closed: "both",
|
156
|
-
name: nil,
|
157
|
-
time_unit: nil,
|
158
|
-
time_zone: nil
|
159
|
-
)
|
160
|
-
if defined?(ActiveSupport::Duration) && interval.is_a?(ActiveSupport::Duration)
|
161
|
-
raise Todo
|
162
|
-
else
|
163
|
-
interval = interval.to_s
|
164
|
-
if interval.include?(" ")
|
165
|
-
interval = interval.gsub(" ", "")
|
166
|
-
end
|
167
|
-
end
|
168
|
-
|
169
|
-
if time_unit.nil?
|
170
|
-
if interval.include?("ns")
|
171
|
-
time_unit = "ns"
|
172
|
-
else
|
173
|
-
time_unit = "us"
|
174
|
-
end
|
175
|
-
end
|
176
|
-
|
177
|
-
start_rbexpr = Utils.parse_as_expression(start)
|
178
|
-
stop_rbexpr = Utils.parse_as_expression(stop)
|
179
|
-
|
180
|
-
result = Utils.wrap_expr(
|
181
|
-
_rb_date_range(start_rbexpr, stop_rbexpr, interval, closed, time_unit, time_zone)
|
182
|
-
)
|
183
|
-
|
184
|
-
result = result.alias(name.to_s)
|
185
|
-
|
186
|
-
if !lazy
|
187
|
-
return select(result).to_series
|
188
|
-
end
|
189
|
-
|
190
|
-
result
|
191
|
-
end
|
192
|
-
|
193
|
-
# Bin values into discrete values.
|
194
|
-
#
|
195
|
-
# @param s [Series]
|
196
|
-
# Series to bin.
|
197
|
-
# @param bins [Array]
|
198
|
-
# Bins to create.
|
199
|
-
# @param labels [Array]
|
200
|
-
# Labels to assign to the bins. If given the length of labels must be
|
201
|
-
# len(bins) + 1.
|
202
|
-
# @param break_point_label [String]
|
203
|
-
# Name given to the breakpoint column.
|
204
|
-
# @param category_label [String]
|
205
|
-
# Name given to the category column.
|
206
|
-
#
|
207
|
-
# @return [DataFrame]
|
208
|
-
#
|
209
|
-
# @note
|
210
|
-
# This functionality is experimental and may change without it being considered a
|
211
|
-
# breaking change.
|
212
|
-
#
|
213
|
-
# @example
|
214
|
-
# a = Polars::Series.new("a", 13.times.map { |i| (-30 + i * 5) / 10.0 })
|
215
|
-
# Polars.cut(a, [-1, 1])
|
216
|
-
# # =>
|
217
|
-
# # shape: (12, 3)
|
218
|
-
# # ┌──────┬─────────────┬──────────────┐
|
219
|
-
# # │ a ┆ break_point ┆ category │
|
220
|
-
# # │ --- ┆ --- ┆ --- │
|
221
|
-
# # │ f64 ┆ f64 ┆ cat │
|
222
|
-
# # ╞══════╪═════════════╪══════════════╡
|
223
|
-
# # │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │
|
224
|
-
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
225
|
-
# # │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │
|
226
|
-
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
227
|
-
# # │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │
|
228
|
-
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
229
|
-
# # │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │
|
230
|
-
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
231
|
-
# # │ ... ┆ ... ┆ ... │
|
232
|
-
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
233
|
-
# # │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │
|
234
|
-
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
235
|
-
# # │ 1.5 ┆ inf ┆ (1.0, inf] │
|
236
|
-
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
237
|
-
# # │ 2.0 ┆ inf ┆ (1.0, inf] │
|
238
|
-
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
239
|
-
# # │ 2.5 ┆ inf ┆ (1.0, inf] │
|
240
|
-
# # └──────┴─────────────┴──────────────┘
|
241
|
-
# def cut(
|
242
|
-
# s,
|
243
|
-
# bins,
|
244
|
-
# labels: nil,
|
245
|
-
# break_point_label: "break_point",
|
246
|
-
# category_label: "category"
|
247
|
-
# )
|
248
|
-
# var_nm = s.name
|
249
|
-
|
250
|
-
# cuts_df = DataFrame.new(
|
251
|
-
# [
|
252
|
-
# Series.new(
|
253
|
-
# break_point_label, bins, dtype: :f64
|
254
|
-
# ).extend_constant(Float::INFINITY, 1)
|
255
|
-
# ]
|
256
|
-
# )
|
257
|
-
|
258
|
-
# if labels
|
259
|
-
# if labels.length != bins.length + 1
|
260
|
-
# raise ArgumentError, "expected more labels"
|
261
|
-
# end
|
262
|
-
# cuts_df = cuts_df.with_column(Series.new(category_label, labels))
|
263
|
-
# else
|
264
|
-
# cuts_df = cuts_df.with_column(
|
265
|
-
# Polars.format(
|
266
|
-
# "({}, {}]",
|
267
|
-
# Polars.col(break_point_label).shift_and_fill(1, -Float::INFINITY),
|
268
|
-
# Polars.col(break_point_label)
|
269
|
-
# ).alias(category_label)
|
270
|
-
# )
|
271
|
-
# end
|
272
|
-
|
273
|
-
# cuts_df = cuts_df.with_column(Polars.col(category_label).cast(:cat))
|
274
|
-
|
275
|
-
# s.cast(:f64)
|
276
|
-
# .sort
|
277
|
-
# .to_frame
|
278
|
-
# .join_asof(
|
279
|
-
# cuts_df,
|
280
|
-
# left_on: var_nm,
|
281
|
-
# right_on: break_point_label,
|
282
|
-
# strategy: "forward"
|
283
|
-
# )
|
284
|
-
# end
|
285
|
-
|
286
|
-
# Align a sequence of frames using the uique values from one or more columns as a key.
|
287
|
-
#
|
288
|
-
# Frames that do not contain the given key values have rows injected (with nulls
|
289
|
-
# filling the non-key columns), and each resulting frame is sorted by the key.
|
290
|
-
#
|
291
|
-
# The original column order of input frames is not changed unless ``select`` is
|
292
|
-
# specified (in which case the final column order is determined from that).
|
293
|
-
#
|
294
|
-
# Note that this does not result in a joined frame - you receive the same number
|
295
|
-
# of frames back that you passed in, but each is now aligned by key and has
|
296
|
-
# the same number of rows.
|
297
|
-
#
|
298
|
-
# @param frames [Array]
|
299
|
-
# Sequence of DataFrames or LazyFrames.
|
300
|
-
# @param on [Object]
|
301
|
-
# One or more columns whose unique values will be used to align the frames.
|
302
|
-
# @param select [Object]
|
303
|
-
# Optional post-alignment column select to constrain and/or order
|
304
|
-
# the columns returned from the newly aligned frames.
|
305
|
-
# @param reverse [Object]
|
306
|
-
# Sort the alignment column values in descending order; can be a single
|
307
|
-
# boolean or a list of booleans associated with each column in `on`.
|
308
|
-
#
|
309
|
-
# @return [Object]
|
310
|
-
#
|
311
|
-
# @example
|
312
|
-
# df1 = Polars::DataFrame.new(
|
313
|
-
# {
|
314
|
-
# "dt" => [Date.new(2022, 9, 1), Date.new(2022, 9, 2), Date.new(2022, 9, 3)],
|
315
|
-
# "x" => [3.5, 4.0, 1.0],
|
316
|
-
# "y" => [10.0, 2.5, 1.5]
|
317
|
-
# }
|
318
|
-
# )
|
319
|
-
# df2 = Polars::DataFrame.new(
|
320
|
-
# {
|
321
|
-
# "dt" => [Date.new(2022, 9, 2), Date.new(2022, 9, 3), Date.new(2022, 9, 1)],
|
322
|
-
# "x" => [8.0, 1.0, 3.5],
|
323
|
-
# "y" => [1.5, 12.0, 5.0]
|
324
|
-
# }
|
325
|
-
# )
|
326
|
-
# df3 = Polars::DataFrame.new(
|
327
|
-
# {
|
328
|
-
# "dt" => [Date.new(2022, 9, 3), Date.new(2022, 9, 2)],
|
329
|
-
# "x" => [2.0, 5.0],
|
330
|
-
# "y" => [2.5, 2.0]
|
331
|
-
# }
|
332
|
-
# )
|
333
|
-
# af1, af2, af3 = Polars.align_frames(
|
334
|
-
# df1, df2, df3, on: "dt", select: ["x", "y"]
|
335
|
-
# )
|
336
|
-
# (af1 * af2 * af3).fill_null(0).select(Polars.sum(Polars.col("*")).alias("dot"))
|
337
|
-
# # =>
|
338
|
-
# # shape: (3, 1)
|
339
|
-
# # ┌───────┐
|
340
|
-
# # │ dot │
|
341
|
-
# # │ --- │
|
342
|
-
# # │ f64 │
|
343
|
-
# # ╞═══════╡
|
344
|
-
# # │ 0.0 │
|
345
|
-
# # ├╌╌╌╌╌╌╌┤
|
346
|
-
# # │ 167.5 │
|
347
|
-
# # ├╌╌╌╌╌╌╌┤
|
348
|
-
# # │ 47.0 │
|
349
|
-
# # └───────┘
|
350
|
-
def align_frames(
|
351
|
-
*frames,
|
352
|
-
on:,
|
353
|
-
select: nil,
|
354
|
-
reverse: false
|
355
|
-
)
|
356
|
-
if frames.empty?
|
357
|
-
return []
|
358
|
-
elsif frames.map(&:class).uniq.length != 1
|
359
|
-
raise TypeError, "Input frames must be of a consistent type (all LazyFrame or all DataFrame)"
|
360
|
-
end
|
361
|
-
|
362
|
-
# establish the superset of all "on" column values, sort, and cache
|
363
|
-
eager = frames[0].is_a?(DataFrame)
|
364
|
-
alignment_frame = (
|
365
|
-
concat(frames.map { |df| df.lazy.select(on) })
|
366
|
-
.unique(maintain_order: false)
|
367
|
-
.sort(on, reverse: reverse)
|
368
|
-
)
|
369
|
-
alignment_frame = (
|
370
|
-
eager ? alignment_frame.collect.lazy : alignment_frame.cache
|
371
|
-
)
|
372
|
-
# finally, align all frames
|
373
|
-
aligned_frames =
|
374
|
-
frames.map do |df|
|
375
|
-
alignment_frame.join(
|
376
|
-
df.lazy,
|
377
|
-
on: alignment_frame.columns,
|
378
|
-
how: "left"
|
379
|
-
).select(df.columns)
|
380
|
-
end
|
381
|
-
if !select.nil?
|
382
|
-
aligned_frames = aligned_frames.map { |df| df.select(select) }
|
383
|
-
end
|
384
|
-
|
385
|
-
eager ? aligned_frames.map(&:collect) : aligned_frames
|
18
|
+
# @return [Expr]
|
19
|
+
def to_list(name)
|
20
|
+
col(name).list
|
386
21
|
end
|
387
22
|
|
388
|
-
#
|
23
|
+
# Compute the spearman rank correlation between two columns.
|
389
24
|
#
|
390
|
-
#
|
391
|
-
# Number of elements in the `Series`
|
392
|
-
# @param dtype [Symbol]
|
393
|
-
# DataType of the elements, defaults to `:f64`
|
25
|
+
# Missing data will be excluded from the computation.
|
394
26
|
#
|
395
|
-
# @
|
27
|
+
# @param a [Object]
|
28
|
+
# Column name or Expression.
|
29
|
+
# @param b [Object]
|
30
|
+
# Column name or Expression.
|
31
|
+
# @param ddof [Integer]
|
32
|
+
# Delta degrees of freedom
|
33
|
+
# @param propagate_nans [Boolean]
|
34
|
+
# If `True` any `NaN` encountered will lead to `NaN` in the output.
|
35
|
+
# Defaults to `False` where `NaN` are regarded as larger than any finite number
|
36
|
+
# and thus lead to the highest rank.
|
396
37
|
#
|
397
|
-
# @
|
398
|
-
|
399
|
-
|
400
|
-
def ones(n, dtype: nil)
|
401
|
-
s = Series.new([1.0])
|
402
|
-
if dtype
|
403
|
-
s = s.cast(dtype)
|
404
|
-
end
|
405
|
-
s.new_from_index(0, n)
|
38
|
+
# @return [Expr]
|
39
|
+
def spearman_rank_corr(a, b, ddof: 1, propagate_nans: false)
|
40
|
+
corr(a, b, method: "spearman", ddof: ddof, propagate_nans: propagate_nans)
|
406
41
|
end
|
407
42
|
|
408
|
-
#
|
409
|
-
#
|
410
|
-
# @param n [Integer]
|
411
|
-
# Number of elements in the `Series`
|
412
|
-
# @param dtype [Symbol]
|
413
|
-
# DataType of the elements, defaults to `:f64`
|
43
|
+
# Compute the pearson's correlation between two columns.
|
414
44
|
#
|
415
|
-
# @
|
45
|
+
# @param a [Object]
|
46
|
+
# Column name or Expression.
|
47
|
+
# @param b [Object]
|
48
|
+
# Column name or Expression.
|
49
|
+
# @param ddof [Integer]
|
50
|
+
# Delta degrees of freedom
|
416
51
|
#
|
417
|
-
# @
|
418
|
-
|
419
|
-
|
420
|
-
def zeros(n, dtype: nil)
|
421
|
-
s = Series.new([0.0])
|
422
|
-
if dtype
|
423
|
-
s = s.cast(dtype)
|
424
|
-
end
|
425
|
-
s.new_from_index(0, n)
|
426
|
-
end
|
427
|
-
|
428
|
-
private
|
429
|
-
|
430
|
-
def _ensure_datetime(value)
|
431
|
-
is_date_type = false
|
432
|
-
if !value.is_a?(::DateTime)
|
433
|
-
value = ::DateTime.new(value.year, value.month, value.day)
|
434
|
-
is_date_type = true
|
435
|
-
end
|
436
|
-
[value, is_date_type]
|
437
|
-
end
|
438
|
-
|
439
|
-
# TODO
|
440
|
-
def _interval_granularity(interval)
|
441
|
-
interval
|
52
|
+
# @return [Expr]
|
53
|
+
def pearson_corr(a, b, ddof: 1)
|
54
|
+
corr(a, b, method: "pearson", ddof: ddof)
|
442
55
|
end
|
443
56
|
end
|
444
57
|
end
|
data/lib/polars/group_by.rb
CHANGED
@@ -38,7 +38,7 @@ module Polars
|
|
38
38
|
temp_col = "__POLARS_GB_GROUP_INDICES"
|
39
39
|
groups_df =
|
40
40
|
@df.lazy
|
41
|
-
.
|
41
|
+
.with_row_index(name: temp_col)
|
42
42
|
.group_by(@by, maintain_order: @maintain_order)
|
43
43
|
.agg(Polars.col(temp_col))
|
44
44
|
.collect(no_optimization: true)
|
@@ -47,7 +47,7 @@ module Polars
|
|
47
47
|
|
48
48
|
# When grouping by a single column, group name is a single value
|
49
49
|
# When grouping by multiple columns, group name is a tuple of values
|
50
|
-
if @by.is_a?(String) || @by.is_a?(Expr)
|
50
|
+
if @by.is_a?(::String) || @by.is_a?(Expr)
|
51
51
|
_group_names = group_names.to_series.each
|
52
52
|
else
|
53
53
|
_group_names = group_names.iter_rows
|
@@ -415,7 +415,7 @@ module Polars
|
|
415
415
|
# # │ Banana ┆ 2 │
|
416
416
|
# # └────────┴───────┘
|
417
417
|
def count
|
418
|
-
agg(Polars.count)
|
418
|
+
agg(Polars.len.alias("count"))
|
419
419
|
end
|
420
420
|
|
421
421
|
# Reduce the groups to the mean values.
|
data/lib/polars/io.rb
CHANGED
@@ -115,10 +115,10 @@ module Polars
|
|
115
115
|
sample_size: 1024,
|
116
116
|
eol_char: "\n"
|
117
117
|
)
|
118
|
-
_check_arg_is_1byte("sep", sep, false)
|
119
|
-
_check_arg_is_1byte("comment_char", comment_char, false)
|
120
|
-
_check_arg_is_1byte("quote_char", quote_char, true)
|
121
|
-
_check_arg_is_1byte("eol_char", eol_char, false)
|
118
|
+
Utils._check_arg_is_1byte("sep", sep, false)
|
119
|
+
Utils._check_arg_is_1byte("comment_char", comment_char, false)
|
120
|
+
Utils._check_arg_is_1byte("quote_char", quote_char, true)
|
121
|
+
Utils._check_arg_is_1byte("eol_char", eol_char, false)
|
122
122
|
|
123
123
|
projection, columns = Utils.handle_projection_columns(columns)
|
124
124
|
|
@@ -264,9 +264,9 @@ module Polars
|
|
264
264
|
parse_dates: false,
|
265
265
|
eol_char: "\n"
|
266
266
|
)
|
267
|
-
_check_arg_is_1byte("sep", sep, false)
|
268
|
-
_check_arg_is_1byte("comment_char", comment_char, false)
|
269
|
-
_check_arg_is_1byte("quote_char", quote_char, true)
|
267
|
+
Utils._check_arg_is_1byte("sep", sep, false)
|
268
|
+
Utils._check_arg_is_1byte("comment_char", comment_char, false)
|
269
|
+
Utils._check_arg_is_1byte("quote_char", quote_char, true)
|
270
270
|
|
271
271
|
if Utils.pathlike?(source)
|
272
272
|
source = Utils.normalise_filepath(source)
|
@@ -604,9 +604,12 @@ module Polars
|
|
604
604
|
#
|
605
605
|
# @param query [Object]
|
606
606
|
# ActiveRecord::Relation or ActiveRecord::Result.
|
607
|
+
# @param schema_overrides [Hash]
|
608
|
+
# A hash mapping column names to dtypes, used to override the schema
|
609
|
+
# inferred from the query.
|
607
610
|
#
|
608
611
|
# @return [DataFrame]
|
609
|
-
def read_database(query)
|
612
|
+
def read_database(query, schema_overrides: nil)
|
610
613
|
if !defined?(ActiveRecord)
|
611
614
|
raise Error, "Active Record not available"
|
612
615
|
end
|
@@ -616,14 +619,14 @@ module Polars
|
|
616
619
|
query
|
617
620
|
elsif query.is_a?(ActiveRecord::Relation)
|
618
621
|
query.connection.select_all(query.to_sql)
|
619
|
-
elsif query.is_a?(String)
|
622
|
+
elsif query.is_a?(::String)
|
620
623
|
ActiveRecord::Base.connection.select_all(query)
|
621
624
|
else
|
622
625
|
raise ArgumentError, "Expected ActiveRecord::Relation, ActiveRecord::Result, or String"
|
623
626
|
end
|
624
627
|
|
625
628
|
data = {}
|
626
|
-
schema_overrides = {}
|
629
|
+
schema_overrides = (schema_overrides || {}).transform_keys(&:to_s)
|
627
630
|
|
628
631
|
result.columns.each_with_index do |k, i|
|
629
632
|
column_type = result.column_types[i]
|
@@ -652,12 +655,15 @@ module Polars
|
|
652
655
|
when :integer
|
653
656
|
Int64
|
654
657
|
when :string, :text
|
655
|
-
|
658
|
+
String
|
656
659
|
when :time
|
657
660
|
Time
|
661
|
+
# TODO fix issue with null
|
662
|
+
# when :json, :jsonb
|
663
|
+
# Struct
|
658
664
|
end
|
659
665
|
|
660
|
-
schema_overrides[k]
|
666
|
+
schema_overrides[k] ||= polars_type if polars_type
|
661
667
|
end
|
662
668
|
|
663
669
|
DataFrame.new(data, schema_overrides: schema_overrides)
|
@@ -836,7 +842,7 @@ module Polars
|
|
836
842
|
source = Utils.normalise_filepath(source)
|
837
843
|
end
|
838
844
|
|
839
|
-
|
845
|
+
Plr.ipc_schema(source)
|
840
846
|
end
|
841
847
|
|
842
848
|
# Get a schema of the Parquet file without reading data.
|
@@ -850,13 +856,13 @@ module Polars
|
|
850
856
|
source = Utils.normalise_filepath(source)
|
851
857
|
end
|
852
858
|
|
853
|
-
|
859
|
+
Plr.parquet_schema(source)
|
854
860
|
end
|
855
861
|
|
856
862
|
private
|
857
863
|
|
858
864
|
def _prepare_file_arg(file)
|
859
|
-
if file.is_a?(String) && file =~ /\Ahttps?:\/\//
|
865
|
+
if file.is_a?(::String) && file =~ /\Ahttps?:\/\//
|
860
866
|
raise ArgumentError, "use URI(...) for remote files"
|
861
867
|
end
|
862
868
|
|
@@ -868,18 +874,5 @@ module Polars
|
|
868
874
|
|
869
875
|
yield file
|
870
876
|
end
|
871
|
-
|
872
|
-
def _check_arg_is_1byte(arg_name, arg, can_be_empty = false)
|
873
|
-
if arg.is_a?(String)
|
874
|
-
arg_byte_length = arg.bytesize
|
875
|
-
if can_be_empty
|
876
|
-
if arg_byte_length > 1
|
877
|
-
raise ArgumentError, "#{arg_name} should be a single byte character or empty, but is #{arg_byte_length} bytes long."
|
878
|
-
end
|
879
|
-
elsif arg_byte_length != 1
|
880
|
-
raise ArgumentError, "#{arg_name} should be a single byte character, but is #{arg_byte_length} bytes long."
|
881
|
-
end
|
882
|
-
end
|
883
|
-
end
|
884
877
|
end
|
885
878
|
end
|