polars-df 0.8.0 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +30 -1
  3. data/Cargo.lock +107 -59
  4. data/Cargo.toml +0 -3
  5. data/LICENSE.txt +1 -1
  6. data/README.md +2 -2
  7. data/ext/polars/Cargo.toml +15 -7
  8. data/ext/polars/src/batched_csv.rs +4 -4
  9. data/ext/polars/src/conversion/anyvalue.rs +185 -0
  10. data/ext/polars/src/conversion/chunked_array.rs +140 -0
  11. data/ext/polars/src/{conversion.rs → conversion/mod.rs} +260 -340
  12. data/ext/polars/src/dataframe.rs +69 -53
  13. data/ext/polars/src/expr/array.rs +74 -0
  14. data/ext/polars/src/expr/datetime.rs +22 -56
  15. data/ext/polars/src/expr/general.rs +61 -33
  16. data/ext/polars/src/expr/list.rs +52 -4
  17. data/ext/polars/src/expr/meta.rs +48 -0
  18. data/ext/polars/src/expr/rolling.rs +1 -0
  19. data/ext/polars/src/expr/string.rs +59 -8
  20. data/ext/polars/src/expr/struct.rs +8 -4
  21. data/ext/polars/src/functions/aggregation.rs +6 -0
  22. data/ext/polars/src/functions/lazy.rs +103 -48
  23. data/ext/polars/src/functions/meta.rs +45 -1
  24. data/ext/polars/src/functions/string_cache.rs +14 -0
  25. data/ext/polars/src/{lazyframe.rs → lazyframe/mod.rs} +138 -22
  26. data/ext/polars/src/lib.rs +226 -168
  27. data/ext/polars/src/series/aggregation.rs +20 -0
  28. data/ext/polars/src/series/mod.rs +25 -4
  29. data/lib/polars/array_expr.rb +449 -0
  30. data/lib/polars/array_name_space.rb +346 -0
  31. data/lib/polars/cat_expr.rb +24 -0
  32. data/lib/polars/cat_name_space.rb +75 -0
  33. data/lib/polars/config.rb +2 -2
  34. data/lib/polars/data_frame.rb +179 -43
  35. data/lib/polars/data_types.rb +191 -28
  36. data/lib/polars/date_time_expr.rb +31 -14
  37. data/lib/polars/exceptions.rb +12 -1
  38. data/lib/polars/expr.rb +866 -186
  39. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  40. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  41. data/lib/polars/functions/as_datatype.rb +248 -0
  42. data/lib/polars/functions/col.rb +47 -0
  43. data/lib/polars/functions/eager.rb +182 -0
  44. data/lib/polars/functions/lazy.rb +1280 -0
  45. data/lib/polars/functions/len.rb +49 -0
  46. data/lib/polars/functions/lit.rb +35 -0
  47. data/lib/polars/functions/random.rb +16 -0
  48. data/lib/polars/functions/range/date_range.rb +103 -0
  49. data/lib/polars/functions/range/int_range.rb +51 -0
  50. data/lib/polars/functions/repeat.rb +144 -0
  51. data/lib/polars/functions/whenthen.rb +27 -0
  52. data/lib/polars/functions.rb +29 -416
  53. data/lib/polars/group_by.rb +2 -2
  54. data/lib/polars/io.rb +18 -25
  55. data/lib/polars/lazy_frame.rb +367 -53
  56. data/lib/polars/list_expr.rb +152 -6
  57. data/lib/polars/list_name_space.rb +102 -0
  58. data/lib/polars/meta_expr.rb +175 -7
  59. data/lib/polars/series.rb +273 -34
  60. data/lib/polars/string_cache.rb +75 -0
  61. data/lib/polars/string_expr.rb +412 -96
  62. data/lib/polars/string_name_space.rb +4 -4
  63. data/lib/polars/testing.rb +507 -0
  64. data/lib/polars/utils.rb +52 -8
  65. data/lib/polars/version.rb +1 -1
  66. data/lib/polars.rb +15 -2
  67. metadata +35 -5
  68. data/lib/polars/lazy_functions.rb +0 -1181
@@ -13,432 +13,45 @@ module Polars
13
13
  df.to_dummies(columns: columns)
14
14
  end
15
15
 
16
- # Aggregate multiple Dataframes/Series to a single DataFrame/Series.
16
+ # Aggregate to list.
17
17
  #
18
- # @param items [Object]
19
- # DataFrames/Series/LazyFrames to concatenate.
20
- # @param rechunk [Boolean]
21
- # Make sure that all data is in contiguous memory.
22
- # @param how ["vertical", "vertical_relaxed", "diagonal", "horizontal"]
23
- # LazyFrames do not support the `horizontal` strategy.
24
- #
25
- # - Vertical: applies multiple `vstack` operations.
26
- # - Diagonal: finds a union between the column schemas and fills missing column values with null.
27
- # - Horizontal: stacks Series horizontally and fills with nulls if the lengths don't match.
28
- # @param parallel [Boolean]
29
- # Only relevant for LazyFrames. This determines if the concatenated
30
- # lazy computations may be executed in parallel.
31
- #
32
- # @return [Object]
33
- #
34
- # @example
35
- # df1 = Polars::DataFrame.new({"a" => [1], "b" => [3]})
36
- # df2 = Polars::DataFrame.new({"a" => [2], "b" => [4]})
37
- # Polars.concat([df1, df2])
38
- # # =>
39
- # # shape: (2, 2)
40
- # # ┌─────┬─────┐
41
- # # │ a ┆ b │
42
- # # │ --- ┆ --- │
43
- # # │ i64 ┆ i64 │
44
- # # ╞═════╪═════╡
45
- # # │ 1 ┆ 3 │
46
- # # │ 2 ┆ 4 │
47
- # # └─────┴─────┘
48
- def concat(items, rechunk: true, how: "vertical", parallel: true)
49
- if items.empty?
50
- raise ArgumentError, "cannot concat empty list"
51
- end
52
-
53
- first = items[0]
54
- if first.is_a?(DataFrame)
55
- if how == "vertical"
56
- out = Utils.wrap_df(_concat_df(items))
57
- elsif how == "diagonal"
58
- out = Utils.wrap_df(_concat_df_diagonal(items))
59
- elsif how == "horizontal"
60
- out = Utils.wrap_df(_concat_df_horizontal(items))
61
- else
62
- raise ArgumentError, "how must be one of {{'vertical', 'diagonal', 'horizontal'}}, got #{how}"
63
- end
64
- elsif first.is_a?(LazyFrame)
65
- if how == "vertical"
66
- return Utils.wrap_ldf(_concat_lf(items, rechunk, parallel, false))
67
- elsif how == "vertical_relaxed"
68
- return Utils.wrap_ldf(_concat_lf(items, rechunk, parallel, true))
69
- elsif how == "diagonal"
70
- return Utils.wrap_ldf(_concat_lf_diagonal(items, rechunk, parallel, false))
71
- else
72
- raise ArgumentError, "Lazy only allows 'vertical', 'vertical_relaxed', and 'diagonal' concat strategy."
73
- end
74
- elsif first.is_a?(Series)
75
- # TODO
76
- out = Utils.wrap_s(_concat_series(items))
77
- elsif first.is_a?(Expr)
78
- out = first
79
- items[1..-1].each do |e|
80
- out = out.append(e)
81
- end
82
- else
83
- raise ArgumentError, "did not expect type: #{first.class.name} in 'Polars.concat'."
84
- end
85
-
86
- if rechunk
87
- out.rechunk
88
- else
89
- out
90
- end
91
- end
92
-
93
- # Create a range of type `Datetime` (or `Date`).
94
- #
95
- # @param start [Object]
96
- # Lower bound of the date range.
97
- # @param stop [Object]
98
- # Upper bound of the date range.
99
- # @param interval [Object]
100
- # Interval periods. It can be a polars duration string, such as `3d12h4m25s`
101
- # representing 3 days, 12 hours, 4 minutes, and 25 seconds.
102
- # @param lazy [Boolean]
103
- # Return an expression.
104
- # @param closed ["both", "left", "right", "none"]
105
- # Define whether the temporal window interval is closed or not.
106
- # @param name [String]
107
- # Name of the output Series.
108
- # @param time_unit [nil, "ns", "us", "ms"]
109
- # Set the time unit.
110
- # @param time_zone [String]
111
- # Optional timezone
112
- #
113
- # @return [Object]
114
- #
115
- # @note
116
- # If both `low` and `high` are passed as date types (not datetime), and the
117
- # interval granularity is no finer than 1d, the returned range is also of
118
- # type date. All other permutations return a datetime Series.
119
- #
120
- # @example Using polars duration string to specify the interval
121
- # Polars.date_range(Date.new(2022, 1, 1), Date.new(2022, 3, 1), "1mo", name: "drange")
122
- # # =>
123
- # # shape: (3,)
124
- # # Series: 'drange' [date]
125
- # # [
126
- # # 2022-01-01
127
- # # 2022-02-01
128
- # # 2022-03-01
129
- # # ]
130
- #
131
- # @example Using `timedelta` object to specify the interval:
132
- # Polars.date_range(
133
- # DateTime.new(1985, 1, 1),
134
- # DateTime.new(1985, 1, 10),
135
- # "1d12h",
136
- # time_unit: "ms"
137
- # )
138
- # # =>
139
- # # shape: (7,)
140
- # # Series: '' [datetime[ms]]
141
- # # [
142
- # # 1985-01-01 00:00:00
143
- # # 1985-01-02 12:00:00
144
- # # 1985-01-04 00:00:00
145
- # # 1985-01-05 12:00:00
146
- # # 1985-01-07 00:00:00
147
- # # 1985-01-08 12:00:00
148
- # # 1985-01-10 00:00:00
149
- # # ]
150
- def date_range(
151
- start,
152
- stop,
153
- interval,
154
- lazy: false,
155
- closed: "both",
156
- name: nil,
157
- time_unit: nil,
158
- time_zone: nil
159
- )
160
- if defined?(ActiveSupport::Duration) && interval.is_a?(ActiveSupport::Duration)
161
- raise Todo
162
- else
163
- interval = interval.to_s
164
- if interval.include?(" ")
165
- interval = interval.gsub(" ", "")
166
- end
167
- end
168
-
169
- if time_unit.nil?
170
- if interval.include?("ns")
171
- time_unit = "ns"
172
- else
173
- time_unit = "us"
174
- end
175
- end
176
-
177
- start_rbexpr = Utils.parse_as_expression(start)
178
- stop_rbexpr = Utils.parse_as_expression(stop)
179
-
180
- result = Utils.wrap_expr(
181
- _rb_date_range(start_rbexpr, stop_rbexpr, interval, closed, time_unit, time_zone)
182
- )
183
-
184
- result = result.alias(name.to_s)
185
-
186
- if !lazy
187
- return select(result).to_series
188
- end
189
-
190
- result
191
- end
192
-
193
- # Bin values into discrete values.
194
- #
195
- # @param s [Series]
196
- # Series to bin.
197
- # @param bins [Array]
198
- # Bins to create.
199
- # @param labels [Array]
200
- # Labels to assign to the bins. If given the length of labels must be
201
- # len(bins) + 1.
202
- # @param break_point_label [String]
203
- # Name given to the breakpoint column.
204
- # @param category_label [String]
205
- # Name given to the category column.
206
- #
207
- # @return [DataFrame]
208
- #
209
- # @note
210
- # This functionality is experimental and may change without it being considered a
211
- # breaking change.
212
- #
213
- # @example
214
- # a = Polars::Series.new("a", 13.times.map { |i| (-30 + i * 5) / 10.0 })
215
- # Polars.cut(a, [-1, 1])
216
- # # =>
217
- # # shape: (12, 3)
218
- # # ┌──────┬─────────────┬──────────────┐
219
- # # │ a ┆ break_point ┆ category │
220
- # # │ --- ┆ --- ┆ --- │
221
- # # │ f64 ┆ f64 ┆ cat │
222
- # # ╞══════╪═════════════╪══════════════╡
223
- # # │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │
224
- # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
225
- # # │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │
226
- # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
227
- # # │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │
228
- # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
229
- # # │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │
230
- # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
231
- # # │ ... ┆ ... ┆ ... │
232
- # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
233
- # # │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │
234
- # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
235
- # # │ 1.5 ┆ inf ┆ (1.0, inf] │
236
- # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
237
- # # │ 2.0 ┆ inf ┆ (1.0, inf] │
238
- # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
239
- # # │ 2.5 ┆ inf ┆ (1.0, inf] │
240
- # # └──────┴─────────────┴──────────────┘
241
- # def cut(
242
- # s,
243
- # bins,
244
- # labels: nil,
245
- # break_point_label: "break_point",
246
- # category_label: "category"
247
- # )
248
- # var_nm = s.name
249
-
250
- # cuts_df = DataFrame.new(
251
- # [
252
- # Series.new(
253
- # break_point_label, bins, dtype: :f64
254
- # ).extend_constant(Float::INFINITY, 1)
255
- # ]
256
- # )
257
-
258
- # if labels
259
- # if labels.length != bins.length + 1
260
- # raise ArgumentError, "expected more labels"
261
- # end
262
- # cuts_df = cuts_df.with_column(Series.new(category_label, labels))
263
- # else
264
- # cuts_df = cuts_df.with_column(
265
- # Polars.format(
266
- # "({}, {}]",
267
- # Polars.col(break_point_label).shift_and_fill(1, -Float::INFINITY),
268
- # Polars.col(break_point_label)
269
- # ).alias(category_label)
270
- # )
271
- # end
272
-
273
- # cuts_df = cuts_df.with_column(Polars.col(category_label).cast(:cat))
274
-
275
- # s.cast(:f64)
276
- # .sort
277
- # .to_frame
278
- # .join_asof(
279
- # cuts_df,
280
- # left_on: var_nm,
281
- # right_on: break_point_label,
282
- # strategy: "forward"
283
- # )
284
- # end
285
-
286
- # Align a sequence of frames using the uique values from one or more columns as a key.
287
- #
288
- # Frames that do not contain the given key values have rows injected (with nulls
289
- # filling the non-key columns), and each resulting frame is sorted by the key.
290
- #
291
- # The original column order of input frames is not changed unless ``select`` is
292
- # specified (in which case the final column order is determined from that).
293
- #
294
- # Note that this does not result in a joined frame - you receive the same number
295
- # of frames back that you passed in, but each is now aligned by key and has
296
- # the same number of rows.
297
- #
298
- # @param frames [Array]
299
- # Sequence of DataFrames or LazyFrames.
300
- # @param on [Object]
301
- # One or more columns whose unique values will be used to align the frames.
302
- # @param select [Object]
303
- # Optional post-alignment column select to constrain and/or order
304
- # the columns returned from the newly aligned frames.
305
- # @param reverse [Object]
306
- # Sort the alignment column values in descending order; can be a single
307
- # boolean or a list of booleans associated with each column in `on`.
308
- #
309
- # @return [Object]
310
- #
311
- # @example
312
- # df1 = Polars::DataFrame.new(
313
- # {
314
- # "dt" => [Date.new(2022, 9, 1), Date.new(2022, 9, 2), Date.new(2022, 9, 3)],
315
- # "x" => [3.5, 4.0, 1.0],
316
- # "y" => [10.0, 2.5, 1.5]
317
- # }
318
- # )
319
- # df2 = Polars::DataFrame.new(
320
- # {
321
- # "dt" => [Date.new(2022, 9, 2), Date.new(2022, 9, 3), Date.new(2022, 9, 1)],
322
- # "x" => [8.0, 1.0, 3.5],
323
- # "y" => [1.5, 12.0, 5.0]
324
- # }
325
- # )
326
- # df3 = Polars::DataFrame.new(
327
- # {
328
- # "dt" => [Date.new(2022, 9, 3), Date.new(2022, 9, 2)],
329
- # "x" => [2.0, 5.0],
330
- # "y" => [2.5, 2.0]
331
- # }
332
- # )
333
- # af1, af2, af3 = Polars.align_frames(
334
- # df1, df2, df3, on: "dt", select: ["x", "y"]
335
- # )
336
- # (af1 * af2 * af3).fill_null(0).select(Polars.sum(Polars.col("*")).alias("dot"))
337
- # # =>
338
- # # shape: (3, 1)
339
- # # ┌───────┐
340
- # # │ dot │
341
- # # │ --- │
342
- # # │ f64 │
343
- # # ╞═══════╡
344
- # # │ 0.0 │
345
- # # ├╌╌╌╌╌╌╌┤
346
- # # │ 167.5 │
347
- # # ├╌╌╌╌╌╌╌┤
348
- # # │ 47.0 │
349
- # # └───────┘
350
- def align_frames(
351
- *frames,
352
- on:,
353
- select: nil,
354
- reverse: false
355
- )
356
- if frames.empty?
357
- return []
358
- elsif frames.map(&:class).uniq.length != 1
359
- raise TypeError, "Input frames must be of a consistent type (all LazyFrame or all DataFrame)"
360
- end
361
-
362
- # establish the superset of all "on" column values, sort, and cache
363
- eager = frames[0].is_a?(DataFrame)
364
- alignment_frame = (
365
- concat(frames.map { |df| df.lazy.select(on) })
366
- .unique(maintain_order: false)
367
- .sort(on, reverse: reverse)
368
- )
369
- alignment_frame = (
370
- eager ? alignment_frame.collect.lazy : alignment_frame.cache
371
- )
372
- # finally, align all frames
373
- aligned_frames =
374
- frames.map do |df|
375
- alignment_frame.join(
376
- df.lazy,
377
- on: alignment_frame.columns,
378
- how: "left"
379
- ).select(df.columns)
380
- end
381
- if !select.nil?
382
- aligned_frames = aligned_frames.map { |df| df.select(select) }
383
- end
384
-
385
- eager ? aligned_frames.map(&:collect) : aligned_frames
18
+ # @return [Expr]
19
+ def to_list(name)
20
+ col(name).list
386
21
  end
387
22
 
388
- # Return a new Series of given length and type, filled with ones.
23
+ # Compute the spearman rank correlation between two columns.
389
24
  #
390
- # @param n [Integer]
391
- # Number of elements in the `Series`
392
- # @param dtype [Symbol]
393
- # DataType of the elements, defaults to `:f64`
25
+ # Missing data will be excluded from the computation.
394
26
  #
395
- # @return [Series]
27
+ # @param a [Object]
28
+ # Column name or Expression.
29
+ # @param b [Object]
30
+ # Column name or Expression.
31
+ # @param ddof [Integer]
32
+ # Delta degrees of freedom
33
+ # @param propagate_nans [Boolean]
34
+ # If `True` any `NaN` encountered will lead to `NaN` in the output.
35
+ # Defaults to `False` where `NaN` are regarded as larger than any finite number
36
+ # and thus lead to the highest rank.
396
37
  #
397
- # @note
398
- # In the lazy API you should probably not use this, but use `lit(1)`
399
- # instead.
400
- def ones(n, dtype: nil)
401
- s = Series.new([1.0])
402
- if dtype
403
- s = s.cast(dtype)
404
- end
405
- s.new_from_index(0, n)
38
+ # @return [Expr]
39
+ def spearman_rank_corr(a, b, ddof: 1, propagate_nans: false)
40
+ corr(a, b, method: "spearman", ddof: ddof, propagate_nans: propagate_nans)
406
41
  end
407
42
 
408
- # Return a new Series of given length and type, filled with zeros.
409
- #
410
- # @param n [Integer]
411
- # Number of elements in the `Series`
412
- # @param dtype [Symbol]
413
- # DataType of the elements, defaults to `:f64`
43
+ # Compute the pearson's correlation between two columns.
414
44
  #
415
- # @return [Series]
45
+ # @param a [Object]
46
+ # Column name or Expression.
47
+ # @param b [Object]
48
+ # Column name or Expression.
49
+ # @param ddof [Integer]
50
+ # Delta degrees of freedom
416
51
  #
417
- # @note
418
- # In the lazy API you should probably not use this, but use `lit(0)`
419
- # instead.
420
- def zeros(n, dtype: nil)
421
- s = Series.new([0.0])
422
- if dtype
423
- s = s.cast(dtype)
424
- end
425
- s.new_from_index(0, n)
426
- end
427
-
428
- private
429
-
430
- def _ensure_datetime(value)
431
- is_date_type = false
432
- if !value.is_a?(::DateTime)
433
- value = ::DateTime.new(value.year, value.month, value.day)
434
- is_date_type = true
435
- end
436
- [value, is_date_type]
437
- end
438
-
439
- # TODO
440
- def _interval_granularity(interval)
441
- interval
52
+ # @return [Expr]
53
+ def pearson_corr(a, b, ddof: 1)
54
+ corr(a, b, method: "pearson", ddof: ddof)
442
55
  end
443
56
  end
444
57
  end
@@ -38,7 +38,7 @@ module Polars
38
38
  temp_col = "__POLARS_GB_GROUP_INDICES"
39
39
  groups_df =
40
40
  @df.lazy
41
- .with_row_count(name: temp_col)
41
+ .with_row_index(name: temp_col)
42
42
  .group_by(@by, maintain_order: @maintain_order)
43
43
  .agg(Polars.col(temp_col))
44
44
  .collect(no_optimization: true)
@@ -415,7 +415,7 @@ module Polars
415
415
  # # │ Banana ┆ 2 │
416
416
  # # └────────┴───────┘
417
417
  def count
418
- agg(Polars.count)
418
+ agg(Polars.len.alias("count"))
419
419
  end
420
420
 
421
421
  # Reduce the groups to the mean values.
data/lib/polars/io.rb CHANGED
@@ -115,10 +115,10 @@ module Polars
115
115
  sample_size: 1024,
116
116
  eol_char: "\n"
117
117
  )
118
- _check_arg_is_1byte("sep", sep, false)
119
- _check_arg_is_1byte("comment_char", comment_char, false)
120
- _check_arg_is_1byte("quote_char", quote_char, true)
121
- _check_arg_is_1byte("eol_char", eol_char, false)
118
+ Utils._check_arg_is_1byte("sep", sep, false)
119
+ Utils._check_arg_is_1byte("comment_char", comment_char, false)
120
+ Utils._check_arg_is_1byte("quote_char", quote_char, true)
121
+ Utils._check_arg_is_1byte("eol_char", eol_char, false)
122
122
 
123
123
  projection, columns = Utils.handle_projection_columns(columns)
124
124
 
@@ -264,9 +264,9 @@ module Polars
264
264
  parse_dates: false,
265
265
  eol_char: "\n"
266
266
  )
267
- _check_arg_is_1byte("sep", sep, false)
268
- _check_arg_is_1byte("comment_char", comment_char, false)
269
- _check_arg_is_1byte("quote_char", quote_char, true)
267
+ Utils._check_arg_is_1byte("sep", sep, false)
268
+ Utils._check_arg_is_1byte("comment_char", comment_char, false)
269
+ Utils._check_arg_is_1byte("quote_char", quote_char, true)
270
270
 
271
271
  if Utils.pathlike?(source)
272
272
  source = Utils.normalise_filepath(source)
@@ -604,9 +604,12 @@ module Polars
604
604
  #
605
605
  # @param query [Object]
606
606
  # ActiveRecord::Relation or ActiveRecord::Result.
607
+ # @param schema_overrides [Hash]
608
+ # A hash mapping column names to dtypes, used to override the schema
609
+ # inferred from the query.
607
610
  #
608
611
  # @return [DataFrame]
609
- def read_database(query)
612
+ def read_database(query, schema_overrides: nil)
610
613
  if !defined?(ActiveRecord)
611
614
  raise Error, "Active Record not available"
612
615
  end
@@ -623,7 +626,7 @@ module Polars
623
626
  end
624
627
 
625
628
  data = {}
626
- schema_overrides = {}
629
+ schema_overrides = (schema_overrides || {}).transform_keys(&:to_s)
627
630
 
628
631
  result.columns.each_with_index do |k, i|
629
632
  column_type = result.column_types[i]
@@ -655,9 +658,12 @@ module Polars
655
658
  String
656
659
  when :time
657
660
  Time
661
+ # TODO fix issue with null
662
+ # when :json, :jsonb
663
+ # Struct
658
664
  end
659
665
 
660
- schema_overrides[k] = polars_type if polars_type
666
+ schema_overrides[k] ||= polars_type if polars_type
661
667
  end
662
668
 
663
669
  DataFrame.new(data, schema_overrides: schema_overrides)
@@ -836,7 +842,7 @@ module Polars
836
842
  source = Utils.normalise_filepath(source)
837
843
  end
838
844
 
839
- _ipc_schema(source)
845
+ Plr.ipc_schema(source)
840
846
  end
841
847
 
842
848
  # Get a schema of the Parquet file without reading data.
@@ -850,7 +856,7 @@ module Polars
850
856
  source = Utils.normalise_filepath(source)
851
857
  end
852
858
 
853
- _parquet_schema(source)
859
+ Plr.parquet_schema(source)
854
860
  end
855
861
 
856
862
  private
@@ -868,18 +874,5 @@ module Polars
868
874
 
869
875
  yield file
870
876
  end
871
-
872
- def _check_arg_is_1byte(arg_name, arg, can_be_empty = false)
873
- if arg.is_a?(::String)
874
- arg_byte_length = arg.bytesize
875
- if can_be_empty
876
- if arg_byte_length > 1
877
- raise ArgumentError, "#{arg_name} should be a single byte character or empty, but is #{arg_byte_length} bytes long."
878
- end
879
- elsif arg_byte_length != 1
880
- raise ArgumentError, "#{arg_name} should be a single byte character, but is #{arg_byte_length} bytes long."
881
- end
882
- end
883
- end
884
877
  end
885
878
  end