polars-df 0.7.0-x86_64-linux → 0.9.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +41 -0
  3. data/Cargo.lock +353 -237
  4. data/Cargo.toml +0 -3
  5. data/LICENSE-THIRD-PARTY.txt +1978 -1459
  6. data/LICENSE.txt +1 -1
  7. data/README.md +2 -2
  8. data/lib/polars/3.1/polars.so +0 -0
  9. data/lib/polars/3.2/polars.so +0 -0
  10. data/lib/polars/{3.0 → 3.3}/polars.so +0 -0
  11. data/lib/polars/array_expr.rb +449 -0
  12. data/lib/polars/array_name_space.rb +346 -0
  13. data/lib/polars/cat_expr.rb +24 -0
  14. data/lib/polars/cat_name_space.rb +75 -0
  15. data/lib/polars/config.rb +2 -2
  16. data/lib/polars/data_frame.rb +248 -108
  17. data/lib/polars/data_types.rb +195 -29
  18. data/lib/polars/date_time_expr.rb +41 -24
  19. data/lib/polars/date_time_name_space.rb +12 -12
  20. data/lib/polars/exceptions.rb +12 -1
  21. data/lib/polars/expr.rb +1080 -195
  22. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  23. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  24. data/lib/polars/functions/as_datatype.rb +248 -0
  25. data/lib/polars/functions/col.rb +47 -0
  26. data/lib/polars/functions/eager.rb +182 -0
  27. data/lib/polars/functions/lazy.rb +1280 -0
  28. data/lib/polars/functions/len.rb +49 -0
  29. data/lib/polars/functions/lit.rb +35 -0
  30. data/lib/polars/functions/random.rb +16 -0
  31. data/lib/polars/functions/range/date_range.rb +103 -0
  32. data/lib/polars/functions/range/int_range.rb +51 -0
  33. data/lib/polars/functions/repeat.rb +144 -0
  34. data/lib/polars/functions/whenthen.rb +27 -0
  35. data/lib/polars/functions.rb +29 -416
  36. data/lib/polars/group_by.rb +3 -3
  37. data/lib/polars/io.rb +21 -28
  38. data/lib/polars/lazy_frame.rb +390 -76
  39. data/lib/polars/list_expr.rb +152 -6
  40. data/lib/polars/list_name_space.rb +102 -0
  41. data/lib/polars/meta_expr.rb +175 -7
  42. data/lib/polars/series.rb +557 -59
  43. data/lib/polars/sql_context.rb +1 -1
  44. data/lib/polars/string_cache.rb +75 -0
  45. data/lib/polars/string_expr.rb +412 -96
  46. data/lib/polars/string_name_space.rb +4 -4
  47. data/lib/polars/struct_expr.rb +1 -1
  48. data/lib/polars/struct_name_space.rb +1 -1
  49. data/lib/polars/testing.rb +507 -0
  50. data/lib/polars/utils.rb +64 -20
  51. data/lib/polars/version.rb +1 -1
  52. data/lib/polars.rb +15 -2
  53. metadata +36 -7
  54. data/lib/polars/lazy_functions.rb +0 -1197
@@ -13,432 +13,45 @@ module Polars
13
13
  df.to_dummies(columns: columns)
14
14
  end
15
15
 
16
- # Aggregate multiple Dataframes/Series to a single DataFrame/Series.
16
+ # Aggregate to list.
17
17
  #
18
- # @param items [Object]
19
- # DataFrames/Series/LazyFrames to concatenate.
20
- # @param rechunk [Boolean]
21
- # Make sure that all data is in contiguous memory.
22
- # @param how ["vertical", "vertical_relaxed", "diagonal", "horizontal"]
23
- # LazyFrames do not support the `horizontal` strategy.
24
- #
25
- # - Vertical: applies multiple `vstack` operations.
26
- # - Diagonal: finds a union between the column schemas and fills missing column values with null.
27
- # - Horizontal: stacks Series horizontally and fills with nulls if the lengths don't match.
28
- # @param parallel [Boolean]
29
- # Only relevant for LazyFrames. This determines if the concatenated
30
- # lazy computations may be executed in parallel.
31
- #
32
- # @return [Object]
33
- #
34
- # @example
35
- # df1 = Polars::DataFrame.new({"a" => [1], "b" => [3]})
36
- # df2 = Polars::DataFrame.new({"a" => [2], "b" => [4]})
37
- # Polars.concat([df1, df2])
38
- # # =>
39
- # # shape: (2, 2)
40
- # # ┌─────┬─────┐
41
- # # │ a ┆ b │
42
- # # │ --- ┆ --- │
43
- # # │ i64 ┆ i64 │
44
- # # ╞═════╪═════╡
45
- # # │ 1 ┆ 3 │
46
- # # │ 2 ┆ 4 │
47
- # # └─────┴─────┘
48
- def concat(items, rechunk: true, how: "vertical", parallel: true)
49
- if items.empty?
50
- raise ArgumentError, "cannot concat empty list"
51
- end
52
-
53
- first = items[0]
54
- if first.is_a?(DataFrame)
55
- if how == "vertical"
56
- out = Utils.wrap_df(_concat_df(items))
57
- elsif how == "diagonal"
58
- out = Utils.wrap_df(_concat_df_diagonal(items))
59
- elsif how == "horizontal"
60
- out = Utils.wrap_df(_concat_df_horizontal(items))
61
- else
62
- raise ArgumentError, "how must be one of {{'vertical', 'diagonal', 'horizontal'}}, got #{how}"
63
- end
64
- elsif first.is_a?(LazyFrame)
65
- if how == "vertical"
66
- return Utils.wrap_ldf(_concat_lf(items, rechunk, parallel, false))
67
- elsif how == "vertical_relaxed"
68
- return Utils.wrap_ldf(_concat_lf(items, rechunk, parallel, true))
69
- elsif how == "diagonal"
70
- return Utils.wrap_ldf(_concat_lf_diagonal(items, rechunk, parallel, false))
71
- else
72
- raise ArgumentError, "Lazy only allows 'vertical', 'vertical_relaxed', and 'diagonal' concat strategy."
73
- end
74
- elsif first.is_a?(Series)
75
- # TODO
76
- out = Utils.wrap_s(_concat_series(items))
77
- elsif first.is_a?(Expr)
78
- out = first
79
- items[1..-1].each do |e|
80
- out = out.append(e)
81
- end
82
- else
83
- raise ArgumentError, "did not expect type: #{first.class.name} in 'Polars.concat'."
84
- end
85
-
86
- if rechunk
87
- out.rechunk
88
- else
89
- out
90
- end
91
- end
92
-
93
- # Create a range of type `Datetime` (or `Date`).
94
- #
95
- # @param start [Object]
96
- # Lower bound of the date range.
97
- # @param stop [Object]
98
- # Upper bound of the date range.
99
- # @param interval [Object]
100
- # Interval periods. It can be a polars duration string, such as `3d12h4m25s`
101
- # representing 3 days, 12 hours, 4 minutes, and 25 seconds.
102
- # @param lazy [Boolean]
103
- # Return an expression.
104
- # @param closed ["both", "left", "right", "none"]
105
- # Define whether the temporal window interval is closed or not.
106
- # @param name [String]
107
- # Name of the output Series.
108
- # @param time_unit [nil, "ns", "us", "ms"]
109
- # Set the time unit.
110
- # @param time_zone [String]
111
- # Optional timezone
112
- #
113
- # @return [Object]
114
- #
115
- # @note
116
- # If both `low` and `high` are passed as date types (not datetime), and the
117
- # interval granularity is no finer than 1d, the returned range is also of
118
- # type date. All other permutations return a datetime Series.
119
- #
120
- # @example Using polars duration string to specify the interval
121
- # Polars.date_range(Date.new(2022, 1, 1), Date.new(2022, 3, 1), "1mo", name: "drange")
122
- # # =>
123
- # # shape: (3,)
124
- # # Series: 'drange' [date]
125
- # # [
126
- # # 2022-01-01
127
- # # 2022-02-01
128
- # # 2022-03-01
129
- # # ]
130
- #
131
- # @example Using `timedelta` object to specify the interval:
132
- # Polars.date_range(
133
- # DateTime.new(1985, 1, 1),
134
- # DateTime.new(1985, 1, 10),
135
- # "1d12h",
136
- # time_unit: "ms"
137
- # )
138
- # # =>
139
- # # shape: (7,)
140
- # # Series: '' [datetime[ms]]
141
- # # [
142
- # # 1985-01-01 00:00:00
143
- # # 1985-01-02 12:00:00
144
- # # 1985-01-04 00:00:00
145
- # # 1985-01-05 12:00:00
146
- # # 1985-01-07 00:00:00
147
- # # 1985-01-08 12:00:00
148
- # # 1985-01-10 00:00:00
149
- # # ]
150
- def date_range(
151
- start,
152
- stop,
153
- interval,
154
- lazy: false,
155
- closed: "both",
156
- name: nil,
157
- time_unit: nil,
158
- time_zone: nil
159
- )
160
- if defined?(ActiveSupport::Duration) && interval.is_a?(ActiveSupport::Duration)
161
- raise Todo
162
- else
163
- interval = interval.to_s
164
- if interval.include?(" ")
165
- interval = interval.gsub(" ", "")
166
- end
167
- end
168
-
169
- if time_unit.nil?
170
- if interval.include?("ns")
171
- time_unit = "ns"
172
- else
173
- time_unit = "us"
174
- end
175
- end
176
-
177
- start_rbexpr = Utils.parse_as_expression(start)
178
- stop_rbexpr = Utils.parse_as_expression(stop)
179
-
180
- result = Utils.wrap_expr(
181
- _rb_date_range(start_rbexpr, stop_rbexpr, interval, closed, time_unit, time_zone)
182
- )
183
-
184
- result = result.alias(name.to_s)
185
-
186
- if !lazy
187
- return select(result).to_series
188
- end
189
-
190
- result
191
- end
192
-
193
- # Bin values into discrete values.
194
- #
195
- # @param s [Series]
196
- # Series to bin.
197
- # @param bins [Array]
198
- # Bins to create.
199
- # @param labels [Array]
200
- # Labels to assign to the bins. If given the length of labels must be
201
- # len(bins) + 1.
202
- # @param break_point_label [String]
203
- # Name given to the breakpoint column.
204
- # @param category_label [String]
205
- # Name given to the category column.
206
- #
207
- # @return [DataFrame]
208
- #
209
- # @note
210
- # This functionality is experimental and may change without it being considered a
211
- # breaking change.
212
- #
213
- # @example
214
- # a = Polars::Series.new("a", 13.times.map { |i| (-30 + i * 5) / 10.0 })
215
- # Polars.cut(a, [-1, 1])
216
- # # =>
217
- # # shape: (12, 3)
218
- # # ┌──────┬─────────────┬──────────────┐
219
- # # │ a ┆ break_point ┆ category │
220
- # # │ --- ┆ --- ┆ --- │
221
- # # │ f64 ┆ f64 ┆ cat │
222
- # # ╞══════╪═════════════╪══════════════╡
223
- # # │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │
224
- # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
225
- # # │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │
226
- # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
227
- # # │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │
228
- # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
229
- # # │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │
230
- # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
231
- # # │ ... ┆ ... ┆ ... │
232
- # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
233
- # # │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │
234
- # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
235
- # # │ 1.5 ┆ inf ┆ (1.0, inf] │
236
- # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
237
- # # │ 2.0 ┆ inf ┆ (1.0, inf] │
238
- # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
239
- # # │ 2.5 ┆ inf ┆ (1.0, inf] │
240
- # # └──────┴─────────────┴──────────────┘
241
- # def cut(
242
- # s,
243
- # bins,
244
- # labels: nil,
245
- # break_point_label: "break_point",
246
- # category_label: "category"
247
- # )
248
- # var_nm = s.name
249
-
250
- # cuts_df = DataFrame.new(
251
- # [
252
- # Series.new(
253
- # break_point_label, bins, dtype: :f64
254
- # ).extend_constant(Float::INFINITY, 1)
255
- # ]
256
- # )
257
-
258
- # if labels
259
- # if labels.length != bins.length + 1
260
- # raise ArgumentError, "expected more labels"
261
- # end
262
- # cuts_df = cuts_df.with_column(Series.new(category_label, labels))
263
- # else
264
- # cuts_df = cuts_df.with_column(
265
- # Polars.format(
266
- # "({}, {}]",
267
- # Polars.col(break_point_label).shift_and_fill(1, -Float::INFINITY),
268
- # Polars.col(break_point_label)
269
- # ).alias(category_label)
270
- # )
271
- # end
272
-
273
- # cuts_df = cuts_df.with_column(Polars.col(category_label).cast(:cat))
274
-
275
- # s.cast(:f64)
276
- # .sort
277
- # .to_frame
278
- # .join_asof(
279
- # cuts_df,
280
- # left_on: var_nm,
281
- # right_on: break_point_label,
282
- # strategy: "forward"
283
- # )
284
- # end
285
-
286
- # Align a sequence of frames using the uique values from one or more columns as a key.
287
- #
288
- # Frames that do not contain the given key values have rows injected (with nulls
289
- # filling the non-key columns), and each resulting frame is sorted by the key.
290
- #
291
- # The original column order of input frames is not changed unless ``select`` is
292
- # specified (in which case the final column order is determined from that).
293
- #
294
- # Note that this does not result in a joined frame - you receive the same number
295
- # of frames back that you passed in, but each is now aligned by key and has
296
- # the same number of rows.
297
- #
298
- # @param frames [Array]
299
- # Sequence of DataFrames or LazyFrames.
300
- # @param on [Object]
301
- # One or more columns whose unique values will be used to align the frames.
302
- # @param select [Object]
303
- # Optional post-alignment column select to constrain and/or order
304
- # the columns returned from the newly aligned frames.
305
- # @param reverse [Object]
306
- # Sort the alignment column values in descending order; can be a single
307
- # boolean or a list of booleans associated with each column in `on`.
308
- #
309
- # @return [Object]
310
- #
311
- # @example
312
- # df1 = Polars::DataFrame.new(
313
- # {
314
- # "dt" => [Date.new(2022, 9, 1), Date.new(2022, 9, 2), Date.new(2022, 9, 3)],
315
- # "x" => [3.5, 4.0, 1.0],
316
- # "y" => [10.0, 2.5, 1.5]
317
- # }
318
- # )
319
- # df2 = Polars::DataFrame.new(
320
- # {
321
- # "dt" => [Date.new(2022, 9, 2), Date.new(2022, 9, 3), Date.new(2022, 9, 1)],
322
- # "x" => [8.0, 1.0, 3.5],
323
- # "y" => [1.5, 12.0, 5.0]
324
- # }
325
- # )
326
- # df3 = Polars::DataFrame.new(
327
- # {
328
- # "dt" => [Date.new(2022, 9, 3), Date.new(2022, 9, 2)],
329
- # "x" => [2.0, 5.0],
330
- # "y" => [2.5, 2.0]
331
- # }
332
- # )
333
- # af1, af2, af3 = Polars.align_frames(
334
- # df1, df2, df3, on: "dt", select: ["x", "y"]
335
- # )
336
- # (af1 * af2 * af3).fill_null(0).select(Polars.sum(Polars.col("*")).alias("dot"))
337
- # # =>
338
- # # shape: (3, 1)
339
- # # ┌───────┐
340
- # # │ dot │
341
- # # │ --- │
342
- # # │ f64 │
343
- # # ╞═══════╡
344
- # # │ 0.0 │
345
- # # ├╌╌╌╌╌╌╌┤
346
- # # │ 167.5 │
347
- # # ├╌╌╌╌╌╌╌┤
348
- # # │ 47.0 │
349
- # # └───────┘
350
- def align_frames(
351
- *frames,
352
- on:,
353
- select: nil,
354
- reverse: false
355
- )
356
- if frames.empty?
357
- return []
358
- elsif frames.map(&:class).uniq.length != 1
359
- raise TypeError, "Input frames must be of a consistent type (all LazyFrame or all DataFrame)"
360
- end
361
-
362
- # establish the superset of all "on" column values, sort, and cache
363
- eager = frames[0].is_a?(DataFrame)
364
- alignment_frame = (
365
- concat(frames.map { |df| df.lazy.select(on) })
366
- .unique(maintain_order: false)
367
- .sort(on, reverse: reverse)
368
- )
369
- alignment_frame = (
370
- eager ? alignment_frame.collect.lazy : alignment_frame.cache
371
- )
372
- # finally, align all frames
373
- aligned_frames =
374
- frames.map do |df|
375
- alignment_frame.join(
376
- df.lazy,
377
- on: alignment_frame.columns,
378
- how: "left"
379
- ).select(df.columns)
380
- end
381
- if !select.nil?
382
- aligned_frames = aligned_frames.map { |df| df.select(select) }
383
- end
384
-
385
- eager ? aligned_frames.map(&:collect) : aligned_frames
18
+ # @return [Expr]
19
+ def to_list(name)
20
+ col(name).list
386
21
  end
387
22
 
388
- # Return a new Series of given length and type, filled with ones.
23
+ # Compute the spearman rank correlation between two columns.
389
24
  #
390
- # @param n [Integer]
391
- # Number of elements in the `Series`
392
- # @param dtype [Symbol]
393
- # DataType of the elements, defaults to `:f64`
25
+ # Missing data will be excluded from the computation.
394
26
  #
395
- # @return [Series]
27
+ # @param a [Object]
28
+ # Column name or Expression.
29
+ # @param b [Object]
30
+ # Column name or Expression.
31
+ # @param ddof [Integer]
32
+ # Delta degrees of freedom
33
+ # @param propagate_nans [Boolean]
34
+ # If `True` any `NaN` encountered will lead to `NaN` in the output.
35
+ # Defaults to `False` where `NaN` are regarded as larger than any finite number
36
+ # and thus lead to the highest rank.
396
37
  #
397
- # @note
398
- # In the lazy API you should probably not use this, but use `lit(1)`
399
- # instead.
400
- def ones(n, dtype: nil)
401
- s = Series.new([1.0])
402
- if dtype
403
- s = s.cast(dtype)
404
- end
405
- s.new_from_index(0, n)
38
+ # @return [Expr]
39
+ def spearman_rank_corr(a, b, ddof: 1, propagate_nans: false)
40
+ corr(a, b, method: "spearman", ddof: ddof, propagate_nans: propagate_nans)
406
41
  end
407
42
 
408
- # Return a new Series of given length and type, filled with zeros.
409
- #
410
- # @param n [Integer]
411
- # Number of elements in the `Series`
412
- # @param dtype [Symbol]
413
- # DataType of the elements, defaults to `:f64`
43
+ # Compute the pearson's correlation between two columns.
414
44
  #
415
- # @return [Series]
45
+ # @param a [Object]
46
+ # Column name or Expression.
47
+ # @param b [Object]
48
+ # Column name or Expression.
49
+ # @param ddof [Integer]
50
+ # Delta degrees of freedom
416
51
  #
417
- # @note
418
- # In the lazy API you should probably not use this, but use `lit(0)`
419
- # instead.
420
- def zeros(n, dtype: nil)
421
- s = Series.new([0.0])
422
- if dtype
423
- s = s.cast(dtype)
424
- end
425
- s.new_from_index(0, n)
426
- end
427
-
428
- private
429
-
430
- def _ensure_datetime(value)
431
- is_date_type = false
432
- if !value.is_a?(::DateTime)
433
- value = ::DateTime.new(value.year, value.month, value.day)
434
- is_date_type = true
435
- end
436
- [value, is_date_type]
437
- end
438
-
439
- # TODO
440
- def _interval_granularity(interval)
441
- interval
52
+ # @return [Expr]
53
+ def pearson_corr(a, b, ddof: 1)
54
+ corr(a, b, method: "pearson", ddof: ddof)
442
55
  end
443
56
  end
444
57
  end
@@ -38,7 +38,7 @@ module Polars
38
38
  temp_col = "__POLARS_GB_GROUP_INDICES"
39
39
  groups_df =
40
40
  @df.lazy
41
- .with_row_count(name: temp_col)
41
+ .with_row_index(name: temp_col)
42
42
  .group_by(@by, maintain_order: @maintain_order)
43
43
  .agg(Polars.col(temp_col))
44
44
  .collect(no_optimization: true)
@@ -47,7 +47,7 @@ module Polars
47
47
 
48
48
  # When grouping by a single column, group name is a single value
49
49
  # When grouping by multiple columns, group name is a tuple of values
50
- if @by.is_a?(String) || @by.is_a?(Expr)
50
+ if @by.is_a?(::String) || @by.is_a?(Expr)
51
51
  _group_names = group_names.to_series.each
52
52
  else
53
53
  _group_names = group_names.iter_rows
@@ -415,7 +415,7 @@ module Polars
415
415
  # # │ Banana ┆ 2 │
416
416
  # # └────────┴───────┘
417
417
  def count
418
- agg(Polars.count)
418
+ agg(Polars.len.alias("count"))
419
419
  end
420
420
 
421
421
  # Reduce the groups to the mean values.
data/lib/polars/io.rb CHANGED
@@ -115,10 +115,10 @@ module Polars
115
115
  sample_size: 1024,
116
116
  eol_char: "\n"
117
117
  )
118
- _check_arg_is_1byte("sep", sep, false)
119
- _check_arg_is_1byte("comment_char", comment_char, false)
120
- _check_arg_is_1byte("quote_char", quote_char, true)
121
- _check_arg_is_1byte("eol_char", eol_char, false)
118
+ Utils._check_arg_is_1byte("sep", sep, false)
119
+ Utils._check_arg_is_1byte("comment_char", comment_char, false)
120
+ Utils._check_arg_is_1byte("quote_char", quote_char, true)
121
+ Utils._check_arg_is_1byte("eol_char", eol_char, false)
122
122
 
123
123
  projection, columns = Utils.handle_projection_columns(columns)
124
124
 
@@ -264,9 +264,9 @@ module Polars
264
264
  parse_dates: false,
265
265
  eol_char: "\n"
266
266
  )
267
- _check_arg_is_1byte("sep", sep, false)
268
- _check_arg_is_1byte("comment_char", comment_char, false)
269
- _check_arg_is_1byte("quote_char", quote_char, true)
267
+ Utils._check_arg_is_1byte("sep", sep, false)
268
+ Utils._check_arg_is_1byte("comment_char", comment_char, false)
269
+ Utils._check_arg_is_1byte("quote_char", quote_char, true)
270
270
 
271
271
  if Utils.pathlike?(source)
272
272
  source = Utils.normalise_filepath(source)
@@ -604,9 +604,12 @@ module Polars
604
604
  #
605
605
  # @param query [Object]
606
606
  # ActiveRecord::Relation or ActiveRecord::Result.
607
+ # @param schema_overrides [Hash]
608
+ # A hash mapping column names to dtypes, used to override the schema
609
+ # inferred from the query.
607
610
  #
608
611
  # @return [DataFrame]
609
- def read_database(query)
612
+ def read_database(query, schema_overrides: nil)
610
613
  if !defined?(ActiveRecord)
611
614
  raise Error, "Active Record not available"
612
615
  end
@@ -616,14 +619,14 @@ module Polars
616
619
  query
617
620
  elsif query.is_a?(ActiveRecord::Relation)
618
621
  query.connection.select_all(query.to_sql)
619
- elsif query.is_a?(String)
622
+ elsif query.is_a?(::String)
620
623
  ActiveRecord::Base.connection.select_all(query)
621
624
  else
622
625
  raise ArgumentError, "Expected ActiveRecord::Relation, ActiveRecord::Result, or String"
623
626
  end
624
627
 
625
628
  data = {}
626
- schema_overrides = {}
629
+ schema_overrides = (schema_overrides || {}).transform_keys(&:to_s)
627
630
 
628
631
  result.columns.each_with_index do |k, i|
629
632
  column_type = result.column_types[i]
@@ -652,12 +655,15 @@ module Polars
652
655
  when :integer
653
656
  Int64
654
657
  when :string, :text
655
- Utf8
658
+ String
656
659
  when :time
657
660
  Time
661
+ # TODO fix issue with null
662
+ # when :json, :jsonb
663
+ # Struct
658
664
  end
659
665
 
660
- schema_overrides[k] = polars_type if polars_type
666
+ schema_overrides[k] ||= polars_type if polars_type
661
667
  end
662
668
 
663
669
  DataFrame.new(data, schema_overrides: schema_overrides)
@@ -836,7 +842,7 @@ module Polars
836
842
  source = Utils.normalise_filepath(source)
837
843
  end
838
844
 
839
- _ipc_schema(source)
845
+ Plr.ipc_schema(source)
840
846
  end
841
847
 
842
848
  # Get a schema of the Parquet file without reading data.
@@ -850,13 +856,13 @@ module Polars
850
856
  source = Utils.normalise_filepath(source)
851
857
  end
852
858
 
853
- _parquet_schema(source)
859
+ Plr.parquet_schema(source)
854
860
  end
855
861
 
856
862
  private
857
863
 
858
864
  def _prepare_file_arg(file)
859
- if file.is_a?(String) && file =~ /\Ahttps?:\/\//
865
+ if file.is_a?(::String) && file =~ /\Ahttps?:\/\//
860
866
  raise ArgumentError, "use URI(...) for remote files"
861
867
  end
862
868
 
@@ -868,18 +874,5 @@ module Polars
868
874
 
869
875
  yield file
870
876
  end
871
-
872
- def _check_arg_is_1byte(arg_name, arg, can_be_empty = false)
873
- if arg.is_a?(String)
874
- arg_byte_length = arg.bytesize
875
- if can_be_empty
876
- if arg_byte_length > 1
877
- raise ArgumentError, "#{arg_name} should be a single byte character or empty, but is #{arg_byte_length} bytes long."
878
- end
879
- elsif arg_byte_length != 1
880
- raise ArgumentError, "#{arg_name} should be a single byte character, but is #{arg_byte_length} bytes long."
881
- end
882
- end
883
- end
884
877
  end
885
878
  end