polars-df 0.8.0-arm64-darwin → 0.10.0-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +42 -1
  3. data/Cargo.lock +159 -66
  4. data/Cargo.toml +0 -3
  5. data/LICENSE-THIRD-PARTY.txt +3112 -1613
  6. data/LICENSE.txt +1 -1
  7. data/README.md +3 -2
  8. data/lib/polars/3.1/polars.bundle +0 -0
  9. data/lib/polars/3.2/polars.bundle +0 -0
  10. data/lib/polars/3.3/polars.bundle +0 -0
  11. data/lib/polars/array_expr.rb +453 -0
  12. data/lib/polars/array_name_space.rb +346 -0
  13. data/lib/polars/batched_csv_reader.rb +4 -2
  14. data/lib/polars/cat_expr.rb +24 -0
  15. data/lib/polars/cat_name_space.rb +75 -0
  16. data/lib/polars/config.rb +2 -2
  17. data/lib/polars/data_frame.rb +306 -96
  18. data/lib/polars/data_types.rb +191 -28
  19. data/lib/polars/date_time_expr.rb +41 -18
  20. data/lib/polars/date_time_name_space.rb +9 -3
  21. data/lib/polars/exceptions.rb +12 -1
  22. data/lib/polars/expr.rb +898 -215
  23. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  24. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  25. data/lib/polars/functions/as_datatype.rb +248 -0
  26. data/lib/polars/functions/col.rb +47 -0
  27. data/lib/polars/functions/eager.rb +182 -0
  28. data/lib/polars/functions/lazy.rb +1280 -0
  29. data/lib/polars/functions/len.rb +49 -0
  30. data/lib/polars/functions/lit.rb +35 -0
  31. data/lib/polars/functions/random.rb +16 -0
  32. data/lib/polars/functions/range/date_range.rb +103 -0
  33. data/lib/polars/functions/range/int_range.rb +51 -0
  34. data/lib/polars/functions/repeat.rb +144 -0
  35. data/lib/polars/functions/whenthen.rb +96 -0
  36. data/lib/polars/functions.rb +29 -416
  37. data/lib/polars/group_by.rb +2 -2
  38. data/lib/polars/io.rb +36 -31
  39. data/lib/polars/lazy_frame.rb +405 -88
  40. data/lib/polars/list_expr.rb +158 -8
  41. data/lib/polars/list_name_space.rb +102 -0
  42. data/lib/polars/meta_expr.rb +175 -7
  43. data/lib/polars/series.rb +282 -41
  44. data/lib/polars/string_cache.rb +75 -0
  45. data/lib/polars/string_expr.rb +413 -96
  46. data/lib/polars/string_name_space.rb +4 -4
  47. data/lib/polars/testing.rb +507 -0
  48. data/lib/polars/utils.rb +106 -8
  49. data/lib/polars/version.rb +1 -1
  50. data/lib/polars/whenthen.rb +83 -0
  51. data/lib/polars.rb +16 -4
  52. metadata +34 -6
  53. data/lib/polars/lazy_functions.rb +0 -1181
  54. data/lib/polars/when.rb +0 -16
  55. data/lib/polars/when_then.rb +0 -19
@@ -13,432 +13,45 @@ module Polars
13
13
  df.to_dummies(columns: columns)
14
14
  end
15
15
 
16
- # Aggregate multiple Dataframes/Series to a single DataFrame/Series.
16
+ # Aggregate to list.
17
17
  #
18
- # @param items [Object]
19
- # DataFrames/Series/LazyFrames to concatenate.
20
- # @param rechunk [Boolean]
21
- # Make sure that all data is in contiguous memory.
22
- # @param how ["vertical", "vertical_relaxed", "diagonal", "horizontal"]
23
- # LazyFrames do not support the `horizontal` strategy.
24
- #
25
- # - Vertical: applies multiple `vstack` operations.
26
- # - Diagonal: finds a union between the column schemas and fills missing column values with null.
27
- # - Horizontal: stacks Series horizontally and fills with nulls if the lengths don't match.
28
- # @param parallel [Boolean]
29
- # Only relevant for LazyFrames. This determines if the concatenated
30
- # lazy computations may be executed in parallel.
31
- #
32
- # @return [Object]
33
- #
34
- # @example
35
- # df1 = Polars::DataFrame.new({"a" => [1], "b" => [3]})
36
- # df2 = Polars::DataFrame.new({"a" => [2], "b" => [4]})
37
- # Polars.concat([df1, df2])
38
- # # =>
39
- # # shape: (2, 2)
40
- # # ┌─────┬─────┐
41
- # # │ a ┆ b │
42
- # # │ --- ┆ --- │
43
- # # │ i64 ┆ i64 │
44
- # # ╞═════╪═════╡
45
- # # │ 1 ┆ 3 │
46
- # # │ 2 ┆ 4 │
47
- # # └─────┴─────┘
48
- def concat(items, rechunk: true, how: "vertical", parallel: true)
49
- if items.empty?
50
- raise ArgumentError, "cannot concat empty list"
51
- end
52
-
53
- first = items[0]
54
- if first.is_a?(DataFrame)
55
- if how == "vertical"
56
- out = Utils.wrap_df(_concat_df(items))
57
- elsif how == "diagonal"
58
- out = Utils.wrap_df(_concat_df_diagonal(items))
59
- elsif how == "horizontal"
60
- out = Utils.wrap_df(_concat_df_horizontal(items))
61
- else
62
- raise ArgumentError, "how must be one of {{'vertical', 'diagonal', 'horizontal'}}, got #{how}"
63
- end
64
- elsif first.is_a?(LazyFrame)
65
- if how == "vertical"
66
- return Utils.wrap_ldf(_concat_lf(items, rechunk, parallel, false))
67
- elsif how == "vertical_relaxed"
68
- return Utils.wrap_ldf(_concat_lf(items, rechunk, parallel, true))
69
- elsif how == "diagonal"
70
- return Utils.wrap_ldf(_concat_lf_diagonal(items, rechunk, parallel, false))
71
- else
72
- raise ArgumentError, "Lazy only allows 'vertical', 'vertical_relaxed', and 'diagonal' concat strategy."
73
- end
74
- elsif first.is_a?(Series)
75
- # TODO
76
- out = Utils.wrap_s(_concat_series(items))
77
- elsif first.is_a?(Expr)
78
- out = first
79
- items[1..-1].each do |e|
80
- out = out.append(e)
81
- end
82
- else
83
- raise ArgumentError, "did not expect type: #{first.class.name} in 'Polars.concat'."
84
- end
85
-
86
- if rechunk
87
- out.rechunk
88
- else
89
- out
90
- end
91
- end
92
-
93
- # Create a range of type `Datetime` (or `Date`).
94
- #
95
- # @param start [Object]
96
- # Lower bound of the date range.
97
- # @param stop [Object]
98
- # Upper bound of the date range.
99
- # @param interval [Object]
100
- # Interval periods. It can be a polars duration string, such as `3d12h4m25s`
101
- # representing 3 days, 12 hours, 4 minutes, and 25 seconds.
102
- # @param lazy [Boolean]
103
- # Return an expression.
104
- # @param closed ["both", "left", "right", "none"]
105
- # Define whether the temporal window interval is closed or not.
106
- # @param name [String]
107
- # Name of the output Series.
108
- # @param time_unit [nil, "ns", "us", "ms"]
109
- # Set the time unit.
110
- # @param time_zone [String]
111
- # Optional timezone
112
- #
113
- # @return [Object]
114
- #
115
- # @note
116
- # If both `low` and `high` are passed as date types (not datetime), and the
117
- # interval granularity is no finer than 1d, the returned range is also of
118
- # type date. All other permutations return a datetime Series.
119
- #
120
- # @example Using polars duration string to specify the interval
121
- # Polars.date_range(Date.new(2022, 1, 1), Date.new(2022, 3, 1), "1mo", name: "drange")
122
- # # =>
123
- # # shape: (3,)
124
- # # Series: 'drange' [date]
125
- # # [
126
- # # 2022-01-01
127
- # # 2022-02-01
128
- # # 2022-03-01
129
- # # ]
130
- #
131
- # @example Using `timedelta` object to specify the interval:
132
- # Polars.date_range(
133
- # DateTime.new(1985, 1, 1),
134
- # DateTime.new(1985, 1, 10),
135
- # "1d12h",
136
- # time_unit: "ms"
137
- # )
138
- # # =>
139
- # # shape: (7,)
140
- # # Series: '' [datetime[ms]]
141
- # # [
142
- # # 1985-01-01 00:00:00
143
- # # 1985-01-02 12:00:00
144
- # # 1985-01-04 00:00:00
145
- # # 1985-01-05 12:00:00
146
- # # 1985-01-07 00:00:00
147
- # # 1985-01-08 12:00:00
148
- # # 1985-01-10 00:00:00
149
- # # ]
150
- def date_range(
151
- start,
152
- stop,
153
- interval,
154
- lazy: false,
155
- closed: "both",
156
- name: nil,
157
- time_unit: nil,
158
- time_zone: nil
159
- )
160
- if defined?(ActiveSupport::Duration) && interval.is_a?(ActiveSupport::Duration)
161
- raise Todo
162
- else
163
- interval = interval.to_s
164
- if interval.include?(" ")
165
- interval = interval.gsub(" ", "")
166
- end
167
- end
168
-
169
- if time_unit.nil?
170
- if interval.include?("ns")
171
- time_unit = "ns"
172
- else
173
- time_unit = "us"
174
- end
175
- end
176
-
177
- start_rbexpr = Utils.parse_as_expression(start)
178
- stop_rbexpr = Utils.parse_as_expression(stop)
179
-
180
- result = Utils.wrap_expr(
181
- _rb_date_range(start_rbexpr, stop_rbexpr, interval, closed, time_unit, time_zone)
182
- )
183
-
184
- result = result.alias(name.to_s)
185
-
186
- if !lazy
187
- return select(result).to_series
188
- end
189
-
190
- result
191
- end
192
-
193
- # Bin values into discrete values.
194
- #
195
- # @param s [Series]
196
- # Series to bin.
197
- # @param bins [Array]
198
- # Bins to create.
199
- # @param labels [Array]
200
- # Labels to assign to the bins. If given the length of labels must be
201
- # len(bins) + 1.
202
- # @param break_point_label [String]
203
- # Name given to the breakpoint column.
204
- # @param category_label [String]
205
- # Name given to the category column.
206
- #
207
- # @return [DataFrame]
208
- #
209
- # @note
210
- # This functionality is experimental and may change without it being considered a
211
- # breaking change.
212
- #
213
- # @example
214
- # a = Polars::Series.new("a", 13.times.map { |i| (-30 + i * 5) / 10.0 })
215
- # Polars.cut(a, [-1, 1])
216
- # # =>
217
- # # shape: (12, 3)
218
- # # ┌──────┬─────────────┬──────────────┐
219
- # # │ a ┆ break_point ┆ category │
220
- # # │ --- ┆ --- ┆ --- │
221
- # # │ f64 ┆ f64 ┆ cat │
222
- # # ╞══════╪═════════════╪══════════════╡
223
- # # │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │
224
- # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
225
- # # │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │
226
- # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
227
- # # │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │
228
- # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
229
- # # │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │
230
- # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
231
- # # │ ... ┆ ... ┆ ... │
232
- # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
233
- # # │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │
234
- # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
235
- # # │ 1.5 ┆ inf ┆ (1.0, inf] │
236
- # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
237
- # # │ 2.0 ┆ inf ┆ (1.0, inf] │
238
- # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
239
- # # │ 2.5 ┆ inf ┆ (1.0, inf] │
240
- # # └──────┴─────────────┴──────────────┘
241
- # def cut(
242
- # s,
243
- # bins,
244
- # labels: nil,
245
- # break_point_label: "break_point",
246
- # category_label: "category"
247
- # )
248
- # var_nm = s.name
249
-
250
- # cuts_df = DataFrame.new(
251
- # [
252
- # Series.new(
253
- # break_point_label, bins, dtype: :f64
254
- # ).extend_constant(Float::INFINITY, 1)
255
- # ]
256
- # )
257
-
258
- # if labels
259
- # if labels.length != bins.length + 1
260
- # raise ArgumentError, "expected more labels"
261
- # end
262
- # cuts_df = cuts_df.with_column(Series.new(category_label, labels))
263
- # else
264
- # cuts_df = cuts_df.with_column(
265
- # Polars.format(
266
- # "({}, {}]",
267
- # Polars.col(break_point_label).shift_and_fill(1, -Float::INFINITY),
268
- # Polars.col(break_point_label)
269
- # ).alias(category_label)
270
- # )
271
- # end
272
-
273
- # cuts_df = cuts_df.with_column(Polars.col(category_label).cast(:cat))
274
-
275
- # s.cast(:f64)
276
- # .sort
277
- # .to_frame
278
- # .join_asof(
279
- # cuts_df,
280
- # left_on: var_nm,
281
- # right_on: break_point_label,
282
- # strategy: "forward"
283
- # )
284
- # end
285
-
286
- # Align a sequence of frames using the uique values from one or more columns as a key.
287
- #
288
- # Frames that do not contain the given key values have rows injected (with nulls
289
- # filling the non-key columns), and each resulting frame is sorted by the key.
290
- #
291
- # The original column order of input frames is not changed unless ``select`` is
292
- # specified (in which case the final column order is determined from that).
293
- #
294
- # Note that this does not result in a joined frame - you receive the same number
295
- # of frames back that you passed in, but each is now aligned by key and has
296
- # the same number of rows.
297
- #
298
- # @param frames [Array]
299
- # Sequence of DataFrames or LazyFrames.
300
- # @param on [Object]
301
- # One or more columns whose unique values will be used to align the frames.
302
- # @param select [Object]
303
- # Optional post-alignment column select to constrain and/or order
304
- # the columns returned from the newly aligned frames.
305
- # @param reverse [Object]
306
- # Sort the alignment column values in descending order; can be a single
307
- # boolean or a list of booleans associated with each column in `on`.
308
- #
309
- # @return [Object]
310
- #
311
- # @example
312
- # df1 = Polars::DataFrame.new(
313
- # {
314
- # "dt" => [Date.new(2022, 9, 1), Date.new(2022, 9, 2), Date.new(2022, 9, 3)],
315
- # "x" => [3.5, 4.0, 1.0],
316
- # "y" => [10.0, 2.5, 1.5]
317
- # }
318
- # )
319
- # df2 = Polars::DataFrame.new(
320
- # {
321
- # "dt" => [Date.new(2022, 9, 2), Date.new(2022, 9, 3), Date.new(2022, 9, 1)],
322
- # "x" => [8.0, 1.0, 3.5],
323
- # "y" => [1.5, 12.0, 5.0]
324
- # }
325
- # )
326
- # df3 = Polars::DataFrame.new(
327
- # {
328
- # "dt" => [Date.new(2022, 9, 3), Date.new(2022, 9, 2)],
329
- # "x" => [2.0, 5.0],
330
- # "y" => [2.5, 2.0]
331
- # }
332
- # )
333
- # af1, af2, af3 = Polars.align_frames(
334
- # df1, df2, df3, on: "dt", select: ["x", "y"]
335
- # )
336
- # (af1 * af2 * af3).fill_null(0).select(Polars.sum(Polars.col("*")).alias("dot"))
337
- # # =>
338
- # # shape: (3, 1)
339
- # # ┌───────┐
340
- # # │ dot │
341
- # # │ --- │
342
- # # │ f64 │
343
- # # ╞═══════╡
344
- # # │ 0.0 │
345
- # # ├╌╌╌╌╌╌╌┤
346
- # # │ 167.5 │
347
- # # ├╌╌╌╌╌╌╌┤
348
- # # │ 47.0 │
349
- # # └───────┘
350
- def align_frames(
351
- *frames,
352
- on:,
353
- select: nil,
354
- reverse: false
355
- )
356
- if frames.empty?
357
- return []
358
- elsif frames.map(&:class).uniq.length != 1
359
- raise TypeError, "Input frames must be of a consistent type (all LazyFrame or all DataFrame)"
360
- end
361
-
362
- # establish the superset of all "on" column values, sort, and cache
363
- eager = frames[0].is_a?(DataFrame)
364
- alignment_frame = (
365
- concat(frames.map { |df| df.lazy.select(on) })
366
- .unique(maintain_order: false)
367
- .sort(on, reverse: reverse)
368
- )
369
- alignment_frame = (
370
- eager ? alignment_frame.collect.lazy : alignment_frame.cache
371
- )
372
- # finally, align all frames
373
- aligned_frames =
374
- frames.map do |df|
375
- alignment_frame.join(
376
- df.lazy,
377
- on: alignment_frame.columns,
378
- how: "left"
379
- ).select(df.columns)
380
- end
381
- if !select.nil?
382
- aligned_frames = aligned_frames.map { |df| df.select(select) }
383
- end
384
-
385
- eager ? aligned_frames.map(&:collect) : aligned_frames
18
+ # @return [Expr]
19
+ def to_list(name)
20
+ col(name).list
386
21
  end
387
22
 
388
- # Return a new Series of given length and type, filled with ones.
23
+ # Compute the spearman rank correlation between two columns.
389
24
  #
390
- # @param n [Integer]
391
- # Number of elements in the `Series`
392
- # @param dtype [Symbol]
393
- # DataType of the elements, defaults to `:f64`
25
+ # Missing data will be excluded from the computation.
394
26
  #
395
- # @return [Series]
27
+ # @param a [Object]
28
+ # Column name or Expression.
29
+ # @param b [Object]
30
+ # Column name or Expression.
31
+ # @param ddof [Integer]
32
+ # Delta degrees of freedom
33
+ # @param propagate_nans [Boolean]
34
+ # If `True` any `NaN` encountered will lead to `NaN` in the output.
35
+ # Defaults to `False` where `NaN` are regarded as larger than any finite number
36
+ # and thus lead to the highest rank.
396
37
  #
397
- # @note
398
- # In the lazy API you should probably not use this, but use `lit(1)`
399
- # instead.
400
- def ones(n, dtype: nil)
401
- s = Series.new([1.0])
402
- if dtype
403
- s = s.cast(dtype)
404
- end
405
- s.new_from_index(0, n)
38
+ # @return [Expr]
39
+ def spearman_rank_corr(a, b, ddof: 1, propagate_nans: false)
40
+ corr(a, b, method: "spearman", ddof: ddof, propagate_nans: propagate_nans)
406
41
  end
407
42
 
408
- # Return a new Series of given length and type, filled with zeros.
409
- #
410
- # @param n [Integer]
411
- # Number of elements in the `Series`
412
- # @param dtype [Symbol]
413
- # DataType of the elements, defaults to `:f64`
43
+ # Compute the pearson's correlation between two columns.
414
44
  #
415
- # @return [Series]
45
+ # @param a [Object]
46
+ # Column name or Expression.
47
+ # @param b [Object]
48
+ # Column name or Expression.
49
+ # @param ddof [Integer]
50
+ # Delta degrees of freedom
416
51
  #
417
- # @note
418
- # In the lazy API you should probably not use this, but use `lit(0)`
419
- # instead.
420
- def zeros(n, dtype: nil)
421
- s = Series.new([0.0])
422
- if dtype
423
- s = s.cast(dtype)
424
- end
425
- s.new_from_index(0, n)
426
- end
427
-
428
- private
429
-
430
- def _ensure_datetime(value)
431
- is_date_type = false
432
- if !value.is_a?(::DateTime)
433
- value = ::DateTime.new(value.year, value.month, value.day)
434
- is_date_type = true
435
- end
436
- [value, is_date_type]
437
- end
438
-
439
- # TODO
440
- def _interval_granularity(interval)
441
- interval
52
+ # @return [Expr]
53
+ def pearson_corr(a, b, ddof: 1)
54
+ corr(a, b, method: "pearson", ddof: ddof)
442
55
  end
443
56
  end
444
57
  end
@@ -38,7 +38,7 @@ module Polars
38
38
  temp_col = "__POLARS_GB_GROUP_INDICES"
39
39
  groups_df =
40
40
  @df.lazy
41
- .with_row_count(name: temp_col)
41
+ .with_row_index(name: temp_col)
42
42
  .group_by(@by, maintain_order: @maintain_order)
43
43
  .agg(Polars.col(temp_col))
44
44
  .collect(no_optimization: true)
@@ -415,7 +415,7 @@ module Polars
415
415
  # # │ Banana ┆ 2 │
416
416
  # # └────────┴───────┘
417
417
  def count
418
- agg(Polars.count)
418
+ agg(Polars.len.alias("count"))
419
419
  end
420
420
 
421
421
  # Reduce the groups to the mean values.
data/lib/polars/io.rb CHANGED
@@ -80,6 +80,8 @@ module Polars
80
80
  # allocation needed.
81
81
  # @param eol_char [String]
82
82
  # Single byte end of line character.
83
+ # @param truncate_ragged_lines [Boolean]
84
+ # Truncate lines that are longer than the schema.
83
85
  #
84
86
  # @return [DataFrame]
85
87
  #
@@ -113,12 +115,13 @@ module Polars
113
115
  row_count_name: nil,
114
116
  row_count_offset: 0,
115
117
  sample_size: 1024,
116
- eol_char: "\n"
118
+ eol_char: "\n",
119
+ truncate_ragged_lines: false
117
120
  )
118
- _check_arg_is_1byte("sep", sep, false)
119
- _check_arg_is_1byte("comment_char", comment_char, false)
120
- _check_arg_is_1byte("quote_char", quote_char, true)
121
- _check_arg_is_1byte("eol_char", eol_char, false)
121
+ Utils._check_arg_is_1byte("sep", sep, false)
122
+ Utils._check_arg_is_1byte("comment_char", comment_char, false)
123
+ Utils._check_arg_is_1byte("quote_char", quote_char, true)
124
+ Utils._check_arg_is_1byte("eol_char", eol_char, false)
122
125
 
123
126
  projection, columns = Utils.handle_projection_columns(columns)
124
127
 
@@ -161,7 +164,8 @@ module Polars
161
164
  row_count_name: row_count_name,
162
165
  row_count_offset: row_count_offset,
163
166
  sample_size: sample_size,
164
- eol_char: eol_char
167
+ eol_char: eol_char,
168
+ truncate_ragged_lines: truncate_ragged_lines
165
169
  )
166
170
  end
167
171
 
@@ -239,6 +243,8 @@ module Polars
239
243
  # the column remains of data type `:str`.
240
244
  # @param eol_char [String]
241
245
  # Single byte end of line character.
246
+ # @param truncate_ragged_lines [Boolean]
247
+ # Truncate lines that are longer than the schema.
242
248
  #
243
249
  # @return [LazyFrame]
244
250
  def scan_csv(
@@ -262,11 +268,12 @@ module Polars
262
268
  row_count_name: nil,
263
269
  row_count_offset: 0,
264
270
  parse_dates: false,
265
- eol_char: "\n"
271
+ eol_char: "\n",
272
+ truncate_ragged_lines: false
266
273
  )
267
- _check_arg_is_1byte("sep", sep, false)
268
- _check_arg_is_1byte("comment_char", comment_char, false)
269
- _check_arg_is_1byte("quote_char", quote_char, true)
274
+ Utils._check_arg_is_1byte("sep", sep, false)
275
+ Utils._check_arg_is_1byte("comment_char", comment_char, false)
276
+ Utils._check_arg_is_1byte("quote_char", quote_char, true)
270
277
 
271
278
  if Utils.pathlike?(source)
272
279
  source = Utils.normalise_filepath(source)
@@ -294,6 +301,7 @@ module Polars
294
301
  row_count_offset: row_count_offset,
295
302
  parse_dates: parse_dates,
296
303
  eol_char: eol_char,
304
+ truncate_ragged_lines: truncate_ragged_lines
297
305
  )
298
306
  end
299
307
 
@@ -520,7 +528,7 @@ module Polars
520
528
 
521
529
  # Read into a DataFrame from a parquet file.
522
530
  #
523
- # @param source [Object]
531
+ # @param source [String, Pathname, StringIO]
524
532
  # Path to a file or a file-like object.
525
533
  # @param columns [Object]
526
534
  # Columns to select. Accepts a list of column indices (starting at zero) or a list
@@ -604,9 +612,12 @@ module Polars
604
612
  #
605
613
  # @param query [Object]
606
614
  # ActiveRecord::Relation or ActiveRecord::Result.
615
+ # @param schema_overrides [Hash]
616
+ # A hash mapping column names to dtypes, used to override the schema
617
+ # inferred from the query.
607
618
  #
608
619
  # @return [DataFrame]
609
- def read_database(query)
620
+ def read_database(query, schema_overrides: nil)
610
621
  if !defined?(ActiveRecord)
611
622
  raise Error, "Active Record not available"
612
623
  end
@@ -623,7 +634,7 @@ module Polars
623
634
  end
624
635
 
625
636
  data = {}
626
- schema_overrides = {}
637
+ schema_overrides = (schema_overrides || {}).transform_keys(&:to_s)
627
638
 
628
639
  result.columns.each_with_index do |k, i|
629
640
  column_type = result.column_types[i]
@@ -655,9 +666,12 @@ module Polars
655
666
  String
656
667
  when :time
657
668
  Time
669
+ # TODO fix issue with null
670
+ # when :json, :jsonb
671
+ # Struct
658
672
  end
659
673
 
660
- schema_overrides[k] = polars_type if polars_type
674
+ schema_overrides[k] ||= polars_type if polars_type
661
675
  end
662
676
 
663
677
  DataFrame.new(data, schema_overrides: schema_overrides)
@@ -749,6 +763,8 @@ module Polars
749
763
  # allocation needed.
750
764
  # @param eol_char [String]
751
765
  # Single byte end of line character.
766
+ # @param truncate_ragged_lines [Boolean]
767
+ # Truncate lines that are longer than the schema.
752
768
  #
753
769
  # @return [BatchedCsvReader]
754
770
  #
@@ -781,7 +797,8 @@ module Polars
781
797
  row_count_name: nil,
782
798
  row_count_offset: 0,
783
799
  sample_size: 1024,
784
- eol_char: "\n"
800
+ eol_char: "\n",
801
+ truncate_ragged_lines: false
785
802
  )
786
803
  projection, columns = Utils.handle_projection_columns(columns)
787
804
 
@@ -821,7 +838,8 @@ module Polars
821
838
  row_count_offset: row_count_offset,
822
839
  sample_size: sample_size,
823
840
  eol_char: eol_char,
824
- new_columns: new_columns
841
+ new_columns: new_columns,
842
+ truncate_ragged_lines: truncate_ragged_lines
825
843
  )
826
844
  end
827
845
 
@@ -836,7 +854,7 @@ module Polars
836
854
  source = Utils.normalise_filepath(source)
837
855
  end
838
856
 
839
- _ipc_schema(source)
857
+ Plr.ipc_schema(source)
840
858
  end
841
859
 
842
860
  # Get a schema of the Parquet file without reading data.
@@ -850,7 +868,7 @@ module Polars
850
868
  source = Utils.normalise_filepath(source)
851
869
  end
852
870
 
853
- _parquet_schema(source)
871
+ Plr.parquet_schema(source)
854
872
  end
855
873
 
856
874
  private
@@ -868,18 +886,5 @@ module Polars
868
886
 
869
887
  yield file
870
888
  end
871
-
872
- def _check_arg_is_1byte(arg_name, arg, can_be_empty = false)
873
- if arg.is_a?(::String)
874
- arg_byte_length = arg.bytesize
875
- if can_be_empty
876
- if arg_byte_length > 1
877
- raise ArgumentError, "#{arg_name} should be a single byte character or empty, but is #{arg_byte_length} bytes long."
878
- end
879
- elsif arg_byte_length != 1
880
- raise ArgumentError, "#{arg_name} should be a single byte character, but is #{arg_byte_length} bytes long."
881
- end
882
- end
883
- end
884
889
  end
885
890
  end