polars-df 0.7.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +41 -0
  3. data/Cargo.lock +353 -237
  4. data/Cargo.toml +0 -3
  5. data/LICENSE.txt +1 -1
  6. data/README.md +2 -2
  7. data/ext/polars/Cargo.toml +17 -6
  8. data/ext/polars/src/batched_csv.rs +6 -7
  9. data/ext/polars/src/conversion/anyvalue.rs +185 -0
  10. data/ext/polars/src/conversion/chunked_array.rs +140 -0
  11. data/ext/polars/src/{conversion.rs → conversion/mod.rs} +268 -347
  12. data/ext/polars/src/dataframe.rs +96 -116
  13. data/ext/polars/src/expr/array.rs +74 -0
  14. data/ext/polars/src/expr/categorical.rs +8 -1
  15. data/ext/polars/src/expr/datetime.rs +22 -56
  16. data/ext/polars/src/expr/general.rs +124 -37
  17. data/ext/polars/src/expr/list.rs +52 -4
  18. data/ext/polars/src/expr/meta.rs +48 -0
  19. data/ext/polars/src/expr/rolling.rs +16 -10
  20. data/ext/polars/src/expr/string.rs +68 -17
  21. data/ext/polars/src/expr/struct.rs +8 -4
  22. data/ext/polars/src/functions/aggregation.rs +6 -0
  23. data/ext/polars/src/functions/lazy.rs +103 -48
  24. data/ext/polars/src/functions/meta.rs +45 -1
  25. data/ext/polars/src/functions/range.rs +5 -10
  26. data/ext/polars/src/functions/string_cache.rs +14 -0
  27. data/ext/polars/src/{lazyframe.rs → lazyframe/mod.rs} +166 -41
  28. data/ext/polars/src/lib.rs +245 -187
  29. data/ext/polars/src/map/dataframe.rs +1 -1
  30. data/ext/polars/src/map/mod.rs +2 -2
  31. data/ext/polars/src/map/series.rs +6 -6
  32. data/ext/polars/src/object.rs +0 -30
  33. data/ext/polars/src/on_startup.rs +32 -0
  34. data/ext/polars/src/series/aggregation.rs +23 -0
  35. data/ext/polars/src/series/construction.rs +1 -1
  36. data/ext/polars/src/series/export.rs +2 -2
  37. data/ext/polars/src/{series.rs → series/mod.rs} +45 -21
  38. data/ext/polars/src/series/{set_at_idx.rs → scatter.rs} +18 -18
  39. data/ext/polars/src/utils.rs +1 -1
  40. data/lib/polars/array_expr.rb +449 -0
  41. data/lib/polars/array_name_space.rb +346 -0
  42. data/lib/polars/cat_expr.rb +24 -0
  43. data/lib/polars/cat_name_space.rb +75 -0
  44. data/lib/polars/config.rb +2 -2
  45. data/lib/polars/data_frame.rb +248 -108
  46. data/lib/polars/data_types.rb +195 -29
  47. data/lib/polars/date_time_expr.rb +41 -24
  48. data/lib/polars/date_time_name_space.rb +12 -12
  49. data/lib/polars/exceptions.rb +12 -1
  50. data/lib/polars/expr.rb +1080 -195
  51. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  52. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  53. data/lib/polars/functions/as_datatype.rb +248 -0
  54. data/lib/polars/functions/col.rb +47 -0
  55. data/lib/polars/functions/eager.rb +182 -0
  56. data/lib/polars/functions/lazy.rb +1280 -0
  57. data/lib/polars/functions/len.rb +49 -0
  58. data/lib/polars/functions/lit.rb +35 -0
  59. data/lib/polars/functions/random.rb +16 -0
  60. data/lib/polars/functions/range/date_range.rb +103 -0
  61. data/lib/polars/functions/range/int_range.rb +51 -0
  62. data/lib/polars/functions/repeat.rb +144 -0
  63. data/lib/polars/functions/whenthen.rb +27 -0
  64. data/lib/polars/functions.rb +29 -416
  65. data/lib/polars/group_by.rb +3 -3
  66. data/lib/polars/io.rb +21 -28
  67. data/lib/polars/lazy_frame.rb +390 -76
  68. data/lib/polars/list_expr.rb +152 -6
  69. data/lib/polars/list_name_space.rb +102 -0
  70. data/lib/polars/meta_expr.rb +175 -7
  71. data/lib/polars/series.rb +557 -59
  72. data/lib/polars/sql_context.rb +1 -1
  73. data/lib/polars/string_cache.rb +75 -0
  74. data/lib/polars/string_expr.rb +412 -96
  75. data/lib/polars/string_name_space.rb +4 -4
  76. data/lib/polars/struct_expr.rb +1 -1
  77. data/lib/polars/struct_name_space.rb +1 -1
  78. data/lib/polars/testing.rb +507 -0
  79. data/lib/polars/utils.rb +64 -20
  80. data/lib/polars/version.rb +1 -1
  81. data/lib/polars.rb +15 -2
  82. metadata +40 -9
  83. data/lib/polars/lazy_functions.rb +0 -1197
@@ -1,24 +1,28 @@
1
+ pub(crate) mod anyvalue;
2
+ mod chunked_array;
3
+
1
4
  use std::fmt::{Debug, Display, Formatter};
2
5
  use std::hash::{Hash, Hasher};
6
+ use std::num::NonZeroUsize;
3
7
 
4
- use magnus::encoding::{EncodingCapable, Index};
5
8
  use magnus::{
6
- class, exception, prelude::*, r_hash::ForEach, value::Opaque, Float, Integer, IntoValue,
7
- Module, RArray, RHash, RString, Ruby, Symbol, TryConvert, Value,
9
+ class, exception, prelude::*, r_hash::ForEach, value::Opaque, IntoValue, Module, RArray, RHash,
10
+ Ruby, Symbol, TryConvert, Value,
8
11
  };
9
12
  use polars::chunked_array::object::PolarsObjectSafe;
10
13
  use polars::chunked_array::ops::{FillNullLimit, FillNullStrategy};
11
14
  use polars::datatypes::AnyValue;
12
- use polars::frame::row::{any_values_to_dtype, Row};
15
+ use polars::frame::row::Row;
13
16
  use polars::frame::NullStrategy;
14
17
  use polars::io::avro::AvroCompression;
15
18
  use polars::prelude::*;
16
19
  use polars::series::ops::NullBehavior;
17
- use polars_core::utils::arrow::util::total_ord::TotalEq;
20
+ use polars_core::utils::arrow::array::Array;
21
+ use polars_utils::total_ord::{TotalEq, TotalHash};
18
22
  use smartstring::alias::String as SmartString;
19
23
 
20
24
  use crate::object::OBJECT_NAME;
21
- use crate::rb_modules::utils;
25
+ use crate::rb_modules::series;
22
26
  use crate::{RbDataFrame, RbLazyFrame, RbPolarsErr, RbResult, RbSeries, RbTypeError, RbValueError};
23
27
 
24
28
  pub(crate) fn slice_to_wrapped<T>(slice: &[T]) -> &[Wrap<T>] {
@@ -78,36 +82,11 @@ pub(crate) fn get_series(obj: Value) -> RbResult<Series> {
78
82
  Ok(rbs.series.borrow().clone())
79
83
  }
80
84
 
81
- impl TryConvert for Wrap<Utf8Chunked> {
82
- fn try_convert(obj: Value) -> RbResult<Self> {
83
- let (seq, len) = get_rbseq(obj)?;
84
- let mut builder = Utf8ChunkedBuilder::new("", len, len * 25);
85
-
86
- for res in seq.each() {
87
- let item = res?;
88
- match String::try_convert(item) {
89
- Ok(val) => builder.append_value(&val),
90
- Err(_) => builder.append_null(),
91
- }
92
- }
93
- Ok(Wrap(builder.finish()))
94
- }
95
- }
96
-
97
- impl TryConvert for Wrap<BinaryChunked> {
98
- fn try_convert(obj: Value) -> RbResult<Self> {
99
- let (seq, len) = get_rbseq(obj)?;
100
- let mut builder = BinaryChunkedBuilder::new("", len, len * 25);
101
-
102
- for res in seq.each() {
103
- let item = res?;
104
- match RString::try_convert(item) {
105
- Ok(val) => builder.append_value(unsafe { val.as_slice() }),
106
- Err(_) => builder.append_null(),
107
- }
108
- }
109
- Ok(Wrap(builder.finish()))
110
- }
85
+ pub(crate) fn to_series(s: RbSeries) -> Value {
86
+ let series = series();
87
+ series
88
+ .funcall::<_, _, Value>("_from_rbseries", (s,))
89
+ .unwrap()
111
90
  }
112
91
 
113
92
  impl TryConvert for Wrap<NullValues> {
@@ -134,102 +113,84 @@ fn struct_dict<'a>(vals: impl Iterator<Item = AnyValue<'a>>, flds: &[Field]) ->
134
113
  dict.into_value()
135
114
  }
136
115
 
137
- impl IntoValue for Wrap<AnyValue<'_>> {
138
- fn into_value_with(self, ruby: &Ruby) -> Value {
116
+ impl IntoValue for Wrap<DataType> {
117
+ fn into_value_with(self, _: &Ruby) -> Value {
118
+ let pl = crate::rb_modules::polars();
119
+
139
120
  match self.0 {
140
- AnyValue::UInt8(v) => ruby.into_value(v),
141
- AnyValue::UInt16(v) => ruby.into_value(v),
142
- AnyValue::UInt32(v) => ruby.into_value(v),
143
- AnyValue::UInt64(v) => ruby.into_value(v),
144
- AnyValue::Int8(v) => ruby.into_value(v),
145
- AnyValue::Int16(v) => ruby.into_value(v),
146
- AnyValue::Int32(v) => ruby.into_value(v),
147
- AnyValue::Int64(v) => ruby.into_value(v),
148
- AnyValue::Float32(v) => ruby.into_value(v),
149
- AnyValue::Float64(v) => ruby.into_value(v),
150
- AnyValue::Null => ruby.qnil().as_value(),
151
- AnyValue::Boolean(v) => ruby.into_value(v),
152
- AnyValue::Utf8(v) => ruby.into_value(v),
153
- AnyValue::Utf8Owned(v) => ruby.into_value(v.as_str()),
154
- AnyValue::Categorical(idx, rev, arr) => {
155
- let s = if arr.is_null() {
156
- rev.get(idx)
157
- } else {
158
- unsafe { arr.deref_unchecked().value(idx as usize) }
159
- };
160
- s.into_value()
121
+ DataType::Int8 => {
122
+ let class = pl.const_get::<_, Value>("Int8").unwrap();
123
+ class.funcall("new", ()).unwrap()
161
124
  }
162
- AnyValue::Date(v) => utils().funcall("_to_ruby_date", (v,)).unwrap(),
163
- AnyValue::Datetime(v, time_unit, time_zone) => {
164
- let time_unit = time_unit.to_ascii();
165
- utils()
166
- .funcall("_to_ruby_datetime", (v, time_unit, time_zone.clone()))
167
- .unwrap()
125
+ DataType::Int16 => {
126
+ let class = pl.const_get::<_, Value>("Int16").unwrap();
127
+ class.funcall("new", ()).unwrap()
168
128
  }
169
- AnyValue::Duration(v, time_unit) => {
170
- let time_unit = time_unit.to_ascii();
171
- utils()
172
- .funcall("_to_ruby_duration", (v, time_unit))
173
- .unwrap()
129
+ DataType::Int32 => {
130
+ let class = pl.const_get::<_, Value>("Int32").unwrap();
131
+ class.funcall("new", ()).unwrap()
174
132
  }
175
- AnyValue::Time(v) => utils().funcall("_to_ruby_time", (v,)).unwrap(),
176
- AnyValue::Array(v, _) | AnyValue::List(v) => RbSeries::new(v).to_a().into_value(),
177
- ref av @ AnyValue::Struct(_, _, flds) => struct_dict(av._iter_struct_av(), flds),
178
- AnyValue::StructOwned(payload) => struct_dict(payload.0.into_iter(), &payload.1),
179
- AnyValue::Object(v) => {
180
- let object = v.as_any().downcast_ref::<ObjectValue>().unwrap();
181
- object.to_object()
133
+ DataType::Int64 => {
134
+ let class = pl.const_get::<_, Value>("Int64").unwrap();
135
+ class.funcall("new", ()).unwrap()
182
136
  }
183
- AnyValue::ObjectOwned(v) => {
184
- let object = v.0.as_any().downcast_ref::<ObjectValue>().unwrap();
185
- object.to_object()
137
+ DataType::UInt8 => {
138
+ let class = pl.const_get::<_, Value>("UInt8").unwrap();
139
+ class.funcall("new", ()).unwrap()
140
+ }
141
+ DataType::UInt16 => {
142
+ let class = pl.const_get::<_, Value>("UInt16").unwrap();
143
+ class.funcall("new", ()).unwrap()
144
+ }
145
+ DataType::UInt32 => {
146
+ let class = pl.const_get::<_, Value>("UInt32").unwrap();
147
+ class.funcall("new", ()).unwrap()
148
+ }
149
+ DataType::UInt64 => {
150
+ let class = pl.const_get::<_, Value>("UInt64").unwrap();
151
+ class.funcall("new", ()).unwrap()
152
+ }
153
+ DataType::Float32 => {
154
+ let class = pl.const_get::<_, Value>("Float32").unwrap();
155
+ class.funcall("new", ()).unwrap()
156
+ }
157
+ DataType::Float64 => {
158
+ let class = pl.const_get::<_, Value>("Float64").unwrap();
159
+ class.funcall("new", ()).unwrap()
186
160
  }
187
- AnyValue::Binary(v) => RString::from_slice(v).into_value(),
188
- AnyValue::BinaryOwned(v) => RString::from_slice(&v).into_value(),
189
- AnyValue::Decimal(v, scale) => utils()
190
- .funcall("_to_ruby_decimal", (v.to_string(), -(scale as i32)))
191
- .unwrap(),
192
- }
193
- }
194
- }
195
-
196
- impl IntoValue for Wrap<DataType> {
197
- fn into_value_with(self, _: &Ruby) -> Value {
198
- let pl = crate::rb_modules::polars();
199
-
200
- match self.0 {
201
- DataType::Int8 => pl.const_get::<_, Value>("Int8").unwrap(),
202
- DataType::Int16 => pl.const_get::<_, Value>("Int16").unwrap(),
203
- DataType::Int32 => pl.const_get::<_, Value>("Int32").unwrap(),
204
- DataType::Int64 => pl.const_get::<_, Value>("Int64").unwrap(),
205
- DataType::UInt8 => pl.const_get::<_, Value>("UInt8").unwrap(),
206
- DataType::UInt16 => pl.const_get::<_, Value>("UInt16").unwrap(),
207
- DataType::UInt32 => pl.const_get::<_, Value>("UInt32").unwrap(),
208
- DataType::UInt64 => pl.const_get::<_, Value>("UInt64").unwrap(),
209
- DataType::Float32 => pl.const_get::<_, Value>("Float32").unwrap(),
210
- DataType::Float64 => pl.const_get::<_, Value>("Float64").unwrap(),
211
161
  DataType::Decimal(precision, scale) => {
212
- let decimal_class = pl.const_get::<_, Value>("Decimal").unwrap();
213
- decimal_class
162
+ let class = pl.const_get::<_, Value>("Decimal").unwrap();
163
+ class
214
164
  .funcall::<_, _, Value>("new", (precision, scale))
215
165
  .unwrap()
216
166
  }
217
- DataType::Boolean => pl.const_get::<_, Value>("Boolean").unwrap(),
218
- DataType::Utf8 => pl.const_get::<_, Value>("Utf8").unwrap(),
219
- DataType::Binary => pl.const_get::<_, Value>("Binary").unwrap(),
167
+ DataType::Boolean => {
168
+ let class = pl.const_get::<_, Value>("Boolean").unwrap();
169
+ class.funcall("new", ()).unwrap()
170
+ }
171
+ DataType::String => {
172
+ let class = pl.const_get::<_, Value>("String").unwrap();
173
+ class.funcall("new", ()).unwrap()
174
+ }
175
+ DataType::Binary => {
176
+ let class = pl.const_get::<_, Value>("Binary").unwrap();
177
+ class.funcall("new", ()).unwrap()
178
+ }
220
179
  DataType::Array(inner, size) => {
180
+ let class = pl.const_get::<_, Value>("Array").unwrap();
221
181
  let inner = Wrap(*inner);
222
- let list_class = pl.const_get::<_, Value>("Array").unwrap();
223
- list_class
224
- .funcall::<_, _, Value>("new", (size, inner))
225
- .unwrap()
182
+ let args = (inner, size);
183
+ class.funcall::<_, _, Value>("new", args).unwrap()
226
184
  }
227
185
  DataType::List(inner) => {
186
+ let class = pl.const_get::<_, Value>("List").unwrap();
228
187
  let inner = Wrap(*inner);
229
- let list_class = pl.const_get::<_, Value>("List").unwrap();
230
- list_class.funcall::<_, _, Value>("new", (inner,)).unwrap()
188
+ class.funcall::<_, _, Value>("new", (inner,)).unwrap()
189
+ }
190
+ DataType::Date => {
191
+ let class = pl.const_get::<_, Value>("Date").unwrap();
192
+ class.funcall("new", ()).unwrap()
231
193
  }
232
- DataType::Date => pl.const_get::<_, Value>("Date").unwrap(),
233
194
  DataType::Datetime(tu, tz) => {
234
195
  let datetime_class = pl.const_get::<_, Value>("Datetime").unwrap();
235
196
  datetime_class
@@ -242,9 +203,29 @@ impl IntoValue for Wrap<DataType> {
242
203
  .funcall::<_, _, Value>("new", (tu.to_ascii(),))
243
204
  .unwrap()
244
205
  }
245
- DataType::Object(_) => pl.const_get::<_, Value>("Object").unwrap(),
246
- DataType::Categorical(_) => pl.const_get::<_, Value>("Categorical").unwrap(),
247
- DataType::Time => pl.const_get::<_, Value>("Time").unwrap(),
206
+ DataType::Object(_, _) => {
207
+ let class = pl.const_get::<_, Value>("Object").unwrap();
208
+ class.funcall("new", ()).unwrap()
209
+ }
210
+ DataType::Categorical(_, ordering) => {
211
+ let class = pl.const_get::<_, Value>("Categorical").unwrap();
212
+ class.funcall("new", (Wrap(ordering),)).unwrap()
213
+ }
214
+ DataType::Enum(rev_map, _) => {
215
+ // we should always have an initialized rev_map coming from rust
216
+ let categories = rev_map.as_ref().unwrap().get_categories();
217
+ let class = pl.const_get::<_, Value>("Enum").unwrap();
218
+ let s = Series::from_arrow("category", categories.to_boxed()).unwrap();
219
+ let series = to_series(s.into());
220
+ class
221
+ .funcall::<_, _, Value>("new", (series,))
222
+ .unwrap()
223
+ .into()
224
+ }
225
+ DataType::Time => {
226
+ let class = pl.const_get::<_, Value>("Time").unwrap();
227
+ class.funcall("new", ()).unwrap()
228
+ }
248
229
  DataType::Struct(fields) => {
249
230
  let field_class = pl.const_get::<_, Value>("Field").unwrap();
250
231
  let iter = fields.iter().map(|fld| {
@@ -260,12 +241,31 @@ impl IntoValue for Wrap<DataType> {
260
241
  .funcall::<_, _, Value>("new", (fields,))
261
242
  .unwrap()
262
243
  }
263
- DataType::Null => pl.const_get::<_, Value>("Null").unwrap(),
264
- DataType::Unknown => pl.const_get::<_, Value>("Unknown").unwrap(),
244
+ DataType::Null => {
245
+ let class = pl.const_get::<_, Value>("Null").unwrap();
246
+ class.funcall("new", ()).unwrap()
247
+ }
248
+ DataType::Unknown => {
249
+ let class = pl.const_get::<_, Value>("Unknown").unwrap();
250
+ class.funcall("new", ()).unwrap()
251
+ }
252
+ DataType::BinaryOffset => {
253
+ unimplemented!()
254
+ }
265
255
  }
266
256
  }
267
257
  }
268
258
 
259
+ impl IntoValue for Wrap<CategoricalOrdering> {
260
+ fn into_value_with(self, _: &Ruby) -> Value {
261
+ let ordering = match self.0 {
262
+ CategoricalOrdering::Physical => "physical",
263
+ CategoricalOrdering::Lexical => "lexical",
264
+ };
265
+ ordering.into_value()
266
+ }
267
+ }
268
+
269
269
  impl IntoValue for Wrap<TimeUnit> {
270
270
  fn into_value_with(self, _: &Ruby) -> Value {
271
271
  let tu = match self.0 {
@@ -277,114 +277,6 @@ impl IntoValue for Wrap<TimeUnit> {
277
277
  }
278
278
  }
279
279
 
280
- impl IntoValue for Wrap<&Utf8Chunked> {
281
- fn into_value_with(self, _: &Ruby) -> Value {
282
- let iter = self.0.into_iter();
283
- RArray::from_iter(iter).into_value()
284
- }
285
- }
286
-
287
- impl IntoValue for Wrap<&BinaryChunked> {
288
- fn into_value_with(self, _: &Ruby) -> Value {
289
- let iter = self
290
- .0
291
- .into_iter()
292
- .map(|opt_bytes| opt_bytes.map(RString::from_slice));
293
- RArray::from_iter(iter).into_value()
294
- }
295
- }
296
-
297
- impl IntoValue for Wrap<&StructChunked> {
298
- fn into_value_with(self, _: &Ruby) -> Value {
299
- let s = self.0.clone().into_series();
300
- // todo! iterate its chunks and flatten.
301
- // make series::iter() accept a chunk index.
302
- let s = s.rechunk();
303
- let iter = s.iter().map(|av| {
304
- if let AnyValue::Struct(_, _, flds) = av {
305
- struct_dict(av._iter_struct_av(), flds)
306
- } else {
307
- unreachable!()
308
- }
309
- });
310
-
311
- RArray::from_iter(iter).into_value()
312
- }
313
- }
314
-
315
- impl IntoValue for Wrap<&DurationChunked> {
316
- fn into_value_with(self, _: &Ruby) -> Value {
317
- let utils = utils();
318
- let time_unit = Wrap(self.0.time_unit()).into_value();
319
- let iter = self.0.into_iter().map(|opt_v| {
320
- opt_v.map(|v| {
321
- utils
322
- .funcall::<_, _, Value>("_to_ruby_duration", (v, time_unit))
323
- .unwrap()
324
- })
325
- });
326
- RArray::from_iter(iter).into_value()
327
- }
328
- }
329
-
330
- impl IntoValue for Wrap<&DatetimeChunked> {
331
- fn into_value_with(self, _: &Ruby) -> Value {
332
- let utils = utils();
333
- let time_unit = Wrap(self.0.time_unit()).into_value();
334
- let time_zone = self.0.time_zone().clone().into_value();
335
- let iter = self.0.into_iter().map(|opt_v| {
336
- opt_v.map(|v| {
337
- utils
338
- .funcall::<_, _, Value>("_to_ruby_datetime", (v, time_unit, time_zone))
339
- .unwrap()
340
- })
341
- });
342
- RArray::from_iter(iter).into_value()
343
- }
344
- }
345
-
346
- impl IntoValue for Wrap<&TimeChunked> {
347
- fn into_value_with(self, _: &Ruby) -> Value {
348
- let utils = utils();
349
- let iter = self.0.into_iter().map(|opt_v| {
350
- opt_v.map(|v| utils.funcall::<_, _, Value>("_to_ruby_time", (v,)).unwrap())
351
- });
352
- RArray::from_iter(iter).into_value()
353
- }
354
- }
355
-
356
- impl IntoValue for Wrap<&DateChunked> {
357
- fn into_value_with(self, _: &Ruby) -> Value {
358
- let utils = utils();
359
- let iter = self.0.into_iter().map(|opt_v| {
360
- opt_v.map(|v| utils.funcall::<_, _, Value>("_to_ruby_date", (v,)).unwrap())
361
- });
362
- RArray::from_iter(iter).into_value()
363
- }
364
- }
365
-
366
- impl IntoValue for Wrap<&DecimalChunked> {
367
- fn into_value_with(self, _: &Ruby) -> Value {
368
- let utils = utils();
369
- let rb_scale = (-(self.0.scale() as i32)).into_value();
370
- let iter = self.0.into_iter().map(|opt_v| {
371
- opt_v.map(|v| {
372
- utils
373
- .funcall::<_, _, Value>("_to_ruby_decimal", (v.to_string(), rb_scale))
374
- .unwrap()
375
- })
376
- });
377
- RArray::from_iter(iter).into_value()
378
- }
379
- }
380
-
381
- fn abs_decimal_from_digits(digits: String, exp: i32) -> Option<(i128, usize)> {
382
- match digits.parse::<i128>() {
383
- Ok(v) => Some((v, ((digits.len() as i32) - exp) as usize)),
384
- Err(_) => None,
385
- }
386
- }
387
-
388
280
  impl TryConvert for Wrap<Field> {
389
281
  fn try_convert(ob: Value) -> RbResult<Self> {
390
282
  let name: String = ob.funcall("name", ())?;
@@ -406,10 +298,11 @@ impl TryConvert for Wrap<DataType> {
406
298
  "Polars::Int16" => DataType::Int16,
407
299
  "Polars::Int32" => DataType::Int32,
408
300
  "Polars::Int64" => DataType::Int64,
409
- "Polars::Utf8" => DataType::Utf8,
301
+ "Polars::String" => DataType::String,
410
302
  "Polars::Binary" => DataType::Binary,
411
303
  "Polars::Boolean" => DataType::Boolean,
412
- "Polars::Categorical" => DataType::Categorical(None),
304
+ "Polars::Categorical" => DataType::Categorical(None, Default::default()),
305
+ "Polars::Enum" => DataType::Enum(None, Default::default()),
413
306
  "Polars::Date" => DataType::Date,
414
307
  "Polars::Datetime" => DataType::Datetime(TimeUnit::Microseconds, None),
415
308
  "Polars::Time" => DataType::Time,
@@ -417,7 +310,7 @@ impl TryConvert for Wrap<DataType> {
417
310
  "Polars::Decimal" => DataType::Decimal(None, None),
418
311
  "Polars::Float32" => DataType::Float32,
419
312
  "Polars::Float64" => DataType::Float64,
420
- "Polars::Object" => DataType::Object(OBJECT_NAME),
313
+ "Polars::Object" => DataType::Object(OBJECT_NAME, None),
421
314
  "Polars::List" => DataType::List(Box::new(DataType::Null)),
422
315
  "Polars::Null" => DataType::Null,
423
316
  "Polars::Unknown" => DataType::Unknown,
@@ -431,6 +324,36 @@ impl TryConvert for Wrap<DataType> {
431
324
  } else if String::try_convert(ob).is_err() {
432
325
  let name = unsafe { ob.class().name() }.into_owned();
433
326
  match name.as_str() {
327
+ "Polars::Int8" => DataType::Int8,
328
+ "Polars::Int16" => DataType::Int16,
329
+ "Polars::Int32" => DataType::Int32,
330
+ "Polars::Int64" => DataType::Int64,
331
+ "Polars::UInt8" => DataType::UInt8,
332
+ "Polars::UInt16" => DataType::UInt16,
333
+ "Polars::UInt32" => DataType::UInt32,
334
+ "Polars::UInt64" => DataType::UInt64,
335
+ "Polars::String" => DataType::String,
336
+ "Polars::Binary" => DataType::Binary,
337
+ "Polars::Boolean" => DataType::Boolean,
338
+ "Polars::Categorical" => {
339
+ let ordering = ob
340
+ .funcall::<_, _, Wrap<CategoricalOrdering>>("ordering", ())?
341
+ .0;
342
+ DataType::Categorical(None, ordering)
343
+ }
344
+ "Polars::Enum" => {
345
+ let categories = ob.funcall("categories", ()).unwrap();
346
+ let s = get_series(categories)?;
347
+ let ca = s.str().map_err(RbPolarsErr::from)?;
348
+ let categories = ca.downcast_iter().next().unwrap().clone();
349
+ create_enum_data_type(categories)
350
+ }
351
+ "Polars::Date" => DataType::Date,
352
+ "Polars::Time" => DataType::Time,
353
+ "Polars::Float32" => DataType::Float32,
354
+ "Polars::Float64" => DataType::Float64,
355
+ "Polars::Null" => DataType::Null,
356
+ "Polars::Unknown" => DataType::Unknown,
434
357
  "Polars::Duration" => {
435
358
  let time_unit: Value = ob.funcall("time_unit", ()).unwrap();
436
359
  let time_unit = Wrap::<TimeUnit>::try_convert(time_unit)?.0;
@@ -477,17 +400,17 @@ impl TryConvert for Wrap<DataType> {
477
400
  "i16" => DataType::Int16,
478
401
  "i32" => DataType::Int32,
479
402
  "i64" => DataType::Int64,
480
- "str" => DataType::Utf8,
403
+ "str" => DataType::String,
481
404
  "bin" => DataType::Binary,
482
405
  "bool" => DataType::Boolean,
483
- "cat" => DataType::Categorical(None),
406
+ "cat" => DataType::Categorical(None, Default::default()),
484
407
  "date" => DataType::Date,
485
408
  "datetime" => DataType::Datetime(TimeUnit::Microseconds, None),
486
409
  "f32" => DataType::Float32,
487
410
  "time" => DataType::Time,
488
411
  "dur" => DataType::Duration(TimeUnit::Microseconds),
489
412
  "f64" => DataType::Float64,
490
- "obj" => DataType::Object(OBJECT_NAME),
413
+ "obj" => DataType::Object(OBJECT_NAME, None),
491
414
  "list" => DataType::List(Box::new(DataType::Boolean)),
492
415
  "null" => DataType::Null,
493
416
  "unk" => DataType::Unknown,
@@ -503,102 +426,6 @@ impl TryConvert for Wrap<DataType> {
503
426
  }
504
427
  }
505
428
 
506
- impl<'s> TryConvert for Wrap<AnyValue<'s>> {
507
- fn try_convert(ob: Value) -> RbResult<Self> {
508
- if ob.is_kind_of(class::true_class()) || ob.is_kind_of(class::false_class()) {
509
- Ok(AnyValue::Boolean(bool::try_convert(ob)?).into())
510
- } else if let Some(v) = Integer::from_value(ob) {
511
- Ok(AnyValue::Int64(v.to_i64()?).into())
512
- } else if let Some(v) = Float::from_value(ob) {
513
- Ok(AnyValue::Float64(v.to_f64()).into())
514
- } else if let Some(v) = RString::from_value(ob) {
515
- if v.enc_get() == Index::utf8() {
516
- Ok(AnyValue::Utf8Owned(v.to_string()?.into()).into())
517
- } else {
518
- Ok(AnyValue::BinaryOwned(unsafe { v.as_slice() }.to_vec()).into())
519
- }
520
- // call is_a? for ActiveSupport::TimeWithZone
521
- } else if ob.funcall::<_, _, bool>("is_a?", (class::time(),))? {
522
- let sec = ob.funcall::<_, _, i64>("to_i", ())?;
523
- let nsec = ob.funcall::<_, _, i64>("nsec", ())?;
524
- let v = sec * 1_000_000_000 + nsec;
525
- // TODO support time zone when possible
526
- // https://github.com/pola-rs/polars/issues/9103
527
- Ok(AnyValue::Datetime(v, TimeUnit::Nanoseconds, &None).into())
528
- } else if ob.is_nil() {
529
- Ok(AnyValue::Null.into())
530
- } else if let Some(dict) = RHash::from_value(ob) {
531
- let len = dict.len();
532
- let mut keys = Vec::with_capacity(len);
533
- let mut vals = Vec::with_capacity(len);
534
- dict.foreach(|k: Value, v: Value| {
535
- let key = String::try_convert(k)?;
536
- let val = Wrap::<AnyValue>::try_convert(v)?.0;
537
- let dtype = DataType::from(&val);
538
- keys.push(Field::new(&key, dtype));
539
- vals.push(val);
540
- Ok(ForEach::Continue)
541
- })?;
542
- Ok(Wrap(AnyValue::StructOwned(Box::new((vals, keys)))))
543
- } else if let Some(v) = RArray::from_value(ob) {
544
- if v.is_empty() {
545
- Ok(Wrap(AnyValue::List(Series::new_empty("", &DataType::Null))))
546
- } else {
547
- let list = v;
548
-
549
- let mut avs = Vec::with_capacity(25);
550
- let mut iter = list.each();
551
-
552
- for item in (&mut iter).take(25) {
553
- avs.push(Wrap::<AnyValue>::try_convert(item?)?.0)
554
- }
555
-
556
- let (dtype, _n_types) = any_values_to_dtype(&avs).map_err(RbPolarsErr::from)?;
557
-
558
- // push the rest
559
- avs.reserve(list.len());
560
- for item in iter {
561
- avs.push(Wrap::<AnyValue>::try_convert(item?)?.0)
562
- }
563
-
564
- let s = Series::from_any_values_and_dtype("", &avs, &dtype, true)
565
- .map_err(RbPolarsErr::from)?;
566
- Ok(Wrap(AnyValue::List(s)))
567
- }
568
- } else if ob.is_kind_of(crate::rb_modules::datetime()) {
569
- let sec: i64 = ob.funcall("to_i", ())?;
570
- let nsec: i64 = ob.funcall("nsec", ())?;
571
- Ok(Wrap(AnyValue::Datetime(
572
- sec * 1_000_000_000 + nsec,
573
- TimeUnit::Nanoseconds,
574
- &None,
575
- )))
576
- } else if ob.is_kind_of(crate::rb_modules::date()) {
577
- // convert to DateTime for UTC
578
- let v = ob
579
- .funcall::<_, _, Value>("to_datetime", ())?
580
- .funcall::<_, _, Value>("to_time", ())?
581
- .funcall::<_, _, i64>("to_i", ())?;
582
- Ok(Wrap(AnyValue::Date((v / 86400) as i32)))
583
- } else if ob.is_kind_of(crate::rb_modules::bigdecimal()) {
584
- let (sign, digits, _, exp): (i8, String, i32, i32) = ob.funcall("split", ()).unwrap();
585
- let (mut v, scale) = abs_decimal_from_digits(digits, exp).ok_or_else(|| {
586
- RbPolarsErr::other("BigDecimal is too large to fit in Decimal128".into())
587
- })?;
588
- if sign < 0 {
589
- // TODO better error
590
- v = v.checked_neg().unwrap();
591
- }
592
- Ok(Wrap(AnyValue::Decimal(v, scale)))
593
- } else {
594
- Err(RbPolarsErr::other(format!(
595
- "object type not supported {:?}",
596
- ob
597
- )))
598
- }
599
- }
600
- }
601
-
602
429
  impl<'s> TryConvert for Wrap<Row<'s>> {
603
430
  fn try_convert(ob: Value) -> RbResult<Self> {
604
431
  let mut vals: Vec<Wrap<AnyValue<'s>>> = Vec::new();
@@ -662,6 +489,15 @@ impl TotalEq for ObjectValue {
662
489
  }
663
490
  }
664
491
 
492
+ impl TotalHash for ObjectValue {
493
+ fn tot_hash<H>(&self, state: &mut H)
494
+ where
495
+ H: Hasher,
496
+ {
497
+ self.hash(state);
498
+ }
499
+ }
500
+
665
501
  impl Display for ObjectValue {
666
502
  fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
667
503
  write!(f, "{}", self.to_object())
@@ -715,24 +551,33 @@ impl Default for ObjectValue {
715
551
 
716
552
  pub(crate) fn dicts_to_rows(
717
553
  records: &Value,
718
- infer_schema_len: usize,
554
+ infer_schema_len: Option<usize>,
555
+ schema_columns: PlIndexSet<String>,
719
556
  ) -> RbResult<(Vec<Row>, Vec<String>)> {
557
+ let infer_schema_len = infer_schema_len.map(|n| std::cmp::max(1, n));
720
558
  let (dicts, len) = get_rbseq(*records)?;
721
559
 
722
- let mut key_names = PlIndexSet::new();
723
- for d in dicts.each().take(infer_schema_len) {
724
- let d = d?;
725
- let d = RHash::try_convert(d)?;
726
-
727
- d.foreach(|name: Value, _value: Value| {
728
- if let Some(v) = Symbol::from_value(name) {
729
- key_names.insert(v.name()?.into());
730
- } else {
731
- key_names.insert(String::try_convert(name)?);
732
- };
733
- Ok(ForEach::Continue)
734
- })?;
735
- }
560
+ let key_names = {
561
+ if !schema_columns.is_empty() {
562
+ schema_columns
563
+ } else {
564
+ let mut inferred_keys = PlIndexSet::new();
565
+ for d in dicts.each().take(infer_schema_len.unwrap_or(usize::MAX)) {
566
+ let d = d?;
567
+ let d = RHash::try_convert(d)?;
568
+
569
+ d.foreach(|name: Value, _value: Value| {
570
+ if let Some(v) = Symbol::from_value(name) {
571
+ inferred_keys.insert(v.name()?.into());
572
+ } else {
573
+ inferred_keys.insert(String::try_convert(name)?);
574
+ };
575
+ Ok(ForEach::Continue)
576
+ })?;
577
+ }
578
+ inferred_keys
579
+ }
580
+ };
736
581
 
737
582
  let mut rows = Vec::with_capacity(len);
738
583
 
@@ -891,7 +736,8 @@ impl TryConvert for Wrap<JoinType> {
891
736
  let parsed = match String::try_convert(ob)?.as_str() {
892
737
  "inner" => JoinType::Inner,
893
738
  "left" => JoinType::Left,
894
- "outer" => JoinType::Outer,
739
+ "outer" => JoinType::Outer { coalesce: false },
740
+ "outer_coalesce" => JoinType::Outer { coalesce: true },
895
741
  "semi" => JoinType::Semi,
896
742
  "anti" => JoinType::Anti,
897
743
  // #[cfg(feature = "cross_join")]
@@ -1061,6 +907,22 @@ impl TryConvert for Wrap<UniqueKeepStrategy> {
1061
907
  }
1062
908
  }
1063
909
 
910
+ impl TryConvert for Wrap<IpcCompression> {
911
+ fn try_convert(ob: Value) -> RbResult<Self> {
912
+ let parsed = match String::try_convert(ob)?.as_str() {
913
+ "lz4" => IpcCompression::LZ4,
914
+ "zstd" => IpcCompression::ZSTD,
915
+ v => {
916
+ return Err(RbValueError::new_err(format!(
917
+ "compression must be one of {{'lz4', 'zstd'}}, got {}",
918
+ v
919
+ )))
920
+ }
921
+ };
922
+ Ok(Wrap(parsed))
923
+ }
924
+ }
925
+
1064
926
  impl TryConvert for Wrap<SearchSortedSide> {
1065
927
  fn try_convert(ob: Value) -> RbResult<Self> {
1066
928
  let parsed = match String::try_convert(ob)?.as_str() {
@@ -1077,6 +939,56 @@ impl TryConvert for Wrap<SearchSortedSide> {
1077
939
  }
1078
940
  }
1079
941
 
942
+ impl TryConvert for Wrap<WindowMapping> {
943
+ fn try_convert(ob: Value) -> RbResult<Self> {
944
+ let parsed = match String::try_convert(ob)?.as_str() {
945
+ "group_to_rows" => WindowMapping::GroupsToRows,
946
+ "join" => WindowMapping::Join,
947
+ "explode" => WindowMapping::Explode,
948
+ v => {
949
+ return Err(RbValueError::new_err(format!(
950
+ "`mapping_strategy` must be one of {{'group_to_rows', 'join', 'explode'}}, got {v}",
951
+ )))
952
+ }
953
+ };
954
+ Ok(Wrap(parsed))
955
+ }
956
+ }
957
+
958
+ impl TryConvert for Wrap<JoinValidation> {
959
+ fn try_convert(ob: Value) -> RbResult<Self> {
960
+ let parsed = match String::try_convert(ob)?.as_str() {
961
+ "1:1" => JoinValidation::OneToOne,
962
+ "1:m" => JoinValidation::OneToMany,
963
+ "m:m" => JoinValidation::ManyToMany,
964
+ "m:1" => JoinValidation::ManyToOne,
965
+ v => {
966
+ return Err(RbValueError::new_err(format!(
967
+ "`validate` must be one of {{'m:m', 'm:1', '1:m', '1:1'}}, got {v}",
968
+ )))
969
+ }
970
+ };
971
+ Ok(Wrap(parsed))
972
+ }
973
+ }
974
+
975
+ impl TryConvert for Wrap<QuoteStyle> {
976
+ fn try_convert(ob: Value) -> RbResult<Self> {
977
+ let parsed = match String::try_convert(ob)?.as_str() {
978
+ "always" => QuoteStyle::Always,
979
+ "necessary" => QuoteStyle::Necessary,
980
+ "non_numeric" => QuoteStyle::NonNumeric,
981
+ "never" => QuoteStyle::Never,
982
+ v => {
983
+ return Err(RbValueError::new_err(format!(
984
+ "`quote_style` must be one of {{'always', 'necessary', 'non_numeric', 'never'}}, got {v}",
985
+ )))
986
+ },
987
+ };
988
+ Ok(Wrap(parsed))
989
+ }
990
+ }
991
+
1080
992
  pub fn parse_fill_null_strategy(
1081
993
  strategy: &str,
1082
994
  limit: FillNullLimit,
@@ -1149,3 +1061,12 @@ where
1149
1061
  {
1150
1062
  container.into_iter().map(|s| s.as_ref().into()).collect()
1151
1063
  }
1064
+
1065
+ impl TryConvert for Wrap<NonZeroUsize> {
1066
+ fn try_convert(ob: Value) -> RbResult<Self> {
1067
+ let v = usize::try_convert(ob)?;
1068
+ NonZeroUsize::new(v)
1069
+ .map(|v| Wrap(v))
1070
+ .ok_or(RbValueError::new_err("must be non-zero".into()))
1071
+ }
1072
+ }