polars-df 0.2.2 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/Cargo.lock +1 -1
- data/README.md +4 -2
- data/ext/polars/Cargo.toml +1 -1
- data/ext/polars/src/conversion.rs +152 -99
- data/ext/polars/src/dataframe.rs +75 -6
- data/ext/polars/src/lib.rs +6 -13
- data/ext/polars/src/object.rs +30 -0
- data/ext/polars/src/prelude.rs +3 -0
- data/ext/polars/src/rb_modules.rs +9 -0
- data/lib/polars/data_frame.rb +57 -2
- data/lib/polars/data_types.rb +67 -29
- data/lib/polars/series.rb +5 -0
- data/lib/polars/utils.rb +5 -5
- data/lib/polars/version.rb +1 -1
- data/lib/polars-df.rb +1 -1
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a07e6dd4bee3bef4020d7818a060c6f28aaedb7264b206e35a485e575cd8a695
|
4
|
+
data.tar.gz: c586e0ec898aab7642f49d49b54c614121b9cb1f748eb7c98d76611ae2ad56a2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ff035a9b60966342ca16dc5eea3b0abd0c4a08f5db8a0fc3c4d6ef206dd20ad56becbcd3a7ecdb5c328de6f9a52d53531eeb44c75078170d451bc39197553570
|
7
|
+
data.tar.gz: 48c7334a56339fb0feda046c415839ece39e2c92cf807ed422026d00787d687bbda2ba1198d612434379a2de92830841f0798109920f76d730205e612ab72cb1
|
data/CHANGELOG.md
CHANGED
data/Cargo.lock
CHANGED
data/README.md
CHANGED
@@ -56,6 +56,8 @@ From Active Record
|
|
56
56
|
|
57
57
|
```ruby
|
58
58
|
Polars.read_sql(User.all)
|
59
|
+
# or
|
60
|
+
Polars.read_sql("SELECT * FROM users")
|
59
61
|
```
|
60
62
|
|
61
63
|
From a hash
|
@@ -287,13 +289,13 @@ CSV
|
|
287
289
|
```ruby
|
288
290
|
df.to_csv
|
289
291
|
# or
|
290
|
-
df.write_csv("
|
292
|
+
df.write_csv("file.csv")
|
291
293
|
```
|
292
294
|
|
293
295
|
Parquet
|
294
296
|
|
295
297
|
```ruby
|
296
|
-
df.write_parquet("
|
298
|
+
df.write_parquet("file.parquet")
|
297
299
|
```
|
298
300
|
|
299
301
|
## Types
|
data/ext/polars/Cargo.toml
CHANGED
@@ -144,7 +144,7 @@ impl From<Wrap<AnyValue<'_>>> for Value {
|
|
144
144
|
|
145
145
|
impl From<Wrap<DataType>> for Value {
|
146
146
|
fn from(w: Wrap<DataType>) -> Self {
|
147
|
-
let pl = crate::
|
147
|
+
let pl = crate::rb_modules::polars();
|
148
148
|
|
149
149
|
match &w.0 {
|
150
150
|
DataType::Int8 => pl.const_get::<_, Value>("Int8").unwrap(),
|
@@ -278,6 +278,22 @@ impl<'s> TryConvert for Wrap<AnyValue<'s>> {
|
|
278
278
|
Ok(AnyValue::Int64(v).into())
|
279
279
|
} else if let Ok(v) = ob.try_convert::<f64>() {
|
280
280
|
Ok(AnyValue::Float64(v).into())
|
281
|
+
} else if ob.is_nil() {
|
282
|
+
Ok(AnyValue::Null.into())
|
283
|
+
} else if ob.is_kind_of(class::hash()) {
|
284
|
+
let dict = ob.try_convert::<RHash>().unwrap();
|
285
|
+
let len = dict.len();
|
286
|
+
let mut keys = Vec::with_capacity(len);
|
287
|
+
let mut vals = Vec::with_capacity(len);
|
288
|
+
dict.foreach(|k: Value, v: Value| {
|
289
|
+
let key = k.try_convert::<String>()?;
|
290
|
+
let val = v.try_convert::<Wrap<AnyValue>>()?.0;
|
291
|
+
let dtype = DataType::from(&val);
|
292
|
+
keys.push(Field::new(&key, dtype));
|
293
|
+
vals.push(val);
|
294
|
+
Ok(ForEach::Continue)
|
295
|
+
})?;
|
296
|
+
Ok(Wrap(AnyValue::StructOwned(Box::new((vals, keys)))))
|
281
297
|
} else {
|
282
298
|
Err(RbPolarsErr::other(format!(
|
283
299
|
"object type not supported {:?}",
|
@@ -287,6 +303,141 @@ impl<'s> TryConvert for Wrap<AnyValue<'s>> {
|
|
287
303
|
}
|
288
304
|
}
|
289
305
|
|
306
|
+
impl<'s> TryConvert for Wrap<Row<'s>> {
|
307
|
+
fn try_convert(ob: Value) -> RbResult<Self> {
|
308
|
+
let mut vals: Vec<Wrap<AnyValue<'s>>> = Vec::new();
|
309
|
+
for item in ob.try_convert::<RArray>()?.each() {
|
310
|
+
vals.push(item?.try_convert::<Wrap<AnyValue<'s>>>()?);
|
311
|
+
}
|
312
|
+
let vals: Vec<AnyValue> = unsafe { std::mem::transmute(vals) };
|
313
|
+
Ok(Wrap(Row(vals)))
|
314
|
+
}
|
315
|
+
}
|
316
|
+
|
317
|
+
impl TryConvert for Wrap<Schema> {
|
318
|
+
fn try_convert(ob: Value) -> RbResult<Self> {
|
319
|
+
let dict = ob.try_convert::<RHash>()?;
|
320
|
+
|
321
|
+
let mut schema = Vec::new();
|
322
|
+
dict.foreach(|key: String, val: Wrap<DataType>| {
|
323
|
+
schema.push(Field::new(&key, val.0));
|
324
|
+
Ok(ForEach::Continue)
|
325
|
+
})
|
326
|
+
.unwrap();
|
327
|
+
|
328
|
+
Ok(Wrap(schema.into_iter().into()))
|
329
|
+
}
|
330
|
+
}
|
331
|
+
|
332
|
+
#[derive(Clone, Debug)]
|
333
|
+
pub struct ObjectValue {
|
334
|
+
pub inner: Value,
|
335
|
+
}
|
336
|
+
|
337
|
+
impl Hash for ObjectValue {
|
338
|
+
fn hash<H: Hasher>(&self, state: &mut H) {
|
339
|
+
let h = self
|
340
|
+
.inner
|
341
|
+
.funcall::<_, _, isize>("hash", ())
|
342
|
+
.expect("should be hashable");
|
343
|
+
state.write_isize(h)
|
344
|
+
}
|
345
|
+
}
|
346
|
+
|
347
|
+
impl Eq for ObjectValue {}
|
348
|
+
|
349
|
+
impl PartialEq for ObjectValue {
|
350
|
+
fn eq(&self, other: &Self) -> bool {
|
351
|
+
self.inner.eql(&other.inner).unwrap_or(false)
|
352
|
+
}
|
353
|
+
}
|
354
|
+
|
355
|
+
impl Display for ObjectValue {
|
356
|
+
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
357
|
+
write!(f, "{}", self.inner)
|
358
|
+
}
|
359
|
+
}
|
360
|
+
|
361
|
+
impl PolarsObject for ObjectValue {
|
362
|
+
fn type_name() -> &'static str {
|
363
|
+
"object"
|
364
|
+
}
|
365
|
+
}
|
366
|
+
|
367
|
+
impl From<Value> for ObjectValue {
|
368
|
+
fn from(v: Value) -> Self {
|
369
|
+
Self { inner: v }
|
370
|
+
}
|
371
|
+
}
|
372
|
+
|
373
|
+
impl TryConvert for ObjectValue {
|
374
|
+
fn try_convert(ob: Value) -> RbResult<Self> {
|
375
|
+
Ok(ObjectValue { inner: ob })
|
376
|
+
}
|
377
|
+
}
|
378
|
+
|
379
|
+
impl From<&dyn PolarsObjectSafe> for &ObjectValue {
|
380
|
+
fn from(val: &dyn PolarsObjectSafe) -> Self {
|
381
|
+
unsafe { &*(val as *const dyn PolarsObjectSafe as *const ObjectValue) }
|
382
|
+
}
|
383
|
+
}
|
384
|
+
|
385
|
+
// TODO remove
|
386
|
+
impl ObjectValue {
|
387
|
+
pub fn to_object(&self) -> Value {
|
388
|
+
self.inner
|
389
|
+
}
|
390
|
+
}
|
391
|
+
|
392
|
+
impl From<ObjectValue> for Value {
|
393
|
+
fn from(val: ObjectValue) -> Self {
|
394
|
+
val.inner
|
395
|
+
}
|
396
|
+
}
|
397
|
+
|
398
|
+
impl Default for ObjectValue {
|
399
|
+
fn default() -> Self {
|
400
|
+
ObjectValue { inner: *QNIL }
|
401
|
+
}
|
402
|
+
}
|
403
|
+
|
404
|
+
pub(crate) fn dicts_to_rows(
|
405
|
+
records: &Value,
|
406
|
+
infer_schema_len: usize,
|
407
|
+
) -> RbResult<(Vec<Row>, Vec<String>)> {
|
408
|
+
let (dicts, len) = get_rbseq(*records)?;
|
409
|
+
|
410
|
+
let mut key_names = PlIndexSet::new();
|
411
|
+
for d in dicts.each().take(infer_schema_len) {
|
412
|
+
let d = d?;
|
413
|
+
let d = d.try_convert::<RHash>()?;
|
414
|
+
|
415
|
+
d.foreach(|name: String, _value: Value| {
|
416
|
+
key_names.insert(name);
|
417
|
+
Ok(ForEach::Continue)
|
418
|
+
})?;
|
419
|
+
}
|
420
|
+
|
421
|
+
let mut rows = Vec::with_capacity(len);
|
422
|
+
|
423
|
+
for d in dicts.each() {
|
424
|
+
let d = d?;
|
425
|
+
let d = d.try_convert::<RHash>()?;
|
426
|
+
|
427
|
+
let mut row = Vec::with_capacity(key_names.len());
|
428
|
+
|
429
|
+
for k in key_names.iter() {
|
430
|
+
let val = match d.get(k.clone()) {
|
431
|
+
None => AnyValue::Null,
|
432
|
+
Some(val) => val.try_convert::<Wrap<AnyValue>>()?.0,
|
433
|
+
};
|
434
|
+
row.push(val)
|
435
|
+
}
|
436
|
+
rows.push(Row(row))
|
437
|
+
}
|
438
|
+
Ok((rows, key_names.into_iter().collect()))
|
439
|
+
}
|
440
|
+
|
290
441
|
impl TryConvert for Wrap<AsofStrategy> {
|
291
442
|
fn try_convert(ob: Value) -> RbResult<Self> {
|
292
443
|
let parsed = match ob.try_convert::<String>()?.as_str() {
|
@@ -641,101 +792,3 @@ pub fn parse_parquet_compression(
|
|
641
792
|
};
|
642
793
|
Ok(parsed)
|
643
794
|
}
|
644
|
-
|
645
|
-
impl<'s> TryConvert for Wrap<Row<'s>> {
|
646
|
-
fn try_convert(ob: Value) -> RbResult<Self> {
|
647
|
-
let mut vals: Vec<Wrap<AnyValue<'s>>> = Vec::new();
|
648
|
-
for item in ob.try_convert::<RArray>()?.each() {
|
649
|
-
vals.push(item?.try_convert::<Wrap<AnyValue<'s>>>()?);
|
650
|
-
}
|
651
|
-
let vals: Vec<AnyValue> = unsafe { std::mem::transmute(vals) };
|
652
|
-
Ok(Wrap(Row(vals)))
|
653
|
-
}
|
654
|
-
}
|
655
|
-
|
656
|
-
impl TryConvert for Wrap<Schema> {
|
657
|
-
fn try_convert(ob: Value) -> RbResult<Self> {
|
658
|
-
let dict = ob.try_convert::<RHash>()?;
|
659
|
-
|
660
|
-
let mut schema = Vec::new();
|
661
|
-
dict.foreach(|key: String, val: Wrap<DataType>| {
|
662
|
-
schema.push(Field::new(&key, val.0));
|
663
|
-
Ok(ForEach::Continue)
|
664
|
-
})
|
665
|
-
.unwrap();
|
666
|
-
|
667
|
-
Ok(Wrap(schema.into_iter().into()))
|
668
|
-
}
|
669
|
-
}
|
670
|
-
|
671
|
-
#[derive(Clone, Debug)]
|
672
|
-
pub struct ObjectValue {
|
673
|
-
pub inner: Value,
|
674
|
-
}
|
675
|
-
|
676
|
-
impl Hash for ObjectValue {
|
677
|
-
fn hash<H: Hasher>(&self, state: &mut H) {
|
678
|
-
let h = self
|
679
|
-
.inner
|
680
|
-
.funcall::<_, _, isize>("hash", ())
|
681
|
-
.expect("should be hashable");
|
682
|
-
state.write_isize(h)
|
683
|
-
}
|
684
|
-
}
|
685
|
-
|
686
|
-
impl Eq for ObjectValue {}
|
687
|
-
|
688
|
-
impl PartialEq for ObjectValue {
|
689
|
-
fn eq(&self, other: &Self) -> bool {
|
690
|
-
self.inner.eql(&other.inner).unwrap_or(false)
|
691
|
-
}
|
692
|
-
}
|
693
|
-
|
694
|
-
impl Display for ObjectValue {
|
695
|
-
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
696
|
-
write!(f, "{}", self.inner)
|
697
|
-
}
|
698
|
-
}
|
699
|
-
|
700
|
-
impl PolarsObject for ObjectValue {
|
701
|
-
fn type_name() -> &'static str {
|
702
|
-
"object"
|
703
|
-
}
|
704
|
-
}
|
705
|
-
|
706
|
-
impl From<Value> for ObjectValue {
|
707
|
-
fn from(v: Value) -> Self {
|
708
|
-
Self { inner: v }
|
709
|
-
}
|
710
|
-
}
|
711
|
-
|
712
|
-
impl TryConvert for ObjectValue {
|
713
|
-
fn try_convert(ob: Value) -> RbResult<Self> {
|
714
|
-
Ok(ObjectValue { inner: ob })
|
715
|
-
}
|
716
|
-
}
|
717
|
-
|
718
|
-
impl From<&dyn PolarsObjectSafe> for &ObjectValue {
|
719
|
-
fn from(val: &dyn PolarsObjectSafe) -> Self {
|
720
|
-
unsafe { &*(val as *const dyn PolarsObjectSafe as *const ObjectValue) }
|
721
|
-
}
|
722
|
-
}
|
723
|
-
|
724
|
-
// TODO remove
|
725
|
-
impl ObjectValue {
|
726
|
-
pub fn to_object(&self) -> Value {
|
727
|
-
self.inner
|
728
|
-
}
|
729
|
-
}
|
730
|
-
|
731
|
-
impl From<ObjectValue> for Value {
|
732
|
-
fn from(val: ObjectValue) -> Self {
|
733
|
-
val.inner
|
734
|
-
}
|
735
|
-
}
|
736
|
-
|
737
|
-
impl Default for ObjectValue {
|
738
|
-
fn default() -> Self {
|
739
|
-
ObjectValue { inner: *QNIL }
|
740
|
-
}
|
741
|
-
}
|
data/ext/polars/src/dataframe.rs
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
use magnus::{r_hash::ForEach, RArray, RHash, RString, Value};
|
2
|
+
use polars::frame::row::{rows_to_schema_supertypes, Row};
|
2
3
|
use polars::frame::NullStrategy;
|
3
4
|
use polars::io::avro::AvroCompression;
|
4
5
|
use polars::io::mmap::ReaderBytes;
|
@@ -15,8 +16,9 @@ use crate::apply::dataframe::{
|
|
15
16
|
};
|
16
17
|
use crate::conversion::*;
|
17
18
|
use crate::file::{get_file_like, get_mmap_bytes_reader};
|
19
|
+
use crate::rb_modules;
|
18
20
|
use crate::series::{to_rbseries_collection, to_series_collection};
|
19
|
-
use crate::{
|
21
|
+
use crate::{RbExpr, RbLazyFrame, RbPolarsErr, RbResult, RbSeries};
|
20
22
|
|
21
23
|
#[magnus::wrap(class = "Polars::RbDataFrame")]
|
22
24
|
pub struct RbDataFrame {
|
@@ -36,6 +38,45 @@ impl RbDataFrame {
|
|
36
38
|
}
|
37
39
|
}
|
38
40
|
|
41
|
+
fn finish_from_rows(
|
42
|
+
rows: Vec<Row>,
|
43
|
+
infer_schema_length: Option<usize>,
|
44
|
+
schema_overwrite: Option<Schema>,
|
45
|
+
) -> RbResult<Self> {
|
46
|
+
// object builder must be registered.
|
47
|
+
crate::object::register_object_builder();
|
48
|
+
|
49
|
+
let schema =
|
50
|
+
rows_to_schema_supertypes(&rows, infer_schema_length).map_err(RbPolarsErr::from)?;
|
51
|
+
// replace inferred nulls with boolean
|
52
|
+
let fields = schema.iter_fields().map(|mut fld| match fld.data_type() {
|
53
|
+
DataType::Null => {
|
54
|
+
fld.coerce(DataType::Boolean);
|
55
|
+
fld
|
56
|
+
}
|
57
|
+
_ => fld,
|
58
|
+
});
|
59
|
+
let mut schema = Schema::from(fields);
|
60
|
+
|
61
|
+
if let Some(schema_overwrite) = schema_overwrite {
|
62
|
+
for (i, (name, dtype)) in schema_overwrite.into_iter().enumerate() {
|
63
|
+
if let Some((name_, dtype_)) = schema.get_index_mut(i) {
|
64
|
+
*name_ = name;
|
65
|
+
|
66
|
+
// if user sets dtype unknown, we use the inferred datatype
|
67
|
+
if !matches!(dtype, DataType::Unknown) {
|
68
|
+
*dtype_ = dtype;
|
69
|
+
}
|
70
|
+
} else {
|
71
|
+
schema.with_column(name, dtype)
|
72
|
+
}
|
73
|
+
}
|
74
|
+
}
|
75
|
+
|
76
|
+
let df = DataFrame::from_rows_and_schema(&rows, &schema).map_err(RbPolarsErr::from)?;
|
77
|
+
Ok(df.into())
|
78
|
+
}
|
79
|
+
|
39
80
|
pub fn init(columns: RArray) -> RbResult<Self> {
|
40
81
|
let mut cols = Vec::new();
|
41
82
|
for i in columns.each() {
|
@@ -288,17 +329,45 @@ impl RbDataFrame {
|
|
288
329
|
}
|
289
330
|
|
290
331
|
pub fn read_hashes(
|
291
|
-
|
292
|
-
|
293
|
-
|
332
|
+
dicts: Value,
|
333
|
+
infer_schema_length: Option<usize>,
|
334
|
+
schema_overwrite: Option<Wrap<Schema>>,
|
294
335
|
) -> RbResult<Self> {
|
295
|
-
|
336
|
+
let (rows, mut names) = dicts_to_rows(&dicts, infer_schema_length.unwrap_or(50))?;
|
337
|
+
|
338
|
+
// ensure the new names are used
|
339
|
+
if let Some(schema) = &schema_overwrite {
|
340
|
+
for (new_name, name) in schema.0.iter_names().zip(names.iter_mut()) {
|
341
|
+
*name = new_name.clone();
|
342
|
+
}
|
343
|
+
}
|
344
|
+
let rbdf = Self::finish_from_rows(
|
345
|
+
rows,
|
346
|
+
infer_schema_length,
|
347
|
+
schema_overwrite.map(|wrap| wrap.0),
|
348
|
+
)?;
|
349
|
+
|
350
|
+
rbdf.df
|
351
|
+
.borrow_mut()
|
352
|
+
.get_columns_mut()
|
353
|
+
.iter_mut()
|
354
|
+
.zip(&names)
|
355
|
+
.for_each(|(s, name)| {
|
356
|
+
s.rename(name);
|
357
|
+
});
|
358
|
+
let length = names.len();
|
359
|
+
if names.into_iter().collect::<PlHashSet<_>>().len() != length {
|
360
|
+
let err = PolarsError::SchemaMisMatch("duplicate column names found".into());
|
361
|
+
Err(RbPolarsErr::from(err))?;
|
362
|
+
}
|
363
|
+
|
364
|
+
Ok(rbdf)
|
296
365
|
}
|
297
366
|
|
298
367
|
pub fn read_hash(data: RHash) -> RbResult<Self> {
|
299
368
|
let mut cols: Vec<Series> = Vec::new();
|
300
369
|
data.foreach(|name: String, values: Value| {
|
301
|
-
let obj: Value = series().funcall("new", (name, values))?;
|
370
|
+
let obj: Value = rb_modules::series().funcall("new", (name, values))?;
|
302
371
|
let rbseries = obj.funcall::<_, _, &RbSeries>("_s", ())?;
|
303
372
|
cols.push(rbseries.series.borrow().clone());
|
304
373
|
Ok(ForEach::Continue)
|
data/ext/polars/src/lib.rs
CHANGED
@@ -6,6 +6,9 @@ mod error;
|
|
6
6
|
mod file;
|
7
7
|
mod lazy;
|
8
8
|
mod list_construction;
|
9
|
+
mod object;
|
10
|
+
mod prelude;
|
11
|
+
pub(crate) mod rb_modules;
|
9
12
|
mod series;
|
10
13
|
mod set;
|
11
14
|
mod utils;
|
@@ -18,15 +21,13 @@ use file::get_file_like;
|
|
18
21
|
use lazy::dataframe::{RbLazyFrame, RbLazyGroupBy};
|
19
22
|
use lazy::dsl::{RbExpr, RbWhen, RbWhenThen};
|
20
23
|
use lazy::utils::rb_exprs_to_exprs;
|
21
|
-
use magnus::{
|
22
|
-
define_module, function, memoize, method, prelude::*, Error, RArray, RClass, RHash, RModule,
|
23
|
-
Value,
|
24
|
-
};
|
24
|
+
use magnus::{function, method, prelude::*, Error, RArray, RHash, Value};
|
25
25
|
use polars::datatypes::{DataType, TimeUnit, IDX_DTYPE};
|
26
26
|
use polars::error::PolarsResult;
|
27
27
|
use polars::frame::DataFrame;
|
28
28
|
use polars::functions::{diag_concat_df, hor_concat_df};
|
29
29
|
use polars::prelude::{ClosedWindow, Duration, DurationArgs, IntoSeries, TimeZone};
|
30
|
+
use rb_modules::polars;
|
30
31
|
use series::RbSeries;
|
31
32
|
|
32
33
|
#[cfg(target_os = "linux")]
|
@@ -45,17 +46,9 @@ static GLOBAL: MiMalloc = MiMalloc;
|
|
45
46
|
|
46
47
|
type RbResult<T> = Result<T, Error>;
|
47
48
|
|
48
|
-
fn module() -> RModule {
|
49
|
-
*memoize!(RModule: define_module("Polars").unwrap())
|
50
|
-
}
|
51
|
-
|
52
|
-
fn series() -> RClass {
|
53
|
-
*memoize!(RClass: module().define_class("Series", Default::default()).unwrap())
|
54
|
-
}
|
55
|
-
|
56
49
|
#[magnus::init]
|
57
50
|
fn init() -> RbResult<()> {
|
58
|
-
let module =
|
51
|
+
let module = polars();
|
59
52
|
module.define_singleton_method("_dtype_cols", function!(dtype_cols, 1))?;
|
60
53
|
module.define_singleton_method("_rb_duration", function!(rb_duration, 8))?;
|
61
54
|
module.define_singleton_method("_concat_df", function!(concat_df, 1))?;
|
@@ -0,0 +1,30 @@
|
|
1
|
+
use std::any::Any;
|
2
|
+
use std::sync::Arc;
|
3
|
+
|
4
|
+
use polars_core::chunked_array::object::builder::ObjectChunkedBuilder;
|
5
|
+
use polars_core::chunked_array::object::registry;
|
6
|
+
use polars_core::chunked_array::object::registry::AnonymousObjectBuilder;
|
7
|
+
use polars_core::prelude::AnyValue;
|
8
|
+
|
9
|
+
use crate::prelude::ObjectValue;
|
10
|
+
use crate::Wrap;
|
11
|
+
|
12
|
+
// pub(crate) const OBJECT_NAME: &str = "object";
|
13
|
+
|
14
|
+
pub(crate) fn register_object_builder() {
|
15
|
+
if !registry::is_object_builder_registered() {
|
16
|
+
let object_builder = Box::new(|name: &str, capacity: usize| {
|
17
|
+
Box::new(ObjectChunkedBuilder::<ObjectValue>::new(name, capacity))
|
18
|
+
as Box<dyn AnonymousObjectBuilder>
|
19
|
+
});
|
20
|
+
|
21
|
+
let object_converter = Arc::new(|av: AnyValue| {
|
22
|
+
let object = ObjectValue {
|
23
|
+
inner: Wrap(av).into(),
|
24
|
+
};
|
25
|
+
Box::new(object) as Box<dyn Any>
|
26
|
+
});
|
27
|
+
|
28
|
+
registry::register_object_builder(object_builder, object_converter)
|
29
|
+
}
|
30
|
+
}
|
@@ -0,0 +1,9 @@
|
|
1
|
+
use magnus::{define_module, memoize, Module, RClass, RModule};
|
2
|
+
|
3
|
+
pub(crate) fn polars() -> RModule {
|
4
|
+
*memoize!(RModule: define_module("Polars").unwrap())
|
5
|
+
}
|
6
|
+
|
7
|
+
pub(crate) fn series() -> RClass {
|
8
|
+
*memoize!(RClass: polars().define_class("Series", Default::default()).unwrap())
|
9
|
+
}
|
data/lib/polars/data_frame.rb
CHANGED
@@ -4746,7 +4746,14 @@ module Polars
|
|
4746
4746
|
end
|
4747
4747
|
|
4748
4748
|
# @private
|
4749
|
-
def self.
|
4749
|
+
def self.include_unknowns(schema, cols)
|
4750
|
+
cols.to_h { |col| [col, schema.fetch(col, Unknown)] }
|
4751
|
+
end
|
4752
|
+
|
4753
|
+
# @private
|
4754
|
+
def self._unpack_columns(columns, schema_overrides: nil, lookup_names: nil, n_expected: nil)
|
4755
|
+
raise Todo if schema_overrides
|
4756
|
+
|
4750
4757
|
if columns.is_a?(Hash)
|
4751
4758
|
columns = columns.to_a
|
4752
4759
|
end
|
@@ -4790,8 +4797,48 @@ module Polars
|
|
4790
4797
|
end
|
4791
4798
|
end
|
4792
4799
|
|
4800
|
+
def self._post_apply_columns(rbdf, columns, structs: nil, schema_overrides: nil)
|
4801
|
+
rbdf_columns = rbdf.columns
|
4802
|
+
rbdf_dtypes = rbdf.dtypes
|
4803
|
+
columns, dtypes = _unpack_columns(
|
4804
|
+
(columns || rbdf_columns), schema_overrides: schema_overrides
|
4805
|
+
)
|
4806
|
+
column_subset = []
|
4807
|
+
if columns != rbdf_columns
|
4808
|
+
if columns.length < rbdf_columns.length && columns == rbdf_columns.first(columns.length)
|
4809
|
+
column_subset = columns
|
4810
|
+
else
|
4811
|
+
rbdf.set_column_names(columns)
|
4812
|
+
end
|
4813
|
+
end
|
4814
|
+
|
4815
|
+
column_casts = []
|
4816
|
+
columns.each do |col, i|
|
4817
|
+
if dtypes[col] == Categorical # != rbdf_dtypes[i]
|
4818
|
+
column_casts << Polars.col(col).cast(Categorical)._rbexpr
|
4819
|
+
elsif structs.any? && structs.include?(col) && structs[col] != rbdf_dtypes[i]
|
4820
|
+
column_casts << Polars.col(col).cast(structs[col])._rbexpr
|
4821
|
+
elsif dtypes.include?(col) && dtypes[col] != rbdf_dtypes[i]
|
4822
|
+
column_casts << Polars.col(col).cast(dtypes[col])._rbexpr
|
4823
|
+
end
|
4824
|
+
end
|
4825
|
+
|
4826
|
+
if column_casts.any? || column_subset.any?
|
4827
|
+
rbdf = rbdf.lazy
|
4828
|
+
if column_casts.any?
|
4829
|
+
rbdf = rbdf.with_columns(column_casts)
|
4830
|
+
end
|
4831
|
+
if column_subset.any?
|
4832
|
+
rbdf = rbdf.select(column_subset.map { |col| Polars.col(col)._rbexpr })
|
4833
|
+
end
|
4834
|
+
rbdf = rbdf.collect
|
4835
|
+
end
|
4836
|
+
|
4837
|
+
rbdf
|
4838
|
+
end
|
4839
|
+
|
4793
4840
|
# @private
|
4794
|
-
def self.sequence_to_rbdf(data, columns: nil, orient: nil)
|
4841
|
+
def self.sequence_to_rbdf(data, columns: nil, orient: nil, infer_schema_length: 50)
|
4795
4842
|
if data.length == 0
|
4796
4843
|
return hash_to_rbdf({}, columns: columns)
|
4797
4844
|
end
|
@@ -4803,6 +4850,14 @@ module Polars
|
|
4803
4850
|
data.each do |s|
|
4804
4851
|
data_series << s._s
|
4805
4852
|
end
|
4853
|
+
elsif data[0].is_a?(Hash)
|
4854
|
+
column_names, dtypes = _unpack_columns(columns)
|
4855
|
+
schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
|
4856
|
+
rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema_overrides)
|
4857
|
+
if column_names
|
4858
|
+
rbdf = _post_apply_columns(rbdf, column_names)
|
4859
|
+
end
|
4860
|
+
return rbdf
|
4806
4861
|
elsif data[0].is_a?(Array)
|
4807
4862
|
if orient.nil? && !columns.nil?
|
4808
4863
|
orient = columns.length == data.length ? "col" : "row"
|
data/lib/polars/data_types.rb
CHANGED
@@ -3,44 +3,64 @@ module Polars
|
|
3
3
|
class DataType
|
4
4
|
end
|
5
5
|
|
6
|
+
# Base class for numeric data types.
|
7
|
+
class NumericType < DataType
|
8
|
+
end
|
9
|
+
|
10
|
+
# Base class for integral data types.
|
11
|
+
class IntegralType < NumericType
|
12
|
+
end
|
13
|
+
|
14
|
+
# Base class for fractional data types.
|
15
|
+
class FractionalType < NumericType
|
16
|
+
end
|
17
|
+
|
18
|
+
# Base class for temporal data types.
|
19
|
+
class TemporalType < DataType
|
20
|
+
end
|
21
|
+
|
22
|
+
# Base class for nested data types.
|
23
|
+
class NestedType < DataType
|
24
|
+
end
|
25
|
+
|
6
26
|
# 8-bit signed integer type.
|
7
|
-
class Int8 <
|
27
|
+
class Int8 < IntegralType
|
8
28
|
end
|
9
29
|
|
10
30
|
# 16-bit signed integer type.
|
11
|
-
class Int16 <
|
31
|
+
class Int16 < IntegralType
|
12
32
|
end
|
13
33
|
|
14
34
|
# 32-bit signed integer type.
|
15
|
-
class Int32 <
|
35
|
+
class Int32 < IntegralType
|
16
36
|
end
|
17
37
|
|
18
38
|
# 64-bit signed integer type.
|
19
|
-
class Int64 <
|
39
|
+
class Int64 < IntegralType
|
20
40
|
end
|
21
41
|
|
22
42
|
# 8-bit unsigned integer type.
|
23
|
-
class UInt8 <
|
43
|
+
class UInt8 < IntegralType
|
24
44
|
end
|
25
45
|
|
26
46
|
# 16-bit unsigned integer type.
|
27
|
-
class UInt16 <
|
47
|
+
class UInt16 < IntegralType
|
28
48
|
end
|
29
49
|
|
30
50
|
# 32-bit unsigned integer type.
|
31
|
-
class UInt32 <
|
51
|
+
class UInt32 < IntegralType
|
32
52
|
end
|
33
53
|
|
34
54
|
# 64-bit unsigned integer type.
|
35
|
-
class UInt64 <
|
55
|
+
class UInt64 < IntegralType
|
36
56
|
end
|
37
57
|
|
38
58
|
# 32-bit floating point type.
|
39
|
-
class Float32 <
|
59
|
+
class Float32 < FractionalType
|
40
60
|
end
|
41
61
|
|
42
62
|
# 64-bit floating point type.
|
43
|
-
class Float64 <
|
63
|
+
class Float64 < FractionalType
|
44
64
|
end
|
45
65
|
|
46
66
|
# Boolean type.
|
@@ -51,31 +71,19 @@ module Polars
|
|
51
71
|
class Utf8 < DataType
|
52
72
|
end
|
53
73
|
|
54
|
-
# Binary type.
|
55
|
-
class Binary < DataType
|
56
|
-
end
|
57
|
-
|
58
|
-
# Type representing Null / None values.
|
59
|
-
class Null < DataType
|
60
|
-
end
|
61
|
-
|
62
|
-
# Type representing Datatype values that could not be determined statically.
|
63
|
-
class Unknown < DataType
|
64
|
-
end
|
65
|
-
|
66
74
|
# Nested list/array type.
|
67
|
-
class List <
|
75
|
+
class List < NestedType
|
68
76
|
def initialize(inner)
|
69
77
|
@inner = Utils.rb_type_to_dtype(inner)
|
70
78
|
end
|
71
79
|
end
|
72
80
|
|
73
81
|
# Calendar date type.
|
74
|
-
class Date <
|
82
|
+
class Date < TemporalType
|
75
83
|
end
|
76
84
|
|
77
85
|
# Calendar date and time type.
|
78
|
-
class Datetime <
|
86
|
+
class Datetime < TemporalType
|
79
87
|
def initialize(time_unit = "us", time_zone = nil)
|
80
88
|
@tu = time_unit || "us"
|
81
89
|
@time_zone = time_zone
|
@@ -83,14 +91,14 @@ module Polars
|
|
83
91
|
end
|
84
92
|
|
85
93
|
# Time duration/delta type.
|
86
|
-
class Duration <
|
94
|
+
class Duration < TemporalType
|
87
95
|
def initialize(time_unit = "us")
|
88
96
|
@tu = time_unit
|
89
97
|
end
|
90
98
|
end
|
91
99
|
|
92
100
|
# Time of day type.
|
93
|
-
class Time <
|
101
|
+
class Time < TemporalType
|
94
102
|
end
|
95
103
|
|
96
104
|
# Type for wrapping arbitrary Ruby objects.
|
@@ -102,15 +110,24 @@ module Polars
|
|
102
110
|
end
|
103
111
|
|
104
112
|
# Definition of a single field within a `Struct` DataType.
|
105
|
-
class Field
|
113
|
+
class Field
|
114
|
+
attr_reader :name, :dtype
|
115
|
+
|
106
116
|
def initialize(name, dtype)
|
107
117
|
@name = name
|
108
118
|
@dtype = Utils.rb_type_to_dtype(dtype)
|
109
119
|
end
|
120
|
+
|
121
|
+
def inspect
|
122
|
+
class_name = self.class.name
|
123
|
+
"#{class_name}(#{@name}: #{@dtype})"
|
124
|
+
end
|
110
125
|
end
|
111
126
|
|
112
127
|
# Struct composite type.
|
113
|
-
class Struct <
|
128
|
+
class Struct < NestedType
|
129
|
+
attr_reader :fields
|
130
|
+
|
114
131
|
def initialize(fields)
|
115
132
|
if fields.is_a?(Hash)
|
116
133
|
@fields = fields.map { |n, d| Field.new(n, d) }
|
@@ -118,5 +135,26 @@ module Polars
|
|
118
135
|
@fields = fields
|
119
136
|
end
|
120
137
|
end
|
138
|
+
|
139
|
+
def inspect
|
140
|
+
class_name = self.class.name
|
141
|
+
"#{class_name}(#{@fields})"
|
142
|
+
end
|
143
|
+
|
144
|
+
def to_schema
|
145
|
+
@fields.to_h { |f| [f.name, f.dtype] }
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
# Binary type.
|
150
|
+
class Binary < DataType
|
151
|
+
end
|
152
|
+
|
153
|
+
# Type representing Null / None values.
|
154
|
+
class Null < DataType
|
155
|
+
end
|
156
|
+
|
157
|
+
# Type representing Datatype values that could not be determined statically.
|
158
|
+
class Unknown < DataType
|
121
159
|
end
|
122
160
|
end
|
data/lib/polars/series.rb
CHANGED
@@ -3667,6 +3667,11 @@ module Polars
|
|
3667
3667
|
rb_temporal_types << ::Time if defined?(::Time)
|
3668
3668
|
|
3669
3669
|
value = _get_first_non_none(values)
|
3670
|
+
if !value.nil?
|
3671
|
+
if value.is_a?(Hash)
|
3672
|
+
return DataFrame.new(values).to_struct(name)._s
|
3673
|
+
end
|
3674
|
+
end
|
3670
3675
|
|
3671
3676
|
if !dtype.nil? && Utils.is_polars_dtype(dtype) && ruby_dtype.nil?
|
3672
3677
|
constructor = polars_type_to_constructor(dtype)
|
data/lib/polars/utils.rb
CHANGED
@@ -160,11 +160,11 @@ module Polars
|
|
160
160
|
|
161
161
|
def self.scale_bytes(sz, to:)
|
162
162
|
scaling_factor = {
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
163
|
+
"b" => 1,
|
164
|
+
"k" => 1024,
|
165
|
+
"m" => 1024 ** 2,
|
166
|
+
"g" => 1024 ** 3,
|
167
|
+
"t" => 1024 ** 4
|
168
168
|
}[to[0]]
|
169
169
|
if scaling_factor > 1
|
170
170
|
sz / scaling_factor.to_f
|
data/lib/polars/version.rb
CHANGED
data/lib/polars-df.rb
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
require_relative "polars"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: polars-df
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-01-
|
11
|
+
date: 2023-01-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -55,6 +55,9 @@ files:
|
|
55
55
|
- ext/polars/src/lazy/utils.rs
|
56
56
|
- ext/polars/src/lib.rs
|
57
57
|
- ext/polars/src/list_construction.rs
|
58
|
+
- ext/polars/src/object.rs
|
59
|
+
- ext/polars/src/prelude.rs
|
60
|
+
- ext/polars/src/rb_modules.rs
|
58
61
|
- ext/polars/src/series.rs
|
59
62
|
- ext/polars/src/set.rs
|
60
63
|
- ext/polars/src/utils.rs
|