polars-df 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/Cargo.lock +1 -1
- data/README.md +4 -2
- data/ext/polars/Cargo.toml +1 -1
- data/ext/polars/src/conversion.rs +152 -99
- data/ext/polars/src/dataframe.rs +75 -6
- data/ext/polars/src/lib.rs +6 -13
- data/ext/polars/src/object.rs +30 -0
- data/ext/polars/src/prelude.rs +3 -0
- data/ext/polars/src/rb_modules.rs +9 -0
- data/lib/polars/data_frame.rb +57 -2
- data/lib/polars/data_types.rb +67 -29
- data/lib/polars/series.rb +5 -0
- data/lib/polars/utils.rb +5 -5
- data/lib/polars/version.rb +1 -1
- data/lib/polars-df.rb +1 -1
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a07e6dd4bee3bef4020d7818a060c6f28aaedb7264b206e35a485e575cd8a695
|
4
|
+
data.tar.gz: c586e0ec898aab7642f49d49b54c614121b9cb1f748eb7c98d76611ae2ad56a2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ff035a9b60966342ca16dc5eea3b0abd0c4a08f5db8a0fc3c4d6ef206dd20ad56becbcd3a7ecdb5c328de6f9a52d53531eeb44c75078170d451bc39197553570
|
7
|
+
data.tar.gz: 48c7334a56339fb0feda046c415839ece39e2c92cf807ed422026d00787d687bbda2ba1198d612434379a2de92830841f0798109920f76d730205e612ab72cb1
|
data/CHANGELOG.md
CHANGED
data/Cargo.lock
CHANGED
data/README.md
CHANGED
@@ -56,6 +56,8 @@ From Active Record
|
|
56
56
|
|
57
57
|
```ruby
|
58
58
|
Polars.read_sql(User.all)
|
59
|
+
# or
|
60
|
+
Polars.read_sql("SELECT * FROM users")
|
59
61
|
```
|
60
62
|
|
61
63
|
From a hash
|
@@ -287,13 +289,13 @@ CSV
|
|
287
289
|
```ruby
|
288
290
|
df.to_csv
|
289
291
|
# or
|
290
|
-
df.write_csv("
|
292
|
+
df.write_csv("file.csv")
|
291
293
|
```
|
292
294
|
|
293
295
|
Parquet
|
294
296
|
|
295
297
|
```ruby
|
296
|
-
df.write_parquet("
|
298
|
+
df.write_parquet("file.parquet")
|
297
299
|
```
|
298
300
|
|
299
301
|
## Types
|
data/ext/polars/Cargo.toml
CHANGED
@@ -144,7 +144,7 @@ impl From<Wrap<AnyValue<'_>>> for Value {
|
|
144
144
|
|
145
145
|
impl From<Wrap<DataType>> for Value {
|
146
146
|
fn from(w: Wrap<DataType>) -> Self {
|
147
|
-
let pl = crate::
|
147
|
+
let pl = crate::rb_modules::polars();
|
148
148
|
|
149
149
|
match &w.0 {
|
150
150
|
DataType::Int8 => pl.const_get::<_, Value>("Int8").unwrap(),
|
@@ -278,6 +278,22 @@ impl<'s> TryConvert for Wrap<AnyValue<'s>> {
|
|
278
278
|
Ok(AnyValue::Int64(v).into())
|
279
279
|
} else if let Ok(v) = ob.try_convert::<f64>() {
|
280
280
|
Ok(AnyValue::Float64(v).into())
|
281
|
+
} else if ob.is_nil() {
|
282
|
+
Ok(AnyValue::Null.into())
|
283
|
+
} else if ob.is_kind_of(class::hash()) {
|
284
|
+
let dict = ob.try_convert::<RHash>().unwrap();
|
285
|
+
let len = dict.len();
|
286
|
+
let mut keys = Vec::with_capacity(len);
|
287
|
+
let mut vals = Vec::with_capacity(len);
|
288
|
+
dict.foreach(|k: Value, v: Value| {
|
289
|
+
let key = k.try_convert::<String>()?;
|
290
|
+
let val = v.try_convert::<Wrap<AnyValue>>()?.0;
|
291
|
+
let dtype = DataType::from(&val);
|
292
|
+
keys.push(Field::new(&key, dtype));
|
293
|
+
vals.push(val);
|
294
|
+
Ok(ForEach::Continue)
|
295
|
+
})?;
|
296
|
+
Ok(Wrap(AnyValue::StructOwned(Box::new((vals, keys)))))
|
281
297
|
} else {
|
282
298
|
Err(RbPolarsErr::other(format!(
|
283
299
|
"object type not supported {:?}",
|
@@ -287,6 +303,141 @@ impl<'s> TryConvert for Wrap<AnyValue<'s>> {
|
|
287
303
|
}
|
288
304
|
}
|
289
305
|
|
306
|
+
impl<'s> TryConvert for Wrap<Row<'s>> {
|
307
|
+
fn try_convert(ob: Value) -> RbResult<Self> {
|
308
|
+
let mut vals: Vec<Wrap<AnyValue<'s>>> = Vec::new();
|
309
|
+
for item in ob.try_convert::<RArray>()?.each() {
|
310
|
+
vals.push(item?.try_convert::<Wrap<AnyValue<'s>>>()?);
|
311
|
+
}
|
312
|
+
let vals: Vec<AnyValue> = unsafe { std::mem::transmute(vals) };
|
313
|
+
Ok(Wrap(Row(vals)))
|
314
|
+
}
|
315
|
+
}
|
316
|
+
|
317
|
+
impl TryConvert for Wrap<Schema> {
|
318
|
+
fn try_convert(ob: Value) -> RbResult<Self> {
|
319
|
+
let dict = ob.try_convert::<RHash>()?;
|
320
|
+
|
321
|
+
let mut schema = Vec::new();
|
322
|
+
dict.foreach(|key: String, val: Wrap<DataType>| {
|
323
|
+
schema.push(Field::new(&key, val.0));
|
324
|
+
Ok(ForEach::Continue)
|
325
|
+
})
|
326
|
+
.unwrap();
|
327
|
+
|
328
|
+
Ok(Wrap(schema.into_iter().into()))
|
329
|
+
}
|
330
|
+
}
|
331
|
+
|
332
|
+
#[derive(Clone, Debug)]
|
333
|
+
pub struct ObjectValue {
|
334
|
+
pub inner: Value,
|
335
|
+
}
|
336
|
+
|
337
|
+
impl Hash for ObjectValue {
|
338
|
+
fn hash<H: Hasher>(&self, state: &mut H) {
|
339
|
+
let h = self
|
340
|
+
.inner
|
341
|
+
.funcall::<_, _, isize>("hash", ())
|
342
|
+
.expect("should be hashable");
|
343
|
+
state.write_isize(h)
|
344
|
+
}
|
345
|
+
}
|
346
|
+
|
347
|
+
impl Eq for ObjectValue {}
|
348
|
+
|
349
|
+
impl PartialEq for ObjectValue {
|
350
|
+
fn eq(&self, other: &Self) -> bool {
|
351
|
+
self.inner.eql(&other.inner).unwrap_or(false)
|
352
|
+
}
|
353
|
+
}
|
354
|
+
|
355
|
+
impl Display for ObjectValue {
|
356
|
+
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
357
|
+
write!(f, "{}", self.inner)
|
358
|
+
}
|
359
|
+
}
|
360
|
+
|
361
|
+
impl PolarsObject for ObjectValue {
|
362
|
+
fn type_name() -> &'static str {
|
363
|
+
"object"
|
364
|
+
}
|
365
|
+
}
|
366
|
+
|
367
|
+
impl From<Value> for ObjectValue {
|
368
|
+
fn from(v: Value) -> Self {
|
369
|
+
Self { inner: v }
|
370
|
+
}
|
371
|
+
}
|
372
|
+
|
373
|
+
impl TryConvert for ObjectValue {
|
374
|
+
fn try_convert(ob: Value) -> RbResult<Self> {
|
375
|
+
Ok(ObjectValue { inner: ob })
|
376
|
+
}
|
377
|
+
}
|
378
|
+
|
379
|
+
impl From<&dyn PolarsObjectSafe> for &ObjectValue {
|
380
|
+
fn from(val: &dyn PolarsObjectSafe) -> Self {
|
381
|
+
unsafe { &*(val as *const dyn PolarsObjectSafe as *const ObjectValue) }
|
382
|
+
}
|
383
|
+
}
|
384
|
+
|
385
|
+
// TODO remove
|
386
|
+
impl ObjectValue {
|
387
|
+
pub fn to_object(&self) -> Value {
|
388
|
+
self.inner
|
389
|
+
}
|
390
|
+
}
|
391
|
+
|
392
|
+
impl From<ObjectValue> for Value {
|
393
|
+
fn from(val: ObjectValue) -> Self {
|
394
|
+
val.inner
|
395
|
+
}
|
396
|
+
}
|
397
|
+
|
398
|
+
impl Default for ObjectValue {
|
399
|
+
fn default() -> Self {
|
400
|
+
ObjectValue { inner: *QNIL }
|
401
|
+
}
|
402
|
+
}
|
403
|
+
|
404
|
+
pub(crate) fn dicts_to_rows(
|
405
|
+
records: &Value,
|
406
|
+
infer_schema_len: usize,
|
407
|
+
) -> RbResult<(Vec<Row>, Vec<String>)> {
|
408
|
+
let (dicts, len) = get_rbseq(*records)?;
|
409
|
+
|
410
|
+
let mut key_names = PlIndexSet::new();
|
411
|
+
for d in dicts.each().take(infer_schema_len) {
|
412
|
+
let d = d?;
|
413
|
+
let d = d.try_convert::<RHash>()?;
|
414
|
+
|
415
|
+
d.foreach(|name: String, _value: Value| {
|
416
|
+
key_names.insert(name);
|
417
|
+
Ok(ForEach::Continue)
|
418
|
+
})?;
|
419
|
+
}
|
420
|
+
|
421
|
+
let mut rows = Vec::with_capacity(len);
|
422
|
+
|
423
|
+
for d in dicts.each() {
|
424
|
+
let d = d?;
|
425
|
+
let d = d.try_convert::<RHash>()?;
|
426
|
+
|
427
|
+
let mut row = Vec::with_capacity(key_names.len());
|
428
|
+
|
429
|
+
for k in key_names.iter() {
|
430
|
+
let val = match d.get(k.clone()) {
|
431
|
+
None => AnyValue::Null,
|
432
|
+
Some(val) => val.try_convert::<Wrap<AnyValue>>()?.0,
|
433
|
+
};
|
434
|
+
row.push(val)
|
435
|
+
}
|
436
|
+
rows.push(Row(row))
|
437
|
+
}
|
438
|
+
Ok((rows, key_names.into_iter().collect()))
|
439
|
+
}
|
440
|
+
|
290
441
|
impl TryConvert for Wrap<AsofStrategy> {
|
291
442
|
fn try_convert(ob: Value) -> RbResult<Self> {
|
292
443
|
let parsed = match ob.try_convert::<String>()?.as_str() {
|
@@ -641,101 +792,3 @@ pub fn parse_parquet_compression(
|
|
641
792
|
};
|
642
793
|
Ok(parsed)
|
643
794
|
}
|
644
|
-
|
645
|
-
impl<'s> TryConvert for Wrap<Row<'s>> {
|
646
|
-
fn try_convert(ob: Value) -> RbResult<Self> {
|
647
|
-
let mut vals: Vec<Wrap<AnyValue<'s>>> = Vec::new();
|
648
|
-
for item in ob.try_convert::<RArray>()?.each() {
|
649
|
-
vals.push(item?.try_convert::<Wrap<AnyValue<'s>>>()?);
|
650
|
-
}
|
651
|
-
let vals: Vec<AnyValue> = unsafe { std::mem::transmute(vals) };
|
652
|
-
Ok(Wrap(Row(vals)))
|
653
|
-
}
|
654
|
-
}
|
655
|
-
|
656
|
-
impl TryConvert for Wrap<Schema> {
|
657
|
-
fn try_convert(ob: Value) -> RbResult<Self> {
|
658
|
-
let dict = ob.try_convert::<RHash>()?;
|
659
|
-
|
660
|
-
let mut schema = Vec::new();
|
661
|
-
dict.foreach(|key: String, val: Wrap<DataType>| {
|
662
|
-
schema.push(Field::new(&key, val.0));
|
663
|
-
Ok(ForEach::Continue)
|
664
|
-
})
|
665
|
-
.unwrap();
|
666
|
-
|
667
|
-
Ok(Wrap(schema.into_iter().into()))
|
668
|
-
}
|
669
|
-
}
|
670
|
-
|
671
|
-
#[derive(Clone, Debug)]
|
672
|
-
pub struct ObjectValue {
|
673
|
-
pub inner: Value,
|
674
|
-
}
|
675
|
-
|
676
|
-
impl Hash for ObjectValue {
|
677
|
-
fn hash<H: Hasher>(&self, state: &mut H) {
|
678
|
-
let h = self
|
679
|
-
.inner
|
680
|
-
.funcall::<_, _, isize>("hash", ())
|
681
|
-
.expect("should be hashable");
|
682
|
-
state.write_isize(h)
|
683
|
-
}
|
684
|
-
}
|
685
|
-
|
686
|
-
impl Eq for ObjectValue {}
|
687
|
-
|
688
|
-
impl PartialEq for ObjectValue {
|
689
|
-
fn eq(&self, other: &Self) -> bool {
|
690
|
-
self.inner.eql(&other.inner).unwrap_or(false)
|
691
|
-
}
|
692
|
-
}
|
693
|
-
|
694
|
-
impl Display for ObjectValue {
|
695
|
-
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
696
|
-
write!(f, "{}", self.inner)
|
697
|
-
}
|
698
|
-
}
|
699
|
-
|
700
|
-
impl PolarsObject for ObjectValue {
|
701
|
-
fn type_name() -> &'static str {
|
702
|
-
"object"
|
703
|
-
}
|
704
|
-
}
|
705
|
-
|
706
|
-
impl From<Value> for ObjectValue {
|
707
|
-
fn from(v: Value) -> Self {
|
708
|
-
Self { inner: v }
|
709
|
-
}
|
710
|
-
}
|
711
|
-
|
712
|
-
impl TryConvert for ObjectValue {
|
713
|
-
fn try_convert(ob: Value) -> RbResult<Self> {
|
714
|
-
Ok(ObjectValue { inner: ob })
|
715
|
-
}
|
716
|
-
}
|
717
|
-
|
718
|
-
impl From<&dyn PolarsObjectSafe> for &ObjectValue {
|
719
|
-
fn from(val: &dyn PolarsObjectSafe) -> Self {
|
720
|
-
unsafe { &*(val as *const dyn PolarsObjectSafe as *const ObjectValue) }
|
721
|
-
}
|
722
|
-
}
|
723
|
-
|
724
|
-
// TODO remove
|
725
|
-
impl ObjectValue {
|
726
|
-
pub fn to_object(&self) -> Value {
|
727
|
-
self.inner
|
728
|
-
}
|
729
|
-
}
|
730
|
-
|
731
|
-
impl From<ObjectValue> for Value {
|
732
|
-
fn from(val: ObjectValue) -> Self {
|
733
|
-
val.inner
|
734
|
-
}
|
735
|
-
}
|
736
|
-
|
737
|
-
impl Default for ObjectValue {
|
738
|
-
fn default() -> Self {
|
739
|
-
ObjectValue { inner: *QNIL }
|
740
|
-
}
|
741
|
-
}
|
data/ext/polars/src/dataframe.rs
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
use magnus::{r_hash::ForEach, RArray, RHash, RString, Value};
|
2
|
+
use polars::frame::row::{rows_to_schema_supertypes, Row};
|
2
3
|
use polars::frame::NullStrategy;
|
3
4
|
use polars::io::avro::AvroCompression;
|
4
5
|
use polars::io::mmap::ReaderBytes;
|
@@ -15,8 +16,9 @@ use crate::apply::dataframe::{
|
|
15
16
|
};
|
16
17
|
use crate::conversion::*;
|
17
18
|
use crate::file::{get_file_like, get_mmap_bytes_reader};
|
19
|
+
use crate::rb_modules;
|
18
20
|
use crate::series::{to_rbseries_collection, to_series_collection};
|
19
|
-
use crate::{
|
21
|
+
use crate::{RbExpr, RbLazyFrame, RbPolarsErr, RbResult, RbSeries};
|
20
22
|
|
21
23
|
#[magnus::wrap(class = "Polars::RbDataFrame")]
|
22
24
|
pub struct RbDataFrame {
|
@@ -36,6 +38,45 @@ impl RbDataFrame {
|
|
36
38
|
}
|
37
39
|
}
|
38
40
|
|
41
|
+
fn finish_from_rows(
|
42
|
+
rows: Vec<Row>,
|
43
|
+
infer_schema_length: Option<usize>,
|
44
|
+
schema_overwrite: Option<Schema>,
|
45
|
+
) -> RbResult<Self> {
|
46
|
+
// object builder must be registered.
|
47
|
+
crate::object::register_object_builder();
|
48
|
+
|
49
|
+
let schema =
|
50
|
+
rows_to_schema_supertypes(&rows, infer_schema_length).map_err(RbPolarsErr::from)?;
|
51
|
+
// replace inferred nulls with boolean
|
52
|
+
let fields = schema.iter_fields().map(|mut fld| match fld.data_type() {
|
53
|
+
DataType::Null => {
|
54
|
+
fld.coerce(DataType::Boolean);
|
55
|
+
fld
|
56
|
+
}
|
57
|
+
_ => fld,
|
58
|
+
});
|
59
|
+
let mut schema = Schema::from(fields);
|
60
|
+
|
61
|
+
if let Some(schema_overwrite) = schema_overwrite {
|
62
|
+
for (i, (name, dtype)) in schema_overwrite.into_iter().enumerate() {
|
63
|
+
if let Some((name_, dtype_)) = schema.get_index_mut(i) {
|
64
|
+
*name_ = name;
|
65
|
+
|
66
|
+
// if user sets dtype unknown, we use the inferred datatype
|
67
|
+
if !matches!(dtype, DataType::Unknown) {
|
68
|
+
*dtype_ = dtype;
|
69
|
+
}
|
70
|
+
} else {
|
71
|
+
schema.with_column(name, dtype)
|
72
|
+
}
|
73
|
+
}
|
74
|
+
}
|
75
|
+
|
76
|
+
let df = DataFrame::from_rows_and_schema(&rows, &schema).map_err(RbPolarsErr::from)?;
|
77
|
+
Ok(df.into())
|
78
|
+
}
|
79
|
+
|
39
80
|
pub fn init(columns: RArray) -> RbResult<Self> {
|
40
81
|
let mut cols = Vec::new();
|
41
82
|
for i in columns.each() {
|
@@ -288,17 +329,45 @@ impl RbDataFrame {
|
|
288
329
|
}
|
289
330
|
|
290
331
|
pub fn read_hashes(
|
291
|
-
|
292
|
-
|
293
|
-
|
332
|
+
dicts: Value,
|
333
|
+
infer_schema_length: Option<usize>,
|
334
|
+
schema_overwrite: Option<Wrap<Schema>>,
|
294
335
|
) -> RbResult<Self> {
|
295
|
-
|
336
|
+
let (rows, mut names) = dicts_to_rows(&dicts, infer_schema_length.unwrap_or(50))?;
|
337
|
+
|
338
|
+
// ensure the new names are used
|
339
|
+
if let Some(schema) = &schema_overwrite {
|
340
|
+
for (new_name, name) in schema.0.iter_names().zip(names.iter_mut()) {
|
341
|
+
*name = new_name.clone();
|
342
|
+
}
|
343
|
+
}
|
344
|
+
let rbdf = Self::finish_from_rows(
|
345
|
+
rows,
|
346
|
+
infer_schema_length,
|
347
|
+
schema_overwrite.map(|wrap| wrap.0),
|
348
|
+
)?;
|
349
|
+
|
350
|
+
rbdf.df
|
351
|
+
.borrow_mut()
|
352
|
+
.get_columns_mut()
|
353
|
+
.iter_mut()
|
354
|
+
.zip(&names)
|
355
|
+
.for_each(|(s, name)| {
|
356
|
+
s.rename(name);
|
357
|
+
});
|
358
|
+
let length = names.len();
|
359
|
+
if names.into_iter().collect::<PlHashSet<_>>().len() != length {
|
360
|
+
let err = PolarsError::SchemaMisMatch("duplicate column names found".into());
|
361
|
+
Err(RbPolarsErr::from(err))?;
|
362
|
+
}
|
363
|
+
|
364
|
+
Ok(rbdf)
|
296
365
|
}
|
297
366
|
|
298
367
|
pub fn read_hash(data: RHash) -> RbResult<Self> {
|
299
368
|
let mut cols: Vec<Series> = Vec::new();
|
300
369
|
data.foreach(|name: String, values: Value| {
|
301
|
-
let obj: Value = series().funcall("new", (name, values))?;
|
370
|
+
let obj: Value = rb_modules::series().funcall("new", (name, values))?;
|
302
371
|
let rbseries = obj.funcall::<_, _, &RbSeries>("_s", ())?;
|
303
372
|
cols.push(rbseries.series.borrow().clone());
|
304
373
|
Ok(ForEach::Continue)
|
data/ext/polars/src/lib.rs
CHANGED
@@ -6,6 +6,9 @@ mod error;
|
|
6
6
|
mod file;
|
7
7
|
mod lazy;
|
8
8
|
mod list_construction;
|
9
|
+
mod object;
|
10
|
+
mod prelude;
|
11
|
+
pub(crate) mod rb_modules;
|
9
12
|
mod series;
|
10
13
|
mod set;
|
11
14
|
mod utils;
|
@@ -18,15 +21,13 @@ use file::get_file_like;
|
|
18
21
|
use lazy::dataframe::{RbLazyFrame, RbLazyGroupBy};
|
19
22
|
use lazy::dsl::{RbExpr, RbWhen, RbWhenThen};
|
20
23
|
use lazy::utils::rb_exprs_to_exprs;
|
21
|
-
use magnus::{
|
22
|
-
define_module, function, memoize, method, prelude::*, Error, RArray, RClass, RHash, RModule,
|
23
|
-
Value,
|
24
|
-
};
|
24
|
+
use magnus::{function, method, prelude::*, Error, RArray, RHash, Value};
|
25
25
|
use polars::datatypes::{DataType, TimeUnit, IDX_DTYPE};
|
26
26
|
use polars::error::PolarsResult;
|
27
27
|
use polars::frame::DataFrame;
|
28
28
|
use polars::functions::{diag_concat_df, hor_concat_df};
|
29
29
|
use polars::prelude::{ClosedWindow, Duration, DurationArgs, IntoSeries, TimeZone};
|
30
|
+
use rb_modules::polars;
|
30
31
|
use series::RbSeries;
|
31
32
|
|
32
33
|
#[cfg(target_os = "linux")]
|
@@ -45,17 +46,9 @@ static GLOBAL: MiMalloc = MiMalloc;
|
|
45
46
|
|
46
47
|
type RbResult<T> = Result<T, Error>;
|
47
48
|
|
48
|
-
fn module() -> RModule {
|
49
|
-
*memoize!(RModule: define_module("Polars").unwrap())
|
50
|
-
}
|
51
|
-
|
52
|
-
fn series() -> RClass {
|
53
|
-
*memoize!(RClass: module().define_class("Series", Default::default()).unwrap())
|
54
|
-
}
|
55
|
-
|
56
49
|
#[magnus::init]
|
57
50
|
fn init() -> RbResult<()> {
|
58
|
-
let module =
|
51
|
+
let module = polars();
|
59
52
|
module.define_singleton_method("_dtype_cols", function!(dtype_cols, 1))?;
|
60
53
|
module.define_singleton_method("_rb_duration", function!(rb_duration, 8))?;
|
61
54
|
module.define_singleton_method("_concat_df", function!(concat_df, 1))?;
|
@@ -0,0 +1,30 @@
|
|
1
|
+
use std::any::Any;
|
2
|
+
use std::sync::Arc;
|
3
|
+
|
4
|
+
use polars_core::chunked_array::object::builder::ObjectChunkedBuilder;
|
5
|
+
use polars_core::chunked_array::object::registry;
|
6
|
+
use polars_core::chunked_array::object::registry::AnonymousObjectBuilder;
|
7
|
+
use polars_core::prelude::AnyValue;
|
8
|
+
|
9
|
+
use crate::prelude::ObjectValue;
|
10
|
+
use crate::Wrap;
|
11
|
+
|
12
|
+
// pub(crate) const OBJECT_NAME: &str = "object";
|
13
|
+
|
14
|
+
pub(crate) fn register_object_builder() {
|
15
|
+
if !registry::is_object_builder_registered() {
|
16
|
+
let object_builder = Box::new(|name: &str, capacity: usize| {
|
17
|
+
Box::new(ObjectChunkedBuilder::<ObjectValue>::new(name, capacity))
|
18
|
+
as Box<dyn AnonymousObjectBuilder>
|
19
|
+
});
|
20
|
+
|
21
|
+
let object_converter = Arc::new(|av: AnyValue| {
|
22
|
+
let object = ObjectValue {
|
23
|
+
inner: Wrap(av).into(),
|
24
|
+
};
|
25
|
+
Box::new(object) as Box<dyn Any>
|
26
|
+
});
|
27
|
+
|
28
|
+
registry::register_object_builder(object_builder, object_converter)
|
29
|
+
}
|
30
|
+
}
|
@@ -0,0 +1,9 @@
|
|
1
|
+
use magnus::{define_module, memoize, Module, RClass, RModule};
|
2
|
+
|
3
|
+
pub(crate) fn polars() -> RModule {
|
4
|
+
*memoize!(RModule: define_module("Polars").unwrap())
|
5
|
+
}
|
6
|
+
|
7
|
+
pub(crate) fn series() -> RClass {
|
8
|
+
*memoize!(RClass: polars().define_class("Series", Default::default()).unwrap())
|
9
|
+
}
|
data/lib/polars/data_frame.rb
CHANGED
@@ -4746,7 +4746,14 @@ module Polars
|
|
4746
4746
|
end
|
4747
4747
|
|
4748
4748
|
# @private
|
4749
|
-
def self.
|
4749
|
+
def self.include_unknowns(schema, cols)
|
4750
|
+
cols.to_h { |col| [col, schema.fetch(col, Unknown)] }
|
4751
|
+
end
|
4752
|
+
|
4753
|
+
# @private
|
4754
|
+
def self._unpack_columns(columns, schema_overrides: nil, lookup_names: nil, n_expected: nil)
|
4755
|
+
raise Todo if schema_overrides
|
4756
|
+
|
4750
4757
|
if columns.is_a?(Hash)
|
4751
4758
|
columns = columns.to_a
|
4752
4759
|
end
|
@@ -4790,8 +4797,48 @@ module Polars
|
|
4790
4797
|
end
|
4791
4798
|
end
|
4792
4799
|
|
4800
|
+
def self._post_apply_columns(rbdf, columns, structs: nil, schema_overrides: nil)
|
4801
|
+
rbdf_columns = rbdf.columns
|
4802
|
+
rbdf_dtypes = rbdf.dtypes
|
4803
|
+
columns, dtypes = _unpack_columns(
|
4804
|
+
(columns || rbdf_columns), schema_overrides: schema_overrides
|
4805
|
+
)
|
4806
|
+
column_subset = []
|
4807
|
+
if columns != rbdf_columns
|
4808
|
+
if columns.length < rbdf_columns.length && columns == rbdf_columns.first(columns.length)
|
4809
|
+
column_subset = columns
|
4810
|
+
else
|
4811
|
+
rbdf.set_column_names(columns)
|
4812
|
+
end
|
4813
|
+
end
|
4814
|
+
|
4815
|
+
column_casts = []
|
4816
|
+
columns.each do |col, i|
|
4817
|
+
if dtypes[col] == Categorical # != rbdf_dtypes[i]
|
4818
|
+
column_casts << Polars.col(col).cast(Categorical)._rbexpr
|
4819
|
+
elsif structs.any? && structs.include?(col) && structs[col] != rbdf_dtypes[i]
|
4820
|
+
column_casts << Polars.col(col).cast(structs[col])._rbexpr
|
4821
|
+
elsif dtypes.include?(col) && dtypes[col] != rbdf_dtypes[i]
|
4822
|
+
column_casts << Polars.col(col).cast(dtypes[col])._rbexpr
|
4823
|
+
end
|
4824
|
+
end
|
4825
|
+
|
4826
|
+
if column_casts.any? || column_subset.any?
|
4827
|
+
rbdf = rbdf.lazy
|
4828
|
+
if column_casts.any?
|
4829
|
+
rbdf = rbdf.with_columns(column_casts)
|
4830
|
+
end
|
4831
|
+
if column_subset.any?
|
4832
|
+
rbdf = rbdf.select(column_subset.map { |col| Polars.col(col)._rbexpr })
|
4833
|
+
end
|
4834
|
+
rbdf = rbdf.collect
|
4835
|
+
end
|
4836
|
+
|
4837
|
+
rbdf
|
4838
|
+
end
|
4839
|
+
|
4793
4840
|
# @private
|
4794
|
-
def self.sequence_to_rbdf(data, columns: nil, orient: nil)
|
4841
|
+
def self.sequence_to_rbdf(data, columns: nil, orient: nil, infer_schema_length: 50)
|
4795
4842
|
if data.length == 0
|
4796
4843
|
return hash_to_rbdf({}, columns: columns)
|
4797
4844
|
end
|
@@ -4803,6 +4850,14 @@ module Polars
|
|
4803
4850
|
data.each do |s|
|
4804
4851
|
data_series << s._s
|
4805
4852
|
end
|
4853
|
+
elsif data[0].is_a?(Hash)
|
4854
|
+
column_names, dtypes = _unpack_columns(columns)
|
4855
|
+
schema_overrides = dtypes ? include_unknowns(dtypes, column_names) : nil
|
4856
|
+
rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema_overrides)
|
4857
|
+
if column_names
|
4858
|
+
rbdf = _post_apply_columns(rbdf, column_names)
|
4859
|
+
end
|
4860
|
+
return rbdf
|
4806
4861
|
elsif data[0].is_a?(Array)
|
4807
4862
|
if orient.nil? && !columns.nil?
|
4808
4863
|
orient = columns.length == data.length ? "col" : "row"
|
data/lib/polars/data_types.rb
CHANGED
@@ -3,44 +3,64 @@ module Polars
|
|
3
3
|
class DataType
|
4
4
|
end
|
5
5
|
|
6
|
+
# Base class for numeric data types.
|
7
|
+
class NumericType < DataType
|
8
|
+
end
|
9
|
+
|
10
|
+
# Base class for integral data types.
|
11
|
+
class IntegralType < NumericType
|
12
|
+
end
|
13
|
+
|
14
|
+
# Base class for fractional data types.
|
15
|
+
class FractionalType < NumericType
|
16
|
+
end
|
17
|
+
|
18
|
+
# Base class for temporal data types.
|
19
|
+
class TemporalType < DataType
|
20
|
+
end
|
21
|
+
|
22
|
+
# Base class for nested data types.
|
23
|
+
class NestedType < DataType
|
24
|
+
end
|
25
|
+
|
6
26
|
# 8-bit signed integer type.
|
7
|
-
class Int8 <
|
27
|
+
class Int8 < IntegralType
|
8
28
|
end
|
9
29
|
|
10
30
|
# 16-bit signed integer type.
|
11
|
-
class Int16 <
|
31
|
+
class Int16 < IntegralType
|
12
32
|
end
|
13
33
|
|
14
34
|
# 32-bit signed integer type.
|
15
|
-
class Int32 <
|
35
|
+
class Int32 < IntegralType
|
16
36
|
end
|
17
37
|
|
18
38
|
# 64-bit signed integer type.
|
19
|
-
class Int64 <
|
39
|
+
class Int64 < IntegralType
|
20
40
|
end
|
21
41
|
|
22
42
|
# 8-bit unsigned integer type.
|
23
|
-
class UInt8 <
|
43
|
+
class UInt8 < IntegralType
|
24
44
|
end
|
25
45
|
|
26
46
|
# 16-bit unsigned integer type.
|
27
|
-
class UInt16 <
|
47
|
+
class UInt16 < IntegralType
|
28
48
|
end
|
29
49
|
|
30
50
|
# 32-bit unsigned integer type.
|
31
|
-
class UInt32 <
|
51
|
+
class UInt32 < IntegralType
|
32
52
|
end
|
33
53
|
|
34
54
|
# 64-bit unsigned integer type.
|
35
|
-
class UInt64 <
|
55
|
+
class UInt64 < IntegralType
|
36
56
|
end
|
37
57
|
|
38
58
|
# 32-bit floating point type.
|
39
|
-
class Float32 <
|
59
|
+
class Float32 < FractionalType
|
40
60
|
end
|
41
61
|
|
42
62
|
# 64-bit floating point type.
|
43
|
-
class Float64 <
|
63
|
+
class Float64 < FractionalType
|
44
64
|
end
|
45
65
|
|
46
66
|
# Boolean type.
|
@@ -51,31 +71,19 @@ module Polars
|
|
51
71
|
class Utf8 < DataType
|
52
72
|
end
|
53
73
|
|
54
|
-
# Binary type.
|
55
|
-
class Binary < DataType
|
56
|
-
end
|
57
|
-
|
58
|
-
# Type representing Null / None values.
|
59
|
-
class Null < DataType
|
60
|
-
end
|
61
|
-
|
62
|
-
# Type representing Datatype values that could not be determined statically.
|
63
|
-
class Unknown < DataType
|
64
|
-
end
|
65
|
-
|
66
74
|
# Nested list/array type.
|
67
|
-
class List <
|
75
|
+
class List < NestedType
|
68
76
|
def initialize(inner)
|
69
77
|
@inner = Utils.rb_type_to_dtype(inner)
|
70
78
|
end
|
71
79
|
end
|
72
80
|
|
73
81
|
# Calendar date type.
|
74
|
-
class Date <
|
82
|
+
class Date < TemporalType
|
75
83
|
end
|
76
84
|
|
77
85
|
# Calendar date and time type.
|
78
|
-
class Datetime <
|
86
|
+
class Datetime < TemporalType
|
79
87
|
def initialize(time_unit = "us", time_zone = nil)
|
80
88
|
@tu = time_unit || "us"
|
81
89
|
@time_zone = time_zone
|
@@ -83,14 +91,14 @@ module Polars
|
|
83
91
|
end
|
84
92
|
|
85
93
|
# Time duration/delta type.
|
86
|
-
class Duration <
|
94
|
+
class Duration < TemporalType
|
87
95
|
def initialize(time_unit = "us")
|
88
96
|
@tu = time_unit
|
89
97
|
end
|
90
98
|
end
|
91
99
|
|
92
100
|
# Time of day type.
|
93
|
-
class Time <
|
101
|
+
class Time < TemporalType
|
94
102
|
end
|
95
103
|
|
96
104
|
# Type for wrapping arbitrary Ruby objects.
|
@@ -102,15 +110,24 @@ module Polars
|
|
102
110
|
end
|
103
111
|
|
104
112
|
# Definition of a single field within a `Struct` DataType.
|
105
|
-
class Field
|
113
|
+
class Field
|
114
|
+
attr_reader :name, :dtype
|
115
|
+
|
106
116
|
def initialize(name, dtype)
|
107
117
|
@name = name
|
108
118
|
@dtype = Utils.rb_type_to_dtype(dtype)
|
109
119
|
end
|
120
|
+
|
121
|
+
def inspect
|
122
|
+
class_name = self.class.name
|
123
|
+
"#{class_name}(#{@name}: #{@dtype})"
|
124
|
+
end
|
110
125
|
end
|
111
126
|
|
112
127
|
# Struct composite type.
|
113
|
-
class Struct <
|
128
|
+
class Struct < NestedType
|
129
|
+
attr_reader :fields
|
130
|
+
|
114
131
|
def initialize(fields)
|
115
132
|
if fields.is_a?(Hash)
|
116
133
|
@fields = fields.map { |n, d| Field.new(n, d) }
|
@@ -118,5 +135,26 @@ module Polars
|
|
118
135
|
@fields = fields
|
119
136
|
end
|
120
137
|
end
|
138
|
+
|
139
|
+
def inspect
|
140
|
+
class_name = self.class.name
|
141
|
+
"#{class_name}(#{@fields})"
|
142
|
+
end
|
143
|
+
|
144
|
+
def to_schema
|
145
|
+
@fields.to_h { |f| [f.name, f.dtype] }
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
# Binary type.
|
150
|
+
class Binary < DataType
|
151
|
+
end
|
152
|
+
|
153
|
+
# Type representing Null / None values.
|
154
|
+
class Null < DataType
|
155
|
+
end
|
156
|
+
|
157
|
+
# Type representing Datatype values that could not be determined statically.
|
158
|
+
class Unknown < DataType
|
121
159
|
end
|
122
160
|
end
|
data/lib/polars/series.rb
CHANGED
@@ -3667,6 +3667,11 @@ module Polars
|
|
3667
3667
|
rb_temporal_types << ::Time if defined?(::Time)
|
3668
3668
|
|
3669
3669
|
value = _get_first_non_none(values)
|
3670
|
+
if !value.nil?
|
3671
|
+
if value.is_a?(Hash)
|
3672
|
+
return DataFrame.new(values).to_struct(name)._s
|
3673
|
+
end
|
3674
|
+
end
|
3670
3675
|
|
3671
3676
|
if !dtype.nil? && Utils.is_polars_dtype(dtype) && ruby_dtype.nil?
|
3672
3677
|
constructor = polars_type_to_constructor(dtype)
|
data/lib/polars/utils.rb
CHANGED
@@ -160,11 +160,11 @@ module Polars
|
|
160
160
|
|
161
161
|
def self.scale_bytes(sz, to:)
|
162
162
|
scaling_factor = {
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
163
|
+
"b" => 1,
|
164
|
+
"k" => 1024,
|
165
|
+
"m" => 1024 ** 2,
|
166
|
+
"g" => 1024 ** 3,
|
167
|
+
"t" => 1024 ** 4
|
168
168
|
}[to[0]]
|
169
169
|
if scaling_factor > 1
|
170
170
|
sz / scaling_factor.to_f
|
data/lib/polars/version.rb
CHANGED
data/lib/polars-df.rb
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
require_relative "polars"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: polars-df
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-01-
|
11
|
+
date: 2023-01-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -55,6 +55,9 @@ files:
|
|
55
55
|
- ext/polars/src/lazy/utils.rs
|
56
56
|
- ext/polars/src/lib.rs
|
57
57
|
- ext/polars/src/list_construction.rs
|
58
|
+
- ext/polars/src/object.rs
|
59
|
+
- ext/polars/src/prelude.rs
|
60
|
+
- ext/polars/src/rb_modules.rs
|
58
61
|
- ext/polars/src/series.rs
|
59
62
|
- ext/polars/src/set.rs
|
60
63
|
- ext/polars/src/utils.rs
|