RubyGems - polars-df - Versions diffs - 0.1.4 → 0.2.0 - Mend

polars-df 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +13 -0
data/Cargo.lock +430 -217
data/Cargo.toml +2 -0
data/LICENSE.txt +1 -1
data/README.md +0 -2
data/ext/polars/Cargo.toml +9 -3
data/ext/polars/src/apply/dataframe.rs +303 -0
data/ext/polars/src/apply/mod.rs +253 -0
data/ext/polars/src/apply/series.rs +1173 -0
data/ext/polars/src/conversion.rs +254 -35
data/ext/polars/src/dataframe.rs +151 -6
data/ext/polars/src/error.rs +8 -0
data/ext/polars/src/lazy/apply.rs +34 -2
data/ext/polars/src/lazy/dataframe.rs +80 -3
data/ext/polars/src/lazy/dsl.rs +84 -10
data/ext/polars/src/lib.rs +180 -8
data/ext/polars/src/series.rs +328 -10
data/ext/polars/src/utils.rs +25 -0
data/lib/polars/convert.rb +100 -0
data/lib/polars/data_frame.rb +1480 -77
data/lib/polars/data_types.rb +122 -0
data/lib/polars/date_time_expr.rb +10 -10
data/lib/polars/date_time_name_space.rb +8 -8
data/lib/polars/dynamic_group_by.rb +52 -0
data/lib/polars/expr.rb +262 -12
data/lib/polars/functions.rb +194 -5
data/lib/polars/group_by.rb +76 -36
data/lib/polars/io.rb +19 -3
data/lib/polars/lazy_frame.rb +798 -25
data/lib/polars/lazy_functions.rb +569 -30
data/lib/polars/list_expr.rb +1 -1
data/lib/polars/rolling_group_by.rb +35 -0
data/lib/polars/series.rb +192 -27
data/lib/polars/string_expr.rb +6 -5
data/lib/polars/string_name_space.rb +1 -1
data/lib/polars/utils.rb +25 -8
data/lib/polars/version.rb +1 -1
data/lib/polars.rb +38 -29
metadata +11 -4

data/Cargo.toml CHANGED Viewed

@@ -3,6 +3,8 @@ members = ["ext/polars"]
 [patch.crates-io]
 jsonpath_lib = { git = "https://github.com/ritchie46/jsonpath", rev = "24eaf0b4416edff38a4d1b6b17bc4b9f3f047b4b" }
+halfbrown = { git = "https://github.com/Licenser/halfbrown", rev = "952023c5dd6461b009bb5ba66b9aa979bd75949f" }
+arrow2 = { git = "https://github.com/ankane/arrow2", rev = "9f36b2b97446e6dd495473e4361a70d863ac8027" }
 [profile.release]
 strip = true

data/LICENSE.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 Copyright (c) 2020 Ritchie Vink
-Copyright (c) 2022 Andrew Kane
+Copyright (c) 2022-2023 Andrew Kane
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

data/README.md CHANGED Viewed

@@ -12,8 +12,6 @@ Add this line to your application’s Gemfile:
 gem "polars-df"
 ```
-Note: Rust is currently required for installation, and it can take 15-20 minutes to compile the extension.
 ## Getting Started
 This library follows the [Polars Python API](https://pola-rs.github.io/polars/py-polars/html/reference/index.html).

data/ext/polars/Cargo.toml CHANGED Viewed

@@ -1,6 +1,7 @@
 [package]
 name = "polars"
-version = "0.1.4"
+version = "0.2.0"
+license = "MIT"
 authors = ["Andrew Kane <andrew@ankane.org>"]
 edition = "2021"
 publish = false
@@ -9,16 +10,19 @@ publish = false
 crate-type = ["cdylib"]
 [dependencies]
+ahash = "0.8"
 magnus = "0.4"
-polars-core = "0.25.1"
+polars-core = "0.26.1"
 serde_json = "1"
 [dependencies.polars]
-version = "0.25.1"
+version = "0.26.1"
 features = [
     "abs",
     "arange",
     "arg_where",
+    "asof_join",
+    "avro",
     "concat_str",
     "cse",
     "csv-file",
@@ -53,6 +57,7 @@ features = [
     "partition_by",
     "pct_change",
     "performant",
+    "pivot",
     "product",
     "propagate_nans",
     "random",
@@ -61,6 +66,7 @@ features = [
     "repeat_by",
     "rolling_window",
     "round_series",
+    "row_hash",
     "search_sorted",
     "semi_anti_join",
     "serde-lazy",

data/ext/polars/src/apply/dataframe.rs ADDED Viewed

@@ -0,0 +1,303 @@
+use magnus::{class, RArray, TryConvert, Value};
+use polars::prelude::*;
+use polars_core::frame::row::{rows_to_schema_first_non_null, Row};
+use polars_core::series::SeriesIter;
+use super::*;
+use crate::{RbDataFrame, RbPolarsErr, RbSeries, Wrap};
+fn get_iters(df: &DataFrame) -> Vec<SeriesIter> {
+    df.get_columns().iter().map(|s| s.iter()).collect()
+}
+fn get_iters_skip(df: &DataFrame, skip: usize) -> Vec<std::iter::Skip<SeriesIter>> {
+    df.get_columns()
+        .iter()
+        .map(|s| s.iter().skip(skip))
+        .collect()
+}
+pub fn apply_lambda_unknown<'a>(
+    df: &'a DataFrame,
+    lambda: Value,
+    inference_size: usize,
+) -> RbResult<(Value, bool)> {
+    let mut null_count = 0;
+    let mut iters = get_iters(df);
+    for _ in 0..df.height() {
+        let iter = iters.iter_mut().map(|it| Wrap(it.next().unwrap()));
+        let arg = (iter.collect::<Vec<Wrap<AnyValue>>>(),);
+        let out: Value = lambda.funcall("call", arg)?;
+        if out.is_nil() {
+            null_count += 1;
+            continue;
+        } else if out.is_kind_of(class::true_class()) || out.is_kind_of(class::false_class()) {
+            let first_value = out.try_convert::<bool>().ok();
+            return Ok((
+                RbSeries::new(
+                    apply_lambda_with_bool_out_type(df, lambda, null_count, first_value)
+                        .into_series(),
+                )
+                .into(),
+                false,
+            ));
+        } else if out.is_kind_of(class::float()) {
+            let first_value = out.try_convert::<f64>().ok();
+            return Ok((
+                RbSeries::new(
+                    apply_lambda_with_primitive_out_type::<Float64Type>(
+                        df,
+                        lambda,
+                        null_count,
+                        first_value,
+                    )
+                    .into_series(),
+                )
+                .into(),
+                false,
+            ));
+        } else if out.is_kind_of(class::integer()) {
+            let first_value = out.try_convert::<i64>().ok();
+            return Ok((
+                RbSeries::new(
+                    apply_lambda_with_primitive_out_type::<Int64Type>(
+                        df,
+                        lambda,
+                        null_count,
+                        first_value,
+                    )
+                    .into_series(),
+                )
+                .into(),
+                false,
+            ));
+        // } else if out.is_kind_of(class::string()) {
+        //     let first_value = out.try_convert::<String>().ok();
+        //     return Ok((
+        //         RbSeries::new(
+        //             apply_lambda_with_utf8_out_type(df, lambda, null_count, first_value)
+        //                 .into_series(),
+        //         )
+        //         .into(),
+        //         false,
+        //     ));
+        } else if out.respond_to("_s", true)? {
+            let rb_rbseries: Value = out.funcall("_s", ()).unwrap();
+            let series = rb_rbseries
+                .try_convert::<&RbSeries>()
+                .unwrap()
+                .series
+                .borrow();
+            let dt = series.dtype();
+            return Ok((
+                RbSeries::new(
+                    apply_lambda_with_list_out_type(df, lambda, null_count, Some(&series), dt)?
+                        .into_series(),
+                )
+                .into(),
+                false,
+            ));
+        } else if out.try_convert::<Wrap<Row<'a>>>().is_ok() {
+            let first_value = out.try_convert::<Wrap<Row<'a>>>().unwrap().0;
+            return Ok((
+                RbDataFrame::from(
+                    apply_lambda_with_rows_output(
+                        df,
+                        lambda,
+                        null_count,
+                        first_value,
+                        inference_size,
+                    )
+                    .map_err(RbPolarsErr::from)?,
+                )
+                .into(),
+                true,
+            ));
+        } else if out.is_kind_of(class::array()) {
+            return Err(RbPolarsErr::other(
+                "A list output type is invalid. Do you mean to create polars List Series?\
+Then return a Series object."
+                    .into(),
+            ));
+        } else {
+            return Err(RbPolarsErr::other("Could not determine output type".into()));
+        }
+    }
+    Err(RbPolarsErr::other("Could not determine output type".into()))
+}
+fn apply_iter<T>(
+    df: &DataFrame,
+    lambda: Value,
+    init_null_count: usize,
+    skip: usize,
+) -> impl Iterator<Item = Option<T>> + '_
+where
+    T: TryConvert,
+{
+    let mut iters = get_iters_skip(df, init_null_count + skip);
+    ((init_null_count + skip)..df.height()).map(move |_| {
+        let iter = iters.iter_mut().map(|it| Wrap(it.next().unwrap()));
+        let tpl = (iter.collect::<Vec<Wrap<AnyValue>>>(),);
+        match lambda.funcall::<_, _, Value>("call", tpl) {
+            Ok(val) => val.try_convert::<T>().ok(),
+            Err(e) => panic!("ruby function failed {}", e),
+        }
+    })
+}
+/// Apply a lambda with a primitive output type
+pub fn apply_lambda_with_primitive_out_type<D>(
+    df: &DataFrame,
+    lambda: Value,
+    init_null_count: usize,
+    first_value: Option<D::Native>,
+) -> ChunkedArray<D>
+where
+    D: RbArrowPrimitiveType,
+    D::Native: Into<Value> + TryConvert,
+{
+    let skip = usize::from(first_value.is_some());
+    if init_null_count == df.height() {
+        ChunkedArray::full_null("apply", df.height())
+    } else {
+        let iter = apply_iter(df, lambda, init_null_count, skip);
+        iterator_to_primitive(iter, init_null_count, first_value, "apply", df.height())
+    }
+}
+/// Apply a lambda with a boolean output type
+pub fn apply_lambda_with_bool_out_type(
+    df: &DataFrame,
+    lambda: Value,
+    init_null_count: usize,
+    first_value: Option<bool>,
+) -> ChunkedArray<BooleanType> {
+    let skip = usize::from(first_value.is_some());
+    if init_null_count == df.height() {
+        ChunkedArray::full_null("apply", df.height())
+    } else {
+        let iter = apply_iter(df, lambda, init_null_count, skip);
+        iterator_to_bool(iter, init_null_count, first_value, "apply", df.height())
+    }
+}
+/// Apply a lambda with utf8 output type
+pub fn apply_lambda_with_utf8_out_type(
+    df: &DataFrame,
+    lambda: Value,
+    init_null_count: usize,
+    first_value: Option<&str>,
+) -> Utf8Chunked {
+    let skip = usize::from(first_value.is_some());
+    if init_null_count == df.height() {
+        ChunkedArray::full_null("apply", df.height())
+    } else {
+        let iter = apply_iter::<String>(df, lambda, init_null_count, skip);
+        iterator_to_utf8(iter, init_null_count, first_value, "apply", df.height())
+    }
+}
+/// Apply a lambda with list output type
+pub fn apply_lambda_with_list_out_type<'a>(
+    df: &'a DataFrame,
+    lambda: Value,
+    init_null_count: usize,
+    first_value: Option<&Series>,
+    dt: &DataType,
+) -> RbResult<ListChunked> {
+    let skip = usize::from(first_value.is_some());
+    if init_null_count == df.height() {
+        Ok(ChunkedArray::full_null("apply", df.height()))
+    } else {
+        let mut iters = get_iters_skip(df, init_null_count + skip);
+        let iter = ((init_null_count + skip)..df.height()).map(|_| {
+            let iter = iters.iter_mut().map(|it| Wrap(it.next().unwrap()));
+            let tpl = (iter.collect::<Vec<Wrap<AnyValue>>>(),);
+            match lambda.funcall::<_, _, Value>("call", tpl) {
+                Ok(val) => match val.funcall::<_, _, Value>("_s", ()) {
+                    Ok(val) => val
+                        .try_convert::<&RbSeries>()
+                        .ok()
+                        .map(|ps| ps.series.borrow().clone()),
+                    Err(_) => {
+                        if val.is_nil() {
+                            None
+                        } else {
+                            panic!("should return a Series, got a {:?}", val)
+                        }
+                    }
+                },
+                Err(e) => panic!("ruby function failed {}", e),
+            }
+        });
+        iterator_to_list(dt, iter, init_null_count, first_value, "apply", df.height())
+    }
+}
+pub fn apply_lambda_with_rows_output<'a>(
+    df: &'a DataFrame,
+    lambda: Value,
+    init_null_count: usize,
+    first_value: Row<'a>,
+    inference_size: usize,
+) -> PolarsResult<DataFrame> {
+    let width = first_value.0.len();
+    let null_row = Row::new(vec![AnyValue::Null; width]);
+    let mut row_buf = Row::default();
+    let skip = 1;
+    let mut iters = get_iters_skip(df, init_null_count + skip);
+    let mut row_iter = ((init_null_count + skip)..df.height()).map(|_| {
+        let iter = iters.iter_mut().map(|it| Wrap(it.next().unwrap()));
+        let tpl = (iter.collect::<Vec<Wrap<AnyValue>>>(),);
+        match lambda.funcall::<_, _, Value>("call", tpl) {
+            Ok(val) => {
+                match val.try_convert::<RArray>().ok() {
+                    Some(tuple) => {
+                        row_buf.0.clear();
+                        for v in tuple.each() {
+                            let v = v.unwrap().try_convert::<Wrap<AnyValue>>().unwrap().0;
+                            row_buf.0.push(v);
+                        }
+                        let ptr = &row_buf as *const Row;
+                        // Safety:
+                        // we know that row constructor of polars dataframe does not keep a reference
+                        // to the row. Before we mutate the row buf again, the reference is dropped.
+                        // we only cannot prove it to the compiler.
+                        // we still do this because it saves a Vec allocation in a hot loop.
+                        unsafe { &*ptr }
+                    }
+                    None => &null_row,
+                }
+            }
+            Err(e) => panic!("ruby function failed {}", e),
+        }
+    });
+    // first rows for schema inference
+    let mut buf = Vec::with_capacity(inference_size);
+    buf.push(first_value);
+    buf.extend((&mut row_iter).take(inference_size).cloned());
+    let schema = rows_to_schema_first_non_null(&buf, Some(50));
+    if init_null_count > 0 {
+        // Safety: we know the iterators size
+        let iter = unsafe {
+            (0..init_null_count)
+                .map(|_| &null_row)
+                .chain(buf.iter())
+                .chain(row_iter)
+                .trust_my_length(df.height())
+        };
+        DataFrame::from_rows_iter_and_schema(iter, &schema)
+    } else {
+        // Safety: we know the iterators size
+        let iter = unsafe { buf.iter().chain(row_iter).trust_my_length(df.height()) };
+        DataFrame::from_rows_iter_and_schema(iter, &schema)
+    }
+}

data/ext/polars/src/apply/mod.rs ADDED Viewed

@@ -0,0 +1,253 @@
+pub mod dataframe;
+pub mod series;
+use magnus::{RHash, Value};
+use polars::chunked_array::builder::get_list_builder;
+use polars::prelude::*;
+use polars_core::export::rayon::prelude::*;
+use polars_core::utils::CustomIterTools;
+use polars_core::POOL;
+use crate::{ObjectValue, RbPolarsErr, RbResult, RbSeries, Wrap};
+pub trait RbArrowPrimitiveType: PolarsNumericType {}
+impl RbArrowPrimitiveType for UInt8Type {}
+impl RbArrowPrimitiveType for UInt16Type {}
+impl RbArrowPrimitiveType for UInt32Type {}
+impl RbArrowPrimitiveType for UInt64Type {}
+impl RbArrowPrimitiveType for Int8Type {}
+impl RbArrowPrimitiveType for Int16Type {}
+impl RbArrowPrimitiveType for Int32Type {}
+impl RbArrowPrimitiveType for Int64Type {}
+impl RbArrowPrimitiveType for Float32Type {}
+impl RbArrowPrimitiveType for Float64Type {}
+fn iterator_to_struct(
+    it: impl Iterator<Item = Option<Value>>,
+    init_null_count: usize,
+    first_value: AnyValue,
+    name: &str,
+    capacity: usize,
+) -> RbResult<RbSeries> {
+    let (vals, flds) = match &first_value {
+        av @ AnyValue::Struct(_, _, flds) => (av._iter_struct_av().collect::<Vec<_>>(), &**flds),
+        AnyValue::StructOwned(payload) => (payload.0.clone(), &*payload.1),
+        _ => {
+            return Err(crate::error::ComputeError::new_err(format!(
+                "expected struct got {first_value:?}",
+            )))
+        }
+    };
+    let struct_width = vals.len();
+    // every item in the struct is kept as its own buffer of anyvalues
+    // so as struct with 2 items: {a, b}
+    // will have
+    // [
+    //      [ a values ]
+    //      [ b values ]
+    // ]
+    let mut items = Vec::with_capacity(vals.len());
+    for item in vals {
+        let mut buf = Vec::with_capacity(capacity);
+        for _ in 0..init_null_count {
+            buf.push(AnyValue::Null);
+        }
+        buf.push(item.clone());
+        items.push(buf);
+    }
+    for dict in it {
+        match dict {
+            None => {
+                for field_items in &mut items {
+                    field_items.push(AnyValue::Null);
+                }
+            }
+            Some(dict) => {
+                let dict = dict.try_convert::<RHash>()?;
+                if dict.len() != struct_width {
+                    return Err(crate::error::ComputeError::new_err(
+                        format!("Cannot create struct type.\n> The struct dtype expects {} fields, but it got a dict with {} fields.", struct_width, dict.len())
+                    ));
+                }
+                // we ignore the keys of the rest of the dicts
+                // the first item determines the output name
+                todo!()
+                // for ((_, val), field_items) in dict.iter().zip(&mut items) {
+                //     let item = val.try_convert::<Wrap<AnyValue>>()?;
+                //     field_items.push(item.0)
+                // }
+            }
+        }
+    }
+    let fields = POOL.install(|| {
+        items
+            .par_iter()
+            .zip(flds)
+            .map(|(av, fld)| Series::new(fld.name(), av))
+            .collect::<Vec<_>>()
+    });
+    Ok(StructChunked::new(name, &fields)
+        .unwrap()
+        .into_series()
+        .into())
+}
+fn iterator_to_primitive<T>(
+    it: impl Iterator<Item = Option<T::Native>>,
+    init_null_count: usize,
+    first_value: Option<T::Native>,
+    name: &str,
+    capacity: usize,
+) -> ChunkedArray<T>
+where
+    T: RbArrowPrimitiveType,
+{
+    // safety: we know the iterators len
+    let mut ca: ChunkedArray<T> = unsafe {
+        if init_null_count > 0 {
+            (0..init_null_count)
+                .map(|_| None)
+                .chain(std::iter::once(first_value))
+                .chain(it)
+                .trust_my_length(capacity)
+                .collect_trusted()
+        } else if first_value.is_some() {
+            std::iter::once(first_value)
+                .chain(it)
+                .trust_my_length(capacity)
+                .collect_trusted()
+        } else {
+            it.collect()
+        }
+    };
+    debug_assert_eq!(ca.len(), capacity);
+    ca.rename(name);
+    ca
+}
+fn iterator_to_bool(
+    it: impl Iterator<Item = Option<bool>>,
+    init_null_count: usize,
+    first_value: Option<bool>,
+    name: &str,
+    capacity: usize,
+) -> ChunkedArray<BooleanType> {
+    // safety: we know the iterators len
+    let mut ca: BooleanChunked = unsafe {
+        if init_null_count > 0 {
+            (0..init_null_count)
+                .map(|_| None)
+                .chain(std::iter::once(first_value))
+                .chain(it)
+                .trust_my_length(capacity)
+                .collect_trusted()
+        } else if first_value.is_some() {
+            std::iter::once(first_value)
+                .chain(it)
+                .trust_my_length(capacity)
+                .collect_trusted()
+        } else {
+            it.collect()
+        }
+    };
+    debug_assert_eq!(ca.len(), capacity);
+    ca.rename(name);
+    ca
+}
+fn iterator_to_object(
+    it: impl Iterator<Item = Option<ObjectValue>>,
+    init_null_count: usize,
+    first_value: Option<ObjectValue>,
+    name: &str,
+    capacity: usize,
+) -> ObjectChunked<ObjectValue> {
+    // safety: we know the iterators len
+    let mut ca: ObjectChunked<ObjectValue> = unsafe {
+        if init_null_count > 0 {
+            (0..init_null_count)
+                .map(|_| None)
+                .chain(std::iter::once(first_value))
+                .chain(it)
+                .trust_my_length(capacity)
+                .collect_trusted()
+        } else if first_value.is_some() {
+            std::iter::once(first_value)
+                .chain(it)
+                .trust_my_length(capacity)
+                .collect_trusted()
+        } else {
+            it.collect()
+        }
+    };
+    debug_assert_eq!(ca.len(), capacity);
+    ca.rename(name);
+    ca
+}
+fn iterator_to_utf8(
+    it: impl Iterator<Item = Option<String>>,
+    init_null_count: usize,
+    first_value: Option<&str>,
+    name: &str,
+    capacity: usize,
+) -> Utf8Chunked {
+    let first_value = first_value.map(|v| v.to_string());
+    // safety: we know the iterators len
+    let mut ca: Utf8Chunked = unsafe {
+        if init_null_count > 0 {
+            (0..init_null_count)
+                .map(|_| None)
+                .chain(std::iter::once(first_value))
+                .chain(it)
+                .trust_my_length(capacity)
+                .collect_trusted()
+        } else if first_value.is_some() {
+            std::iter::once(first_value)
+                .chain(it)
+                .trust_my_length(capacity)
+                .collect_trusted()
+        } else {
+            it.collect()
+        }
+    };
+    debug_assert_eq!(ca.len(), capacity);
+    ca.rename(name);
+    ca
+}
+fn iterator_to_list(
+    dt: &DataType,
+    it: impl Iterator<Item = Option<Series>>,
+    init_null_count: usize,
+    first_value: Option<&Series>,
+    name: &str,
+    capacity: usize,
+) -> RbResult<ListChunked> {
+    let mut builder =
+        get_list_builder(dt, capacity * 5, capacity, name).map_err(RbPolarsErr::from)?;
+    for _ in 0..init_null_count {
+        builder.append_null()
+    }
+    builder.append_opt_series(first_value);
+    for opt_val in it {
+        match opt_val {
+            None => builder.append_null(),
+            Some(s) => {
+                if s.len() == 0 && s.dtype() != dt {
+                    builder.append_series(&Series::full_null("", 0, dt))
+                } else {
+                    builder.append_series(&s)
+                }
+            }
+        }
+    }
+    Ok(builder.finish())
+}