RubyGems - polars-df - Versions diffs - 0.1.1 → 0.1.2 - Mend

polars-df 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +4 -0
data/Cargo.lock +1 -1
data/ext/polars/Cargo.toml +1 -1
data/ext/polars/src/batched_csv.rs +120 -0
data/ext/polars/src/conversion.rs +105 -5
data/ext/polars/src/dataframe.rs +132 -4
data/ext/polars/src/error.rs +9 -0
data/ext/polars/src/file.rs +8 -7
data/ext/polars/src/lazy/apply.rs +7 -0
data/ext/polars/src/lazy/dataframe.rs +132 -0
data/ext/polars/src/lazy/dsl.rs +38 -0
data/ext/polars/src/lazy/meta.rs +1 -1
data/ext/polars/src/lazy/mod.rs +1 -0
data/ext/polars/src/lib.rs +77 -3
data/ext/polars/src/series.rs +8 -9
data/lib/polars/batched_csv_reader.rb +95 -0
data/lib/polars/data_frame.rb +585 -19
data/lib/polars/expr.rb +17 -2
data/lib/polars/io.rb +342 -2
data/lib/polars/lazy_frame.rb +156 -2
data/lib/polars/lazy_functions.rb +154 -11
data/lib/polars/series.rb +806 -18
data/lib/polars/utils.rb +33 -0
data/lib/polars/version.rb +1 -1
data/lib/polars.rb +9 -0
metadata +5 -2

data/ext/polars/src/lazy/dataframe.rs CHANGED Viewed

@@ -1,4 +1,5 @@
 use magnus::{RArray, RHash, Value};
+use polars::io::RowCount;
 use polars::lazy::frame::{LazyFrame, LazyGroupBy};
 use polars::prelude::*;
 use std::cell::RefCell;
@@ -52,6 +53,137 @@ impl From<LazyFrame> for RbLazyFrame {
 }
 impl RbLazyFrame {
+    pub fn new_from_ndjson(
+        path: String,
+        infer_schema_length: Option<usize>,
+        batch_size: Option<usize>,
+        n_rows: Option<usize>,
+        low_memory: bool,
+        rechunk: bool,
+        row_count: Option<(String, IdxSize)>,
+    ) -> RbResult<Self> {
+        let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
+        let lf = LazyJsonLineReader::new(path)
+            .with_infer_schema_length(infer_schema_length)
+            .with_batch_size(batch_size)
+            .with_n_rows(n_rows)
+            .low_memory(low_memory)
+            .with_rechunk(rechunk)
+            .with_row_count(row_count)
+            .finish()
+            .map_err(RbPolarsErr::from)?;
+        Ok(lf.into())
+    }
+    pub fn new_from_csv(arguments: &[Value]) -> RbResult<Self> {
+        // start arguments
+        // this pattern is needed for more than 16
+        let path: String = arguments[0].try_convert()?;
+        let sep: String = arguments[1].try_convert()?;
+        let has_header: bool = arguments[2].try_convert()?;
+        let ignore_errors: bool = arguments[3].try_convert()?;
+        let skip_rows: usize = arguments[4].try_convert()?;
+        let n_rows: Option<usize> = arguments[5].try_convert()?;
+        let cache: bool = arguments[6].try_convert()?;
+        let overwrite_dtype: Option<Vec<(String, Wrap<DataType>)>> = arguments[7].try_convert()?;
+        let low_memory: bool = arguments[8].try_convert()?;
+        let comment_char: Option<String> = arguments[9].try_convert()?;
+        let quote_char: Option<String> = arguments[10].try_convert()?;
+        let null_values: Option<Wrap<NullValues>> = arguments[11].try_convert()?;
+        let infer_schema_length: Option<usize> = arguments[12].try_convert()?;
+        let with_schema_modify: Option<Value> = arguments[13].try_convert()?;
+        let rechunk: bool = arguments[14].try_convert()?;
+        let skip_rows_after_header: usize = arguments[15].try_convert()?;
+        let encoding: Wrap<CsvEncoding> = arguments[16].try_convert()?;
+        let row_count: Option<(String, IdxSize)> = arguments[17].try_convert()?;
+        let parse_dates: bool = arguments[18].try_convert()?;
+        let eol_char: String = arguments[19].try_convert()?;
+        // end arguments
+        let null_values = null_values.map(|w| w.0);
+        let comment_char = comment_char.map(|s| s.as_bytes()[0]);
+        let quote_char = quote_char.map(|s| s.as_bytes()[0]);
+        let delimiter = sep.as_bytes()[0];
+        let eol_char = eol_char.as_bytes()[0];
+        let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
+        let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| {
+            let fields = overwrite_dtype
+                .into_iter()
+                .map(|(name, dtype)| Field::new(&name, dtype.0));
+            Schema::from(fields)
+        });
+        let r = LazyCsvReader::new(path)
+            .with_infer_schema_length(infer_schema_length)
+            .with_delimiter(delimiter)
+            .has_header(has_header)
+            .with_ignore_parser_errors(ignore_errors)
+            .with_skip_rows(skip_rows)
+            .with_n_rows(n_rows)
+            .with_cache(cache)
+            .with_dtype_overwrite(overwrite_dtype.as_ref())
+            .low_memory(low_memory)
+            .with_comment_char(comment_char)
+            .with_quote_char(quote_char)
+            .with_end_of_line_char(eol_char)
+            .with_rechunk(rechunk)
+            .with_skip_rows_after_header(skip_rows_after_header)
+            .with_encoding(encoding.0)
+            .with_row_count(row_count)
+            .with_parse_dates(parse_dates)
+            .with_null_values(null_values);
+        if let Some(_lambda) = with_schema_modify {
+            todo!();
+        }
+        Ok(r.finish().map_err(RbPolarsErr::from)?.into())
+    }
+    pub fn new_from_parquet(
+        path: String,
+        n_rows: Option<usize>,
+        cache: bool,
+        parallel: Wrap<ParallelStrategy>,
+        rechunk: bool,
+        row_count: Option<(String, IdxSize)>,
+        low_memory: bool,
+    ) -> RbResult<Self> {
+        let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
+        let args = ScanArgsParquet {
+            n_rows,
+            cache,
+            parallel: parallel.0,
+            rechunk,
+            row_count,
+            low_memory,
+        };
+        let lf = LazyFrame::scan_parquet(path, args).map_err(RbPolarsErr::from)?;
+        Ok(lf.into())
+    }
+    pub fn new_from_ipc(
+        path: String,
+        n_rows: Option<usize>,
+        cache: bool,
+        rechunk: bool,
+        row_count: Option<(String, IdxSize)>,
+        memory_map: bool,
+    ) -> RbResult<Self> {
+        let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
+        let args = ScanArgsIpc {
+            n_rows,
+            cache,
+            rechunk,
+            row_count,
+            memmap: memory_map,
+        };
+        let lf = LazyFrame::scan_ipc(path, args).map_err(RbPolarsErr::from)?;
+        Ok(lf.into())
+    }
     pub fn write_json(&self, rb_f: Value) -> RbResult<()> {
         let file = BufWriter::new(get_file_like(rb_f, true)?);
         serde_json::to_writer(file, &self.ldf.logical_plan)

data/ext/polars/src/lazy/dsl.rs CHANGED Viewed

@@ -6,6 +6,7 @@ use polars::prelude::*;
 use polars::series::ops::NullBehavior;
 use crate::conversion::*;
+use crate::lazy::apply::*;
 use crate::lazy::utils::rb_exprs_to_exprs;
 use crate::RbResult;
@@ -901,6 +902,10 @@ impl RbExpr {
         self.inner.clone().suffix(&suffix).into()
     }
+    pub fn exclude(&self, columns: Vec<String>) -> Self {
+        self.inner.clone().exclude(columns).into()
+    }
     pub fn interpolate(&self) -> Self {
         self.inner.clone().interpolate().into()
     }
@@ -1333,6 +1338,29 @@ pub fn col(name: String) -> RbExpr {
     dsl::col(&name).into()
 }
+pub fn count() -> RbExpr {
+    dsl::count().into()
+}
+pub fn first() -> RbExpr {
+    dsl::first().into()
+}
+pub fn last() -> RbExpr {
+    dsl::last().into()
+}
+pub fn cols(names: Vec<String>) -> RbExpr {
+    dsl::cols(names).into()
+}
+pub fn fold(acc: &RbExpr, lambda: Value, exprs: RArray) -> RbResult<RbExpr> {
+    let exprs = rb_exprs_to_exprs(exprs)?;
+    let func = move |a: Series, b: Series| binary_lambda(lambda, a, b);
+    Ok(polars::lazy::dsl::fold_exprs(acc.inner.clone(), func, exprs).into())
+}
 // TODO improve
 pub fn lit(value: Value) -> RbResult<RbExpr> {
     if value.is_nil() {
@@ -1387,3 +1415,13 @@ impl RbWhenThen {
 pub fn when(predicate: &RbExpr) -> RbWhen {
     dsl::when(predicate.inner.clone()).into()
 }
+pub fn concat_str(s: RArray, sep: String) -> RbResult<RbExpr> {
+    let s = rb_exprs_to_exprs(s)?;
+    Ok(dsl::concat_str(s, &sep).into())
+}
+pub fn concat_lst(s: RArray) -> RbResult<RbExpr> {
+    let s = rb_exprs_to_exprs(s)?;
+    Ok(dsl::concat_lst(s).into())
+}

data/ext/polars/src/lazy/meta.rs CHANGED Viewed

@@ -7,7 +7,7 @@ impl RbExpr {
             .meta()
             .pop()
             .into_iter()
-            .map(|v| RbExpr::from(v))
+            .map(RbExpr::from)
             .collect()
     }

data/ext/polars/src/lazy/mod.rs CHANGED Viewed

@@ -1,3 +1,4 @@
+pub mod apply;
 pub mod dataframe;
 pub mod dsl;
 pub mod meta;

data/ext/polars/src/lib.rs CHANGED Viewed

@@ -1,3 +1,4 @@
+mod batched_csv;
 mod conversion;
 mod dataframe;
 mod error;
@@ -5,14 +6,18 @@ mod file;
 mod lazy;
 mod series;
-use conversion::get_df;
+use batched_csv::RbBatchedCsv;
+use conversion::*;
 use dataframe::RbDataFrame;
 use error::{RbPolarsErr, RbValueError};
+use file::get_file_like;
 use lazy::dataframe::{RbLazyFrame, RbLazyGroupBy};
 use lazy::dsl::{RbExpr, RbWhen, RbWhenThen};
 use magnus::{
-    define_module, function, memoize, method, prelude::*, Error, RArray, RClass, RModule,
+    define_module, function, memoize, method, prelude::*, Error, RArray, RClass, RHash, RModule,
+    Value,
 };
+use polars::datatypes::DataType;
 use polars::error::PolarsResult;
 use polars::frame::DataFrame;
 use polars::functions::{diag_concat_df, hor_concat_df};
@@ -34,11 +39,19 @@ fn init() -> RbResult<()> {
     module.define_singleton_method("_concat_df", function!(concat_df, 1))?;
     module.define_singleton_method("_diag_concat_df", function!(rb_diag_concat_df, 1))?;
     module.define_singleton_method("_hor_concat_df", function!(rb_hor_concat_df, 1))?;
+    module.define_singleton_method("_concat_series", function!(concat_series, 1))?;
+    module.define_singleton_method("_ipc_schema", function!(ipc_schema, 1))?;
+    module.define_singleton_method("_parquet_schema", function!(parquet_schema, 1))?;
+    let class = module.define_class("RbBatchedCsv", Default::default())?;
+    class.define_singleton_method("new", function!(RbBatchedCsv::new, -1))?;
+    class.define_method("next_batches", method!(RbBatchedCsv::next_batches, 1))?;
     let class = module.define_class("RbDataFrame", Default::default())?;
     class.define_singleton_method("new", function!(RbDataFrame::init, 1))?;
-    class.define_singleton_method("read_csv", function!(RbDataFrame::read_csv, 2))?;
+    class.define_singleton_method("read_csv", function!(RbDataFrame::read_csv, -1))?;
     class.define_singleton_method("read_parquet", function!(RbDataFrame::read_parquet, 1))?;
+    class.define_singleton_method("read_ipc", function!(RbDataFrame::read_ipc, 6))?;
     class.define_singleton_method("read_hash", function!(RbDataFrame::read_hash, 1))?;
     class.define_singleton_method("read_json", function!(RbDataFrame::read_json, 1))?;
     class.define_singleton_method("read_ndjson", function!(RbDataFrame::read_ndjson, 1))?;
@@ -46,6 +59,7 @@ fn init() -> RbResult<()> {
     class.define_method("write_json", method!(RbDataFrame::write_json, 3))?;
     class.define_method("write_ndjson", method!(RbDataFrame::write_ndjson, 1))?;
     class.define_method("write_csv", method!(RbDataFrame::write_csv, 10))?;
+    class.define_method("write_ipc", method!(RbDataFrame::write_ipc, 2))?;
     class.define_method("write_parquet", method!(RbDataFrame::write_parquet, 5))?;
     class.define_method("rechunk", method!(RbDataFrame::rechunk, 0))?;
     class.define_method("to_s", method!(RbDataFrame::to_s, 0))?;
@@ -294,6 +308,7 @@ fn init() -> RbResult<()> {
     class.define_method("keep_name", method!(RbExpr::keep_name, 0))?;
     class.define_method("prefix", method!(RbExpr::prefix, 1))?;
     class.define_method("suffix", method!(RbExpr::suffix, 1))?;
+    class.define_method("exclude", method!(RbExpr::exclude, 1))?;
     class.define_method("interpolate", method!(RbExpr::interpolate, 0))?;
     class.define_method("rolling_sum", method!(RbExpr::rolling_sum, 6))?;
     class.define_method("rolling_min", method!(RbExpr::rolling_min, 6))?;
@@ -364,11 +379,28 @@ fn init() -> RbResult<()> {
     // maybe add to different class
     class.define_singleton_method("col", function!(crate::lazy::dsl::col, 1))?;
+    class.define_singleton_method("count", function!(crate::lazy::dsl::count, 0))?;
+    class.define_singleton_method("first", function!(crate::lazy::dsl::first, 0))?;
+    class.define_singleton_method("last", function!(crate::lazy::dsl::last, 0))?;
+    class.define_singleton_method("cols", function!(crate::lazy::dsl::cols, 1))?;
+    class.define_singleton_method("fold", function!(crate::lazy::dsl::fold, 3))?;
     class.define_singleton_method("lit", function!(crate::lazy::dsl::lit, 1))?;
     class.define_singleton_method("arange", function!(crate::lazy::dsl::arange, 3))?;
     class.define_singleton_method("when", function!(crate::lazy::dsl::when, 1))?;
+    class.define_singleton_method("concat_str", function!(crate::lazy::dsl::concat_str, 2))?;
+    class.define_singleton_method("concat_lst", function!(crate::lazy::dsl::concat_lst, 1))?;
     let class = module.define_class("RbLazyFrame", Default::default())?;
+    class.define_singleton_method(
+        "new_from_ndjson",
+        function!(RbLazyFrame::new_from_ndjson, 7),
+    )?;
+    class.define_singleton_method("new_from_csv", function!(RbLazyFrame::new_from_csv, -1))?;
+    class.define_singleton_method(
+        "new_from_parquet",
+        function!(RbLazyFrame::new_from_parquet, 7),
+    )?;
+    class.define_singleton_method("new_from_ipc", function!(RbLazyFrame::new_from_ipc, 6))?;
     class.define_method("write_json", method!(RbLazyFrame::write_json, 1))?;
     class.define_method("describe_plan", method!(RbLazyFrame::describe_plan, 0))?;
     class.define_method(
@@ -567,3 +599,45 @@ fn rb_hor_concat_df(seq: RArray) -> RbResult<RbDataFrame> {
     let df = hor_concat_df(&dfs).map_err(RbPolarsErr::from)?;
     Ok(df.into())
 }
+fn concat_series(seq: RArray) -> RbResult<RbSeries> {
+    let mut iter = seq.each();
+    let first = iter.next().unwrap()?;
+    let mut s = get_series(first)?;
+    for res in iter {
+        let item = res?;
+        let item = get_series(item)?;
+        s.append(&item).map_err(RbPolarsErr::from)?;
+    }
+    Ok(s.into())
+}
+fn ipc_schema(rb_f: Value) -> RbResult<Value> {
+    use polars::export::arrow::io::ipc::read::read_file_metadata;
+    let mut r = get_file_like(rb_f, false)?;
+    let metadata = read_file_metadata(&mut r).map_err(RbPolarsErr::arrow)?;
+    let dict = RHash::new();
+    for field in metadata.schema.fields {
+        let dt: Wrap<DataType> = Wrap((&field.data_type).into());
+        dict.aset(field.name, dt)?;
+    }
+    Ok(dict.into())
+}
+fn parquet_schema(rb_f: Value) -> RbResult<Value> {
+    use polars::export::arrow::io::parquet::read::{infer_schema, read_metadata};
+    let mut r = get_file_like(rb_f, false)?;
+    let metadata = read_metadata(&mut r).map_err(RbPolarsErr::arrow)?;
+    let arrow_schema = infer_schema(&metadata).map_err(RbPolarsErr::arrow)?;
+    let dict = RHash::new();
+    for field in arrow_schema.fields {
+        let dt: Wrap<DataType> = Wrap((&field.data_type).into());
+        dict.aset(field.name, dt)?;
+    }
+    Ok(dict.into())
+}

data/ext/polars/src/series.rs CHANGED Viewed

@@ -116,11 +116,10 @@ init_method_opt!(new_opt_f32, Float32Type, f32);
 init_method_opt!(new_opt_f64, Float64Type, f64);
 impl RbSeries {
-    pub fn new_str(name: String, val: RArray, _strict: bool) -> RbResult<Self> {
-        let v = val.try_convert::<Vec<Option<String>>>()?;
-        let mut s = Utf8Chunked::new(&name, v).into_series();
+    pub fn new_str(name: String, val: Wrap<Utf8Chunked>, _strict: bool) -> Self {
+        let mut s = val.0.into_series();
         s.rename(&name);
-        Ok(RbSeries::new(s))
+        RbSeries::new(s)
     }
     pub fn estimated_size(&self) -> usize {
@@ -199,16 +198,16 @@ impl RbSeries {
         self.series.borrow_mut().rename(&name);
     }
-    pub fn dtype(&self) -> String {
-        self.series.borrow().dtype().to_string()
+    pub fn dtype(&self) -> Value {
+        Wrap(self.series.borrow().dtype().clone()).into()
     }
-    pub fn inner_dtype(&self) -> Option<String> {
+    pub fn inner_dtype(&self) -> Option<Value> {
         self.series
             .borrow()
             .dtype()
             .inner_dtype()
-            .map(|dt| dt.to_string())
+            .map(|dt| Wrap(dt.clone()).into())
     }
     pub fn set_sorted(&self, reverse: bool) -> Self {
@@ -634,5 +633,5 @@ impl RbSeries {
 }
 pub fn to_rbseries_collection(s: Vec<Series>) -> Vec<RbSeries> {
-    s.into_iter().map(|v| RbSeries::new(v)).collect()
+    s.into_iter().map(RbSeries::new).collect()
 }

data/lib/polars/batched_csv_reader.rb ADDED Viewed

@@ -0,0 +1,95 @@
+module Polars
+  class BatchedCsvReader
+    attr_accessor :_reader, :new_columns
+    def initialize(
+      file,
+      has_header: true,
+      columns: nil,
+      sep: ",",
+      comment_char: nil,
+      quote_char: '"',
+      skip_rows: 0,
+      dtypes: nil,
+      null_values: nil,
+      ignore_errors: false,
+      parse_dates: false,
+      n_threads: nil,
+      infer_schema_length: 100,
+      batch_size: 50_000,
+      n_rows: nil,
+      encoding: "utf8",
+      low_memory: false,
+      rechunk: true,
+      skip_rows_after_header: 0,
+      row_count_name: nil,
+      row_count_offset: 0,
+      sample_size: 1024,
+      eol_char: "\n",
+      new_columns: nil
+    )
+      if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
+        path = Utils.format_path(file)
+      end
+      dtype_list = nil
+      dtype_slice = nil
+      if !dtypes.nil?
+        if dtypes.is_a?(Hash)
+          dtype_list = []
+          dtypes.each do|k, v|
+            dtype_list << [k, Utils.rb_type_to_dtype(v)]
+          end
+        elsif dtypes.is_a?(Array)
+          dtype_slice = dtypes
+        else
+          raise ArgumentError, "dtype arg should be list or dict"
+        end
+      end
+      processed_null_values = Utils._process_null_values(null_values)
+      projection, columns = Utils.handle_projection_columns(columns)
+      self._reader = RbBatchedCsv.new(
+        infer_schema_length,
+        batch_size,
+        has_header,
+        ignore_errors,
+        n_rows,
+        skip_rows,
+        projection,
+        sep,
+        rechunk,
+        columns,
+        encoding,
+        n_threads,
+        path,
+        dtype_list,
+        dtype_slice,
+        low_memory,
+        comment_char,
+        quote_char,
+        processed_null_values,
+        parse_dates,
+        skip_rows_after_header,
+        Utils._prepare_row_count_args(row_count_name, row_count_offset),
+        sample_size,
+        eol_char
+      )
+      self.new_columns = new_columns
+    end
+    def next_batches(n)
+      batches = _reader.next_batches(n)
+      if !batches.nil?
+        if new_columns
+          batches.map { |df| Utils._update_columns(Utils.wrap_df(df), new_columns) }
+        else
+          batches.map { |df| Utils.wrap_df(df) }
+        end
+      else
+        nil
+      end
+    end
+  end
+end