polars-df 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/Cargo.lock +1946 -0
- data/Cargo.toml +5 -0
- data/ext/polars/Cargo.toml +31 -1
- data/ext/polars/src/batched_csv.rs +120 -0
- data/ext/polars/src/conversion.rs +336 -42
- data/ext/polars/src/dataframe.rs +409 -4
- data/ext/polars/src/error.rs +9 -0
- data/ext/polars/src/file.rs +8 -7
- data/ext/polars/src/lazy/apply.rs +7 -0
- data/ext/polars/src/lazy/dataframe.rs +436 -10
- data/ext/polars/src/lazy/dsl.rs +1134 -5
- data/ext/polars/src/lazy/meta.rs +41 -0
- data/ext/polars/src/lazy/mod.rs +2 -0
- data/ext/polars/src/lib.rs +390 -3
- data/ext/polars/src/series.rs +175 -13
- data/lib/polars/batched_csv_reader.rb +95 -0
- data/lib/polars/cat_expr.rb +13 -0
- data/lib/polars/data_frame.rb +892 -21
- data/lib/polars/date_time_expr.rb +143 -0
- data/lib/polars/expr.rb +503 -0
- data/lib/polars/io.rb +342 -2
- data/lib/polars/lazy_frame.rb +338 -6
- data/lib/polars/lazy_functions.rb +158 -11
- data/lib/polars/list_expr.rb +108 -0
- data/lib/polars/meta_expr.rb +33 -0
- data/lib/polars/series.rb +1304 -14
- data/lib/polars/string_expr.rb +117 -0
- data/lib/polars/struct_expr.rb +27 -0
- data/lib/polars/utils.rb +60 -0
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +15 -1
- metadata +13 -2
    
        data/ext/polars/src/series.rs
    CHANGED
    
    | @@ -1,11 +1,12 @@ | |
| 1 | 
            -
            use crate::conversion::wrap;
         | 
| 2 | 
            -
            use crate::{RbDataFrame, RbPolarsErr, RbResult};
         | 
| 3 1 | 
             
            use magnus::exception::arg_error;
         | 
| 4 2 | 
             
            use magnus::{Error, RArray, Value};
         | 
| 5 3 | 
             
            use polars::prelude::*;
         | 
| 6 4 | 
             
            use polars::series::IsSorted;
         | 
| 7 5 | 
             
            use std::cell::RefCell;
         | 
| 8 6 |  | 
| 7 | 
            +
            use crate::conversion::*;
         | 
| 8 | 
            +
            use crate::{RbDataFrame, RbPolarsErr, RbResult, RbValueError};
         | 
| 9 | 
            +
             | 
| 9 10 | 
             
            #[magnus::wrap(class = "Polars::RbSeries")]
         | 
| 10 11 | 
             
            pub struct RbSeries {
         | 
| 11 12 | 
             
                pub series: RefCell<Series>,
         | 
| @@ -24,6 +25,14 @@ impl RbSeries { | |
| 24 25 | 
             
                    }
         | 
| 25 26 | 
             
                }
         | 
| 26 27 |  | 
| 28 | 
            +
                pub fn is_sorted_flag(&self) -> bool {
         | 
| 29 | 
            +
                    matches!(self.series.borrow().is_sorted(), IsSorted::Ascending)
         | 
| 30 | 
            +
                }
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                pub fn is_sorted_reverse_flag(&self) -> bool {
         | 
| 33 | 
            +
                    matches!(self.series.borrow().is_sorted(), IsSorted::Descending)
         | 
| 34 | 
            +
                }
         | 
| 35 | 
            +
             | 
| 27 36 | 
             
                pub fn new_opt_bool(name: String, obj: RArray, strict: bool) -> RbResult<RbSeries> {
         | 
| 28 37 | 
             
                    let len = obj.len();
         | 
| 29 38 | 
             
                    let mut builder = BooleanChunkedBuilder::new(&name, len);
         | 
| @@ -107,11 +116,33 @@ init_method_opt!(new_opt_f32, Float32Type, f32); | |
| 107 116 | 
             
            init_method_opt!(new_opt_f64, Float64Type, f64);
         | 
| 108 117 |  | 
| 109 118 | 
             
            impl RbSeries {
         | 
| 110 | 
            -
                pub fn new_str(name: String, val:  | 
| 111 | 
            -
                    let  | 
| 112 | 
            -
                    let mut s = Utf8Chunked::new(&name, v).into_series();
         | 
| 119 | 
            +
                pub fn new_str(name: String, val: Wrap<Utf8Chunked>, _strict: bool) -> Self {
         | 
| 120 | 
            +
                    let mut s = val.0.into_series();
         | 
| 113 121 | 
             
                    s.rename(&name);
         | 
| 114 | 
            -
                     | 
| 122 | 
            +
                    RbSeries::new(s)
         | 
| 123 | 
            +
                }
         | 
| 124 | 
            +
             | 
| 125 | 
            +
                pub fn estimated_size(&self) -> usize {
         | 
| 126 | 
            +
                    self.series.borrow().estimated_size()
         | 
| 127 | 
            +
                }
         | 
| 128 | 
            +
             | 
| 129 | 
            +
                pub fn get_fmt(&self, index: usize, str_lengths: usize) -> String {
         | 
| 130 | 
            +
                    let val = format!("{}", self.series.borrow().get(index));
         | 
| 131 | 
            +
                    if let DataType::Utf8 | DataType::Categorical(_) = self.series.borrow().dtype() {
         | 
| 132 | 
            +
                        let v_trunc = &val[..val
         | 
| 133 | 
            +
                            .char_indices()
         | 
| 134 | 
            +
                            .take(str_lengths)
         | 
| 135 | 
            +
                            .last()
         | 
| 136 | 
            +
                            .map(|(i, c)| i + c.len_utf8())
         | 
| 137 | 
            +
                            .unwrap_or(0)];
         | 
| 138 | 
            +
                        if val == v_trunc {
         | 
| 139 | 
            +
                            val
         | 
| 140 | 
            +
                        } else {
         | 
| 141 | 
            +
                            format!("{}...", v_trunc)
         | 
| 142 | 
            +
                        }
         | 
| 143 | 
            +
                    } else {
         | 
| 144 | 
            +
                        val
         | 
| 145 | 
            +
                    }
         | 
| 115 146 | 
             
                }
         | 
| 116 147 |  | 
| 117 148 | 
             
                pub fn rechunk(&self, in_place: bool) -> Option<Self> {
         | 
| @@ -124,6 +155,10 @@ impl RbSeries { | |
| 124 155 | 
             
                    }
         | 
| 125 156 | 
             
                }
         | 
| 126 157 |  | 
| 158 | 
            +
                pub fn get_idx(&self, idx: usize) -> Value {
         | 
| 159 | 
            +
                    Wrap(self.series.borrow().get(idx)).into()
         | 
| 160 | 
            +
                }
         | 
| 161 | 
            +
             | 
| 127 162 | 
             
                pub fn bitand(&self, other: &RbSeries) -> RbResult<Self> {
         | 
| 128 163 | 
             
                    let out = self
         | 
| 129 164 | 
             
                        .series
         | 
| @@ -163,16 +198,16 @@ impl RbSeries { | |
| 163 198 | 
             
                    self.series.borrow_mut().rename(&name);
         | 
| 164 199 | 
             
                }
         | 
| 165 200 |  | 
| 166 | 
            -
                pub fn dtype(&self) ->  | 
| 167 | 
            -
                    self.series.borrow().dtype(). | 
| 201 | 
            +
                pub fn dtype(&self) -> Value {
         | 
| 202 | 
            +
                    Wrap(self.series.borrow().dtype().clone()).into()
         | 
| 168 203 | 
             
                }
         | 
| 169 204 |  | 
| 170 | 
            -
                pub fn inner_dtype(&self) -> Option< | 
| 205 | 
            +
                pub fn inner_dtype(&self) -> Option<Value> {
         | 
| 171 206 | 
             
                    self.series
         | 
| 172 207 | 
             
                        .borrow()
         | 
| 173 208 | 
             
                        .dtype()
         | 
| 174 209 | 
             
                        .inner_dtype()
         | 
| 175 | 
            -
                        .map(|dt| dt. | 
| 210 | 
            +
                        .map(|dt| Wrap(dt.clone()).into())
         | 
| 176 211 | 
             
                }
         | 
| 177 212 |  | 
| 178 213 | 
             
                pub fn set_sorted(&self, reverse: bool) -> Self {
         | 
| @@ -196,15 +231,15 @@ impl RbSeries { | |
| 196 231 | 
             
                }
         | 
| 197 232 |  | 
| 198 233 | 
             
                pub fn max(&self) -> Value {
         | 
| 199 | 
            -
                     | 
| 234 | 
            +
                    Wrap(self.series.borrow().max_as_series().get(0)).into()
         | 
| 200 235 | 
             
                }
         | 
| 201 236 |  | 
| 202 237 | 
             
                pub fn min(&self) -> Value {
         | 
| 203 | 
            -
                     | 
| 238 | 
            +
                    Wrap(self.series.borrow().min_as_series().get(0)).into()
         | 
| 204 239 | 
             
                }
         | 
| 205 240 |  | 
| 206 241 | 
             
                pub fn sum(&self) -> Value {
         | 
| 207 | 
            -
                     | 
| 242 | 
            +
                    Wrap(self.series.borrow().sum_as_series().get(0)).into()
         | 
| 208 243 | 
             
                }
         | 
| 209 244 |  | 
| 210 245 | 
             
                pub fn n_chunks(&self) -> usize {
         | 
| @@ -454,6 +489,111 @@ impl RbSeries { | |
| 454 489 | 
             
                    }
         | 
| 455 490 | 
             
                }
         | 
| 456 491 |  | 
| 492 | 
            +
                pub fn quantile(
         | 
| 493 | 
            +
                    &self,
         | 
| 494 | 
            +
                    quantile: f64,
         | 
| 495 | 
            +
                    interpolation: Wrap<QuantileInterpolOptions>,
         | 
| 496 | 
            +
                ) -> RbResult<Value> {
         | 
| 497 | 
            +
                    Ok(Wrap(
         | 
| 498 | 
            +
                        self.series
         | 
| 499 | 
            +
                            .borrow()
         | 
| 500 | 
            +
                            .quantile_as_series(quantile, interpolation.0)
         | 
| 501 | 
            +
                            .map_err(|_| RbValueError::new_err("invalid quantile".into()))?
         | 
| 502 | 
            +
                            .get(0),
         | 
| 503 | 
            +
                    )
         | 
| 504 | 
            +
                    .into())
         | 
| 505 | 
            +
                }
         | 
| 506 | 
            +
             | 
| 507 | 
            +
                pub fn clone(&self) -> Self {
         | 
| 508 | 
            +
                    RbSeries::new(self.series.borrow().clone())
         | 
| 509 | 
            +
                }
         | 
| 510 | 
            +
             | 
| 511 | 
            +
                pub fn zip_with(&self, mask: &RbSeries, other: &RbSeries) -> RbResult<Self> {
         | 
| 512 | 
            +
                    let binding = mask.series.borrow();
         | 
| 513 | 
            +
                    let mask = binding.bool().map_err(RbPolarsErr::from)?;
         | 
| 514 | 
            +
                    let s = self
         | 
| 515 | 
            +
                        .series
         | 
| 516 | 
            +
                        .borrow()
         | 
| 517 | 
            +
                        .zip_with(mask, &other.series.borrow())
         | 
| 518 | 
            +
                        .map_err(RbPolarsErr::from)?;
         | 
| 519 | 
            +
                    Ok(RbSeries::new(s))
         | 
| 520 | 
            +
                }
         | 
| 521 | 
            +
             | 
| 522 | 
            +
                pub fn to_dummies(&self) -> RbResult<RbDataFrame> {
         | 
| 523 | 
            +
                    let df = self
         | 
| 524 | 
            +
                        .series
         | 
| 525 | 
            +
                        .borrow()
         | 
| 526 | 
            +
                        .to_dummies()
         | 
| 527 | 
            +
                        .map_err(RbPolarsErr::from)?;
         | 
| 528 | 
            +
                    Ok(df.into())
         | 
| 529 | 
            +
                }
         | 
| 530 | 
            +
             | 
| 531 | 
            +
                pub fn peak_max(&self) -> Self {
         | 
| 532 | 
            +
                    self.series.borrow().peak_max().into_series().into()
         | 
| 533 | 
            +
                }
         | 
| 534 | 
            +
             | 
| 535 | 
            +
                pub fn peak_min(&self) -> Self {
         | 
| 536 | 
            +
                    self.series.borrow().peak_min().into_series().into()
         | 
| 537 | 
            +
                }
         | 
| 538 | 
            +
             | 
| 539 | 
            +
                pub fn n_unique(&self) -> RbResult<usize> {
         | 
| 540 | 
            +
                    let n = self.series.borrow().n_unique().map_err(RbPolarsErr::from)?;
         | 
| 541 | 
            +
                    Ok(n)
         | 
| 542 | 
            +
                }
         | 
| 543 | 
            +
             | 
| 544 | 
            +
                pub fn floor(&self) -> RbResult<Self> {
         | 
| 545 | 
            +
                    let s = self.series.borrow().floor().map_err(RbPolarsErr::from)?;
         | 
| 546 | 
            +
                    Ok(s.into())
         | 
| 547 | 
            +
                }
         | 
| 548 | 
            +
             | 
| 549 | 
            +
                pub fn shrink_to_fit(&self) {
         | 
| 550 | 
            +
                    self.series.borrow_mut().shrink_to_fit();
         | 
| 551 | 
            +
                }
         | 
| 552 | 
            +
             | 
| 553 | 
            +
                pub fn dot(&self, other: &RbSeries) -> Option<f64> {
         | 
| 554 | 
            +
                    self.series.borrow().dot(&other.series.borrow())
         | 
| 555 | 
            +
                }
         | 
| 556 | 
            +
             | 
| 557 | 
            +
                pub fn skew(&self, bias: bool) -> RbResult<Option<f64>> {
         | 
| 558 | 
            +
                    let out = self.series.borrow().skew(bias).map_err(RbPolarsErr::from)?;
         | 
| 559 | 
            +
                    Ok(out)
         | 
| 560 | 
            +
                }
         | 
| 561 | 
            +
             | 
| 562 | 
            +
                pub fn kurtosis(&self, fisher: bool, bias: bool) -> RbResult<Option<f64>> {
         | 
| 563 | 
            +
                    let out = self
         | 
| 564 | 
            +
                        .series
         | 
| 565 | 
            +
                        .borrow()
         | 
| 566 | 
            +
                        .kurtosis(fisher, bias)
         | 
| 567 | 
            +
                        .map_err(RbPolarsErr::from)?;
         | 
| 568 | 
            +
                    Ok(out)
         | 
| 569 | 
            +
                }
         | 
| 570 | 
            +
             | 
| 571 | 
            +
                pub fn cast(&self, dtype: Wrap<DataType>, strict: bool) -> RbResult<Self> {
         | 
| 572 | 
            +
                    let dtype = dtype.0;
         | 
| 573 | 
            +
                    let out = if strict {
         | 
| 574 | 
            +
                        self.series.borrow().strict_cast(&dtype)
         | 
| 575 | 
            +
                    } else {
         | 
| 576 | 
            +
                        self.series.borrow().cast(&dtype)
         | 
| 577 | 
            +
                    };
         | 
| 578 | 
            +
                    let out = out.map_err(RbPolarsErr::from)?;
         | 
| 579 | 
            +
                    Ok(out.into())
         | 
| 580 | 
            +
                }
         | 
| 581 | 
            +
             | 
| 582 | 
            +
                pub fn time_unit(&self) -> Option<String> {
         | 
| 583 | 
            +
                    if let DataType::Datetime(tu, _) | DataType::Duration(tu) = self.series.borrow().dtype() {
         | 
| 584 | 
            +
                        Some(
         | 
| 585 | 
            +
                            match tu {
         | 
| 586 | 
            +
                                TimeUnit::Nanoseconds => "ns",
         | 
| 587 | 
            +
                                TimeUnit::Microseconds => "us",
         | 
| 588 | 
            +
                                TimeUnit::Milliseconds => "ms",
         | 
| 589 | 
            +
                            }
         | 
| 590 | 
            +
                            .to_string(),
         | 
| 591 | 
            +
                        )
         | 
| 592 | 
            +
                    } else {
         | 
| 593 | 
            +
                        None
         | 
| 594 | 
            +
                    }
         | 
| 595 | 
            +
                }
         | 
| 596 | 
            +
             | 
| 457 597 | 
             
                // dispatch dynamically in future?
         | 
| 458 598 |  | 
| 459 599 | 
             
                pub fn cumsum(&self, reverse: bool) -> Self {
         | 
| @@ -468,8 +608,30 @@ impl RbSeries { | |
| 468 608 | 
             
                    self.series.borrow().cummin(reverse).into()
         | 
| 469 609 | 
             
                }
         | 
| 470 610 |  | 
| 611 | 
            +
                pub fn cumprod(&self, reverse: bool) -> Self {
         | 
| 612 | 
            +
                    self.series.borrow().cumprod(reverse).into()
         | 
| 613 | 
            +
                }
         | 
| 614 | 
            +
             | 
| 471 615 | 
             
                pub fn slice(&self, offset: i64, length: usize) -> Self {
         | 
| 472 616 | 
             
                    let series = self.series.borrow().slice(offset, length);
         | 
| 473 617 | 
             
                    series.into()
         | 
| 474 618 | 
             
                }
         | 
| 619 | 
            +
             | 
| 620 | 
            +
                pub fn ceil(&self) -> RbResult<Self> {
         | 
| 621 | 
            +
                    let s = self.series.borrow().ceil().map_err(RbPolarsErr::from)?;
         | 
| 622 | 
            +
                    Ok(s.into())
         | 
| 623 | 
            +
                }
         | 
| 624 | 
            +
             | 
| 625 | 
            +
                pub fn round(&self, decimals: u32) -> RbResult<Self> {
         | 
| 626 | 
            +
                    let s = self
         | 
| 627 | 
            +
                        .series
         | 
| 628 | 
            +
                        .borrow()
         | 
| 629 | 
            +
                        .round(decimals)
         | 
| 630 | 
            +
                        .map_err(RbPolarsErr::from)?;
         | 
| 631 | 
            +
                    Ok(s.into())
         | 
| 632 | 
            +
                }
         | 
| 633 | 
            +
            }
         | 
| 634 | 
            +
             | 
| 635 | 
            +
            pub fn to_rbseries_collection(s: Vec<Series>) -> Vec<RbSeries> {
         | 
| 636 | 
            +
                s.into_iter().map(RbSeries::new).collect()
         | 
| 475 637 | 
             
            }
         | 
| @@ -0,0 +1,95 @@ | |
| 1 | 
            +
            module Polars
         | 
| 2 | 
            +
              class BatchedCsvReader
         | 
| 3 | 
            +
                attr_accessor :_reader, :new_columns
         | 
| 4 | 
            +
             | 
| 5 | 
            +
                def initialize(
         | 
| 6 | 
            +
                  file,
         | 
| 7 | 
            +
                  has_header: true,
         | 
| 8 | 
            +
                  columns: nil,
         | 
| 9 | 
            +
                  sep: ",",
         | 
| 10 | 
            +
                  comment_char: nil,
         | 
| 11 | 
            +
                  quote_char: '"',
         | 
| 12 | 
            +
                  skip_rows: 0,
         | 
| 13 | 
            +
                  dtypes: nil,
         | 
| 14 | 
            +
                  null_values: nil,
         | 
| 15 | 
            +
                  ignore_errors: false,
         | 
| 16 | 
            +
                  parse_dates: false,
         | 
| 17 | 
            +
                  n_threads: nil,
         | 
| 18 | 
            +
                  infer_schema_length: 100,
         | 
| 19 | 
            +
                  batch_size: 50_000,
         | 
| 20 | 
            +
                  n_rows: nil,
         | 
| 21 | 
            +
                  encoding: "utf8",
         | 
| 22 | 
            +
                  low_memory: false,
         | 
| 23 | 
            +
                  rechunk: true,
         | 
| 24 | 
            +
                  skip_rows_after_header: 0,
         | 
| 25 | 
            +
                  row_count_name: nil,
         | 
| 26 | 
            +
                  row_count_offset: 0,
         | 
| 27 | 
            +
                  sample_size: 1024,
         | 
| 28 | 
            +
                  eol_char: "\n",
         | 
| 29 | 
            +
                  new_columns: nil
         | 
| 30 | 
            +
                )
         | 
| 31 | 
            +
                  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
         | 
| 32 | 
            +
                    path = Utils.format_path(file)
         | 
| 33 | 
            +
                  end
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                  dtype_list = nil
         | 
| 36 | 
            +
                  dtype_slice = nil
         | 
| 37 | 
            +
                  if !dtypes.nil?
         | 
| 38 | 
            +
                    if dtypes.is_a?(Hash)
         | 
| 39 | 
            +
                      dtype_list = []
         | 
| 40 | 
            +
                      dtypes.each do|k, v|
         | 
| 41 | 
            +
                        dtype_list << [k, Utils.rb_type_to_dtype(v)]
         | 
| 42 | 
            +
                      end
         | 
| 43 | 
            +
                    elsif dtypes.is_a?(Array)
         | 
| 44 | 
            +
                      dtype_slice = dtypes
         | 
| 45 | 
            +
                    else
         | 
| 46 | 
            +
                      raise ArgumentError, "dtype arg should be list or dict"
         | 
| 47 | 
            +
                    end
         | 
| 48 | 
            +
                  end
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                  processed_null_values = Utils._process_null_values(null_values)
         | 
| 51 | 
            +
                  projection, columns = Utils.handle_projection_columns(columns)
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                  self._reader = RbBatchedCsv.new(
         | 
| 54 | 
            +
                    infer_schema_length,
         | 
| 55 | 
            +
                    batch_size,
         | 
| 56 | 
            +
                    has_header,
         | 
| 57 | 
            +
                    ignore_errors,
         | 
| 58 | 
            +
                    n_rows,
         | 
| 59 | 
            +
                    skip_rows,
         | 
| 60 | 
            +
                    projection,
         | 
| 61 | 
            +
                    sep,
         | 
| 62 | 
            +
                    rechunk,
         | 
| 63 | 
            +
                    columns,
         | 
| 64 | 
            +
                    encoding,
         | 
| 65 | 
            +
                    n_threads,
         | 
| 66 | 
            +
                    path,
         | 
| 67 | 
            +
                    dtype_list,
         | 
| 68 | 
            +
                    dtype_slice,
         | 
| 69 | 
            +
                    low_memory,
         | 
| 70 | 
            +
                    comment_char,
         | 
| 71 | 
            +
                    quote_char,
         | 
| 72 | 
            +
                    processed_null_values,
         | 
| 73 | 
            +
                    parse_dates,
         | 
| 74 | 
            +
                    skip_rows_after_header,
         | 
| 75 | 
            +
                    Utils._prepare_row_count_args(row_count_name, row_count_offset),
         | 
| 76 | 
            +
                    sample_size,
         | 
| 77 | 
            +
                    eol_char
         | 
| 78 | 
            +
                  )
         | 
| 79 | 
            +
                  self.new_columns = new_columns
         | 
| 80 | 
            +
                end
         | 
| 81 | 
            +
             | 
| 82 | 
            +
                def next_batches(n)
         | 
| 83 | 
            +
                  batches = _reader.next_batches(n)
         | 
| 84 | 
            +
                  if !batches.nil?
         | 
| 85 | 
            +
                    if new_columns
         | 
| 86 | 
            +
                      batches.map { |df| Utils._update_columns(Utils.wrap_df(df), new_columns) }
         | 
| 87 | 
            +
                    else
         | 
| 88 | 
            +
                      batches.map { |df| Utils.wrap_df(df) }
         | 
| 89 | 
            +
                    end
         | 
| 90 | 
            +
                  else
         | 
| 91 | 
            +
                    nil
         | 
| 92 | 
            +
                  end
         | 
| 93 | 
            +
                end
         | 
| 94 | 
            +
              end
         | 
| 95 | 
            +
            end
         |