polars-df 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,5 @@
1
1
  use magnus::{RArray, RHash, Value};
2
+ use polars::io::RowCount;
2
3
  use polars::lazy::frame::{LazyFrame, LazyGroupBy};
3
4
  use polars::prelude::*;
4
5
  use std::cell::RefCell;
@@ -52,6 +53,137 @@ impl From<LazyFrame> for RbLazyFrame {
52
53
  }
53
54
 
54
55
  impl RbLazyFrame {
56
+ pub fn new_from_ndjson(
57
+ path: String,
58
+ infer_schema_length: Option<usize>,
59
+ batch_size: Option<usize>,
60
+ n_rows: Option<usize>,
61
+ low_memory: bool,
62
+ rechunk: bool,
63
+ row_count: Option<(String, IdxSize)>,
64
+ ) -> RbResult<Self> {
65
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
66
+
67
+ let lf = LazyJsonLineReader::new(path)
68
+ .with_infer_schema_length(infer_schema_length)
69
+ .with_batch_size(batch_size)
70
+ .with_n_rows(n_rows)
71
+ .low_memory(low_memory)
72
+ .with_rechunk(rechunk)
73
+ .with_row_count(row_count)
74
+ .finish()
75
+ .map_err(RbPolarsErr::from)?;
76
+ Ok(lf.into())
77
+ }
78
+
79
+ pub fn new_from_csv(arguments: &[Value]) -> RbResult<Self> {
80
+ // start arguments
81
+ // this pattern is needed for more than 16
82
+ let path: String = arguments[0].try_convert()?;
83
+ let sep: String = arguments[1].try_convert()?;
84
+ let has_header: bool = arguments[2].try_convert()?;
85
+ let ignore_errors: bool = arguments[3].try_convert()?;
86
+ let skip_rows: usize = arguments[4].try_convert()?;
87
+ let n_rows: Option<usize> = arguments[5].try_convert()?;
88
+ let cache: bool = arguments[6].try_convert()?;
89
+ let overwrite_dtype: Option<Vec<(String, Wrap<DataType>)>> = arguments[7].try_convert()?;
90
+ let low_memory: bool = arguments[8].try_convert()?;
91
+ let comment_char: Option<String> = arguments[9].try_convert()?;
92
+ let quote_char: Option<String> = arguments[10].try_convert()?;
93
+ let null_values: Option<Wrap<NullValues>> = arguments[11].try_convert()?;
94
+ let infer_schema_length: Option<usize> = arguments[12].try_convert()?;
95
+ let with_schema_modify: Option<Value> = arguments[13].try_convert()?;
96
+ let rechunk: bool = arguments[14].try_convert()?;
97
+ let skip_rows_after_header: usize = arguments[15].try_convert()?;
98
+ let encoding: Wrap<CsvEncoding> = arguments[16].try_convert()?;
99
+ let row_count: Option<(String, IdxSize)> = arguments[17].try_convert()?;
100
+ let parse_dates: bool = arguments[18].try_convert()?;
101
+ let eol_char: String = arguments[19].try_convert()?;
102
+ // end arguments
103
+
104
+ let null_values = null_values.map(|w| w.0);
105
+ let comment_char = comment_char.map(|s| s.as_bytes()[0]);
106
+ let quote_char = quote_char.map(|s| s.as_bytes()[0]);
107
+ let delimiter = sep.as_bytes()[0];
108
+ let eol_char = eol_char.as_bytes()[0];
109
+
110
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
111
+
112
+ let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| {
113
+ let fields = overwrite_dtype
114
+ .into_iter()
115
+ .map(|(name, dtype)| Field::new(&name, dtype.0));
116
+ Schema::from(fields)
117
+ });
118
+ let r = LazyCsvReader::new(path)
119
+ .with_infer_schema_length(infer_schema_length)
120
+ .with_delimiter(delimiter)
121
+ .has_header(has_header)
122
+ .with_ignore_parser_errors(ignore_errors)
123
+ .with_skip_rows(skip_rows)
124
+ .with_n_rows(n_rows)
125
+ .with_cache(cache)
126
+ .with_dtype_overwrite(overwrite_dtype.as_ref())
127
+ .low_memory(low_memory)
128
+ .with_comment_char(comment_char)
129
+ .with_quote_char(quote_char)
130
+ .with_end_of_line_char(eol_char)
131
+ .with_rechunk(rechunk)
132
+ .with_skip_rows_after_header(skip_rows_after_header)
133
+ .with_encoding(encoding.0)
134
+ .with_row_count(row_count)
135
+ .with_parse_dates(parse_dates)
136
+ .with_null_values(null_values);
137
+
138
+ if let Some(_lambda) = with_schema_modify {
139
+ todo!();
140
+ }
141
+
142
+ Ok(r.finish().map_err(RbPolarsErr::from)?.into())
143
+ }
144
+
145
+ pub fn new_from_parquet(
146
+ path: String,
147
+ n_rows: Option<usize>,
148
+ cache: bool,
149
+ parallel: Wrap<ParallelStrategy>,
150
+ rechunk: bool,
151
+ row_count: Option<(String, IdxSize)>,
152
+ low_memory: bool,
153
+ ) -> RbResult<Self> {
154
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
155
+ let args = ScanArgsParquet {
156
+ n_rows,
157
+ cache,
158
+ parallel: parallel.0,
159
+ rechunk,
160
+ row_count,
161
+ low_memory,
162
+ };
163
+ let lf = LazyFrame::scan_parquet(path, args).map_err(RbPolarsErr::from)?;
164
+ Ok(lf.into())
165
+ }
166
+
167
+ pub fn new_from_ipc(
168
+ path: String,
169
+ n_rows: Option<usize>,
170
+ cache: bool,
171
+ rechunk: bool,
172
+ row_count: Option<(String, IdxSize)>,
173
+ memory_map: bool,
174
+ ) -> RbResult<Self> {
175
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
176
+ let args = ScanArgsIpc {
177
+ n_rows,
178
+ cache,
179
+ rechunk,
180
+ row_count,
181
+ memmap: memory_map,
182
+ };
183
+ let lf = LazyFrame::scan_ipc(path, args).map_err(RbPolarsErr::from)?;
184
+ Ok(lf.into())
185
+ }
186
+
55
187
  pub fn write_json(&self, rb_f: Value) -> RbResult<()> {
56
188
  let file = BufWriter::new(get_file_like(rb_f, true)?);
57
189
  serde_json::to_writer(file, &self.ldf.logical_plan)
@@ -6,6 +6,7 @@ use polars::prelude::*;
6
6
  use polars::series::ops::NullBehavior;
7
7
 
8
8
  use crate::conversion::*;
9
+ use crate::lazy::apply::*;
9
10
  use crate::lazy::utils::rb_exprs_to_exprs;
10
11
  use crate::RbResult;
11
12
 
@@ -901,6 +902,10 @@ impl RbExpr {
901
902
  self.inner.clone().suffix(&suffix).into()
902
903
  }
903
904
 
905
+ pub fn exclude(&self, columns: Vec<String>) -> Self {
906
+ self.inner.clone().exclude(columns).into()
907
+ }
908
+
904
909
  pub fn interpolate(&self) -> Self {
905
910
  self.inner.clone().interpolate().into()
906
911
  }
@@ -1333,6 +1338,29 @@ pub fn col(name: String) -> RbExpr {
1333
1338
  dsl::col(&name).into()
1334
1339
  }
1335
1340
 
1341
+ pub fn count() -> RbExpr {
1342
+ dsl::count().into()
1343
+ }
1344
+
1345
+ pub fn first() -> RbExpr {
1346
+ dsl::first().into()
1347
+ }
1348
+
1349
+ pub fn last() -> RbExpr {
1350
+ dsl::last().into()
1351
+ }
1352
+
1353
+ pub fn cols(names: Vec<String>) -> RbExpr {
1354
+ dsl::cols(names).into()
1355
+ }
1356
+
1357
+ pub fn fold(acc: &RbExpr, lambda: Value, exprs: RArray) -> RbResult<RbExpr> {
1358
+ let exprs = rb_exprs_to_exprs(exprs)?;
1359
+
1360
+ let func = move |a: Series, b: Series| binary_lambda(lambda, a, b);
1361
+ Ok(polars::lazy::dsl::fold_exprs(acc.inner.clone(), func, exprs).into())
1362
+ }
1363
+
1336
1364
  // TODO improve
1337
1365
  pub fn lit(value: Value) -> RbResult<RbExpr> {
1338
1366
  if value.is_nil() {
@@ -1387,3 +1415,13 @@ impl RbWhenThen {
1387
1415
  pub fn when(predicate: &RbExpr) -> RbWhen {
1388
1416
  dsl::when(predicate.inner.clone()).into()
1389
1417
  }
1418
+
1419
+ pub fn concat_str(s: RArray, sep: String) -> RbResult<RbExpr> {
1420
+ let s = rb_exprs_to_exprs(s)?;
1421
+ Ok(dsl::concat_str(s, &sep).into())
1422
+ }
1423
+
1424
+ pub fn concat_lst(s: RArray) -> RbResult<RbExpr> {
1425
+ let s = rb_exprs_to_exprs(s)?;
1426
+ Ok(dsl::concat_lst(s).into())
1427
+ }
@@ -7,7 +7,7 @@ impl RbExpr {
7
7
  .meta()
8
8
  .pop()
9
9
  .into_iter()
10
- .map(|v| RbExpr::from(v))
10
+ .map(RbExpr::from)
11
11
  .collect()
12
12
  }
13
13
 
@@ -1,3 +1,4 @@
1
+ pub mod apply;
1
2
  pub mod dataframe;
2
3
  pub mod dsl;
3
4
  pub mod meta;
@@ -1,3 +1,4 @@
1
+ mod batched_csv;
1
2
  mod conversion;
2
3
  mod dataframe;
3
4
  mod error;
@@ -5,14 +6,18 @@ mod file;
5
6
  mod lazy;
6
7
  mod series;
7
8
 
8
- use conversion::get_df;
9
+ use batched_csv::RbBatchedCsv;
10
+ use conversion::*;
9
11
  use dataframe::RbDataFrame;
10
12
  use error::{RbPolarsErr, RbValueError};
13
+ use file::get_file_like;
11
14
  use lazy::dataframe::{RbLazyFrame, RbLazyGroupBy};
12
15
  use lazy::dsl::{RbExpr, RbWhen, RbWhenThen};
13
16
  use magnus::{
14
- define_module, function, memoize, method, prelude::*, Error, RArray, RClass, RModule,
17
+ define_module, function, memoize, method, prelude::*, Error, RArray, RClass, RHash, RModule,
18
+ Value,
15
19
  };
20
+ use polars::datatypes::DataType;
16
21
  use polars::error::PolarsResult;
17
22
  use polars::frame::DataFrame;
18
23
  use polars::functions::{diag_concat_df, hor_concat_df};
@@ -34,11 +39,19 @@ fn init() -> RbResult<()> {
34
39
  module.define_singleton_method("_concat_df", function!(concat_df, 1))?;
35
40
  module.define_singleton_method("_diag_concat_df", function!(rb_diag_concat_df, 1))?;
36
41
  module.define_singleton_method("_hor_concat_df", function!(rb_hor_concat_df, 1))?;
42
+ module.define_singleton_method("_concat_series", function!(concat_series, 1))?;
43
+ module.define_singleton_method("_ipc_schema", function!(ipc_schema, 1))?;
44
+ module.define_singleton_method("_parquet_schema", function!(parquet_schema, 1))?;
45
+
46
+ let class = module.define_class("RbBatchedCsv", Default::default())?;
47
+ class.define_singleton_method("new", function!(RbBatchedCsv::new, -1))?;
48
+ class.define_method("next_batches", method!(RbBatchedCsv::next_batches, 1))?;
37
49
 
38
50
  let class = module.define_class("RbDataFrame", Default::default())?;
39
51
  class.define_singleton_method("new", function!(RbDataFrame::init, 1))?;
40
- class.define_singleton_method("read_csv", function!(RbDataFrame::read_csv, 2))?;
52
+ class.define_singleton_method("read_csv", function!(RbDataFrame::read_csv, -1))?;
41
53
  class.define_singleton_method("read_parquet", function!(RbDataFrame::read_parquet, 1))?;
54
+ class.define_singleton_method("read_ipc", function!(RbDataFrame::read_ipc, 6))?;
42
55
  class.define_singleton_method("read_hash", function!(RbDataFrame::read_hash, 1))?;
43
56
  class.define_singleton_method("read_json", function!(RbDataFrame::read_json, 1))?;
44
57
  class.define_singleton_method("read_ndjson", function!(RbDataFrame::read_ndjson, 1))?;
@@ -46,6 +59,7 @@ fn init() -> RbResult<()> {
46
59
  class.define_method("write_json", method!(RbDataFrame::write_json, 3))?;
47
60
  class.define_method("write_ndjson", method!(RbDataFrame::write_ndjson, 1))?;
48
61
  class.define_method("write_csv", method!(RbDataFrame::write_csv, 10))?;
62
+ class.define_method("write_ipc", method!(RbDataFrame::write_ipc, 2))?;
49
63
  class.define_method("write_parquet", method!(RbDataFrame::write_parquet, 5))?;
50
64
  class.define_method("rechunk", method!(RbDataFrame::rechunk, 0))?;
51
65
  class.define_method("to_s", method!(RbDataFrame::to_s, 0))?;
@@ -294,6 +308,7 @@ fn init() -> RbResult<()> {
294
308
  class.define_method("keep_name", method!(RbExpr::keep_name, 0))?;
295
309
  class.define_method("prefix", method!(RbExpr::prefix, 1))?;
296
310
  class.define_method("suffix", method!(RbExpr::suffix, 1))?;
311
+ class.define_method("exclude", method!(RbExpr::exclude, 1))?;
297
312
  class.define_method("interpolate", method!(RbExpr::interpolate, 0))?;
298
313
  class.define_method("rolling_sum", method!(RbExpr::rolling_sum, 6))?;
299
314
  class.define_method("rolling_min", method!(RbExpr::rolling_min, 6))?;
@@ -364,11 +379,28 @@ fn init() -> RbResult<()> {
364
379
 
365
380
  // maybe add to different class
366
381
  class.define_singleton_method("col", function!(crate::lazy::dsl::col, 1))?;
382
+ class.define_singleton_method("count", function!(crate::lazy::dsl::count, 0))?;
383
+ class.define_singleton_method("first", function!(crate::lazy::dsl::first, 0))?;
384
+ class.define_singleton_method("last", function!(crate::lazy::dsl::last, 0))?;
385
+ class.define_singleton_method("cols", function!(crate::lazy::dsl::cols, 1))?;
386
+ class.define_singleton_method("fold", function!(crate::lazy::dsl::fold, 3))?;
367
387
  class.define_singleton_method("lit", function!(crate::lazy::dsl::lit, 1))?;
368
388
  class.define_singleton_method("arange", function!(crate::lazy::dsl::arange, 3))?;
369
389
  class.define_singleton_method("when", function!(crate::lazy::dsl::when, 1))?;
390
+ class.define_singleton_method("concat_str", function!(crate::lazy::dsl::concat_str, 2))?;
391
+ class.define_singleton_method("concat_lst", function!(crate::lazy::dsl::concat_lst, 1))?;
370
392
 
371
393
  let class = module.define_class("RbLazyFrame", Default::default())?;
394
+ class.define_singleton_method(
395
+ "new_from_ndjson",
396
+ function!(RbLazyFrame::new_from_ndjson, 7),
397
+ )?;
398
+ class.define_singleton_method("new_from_csv", function!(RbLazyFrame::new_from_csv, -1))?;
399
+ class.define_singleton_method(
400
+ "new_from_parquet",
401
+ function!(RbLazyFrame::new_from_parquet, 7),
402
+ )?;
403
+ class.define_singleton_method("new_from_ipc", function!(RbLazyFrame::new_from_ipc, 6))?;
372
404
  class.define_method("write_json", method!(RbLazyFrame::write_json, 1))?;
373
405
  class.define_method("describe_plan", method!(RbLazyFrame::describe_plan, 0))?;
374
406
  class.define_method(
@@ -567,3 +599,45 @@ fn rb_hor_concat_df(seq: RArray) -> RbResult<RbDataFrame> {
567
599
  let df = hor_concat_df(&dfs).map_err(RbPolarsErr::from)?;
568
600
  Ok(df.into())
569
601
  }
602
+
603
+ fn concat_series(seq: RArray) -> RbResult<RbSeries> {
604
+ let mut iter = seq.each();
605
+ let first = iter.next().unwrap()?;
606
+
607
+ let mut s = get_series(first)?;
608
+
609
+ for res in iter {
610
+ let item = res?;
611
+ let item = get_series(item)?;
612
+ s.append(&item).map_err(RbPolarsErr::from)?;
613
+ }
614
+ Ok(s.into())
615
+ }
616
+
617
+ fn ipc_schema(rb_f: Value) -> RbResult<Value> {
618
+ use polars::export::arrow::io::ipc::read::read_file_metadata;
619
+ let mut r = get_file_like(rb_f, false)?;
620
+ let metadata = read_file_metadata(&mut r).map_err(RbPolarsErr::arrow)?;
621
+
622
+ let dict = RHash::new();
623
+ for field in metadata.schema.fields {
624
+ let dt: Wrap<DataType> = Wrap((&field.data_type).into());
625
+ dict.aset(field.name, dt)?;
626
+ }
627
+ Ok(dict.into())
628
+ }
629
+
630
+ fn parquet_schema(rb_f: Value) -> RbResult<Value> {
631
+ use polars::export::arrow::io::parquet::read::{infer_schema, read_metadata};
632
+
633
+ let mut r = get_file_like(rb_f, false)?;
634
+ let metadata = read_metadata(&mut r).map_err(RbPolarsErr::arrow)?;
635
+ let arrow_schema = infer_schema(&metadata).map_err(RbPolarsErr::arrow)?;
636
+
637
+ let dict = RHash::new();
638
+ for field in arrow_schema.fields {
639
+ let dt: Wrap<DataType> = Wrap((&field.data_type).into());
640
+ dict.aset(field.name, dt)?;
641
+ }
642
+ Ok(dict.into())
643
+ }
@@ -116,11 +116,10 @@ init_method_opt!(new_opt_f32, Float32Type, f32);
116
116
  init_method_opt!(new_opt_f64, Float64Type, f64);
117
117
 
118
118
  impl RbSeries {
119
- pub fn new_str(name: String, val: RArray, _strict: bool) -> RbResult<Self> {
120
- let v = val.try_convert::<Vec<Option<String>>>()?;
121
- let mut s = Utf8Chunked::new(&name, v).into_series();
119
+ pub fn new_str(name: String, val: Wrap<Utf8Chunked>, _strict: bool) -> Self {
120
+ let mut s = val.0.into_series();
122
121
  s.rename(&name);
123
- Ok(RbSeries::new(s))
122
+ RbSeries::new(s)
124
123
  }
125
124
 
126
125
  pub fn estimated_size(&self) -> usize {
@@ -199,16 +198,16 @@ impl RbSeries {
199
198
  self.series.borrow_mut().rename(&name);
200
199
  }
201
200
 
202
- pub fn dtype(&self) -> String {
203
- self.series.borrow().dtype().to_string()
201
+ pub fn dtype(&self) -> Value {
202
+ Wrap(self.series.borrow().dtype().clone()).into()
204
203
  }
205
204
 
206
- pub fn inner_dtype(&self) -> Option<String> {
205
+ pub fn inner_dtype(&self) -> Option<Value> {
207
206
  self.series
208
207
  .borrow()
209
208
  .dtype()
210
209
  .inner_dtype()
211
- .map(|dt| dt.to_string())
210
+ .map(|dt| Wrap(dt.clone()).into())
212
211
  }
213
212
 
214
213
  pub fn set_sorted(&self, reverse: bool) -> Self {
@@ -634,5 +633,5 @@ impl RbSeries {
634
633
  }
635
634
 
636
635
  pub fn to_rbseries_collection(s: Vec<Series>) -> Vec<RbSeries> {
637
- s.into_iter().map(|v| RbSeries::new(v)).collect()
636
+ s.into_iter().map(RbSeries::new).collect()
638
637
  }
@@ -0,0 +1,95 @@
1
+ module Polars
2
+ class BatchedCsvReader
3
+ attr_accessor :_reader, :new_columns
4
+
5
+ def initialize(
6
+ file,
7
+ has_header: true,
8
+ columns: nil,
9
+ sep: ",",
10
+ comment_char: nil,
11
+ quote_char: '"',
12
+ skip_rows: 0,
13
+ dtypes: nil,
14
+ null_values: nil,
15
+ ignore_errors: false,
16
+ parse_dates: false,
17
+ n_threads: nil,
18
+ infer_schema_length: 100,
19
+ batch_size: 50_000,
20
+ n_rows: nil,
21
+ encoding: "utf8",
22
+ low_memory: false,
23
+ rechunk: true,
24
+ skip_rows_after_header: 0,
25
+ row_count_name: nil,
26
+ row_count_offset: 0,
27
+ sample_size: 1024,
28
+ eol_char: "\n",
29
+ new_columns: nil
30
+ )
31
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
32
+ path = Utils.format_path(file)
33
+ end
34
+
35
+ dtype_list = nil
36
+ dtype_slice = nil
37
+ if !dtypes.nil?
38
+ if dtypes.is_a?(Hash)
39
+ dtype_list = []
40
+ dtypes.each do|k, v|
41
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
42
+ end
43
+ elsif dtypes.is_a?(Array)
44
+ dtype_slice = dtypes
45
+ else
46
+ raise ArgumentError, "dtype arg should be list or dict"
47
+ end
48
+ end
49
+
50
+ processed_null_values = Utils._process_null_values(null_values)
51
+ projection, columns = Utils.handle_projection_columns(columns)
52
+
53
+ self._reader = RbBatchedCsv.new(
54
+ infer_schema_length,
55
+ batch_size,
56
+ has_header,
57
+ ignore_errors,
58
+ n_rows,
59
+ skip_rows,
60
+ projection,
61
+ sep,
62
+ rechunk,
63
+ columns,
64
+ encoding,
65
+ n_threads,
66
+ path,
67
+ dtype_list,
68
+ dtype_slice,
69
+ low_memory,
70
+ comment_char,
71
+ quote_char,
72
+ processed_null_values,
73
+ parse_dates,
74
+ skip_rows_after_header,
75
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
76
+ sample_size,
77
+ eol_char
78
+ )
79
+ self.new_columns = new_columns
80
+ end
81
+
82
+ def next_batches(n)
83
+ batches = _reader.next_batches(n)
84
+ if !batches.nil?
85
+ if new_columns
86
+ batches.map { |df| Utils._update_columns(Utils.wrap_df(df), new_columns) }
87
+ else
88
+ batches.map { |df| Utils.wrap_df(df) }
89
+ end
90
+ else
91
+ nil
92
+ end
93
+ end
94
+ end
95
+ end