polars-df 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,4 +1,5 @@
1
1
  use magnus::{RArray, RHash, Value};
2
+ use polars::io::RowCount;
2
3
  use polars::lazy::frame::{LazyFrame, LazyGroupBy};
3
4
  use polars::prelude::*;
4
5
  use std::cell::RefCell;
@@ -52,6 +53,137 @@ impl From<LazyFrame> for RbLazyFrame {
52
53
  }
53
54
 
54
55
  impl RbLazyFrame {
56
+ pub fn new_from_ndjson(
57
+ path: String,
58
+ infer_schema_length: Option<usize>,
59
+ batch_size: Option<usize>,
60
+ n_rows: Option<usize>,
61
+ low_memory: bool,
62
+ rechunk: bool,
63
+ row_count: Option<(String, IdxSize)>,
64
+ ) -> RbResult<Self> {
65
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
66
+
67
+ let lf = LazyJsonLineReader::new(path)
68
+ .with_infer_schema_length(infer_schema_length)
69
+ .with_batch_size(batch_size)
70
+ .with_n_rows(n_rows)
71
+ .low_memory(low_memory)
72
+ .with_rechunk(rechunk)
73
+ .with_row_count(row_count)
74
+ .finish()
75
+ .map_err(RbPolarsErr::from)?;
76
+ Ok(lf.into())
77
+ }
78
+
79
+ pub fn new_from_csv(arguments: &[Value]) -> RbResult<Self> {
80
+ // start arguments
81
+ // this pattern is needed for more than 16
82
+ let path: String = arguments[0].try_convert()?;
83
+ let sep: String = arguments[1].try_convert()?;
84
+ let has_header: bool = arguments[2].try_convert()?;
85
+ let ignore_errors: bool = arguments[3].try_convert()?;
86
+ let skip_rows: usize = arguments[4].try_convert()?;
87
+ let n_rows: Option<usize> = arguments[5].try_convert()?;
88
+ let cache: bool = arguments[6].try_convert()?;
89
+ let overwrite_dtype: Option<Vec<(String, Wrap<DataType>)>> = arguments[7].try_convert()?;
90
+ let low_memory: bool = arguments[8].try_convert()?;
91
+ let comment_char: Option<String> = arguments[9].try_convert()?;
92
+ let quote_char: Option<String> = arguments[10].try_convert()?;
93
+ let null_values: Option<Wrap<NullValues>> = arguments[11].try_convert()?;
94
+ let infer_schema_length: Option<usize> = arguments[12].try_convert()?;
95
+ let with_schema_modify: Option<Value> = arguments[13].try_convert()?;
96
+ let rechunk: bool = arguments[14].try_convert()?;
97
+ let skip_rows_after_header: usize = arguments[15].try_convert()?;
98
+ let encoding: Wrap<CsvEncoding> = arguments[16].try_convert()?;
99
+ let row_count: Option<(String, IdxSize)> = arguments[17].try_convert()?;
100
+ let parse_dates: bool = arguments[18].try_convert()?;
101
+ let eol_char: String = arguments[19].try_convert()?;
102
+ // end arguments
103
+
104
+ let null_values = null_values.map(|w| w.0);
105
+ let comment_char = comment_char.map(|s| s.as_bytes()[0]);
106
+ let quote_char = quote_char.map(|s| s.as_bytes()[0]);
107
+ let delimiter = sep.as_bytes()[0];
108
+ let eol_char = eol_char.as_bytes()[0];
109
+
110
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
111
+
112
+ let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| {
113
+ let fields = overwrite_dtype
114
+ .into_iter()
115
+ .map(|(name, dtype)| Field::new(&name, dtype.0));
116
+ Schema::from(fields)
117
+ });
118
+ let r = LazyCsvReader::new(path)
119
+ .with_infer_schema_length(infer_schema_length)
120
+ .with_delimiter(delimiter)
121
+ .has_header(has_header)
122
+ .with_ignore_parser_errors(ignore_errors)
123
+ .with_skip_rows(skip_rows)
124
+ .with_n_rows(n_rows)
125
+ .with_cache(cache)
126
+ .with_dtype_overwrite(overwrite_dtype.as_ref())
127
+ .low_memory(low_memory)
128
+ .with_comment_char(comment_char)
129
+ .with_quote_char(quote_char)
130
+ .with_end_of_line_char(eol_char)
131
+ .with_rechunk(rechunk)
132
+ .with_skip_rows_after_header(skip_rows_after_header)
133
+ .with_encoding(encoding.0)
134
+ .with_row_count(row_count)
135
+ .with_parse_dates(parse_dates)
136
+ .with_null_values(null_values);
137
+
138
+ if let Some(_lambda) = with_schema_modify {
139
+ todo!();
140
+ }
141
+
142
+ Ok(r.finish().map_err(RbPolarsErr::from)?.into())
143
+ }
144
+
145
+ pub fn new_from_parquet(
146
+ path: String,
147
+ n_rows: Option<usize>,
148
+ cache: bool,
149
+ parallel: Wrap<ParallelStrategy>,
150
+ rechunk: bool,
151
+ row_count: Option<(String, IdxSize)>,
152
+ low_memory: bool,
153
+ ) -> RbResult<Self> {
154
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
155
+ let args = ScanArgsParquet {
156
+ n_rows,
157
+ cache,
158
+ parallel: parallel.0,
159
+ rechunk,
160
+ row_count,
161
+ low_memory,
162
+ };
163
+ let lf = LazyFrame::scan_parquet(path, args).map_err(RbPolarsErr::from)?;
164
+ Ok(lf.into())
165
+ }
166
+
167
+ pub fn new_from_ipc(
168
+ path: String,
169
+ n_rows: Option<usize>,
170
+ cache: bool,
171
+ rechunk: bool,
172
+ row_count: Option<(String, IdxSize)>,
173
+ memory_map: bool,
174
+ ) -> RbResult<Self> {
175
+ let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
176
+ let args = ScanArgsIpc {
177
+ n_rows,
178
+ cache,
179
+ rechunk,
180
+ row_count,
181
+ memmap: memory_map,
182
+ };
183
+ let lf = LazyFrame::scan_ipc(path, args).map_err(RbPolarsErr::from)?;
184
+ Ok(lf.into())
185
+ }
186
+
55
187
  pub fn write_json(&self, rb_f: Value) -> RbResult<()> {
56
188
  let file = BufWriter::new(get_file_like(rb_f, true)?);
57
189
  serde_json::to_writer(file, &self.ldf.logical_plan)
@@ -6,6 +6,7 @@ use polars::prelude::*;
6
6
  use polars::series::ops::NullBehavior;
7
7
 
8
8
  use crate::conversion::*;
9
+ use crate::lazy::apply::*;
9
10
  use crate::lazy::utils::rb_exprs_to_exprs;
10
11
  use crate::RbResult;
11
12
 
@@ -901,6 +902,10 @@ impl RbExpr {
901
902
  self.inner.clone().suffix(&suffix).into()
902
903
  }
903
904
 
905
+ pub fn exclude(&self, columns: Vec<String>) -> Self {
906
+ self.inner.clone().exclude(columns).into()
907
+ }
908
+
904
909
  pub fn interpolate(&self) -> Self {
905
910
  self.inner.clone().interpolate().into()
906
911
  }
@@ -1333,6 +1338,29 @@ pub fn col(name: String) -> RbExpr {
1333
1338
  dsl::col(&name).into()
1334
1339
  }
1335
1340
 
1341
+ pub fn count() -> RbExpr {
1342
+ dsl::count().into()
1343
+ }
1344
+
1345
+ pub fn first() -> RbExpr {
1346
+ dsl::first().into()
1347
+ }
1348
+
1349
+ pub fn last() -> RbExpr {
1350
+ dsl::last().into()
1351
+ }
1352
+
1353
+ pub fn cols(names: Vec<String>) -> RbExpr {
1354
+ dsl::cols(names).into()
1355
+ }
1356
+
1357
+ pub fn fold(acc: &RbExpr, lambda: Value, exprs: RArray) -> RbResult<RbExpr> {
1358
+ let exprs = rb_exprs_to_exprs(exprs)?;
1359
+
1360
+ let func = move |a: Series, b: Series| binary_lambda(lambda, a, b);
1361
+ Ok(polars::lazy::dsl::fold_exprs(acc.inner.clone(), func, exprs).into())
1362
+ }
1363
+
1336
1364
  // TODO improve
1337
1365
  pub fn lit(value: Value) -> RbResult<RbExpr> {
1338
1366
  if value.is_nil() {
@@ -1387,3 +1415,13 @@ impl RbWhenThen {
1387
1415
  pub fn when(predicate: &RbExpr) -> RbWhen {
1388
1416
  dsl::when(predicate.inner.clone()).into()
1389
1417
  }
1418
+
1419
+ pub fn concat_str(s: RArray, sep: String) -> RbResult<RbExpr> {
1420
+ let s = rb_exprs_to_exprs(s)?;
1421
+ Ok(dsl::concat_str(s, &sep).into())
1422
+ }
1423
+
1424
+ pub fn concat_lst(s: RArray) -> RbResult<RbExpr> {
1425
+ let s = rb_exprs_to_exprs(s)?;
1426
+ Ok(dsl::concat_lst(s).into())
1427
+ }
@@ -7,7 +7,7 @@ impl RbExpr {
7
7
  .meta()
8
8
  .pop()
9
9
  .into_iter()
10
- .map(|v| RbExpr::from(v))
10
+ .map(RbExpr::from)
11
11
  .collect()
12
12
  }
13
13
 
@@ -1,3 +1,4 @@
1
+ pub mod apply;
1
2
  pub mod dataframe;
2
3
  pub mod dsl;
3
4
  pub mod meta;
@@ -1,3 +1,4 @@
1
+ mod batched_csv;
1
2
  mod conversion;
2
3
  mod dataframe;
3
4
  mod error;
@@ -5,14 +6,18 @@ mod file;
5
6
  mod lazy;
6
7
  mod series;
7
8
 
8
- use conversion::get_df;
9
+ use batched_csv::RbBatchedCsv;
10
+ use conversion::*;
9
11
  use dataframe::RbDataFrame;
10
12
  use error::{RbPolarsErr, RbValueError};
13
+ use file::get_file_like;
11
14
  use lazy::dataframe::{RbLazyFrame, RbLazyGroupBy};
12
15
  use lazy::dsl::{RbExpr, RbWhen, RbWhenThen};
13
16
  use magnus::{
14
- define_module, function, memoize, method, prelude::*, Error, RArray, RClass, RModule,
17
+ define_module, function, memoize, method, prelude::*, Error, RArray, RClass, RHash, RModule,
18
+ Value,
15
19
  };
20
+ use polars::datatypes::DataType;
16
21
  use polars::error::PolarsResult;
17
22
  use polars::frame::DataFrame;
18
23
  use polars::functions::{diag_concat_df, hor_concat_df};
@@ -34,11 +39,19 @@ fn init() -> RbResult<()> {
34
39
  module.define_singleton_method("_concat_df", function!(concat_df, 1))?;
35
40
  module.define_singleton_method("_diag_concat_df", function!(rb_diag_concat_df, 1))?;
36
41
  module.define_singleton_method("_hor_concat_df", function!(rb_hor_concat_df, 1))?;
42
+ module.define_singleton_method("_concat_series", function!(concat_series, 1))?;
43
+ module.define_singleton_method("_ipc_schema", function!(ipc_schema, 1))?;
44
+ module.define_singleton_method("_parquet_schema", function!(parquet_schema, 1))?;
45
+
46
+ let class = module.define_class("RbBatchedCsv", Default::default())?;
47
+ class.define_singleton_method("new", function!(RbBatchedCsv::new, -1))?;
48
+ class.define_method("next_batches", method!(RbBatchedCsv::next_batches, 1))?;
37
49
 
38
50
  let class = module.define_class("RbDataFrame", Default::default())?;
39
51
  class.define_singleton_method("new", function!(RbDataFrame::init, 1))?;
40
- class.define_singleton_method("read_csv", function!(RbDataFrame::read_csv, 2))?;
52
+ class.define_singleton_method("read_csv", function!(RbDataFrame::read_csv, -1))?;
41
53
  class.define_singleton_method("read_parquet", function!(RbDataFrame::read_parquet, 1))?;
54
+ class.define_singleton_method("read_ipc", function!(RbDataFrame::read_ipc, 6))?;
42
55
  class.define_singleton_method("read_hash", function!(RbDataFrame::read_hash, 1))?;
43
56
  class.define_singleton_method("read_json", function!(RbDataFrame::read_json, 1))?;
44
57
  class.define_singleton_method("read_ndjson", function!(RbDataFrame::read_ndjson, 1))?;
@@ -46,6 +59,7 @@ fn init() -> RbResult<()> {
46
59
  class.define_method("write_json", method!(RbDataFrame::write_json, 3))?;
47
60
  class.define_method("write_ndjson", method!(RbDataFrame::write_ndjson, 1))?;
48
61
  class.define_method("write_csv", method!(RbDataFrame::write_csv, 10))?;
62
+ class.define_method("write_ipc", method!(RbDataFrame::write_ipc, 2))?;
49
63
  class.define_method("write_parquet", method!(RbDataFrame::write_parquet, 5))?;
50
64
  class.define_method("rechunk", method!(RbDataFrame::rechunk, 0))?;
51
65
  class.define_method("to_s", method!(RbDataFrame::to_s, 0))?;
@@ -294,6 +308,7 @@ fn init() -> RbResult<()> {
294
308
  class.define_method("keep_name", method!(RbExpr::keep_name, 0))?;
295
309
  class.define_method("prefix", method!(RbExpr::prefix, 1))?;
296
310
  class.define_method("suffix", method!(RbExpr::suffix, 1))?;
311
+ class.define_method("exclude", method!(RbExpr::exclude, 1))?;
297
312
  class.define_method("interpolate", method!(RbExpr::interpolate, 0))?;
298
313
  class.define_method("rolling_sum", method!(RbExpr::rolling_sum, 6))?;
299
314
  class.define_method("rolling_min", method!(RbExpr::rolling_min, 6))?;
@@ -364,11 +379,28 @@ fn init() -> RbResult<()> {
364
379
 
365
380
  // maybe add to different class
366
381
  class.define_singleton_method("col", function!(crate::lazy::dsl::col, 1))?;
382
+ class.define_singleton_method("count", function!(crate::lazy::dsl::count, 0))?;
383
+ class.define_singleton_method("first", function!(crate::lazy::dsl::first, 0))?;
384
+ class.define_singleton_method("last", function!(crate::lazy::dsl::last, 0))?;
385
+ class.define_singleton_method("cols", function!(crate::lazy::dsl::cols, 1))?;
386
+ class.define_singleton_method("fold", function!(crate::lazy::dsl::fold, 3))?;
367
387
  class.define_singleton_method("lit", function!(crate::lazy::dsl::lit, 1))?;
368
388
  class.define_singleton_method("arange", function!(crate::lazy::dsl::arange, 3))?;
369
389
  class.define_singleton_method("when", function!(crate::lazy::dsl::when, 1))?;
390
+ class.define_singleton_method("concat_str", function!(crate::lazy::dsl::concat_str, 2))?;
391
+ class.define_singleton_method("concat_lst", function!(crate::lazy::dsl::concat_lst, 1))?;
370
392
 
371
393
  let class = module.define_class("RbLazyFrame", Default::default())?;
394
+ class.define_singleton_method(
395
+ "new_from_ndjson",
396
+ function!(RbLazyFrame::new_from_ndjson, 7),
397
+ )?;
398
+ class.define_singleton_method("new_from_csv", function!(RbLazyFrame::new_from_csv, -1))?;
399
+ class.define_singleton_method(
400
+ "new_from_parquet",
401
+ function!(RbLazyFrame::new_from_parquet, 7),
402
+ )?;
403
+ class.define_singleton_method("new_from_ipc", function!(RbLazyFrame::new_from_ipc, 6))?;
372
404
  class.define_method("write_json", method!(RbLazyFrame::write_json, 1))?;
373
405
  class.define_method("describe_plan", method!(RbLazyFrame::describe_plan, 0))?;
374
406
  class.define_method(
@@ -567,3 +599,45 @@ fn rb_hor_concat_df(seq: RArray) -> RbResult<RbDataFrame> {
567
599
  let df = hor_concat_df(&dfs).map_err(RbPolarsErr::from)?;
568
600
  Ok(df.into())
569
601
  }
602
+
603
+ fn concat_series(seq: RArray) -> RbResult<RbSeries> {
604
+ let mut iter = seq.each();
605
+ let first = iter.next().unwrap()?;
606
+
607
+ let mut s = get_series(first)?;
608
+
609
+ for res in iter {
610
+ let item = res?;
611
+ let item = get_series(item)?;
612
+ s.append(&item).map_err(RbPolarsErr::from)?;
613
+ }
614
+ Ok(s.into())
615
+ }
616
+
617
+ fn ipc_schema(rb_f: Value) -> RbResult<Value> {
618
+ use polars::export::arrow::io::ipc::read::read_file_metadata;
619
+ let mut r = get_file_like(rb_f, false)?;
620
+ let metadata = read_file_metadata(&mut r).map_err(RbPolarsErr::arrow)?;
621
+
622
+ let dict = RHash::new();
623
+ for field in metadata.schema.fields {
624
+ let dt: Wrap<DataType> = Wrap((&field.data_type).into());
625
+ dict.aset(field.name, dt)?;
626
+ }
627
+ Ok(dict.into())
628
+ }
629
+
630
+ fn parquet_schema(rb_f: Value) -> RbResult<Value> {
631
+ use polars::export::arrow::io::parquet::read::{infer_schema, read_metadata};
632
+
633
+ let mut r = get_file_like(rb_f, false)?;
634
+ let metadata = read_metadata(&mut r).map_err(RbPolarsErr::arrow)?;
635
+ let arrow_schema = infer_schema(&metadata).map_err(RbPolarsErr::arrow)?;
636
+
637
+ let dict = RHash::new();
638
+ for field in arrow_schema.fields {
639
+ let dt: Wrap<DataType> = Wrap((&field.data_type).into());
640
+ dict.aset(field.name, dt)?;
641
+ }
642
+ Ok(dict.into())
643
+ }
@@ -116,11 +116,10 @@ init_method_opt!(new_opt_f32, Float32Type, f32);
116
116
  init_method_opt!(new_opt_f64, Float64Type, f64);
117
117
 
118
118
  impl RbSeries {
119
- pub fn new_str(name: String, val: RArray, _strict: bool) -> RbResult<Self> {
120
- let v = val.try_convert::<Vec<Option<String>>>()?;
121
- let mut s = Utf8Chunked::new(&name, v).into_series();
119
+ pub fn new_str(name: String, val: Wrap<Utf8Chunked>, _strict: bool) -> Self {
120
+ let mut s = val.0.into_series();
122
121
  s.rename(&name);
123
- Ok(RbSeries::new(s))
122
+ RbSeries::new(s)
124
123
  }
125
124
 
126
125
  pub fn estimated_size(&self) -> usize {
@@ -199,16 +198,16 @@ impl RbSeries {
199
198
  self.series.borrow_mut().rename(&name);
200
199
  }
201
200
 
202
- pub fn dtype(&self) -> String {
203
- self.series.borrow().dtype().to_string()
201
+ pub fn dtype(&self) -> Value {
202
+ Wrap(self.series.borrow().dtype().clone()).into()
204
203
  }
205
204
 
206
- pub fn inner_dtype(&self) -> Option<String> {
205
+ pub fn inner_dtype(&self) -> Option<Value> {
207
206
  self.series
208
207
  .borrow()
209
208
  .dtype()
210
209
  .inner_dtype()
211
- .map(|dt| dt.to_string())
210
+ .map(|dt| Wrap(dt.clone()).into())
212
211
  }
213
212
 
214
213
  pub fn set_sorted(&self, reverse: bool) -> Self {
@@ -634,5 +633,5 @@ impl RbSeries {
634
633
  }
635
634
 
636
635
  pub fn to_rbseries_collection(s: Vec<Series>) -> Vec<RbSeries> {
637
- s.into_iter().map(|v| RbSeries::new(v)).collect()
636
+ s.into_iter().map(RbSeries::new).collect()
638
637
  }
@@ -0,0 +1,95 @@
1
+ module Polars
2
+ class BatchedCsvReader
3
+ attr_accessor :_reader, :new_columns
4
+
5
+ def initialize(
6
+ file,
7
+ has_header: true,
8
+ columns: nil,
9
+ sep: ",",
10
+ comment_char: nil,
11
+ quote_char: '"',
12
+ skip_rows: 0,
13
+ dtypes: nil,
14
+ null_values: nil,
15
+ ignore_errors: false,
16
+ parse_dates: false,
17
+ n_threads: nil,
18
+ infer_schema_length: 100,
19
+ batch_size: 50_000,
20
+ n_rows: nil,
21
+ encoding: "utf8",
22
+ low_memory: false,
23
+ rechunk: true,
24
+ skip_rows_after_header: 0,
25
+ row_count_name: nil,
26
+ row_count_offset: 0,
27
+ sample_size: 1024,
28
+ eol_char: "\n",
29
+ new_columns: nil
30
+ )
31
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
32
+ path = Utils.format_path(file)
33
+ end
34
+
35
+ dtype_list = nil
36
+ dtype_slice = nil
37
+ if !dtypes.nil?
38
+ if dtypes.is_a?(Hash)
39
+ dtype_list = []
40
+ dtypes.each do|k, v|
41
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
42
+ end
43
+ elsif dtypes.is_a?(Array)
44
+ dtype_slice = dtypes
45
+ else
46
+ raise ArgumentError, "dtype arg should be list or dict"
47
+ end
48
+ end
49
+
50
+ processed_null_values = Utils._process_null_values(null_values)
51
+ projection, columns = Utils.handle_projection_columns(columns)
52
+
53
+ self._reader = RbBatchedCsv.new(
54
+ infer_schema_length,
55
+ batch_size,
56
+ has_header,
57
+ ignore_errors,
58
+ n_rows,
59
+ skip_rows,
60
+ projection,
61
+ sep,
62
+ rechunk,
63
+ columns,
64
+ encoding,
65
+ n_threads,
66
+ path,
67
+ dtype_list,
68
+ dtype_slice,
69
+ low_memory,
70
+ comment_char,
71
+ quote_char,
72
+ processed_null_values,
73
+ parse_dates,
74
+ skip_rows_after_header,
75
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
76
+ sample_size,
77
+ eol_char
78
+ )
79
+ self.new_columns = new_columns
80
+ end
81
+
82
+ def next_batches(n)
83
+ batches = _reader.next_batches(n)
84
+ if !batches.nil?
85
+ if new_columns
86
+ batches.map { |df| Utils._update_columns(Utils.wrap_df(df), new_columns) }
87
+ else
88
+ batches.map { |df| Utils.wrap_df(df) }
89
+ end
90
+ else
91
+ nil
92
+ end
93
+ end
94
+ end
95
+ end