polars-df 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/Cargo.lock +1 -1
- data/ext/polars/Cargo.toml +1 -1
- data/ext/polars/src/batched_csv.rs +120 -0
- data/ext/polars/src/conversion.rs +105 -5
- data/ext/polars/src/dataframe.rs +132 -4
- data/ext/polars/src/error.rs +9 -0
- data/ext/polars/src/file.rs +8 -7
- data/ext/polars/src/lazy/apply.rs +7 -0
- data/ext/polars/src/lazy/dataframe.rs +132 -0
- data/ext/polars/src/lazy/dsl.rs +38 -0
- data/ext/polars/src/lazy/meta.rs +1 -1
- data/ext/polars/src/lazy/mod.rs +1 -0
- data/ext/polars/src/lib.rs +77 -3
- data/ext/polars/src/series.rs +8 -9
- data/lib/polars/batched_csv_reader.rb +95 -0
- data/lib/polars/data_frame.rb +585 -19
- data/lib/polars/expr.rb +17 -2
- data/lib/polars/io.rb +342 -2
- data/lib/polars/lazy_frame.rb +156 -2
- data/lib/polars/lazy_functions.rb +154 -11
- data/lib/polars/series.rb +806 -18
- data/lib/polars/utils.rb +33 -0
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +9 -0
- metadata +5 -2
@@ -1,4 +1,5 @@
|
|
1
1
|
use magnus::{RArray, RHash, Value};
|
2
|
+
use polars::io::RowCount;
|
2
3
|
use polars::lazy::frame::{LazyFrame, LazyGroupBy};
|
3
4
|
use polars::prelude::*;
|
4
5
|
use std::cell::RefCell;
|
@@ -52,6 +53,137 @@ impl From<LazyFrame> for RbLazyFrame {
|
|
52
53
|
}
|
53
54
|
|
54
55
|
impl RbLazyFrame {
|
56
|
+
pub fn new_from_ndjson(
|
57
|
+
path: String,
|
58
|
+
infer_schema_length: Option<usize>,
|
59
|
+
batch_size: Option<usize>,
|
60
|
+
n_rows: Option<usize>,
|
61
|
+
low_memory: bool,
|
62
|
+
rechunk: bool,
|
63
|
+
row_count: Option<(String, IdxSize)>,
|
64
|
+
) -> RbResult<Self> {
|
65
|
+
let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
|
66
|
+
|
67
|
+
let lf = LazyJsonLineReader::new(path)
|
68
|
+
.with_infer_schema_length(infer_schema_length)
|
69
|
+
.with_batch_size(batch_size)
|
70
|
+
.with_n_rows(n_rows)
|
71
|
+
.low_memory(low_memory)
|
72
|
+
.with_rechunk(rechunk)
|
73
|
+
.with_row_count(row_count)
|
74
|
+
.finish()
|
75
|
+
.map_err(RbPolarsErr::from)?;
|
76
|
+
Ok(lf.into())
|
77
|
+
}
|
78
|
+
|
79
|
+
pub fn new_from_csv(arguments: &[Value]) -> RbResult<Self> {
|
80
|
+
// start arguments
|
81
|
+
// this pattern is needed for more than 16
|
82
|
+
let path: String = arguments[0].try_convert()?;
|
83
|
+
let sep: String = arguments[1].try_convert()?;
|
84
|
+
let has_header: bool = arguments[2].try_convert()?;
|
85
|
+
let ignore_errors: bool = arguments[3].try_convert()?;
|
86
|
+
let skip_rows: usize = arguments[4].try_convert()?;
|
87
|
+
let n_rows: Option<usize> = arguments[5].try_convert()?;
|
88
|
+
let cache: bool = arguments[6].try_convert()?;
|
89
|
+
let overwrite_dtype: Option<Vec<(String, Wrap<DataType>)>> = arguments[7].try_convert()?;
|
90
|
+
let low_memory: bool = arguments[8].try_convert()?;
|
91
|
+
let comment_char: Option<String> = arguments[9].try_convert()?;
|
92
|
+
let quote_char: Option<String> = arguments[10].try_convert()?;
|
93
|
+
let null_values: Option<Wrap<NullValues>> = arguments[11].try_convert()?;
|
94
|
+
let infer_schema_length: Option<usize> = arguments[12].try_convert()?;
|
95
|
+
let with_schema_modify: Option<Value> = arguments[13].try_convert()?;
|
96
|
+
let rechunk: bool = arguments[14].try_convert()?;
|
97
|
+
let skip_rows_after_header: usize = arguments[15].try_convert()?;
|
98
|
+
let encoding: Wrap<CsvEncoding> = arguments[16].try_convert()?;
|
99
|
+
let row_count: Option<(String, IdxSize)> = arguments[17].try_convert()?;
|
100
|
+
let parse_dates: bool = arguments[18].try_convert()?;
|
101
|
+
let eol_char: String = arguments[19].try_convert()?;
|
102
|
+
// end arguments
|
103
|
+
|
104
|
+
let null_values = null_values.map(|w| w.0);
|
105
|
+
let comment_char = comment_char.map(|s| s.as_bytes()[0]);
|
106
|
+
let quote_char = quote_char.map(|s| s.as_bytes()[0]);
|
107
|
+
let delimiter = sep.as_bytes()[0];
|
108
|
+
let eol_char = eol_char.as_bytes()[0];
|
109
|
+
|
110
|
+
let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
|
111
|
+
|
112
|
+
let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| {
|
113
|
+
let fields = overwrite_dtype
|
114
|
+
.into_iter()
|
115
|
+
.map(|(name, dtype)| Field::new(&name, dtype.0));
|
116
|
+
Schema::from(fields)
|
117
|
+
});
|
118
|
+
let r = LazyCsvReader::new(path)
|
119
|
+
.with_infer_schema_length(infer_schema_length)
|
120
|
+
.with_delimiter(delimiter)
|
121
|
+
.has_header(has_header)
|
122
|
+
.with_ignore_parser_errors(ignore_errors)
|
123
|
+
.with_skip_rows(skip_rows)
|
124
|
+
.with_n_rows(n_rows)
|
125
|
+
.with_cache(cache)
|
126
|
+
.with_dtype_overwrite(overwrite_dtype.as_ref())
|
127
|
+
.low_memory(low_memory)
|
128
|
+
.with_comment_char(comment_char)
|
129
|
+
.with_quote_char(quote_char)
|
130
|
+
.with_end_of_line_char(eol_char)
|
131
|
+
.with_rechunk(rechunk)
|
132
|
+
.with_skip_rows_after_header(skip_rows_after_header)
|
133
|
+
.with_encoding(encoding.0)
|
134
|
+
.with_row_count(row_count)
|
135
|
+
.with_parse_dates(parse_dates)
|
136
|
+
.with_null_values(null_values);
|
137
|
+
|
138
|
+
if let Some(_lambda) = with_schema_modify {
|
139
|
+
todo!();
|
140
|
+
}
|
141
|
+
|
142
|
+
Ok(r.finish().map_err(RbPolarsErr::from)?.into())
|
143
|
+
}
|
144
|
+
|
145
|
+
pub fn new_from_parquet(
|
146
|
+
path: String,
|
147
|
+
n_rows: Option<usize>,
|
148
|
+
cache: bool,
|
149
|
+
parallel: Wrap<ParallelStrategy>,
|
150
|
+
rechunk: bool,
|
151
|
+
row_count: Option<(String, IdxSize)>,
|
152
|
+
low_memory: bool,
|
153
|
+
) -> RbResult<Self> {
|
154
|
+
let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
|
155
|
+
let args = ScanArgsParquet {
|
156
|
+
n_rows,
|
157
|
+
cache,
|
158
|
+
parallel: parallel.0,
|
159
|
+
rechunk,
|
160
|
+
row_count,
|
161
|
+
low_memory,
|
162
|
+
};
|
163
|
+
let lf = LazyFrame::scan_parquet(path, args).map_err(RbPolarsErr::from)?;
|
164
|
+
Ok(lf.into())
|
165
|
+
}
|
166
|
+
|
167
|
+
pub fn new_from_ipc(
|
168
|
+
path: String,
|
169
|
+
n_rows: Option<usize>,
|
170
|
+
cache: bool,
|
171
|
+
rechunk: bool,
|
172
|
+
row_count: Option<(String, IdxSize)>,
|
173
|
+
memory_map: bool,
|
174
|
+
) -> RbResult<Self> {
|
175
|
+
let row_count = row_count.map(|(name, offset)| RowCount { name, offset });
|
176
|
+
let args = ScanArgsIpc {
|
177
|
+
n_rows,
|
178
|
+
cache,
|
179
|
+
rechunk,
|
180
|
+
row_count,
|
181
|
+
memmap: memory_map,
|
182
|
+
};
|
183
|
+
let lf = LazyFrame::scan_ipc(path, args).map_err(RbPolarsErr::from)?;
|
184
|
+
Ok(lf.into())
|
185
|
+
}
|
186
|
+
|
55
187
|
pub fn write_json(&self, rb_f: Value) -> RbResult<()> {
|
56
188
|
let file = BufWriter::new(get_file_like(rb_f, true)?);
|
57
189
|
serde_json::to_writer(file, &self.ldf.logical_plan)
|
data/ext/polars/src/lazy/dsl.rs
CHANGED
@@ -6,6 +6,7 @@ use polars::prelude::*;
|
|
6
6
|
use polars::series::ops::NullBehavior;
|
7
7
|
|
8
8
|
use crate::conversion::*;
|
9
|
+
use crate::lazy::apply::*;
|
9
10
|
use crate::lazy::utils::rb_exprs_to_exprs;
|
10
11
|
use crate::RbResult;
|
11
12
|
|
@@ -901,6 +902,10 @@ impl RbExpr {
|
|
901
902
|
self.inner.clone().suffix(&suffix).into()
|
902
903
|
}
|
903
904
|
|
905
|
+
pub fn exclude(&self, columns: Vec<String>) -> Self {
|
906
|
+
self.inner.clone().exclude(columns).into()
|
907
|
+
}
|
908
|
+
|
904
909
|
pub fn interpolate(&self) -> Self {
|
905
910
|
self.inner.clone().interpolate().into()
|
906
911
|
}
|
@@ -1333,6 +1338,29 @@ pub fn col(name: String) -> RbExpr {
|
|
1333
1338
|
dsl::col(&name).into()
|
1334
1339
|
}
|
1335
1340
|
|
1341
|
+
pub fn count() -> RbExpr {
|
1342
|
+
dsl::count().into()
|
1343
|
+
}
|
1344
|
+
|
1345
|
+
pub fn first() -> RbExpr {
|
1346
|
+
dsl::first().into()
|
1347
|
+
}
|
1348
|
+
|
1349
|
+
pub fn last() -> RbExpr {
|
1350
|
+
dsl::last().into()
|
1351
|
+
}
|
1352
|
+
|
1353
|
+
pub fn cols(names: Vec<String>) -> RbExpr {
|
1354
|
+
dsl::cols(names).into()
|
1355
|
+
}
|
1356
|
+
|
1357
|
+
pub fn fold(acc: &RbExpr, lambda: Value, exprs: RArray) -> RbResult<RbExpr> {
|
1358
|
+
let exprs = rb_exprs_to_exprs(exprs)?;
|
1359
|
+
|
1360
|
+
let func = move |a: Series, b: Series| binary_lambda(lambda, a, b);
|
1361
|
+
Ok(polars::lazy::dsl::fold_exprs(acc.inner.clone(), func, exprs).into())
|
1362
|
+
}
|
1363
|
+
|
1336
1364
|
// TODO improve
|
1337
1365
|
pub fn lit(value: Value) -> RbResult<RbExpr> {
|
1338
1366
|
if value.is_nil() {
|
@@ -1387,3 +1415,13 @@ impl RbWhenThen {
|
|
1387
1415
|
pub fn when(predicate: &RbExpr) -> RbWhen {
|
1388
1416
|
dsl::when(predicate.inner.clone()).into()
|
1389
1417
|
}
|
1418
|
+
|
1419
|
+
pub fn concat_str(s: RArray, sep: String) -> RbResult<RbExpr> {
|
1420
|
+
let s = rb_exprs_to_exprs(s)?;
|
1421
|
+
Ok(dsl::concat_str(s, &sep).into())
|
1422
|
+
}
|
1423
|
+
|
1424
|
+
pub fn concat_lst(s: RArray) -> RbResult<RbExpr> {
|
1425
|
+
let s = rb_exprs_to_exprs(s)?;
|
1426
|
+
Ok(dsl::concat_lst(s).into())
|
1427
|
+
}
|
data/ext/polars/src/lazy/meta.rs
CHANGED
data/ext/polars/src/lazy/mod.rs
CHANGED
data/ext/polars/src/lib.rs
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
mod batched_csv;
|
1
2
|
mod conversion;
|
2
3
|
mod dataframe;
|
3
4
|
mod error;
|
@@ -5,14 +6,18 @@ mod file;
|
|
5
6
|
mod lazy;
|
6
7
|
mod series;
|
7
8
|
|
8
|
-
use
|
9
|
+
use batched_csv::RbBatchedCsv;
|
10
|
+
use conversion::*;
|
9
11
|
use dataframe::RbDataFrame;
|
10
12
|
use error::{RbPolarsErr, RbValueError};
|
13
|
+
use file::get_file_like;
|
11
14
|
use lazy::dataframe::{RbLazyFrame, RbLazyGroupBy};
|
12
15
|
use lazy::dsl::{RbExpr, RbWhen, RbWhenThen};
|
13
16
|
use magnus::{
|
14
|
-
define_module, function, memoize, method, prelude::*, Error, RArray, RClass, RModule,
|
17
|
+
define_module, function, memoize, method, prelude::*, Error, RArray, RClass, RHash, RModule,
|
18
|
+
Value,
|
15
19
|
};
|
20
|
+
use polars::datatypes::DataType;
|
16
21
|
use polars::error::PolarsResult;
|
17
22
|
use polars::frame::DataFrame;
|
18
23
|
use polars::functions::{diag_concat_df, hor_concat_df};
|
@@ -34,11 +39,19 @@ fn init() -> RbResult<()> {
|
|
34
39
|
module.define_singleton_method("_concat_df", function!(concat_df, 1))?;
|
35
40
|
module.define_singleton_method("_diag_concat_df", function!(rb_diag_concat_df, 1))?;
|
36
41
|
module.define_singleton_method("_hor_concat_df", function!(rb_hor_concat_df, 1))?;
|
42
|
+
module.define_singleton_method("_concat_series", function!(concat_series, 1))?;
|
43
|
+
module.define_singleton_method("_ipc_schema", function!(ipc_schema, 1))?;
|
44
|
+
module.define_singleton_method("_parquet_schema", function!(parquet_schema, 1))?;
|
45
|
+
|
46
|
+
let class = module.define_class("RbBatchedCsv", Default::default())?;
|
47
|
+
class.define_singleton_method("new", function!(RbBatchedCsv::new, -1))?;
|
48
|
+
class.define_method("next_batches", method!(RbBatchedCsv::next_batches, 1))?;
|
37
49
|
|
38
50
|
let class = module.define_class("RbDataFrame", Default::default())?;
|
39
51
|
class.define_singleton_method("new", function!(RbDataFrame::init, 1))?;
|
40
|
-
class.define_singleton_method("read_csv", function!(RbDataFrame::read_csv,
|
52
|
+
class.define_singleton_method("read_csv", function!(RbDataFrame::read_csv, -1))?;
|
41
53
|
class.define_singleton_method("read_parquet", function!(RbDataFrame::read_parquet, 1))?;
|
54
|
+
class.define_singleton_method("read_ipc", function!(RbDataFrame::read_ipc, 6))?;
|
42
55
|
class.define_singleton_method("read_hash", function!(RbDataFrame::read_hash, 1))?;
|
43
56
|
class.define_singleton_method("read_json", function!(RbDataFrame::read_json, 1))?;
|
44
57
|
class.define_singleton_method("read_ndjson", function!(RbDataFrame::read_ndjson, 1))?;
|
@@ -46,6 +59,7 @@ fn init() -> RbResult<()> {
|
|
46
59
|
class.define_method("write_json", method!(RbDataFrame::write_json, 3))?;
|
47
60
|
class.define_method("write_ndjson", method!(RbDataFrame::write_ndjson, 1))?;
|
48
61
|
class.define_method("write_csv", method!(RbDataFrame::write_csv, 10))?;
|
62
|
+
class.define_method("write_ipc", method!(RbDataFrame::write_ipc, 2))?;
|
49
63
|
class.define_method("write_parquet", method!(RbDataFrame::write_parquet, 5))?;
|
50
64
|
class.define_method("rechunk", method!(RbDataFrame::rechunk, 0))?;
|
51
65
|
class.define_method("to_s", method!(RbDataFrame::to_s, 0))?;
|
@@ -294,6 +308,7 @@ fn init() -> RbResult<()> {
|
|
294
308
|
class.define_method("keep_name", method!(RbExpr::keep_name, 0))?;
|
295
309
|
class.define_method("prefix", method!(RbExpr::prefix, 1))?;
|
296
310
|
class.define_method("suffix", method!(RbExpr::suffix, 1))?;
|
311
|
+
class.define_method("exclude", method!(RbExpr::exclude, 1))?;
|
297
312
|
class.define_method("interpolate", method!(RbExpr::interpolate, 0))?;
|
298
313
|
class.define_method("rolling_sum", method!(RbExpr::rolling_sum, 6))?;
|
299
314
|
class.define_method("rolling_min", method!(RbExpr::rolling_min, 6))?;
|
@@ -364,11 +379,28 @@ fn init() -> RbResult<()> {
|
|
364
379
|
|
365
380
|
// maybe add to different class
|
366
381
|
class.define_singleton_method("col", function!(crate::lazy::dsl::col, 1))?;
|
382
|
+
class.define_singleton_method("count", function!(crate::lazy::dsl::count, 0))?;
|
383
|
+
class.define_singleton_method("first", function!(crate::lazy::dsl::first, 0))?;
|
384
|
+
class.define_singleton_method("last", function!(crate::lazy::dsl::last, 0))?;
|
385
|
+
class.define_singleton_method("cols", function!(crate::lazy::dsl::cols, 1))?;
|
386
|
+
class.define_singleton_method("fold", function!(crate::lazy::dsl::fold, 3))?;
|
367
387
|
class.define_singleton_method("lit", function!(crate::lazy::dsl::lit, 1))?;
|
368
388
|
class.define_singleton_method("arange", function!(crate::lazy::dsl::arange, 3))?;
|
369
389
|
class.define_singleton_method("when", function!(crate::lazy::dsl::when, 1))?;
|
390
|
+
class.define_singleton_method("concat_str", function!(crate::lazy::dsl::concat_str, 2))?;
|
391
|
+
class.define_singleton_method("concat_lst", function!(crate::lazy::dsl::concat_lst, 1))?;
|
370
392
|
|
371
393
|
let class = module.define_class("RbLazyFrame", Default::default())?;
|
394
|
+
class.define_singleton_method(
|
395
|
+
"new_from_ndjson",
|
396
|
+
function!(RbLazyFrame::new_from_ndjson, 7),
|
397
|
+
)?;
|
398
|
+
class.define_singleton_method("new_from_csv", function!(RbLazyFrame::new_from_csv, -1))?;
|
399
|
+
class.define_singleton_method(
|
400
|
+
"new_from_parquet",
|
401
|
+
function!(RbLazyFrame::new_from_parquet, 7),
|
402
|
+
)?;
|
403
|
+
class.define_singleton_method("new_from_ipc", function!(RbLazyFrame::new_from_ipc, 6))?;
|
372
404
|
class.define_method("write_json", method!(RbLazyFrame::write_json, 1))?;
|
373
405
|
class.define_method("describe_plan", method!(RbLazyFrame::describe_plan, 0))?;
|
374
406
|
class.define_method(
|
@@ -567,3 +599,45 @@ fn rb_hor_concat_df(seq: RArray) -> RbResult<RbDataFrame> {
|
|
567
599
|
let df = hor_concat_df(&dfs).map_err(RbPolarsErr::from)?;
|
568
600
|
Ok(df.into())
|
569
601
|
}
|
602
|
+
|
603
|
+
fn concat_series(seq: RArray) -> RbResult<RbSeries> {
|
604
|
+
let mut iter = seq.each();
|
605
|
+
let first = iter.next().unwrap()?;
|
606
|
+
|
607
|
+
let mut s = get_series(first)?;
|
608
|
+
|
609
|
+
for res in iter {
|
610
|
+
let item = res?;
|
611
|
+
let item = get_series(item)?;
|
612
|
+
s.append(&item).map_err(RbPolarsErr::from)?;
|
613
|
+
}
|
614
|
+
Ok(s.into())
|
615
|
+
}
|
616
|
+
|
617
|
+
fn ipc_schema(rb_f: Value) -> RbResult<Value> {
|
618
|
+
use polars::export::arrow::io::ipc::read::read_file_metadata;
|
619
|
+
let mut r = get_file_like(rb_f, false)?;
|
620
|
+
let metadata = read_file_metadata(&mut r).map_err(RbPolarsErr::arrow)?;
|
621
|
+
|
622
|
+
let dict = RHash::new();
|
623
|
+
for field in metadata.schema.fields {
|
624
|
+
let dt: Wrap<DataType> = Wrap((&field.data_type).into());
|
625
|
+
dict.aset(field.name, dt)?;
|
626
|
+
}
|
627
|
+
Ok(dict.into())
|
628
|
+
}
|
629
|
+
|
630
|
+
fn parquet_schema(rb_f: Value) -> RbResult<Value> {
|
631
|
+
use polars::export::arrow::io::parquet::read::{infer_schema, read_metadata};
|
632
|
+
|
633
|
+
let mut r = get_file_like(rb_f, false)?;
|
634
|
+
let metadata = read_metadata(&mut r).map_err(RbPolarsErr::arrow)?;
|
635
|
+
let arrow_schema = infer_schema(&metadata).map_err(RbPolarsErr::arrow)?;
|
636
|
+
|
637
|
+
let dict = RHash::new();
|
638
|
+
for field in arrow_schema.fields {
|
639
|
+
let dt: Wrap<DataType> = Wrap((&field.data_type).into());
|
640
|
+
dict.aset(field.name, dt)?;
|
641
|
+
}
|
642
|
+
Ok(dict.into())
|
643
|
+
}
|
data/ext/polars/src/series.rs
CHANGED
@@ -116,11 +116,10 @@ init_method_opt!(new_opt_f32, Float32Type, f32);
|
|
116
116
|
init_method_opt!(new_opt_f64, Float64Type, f64);
|
117
117
|
|
118
118
|
impl RbSeries {
|
119
|
-
pub fn new_str(name: String, val:
|
120
|
-
let
|
121
|
-
let mut s = Utf8Chunked::new(&name, v).into_series();
|
119
|
+
pub fn new_str(name: String, val: Wrap<Utf8Chunked>, _strict: bool) -> Self {
|
120
|
+
let mut s = val.0.into_series();
|
122
121
|
s.rename(&name);
|
123
|
-
|
122
|
+
RbSeries::new(s)
|
124
123
|
}
|
125
124
|
|
126
125
|
pub fn estimated_size(&self) -> usize {
|
@@ -199,16 +198,16 @@ impl RbSeries {
|
|
199
198
|
self.series.borrow_mut().rename(&name);
|
200
199
|
}
|
201
200
|
|
202
|
-
pub fn dtype(&self) ->
|
203
|
-
self.series.borrow().dtype().
|
201
|
+
pub fn dtype(&self) -> Value {
|
202
|
+
Wrap(self.series.borrow().dtype().clone()).into()
|
204
203
|
}
|
205
204
|
|
206
|
-
pub fn inner_dtype(&self) -> Option<
|
205
|
+
pub fn inner_dtype(&self) -> Option<Value> {
|
207
206
|
self.series
|
208
207
|
.borrow()
|
209
208
|
.dtype()
|
210
209
|
.inner_dtype()
|
211
|
-
.map(|dt| dt.
|
210
|
+
.map(|dt| Wrap(dt.clone()).into())
|
212
211
|
}
|
213
212
|
|
214
213
|
pub fn set_sorted(&self, reverse: bool) -> Self {
|
@@ -634,5 +633,5 @@ impl RbSeries {
|
|
634
633
|
}
|
635
634
|
|
636
635
|
pub fn to_rbseries_collection(s: Vec<Series>) -> Vec<RbSeries> {
|
637
|
-
s.into_iter().map(
|
636
|
+
s.into_iter().map(RbSeries::new).collect()
|
638
637
|
}
|
@@ -0,0 +1,95 @@
|
|
1
|
+
module Polars
|
2
|
+
class BatchedCsvReader
|
3
|
+
attr_accessor :_reader, :new_columns
|
4
|
+
|
5
|
+
def initialize(
|
6
|
+
file,
|
7
|
+
has_header: true,
|
8
|
+
columns: nil,
|
9
|
+
sep: ",",
|
10
|
+
comment_char: nil,
|
11
|
+
quote_char: '"',
|
12
|
+
skip_rows: 0,
|
13
|
+
dtypes: nil,
|
14
|
+
null_values: nil,
|
15
|
+
ignore_errors: false,
|
16
|
+
parse_dates: false,
|
17
|
+
n_threads: nil,
|
18
|
+
infer_schema_length: 100,
|
19
|
+
batch_size: 50_000,
|
20
|
+
n_rows: nil,
|
21
|
+
encoding: "utf8",
|
22
|
+
low_memory: false,
|
23
|
+
rechunk: true,
|
24
|
+
skip_rows_after_header: 0,
|
25
|
+
row_count_name: nil,
|
26
|
+
row_count_offset: 0,
|
27
|
+
sample_size: 1024,
|
28
|
+
eol_char: "\n",
|
29
|
+
new_columns: nil
|
30
|
+
)
|
31
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
32
|
+
path = Utils.format_path(file)
|
33
|
+
end
|
34
|
+
|
35
|
+
dtype_list = nil
|
36
|
+
dtype_slice = nil
|
37
|
+
if !dtypes.nil?
|
38
|
+
if dtypes.is_a?(Hash)
|
39
|
+
dtype_list = []
|
40
|
+
dtypes.each do|k, v|
|
41
|
+
dtype_list << [k, Utils.rb_type_to_dtype(v)]
|
42
|
+
end
|
43
|
+
elsif dtypes.is_a?(Array)
|
44
|
+
dtype_slice = dtypes
|
45
|
+
else
|
46
|
+
raise ArgumentError, "dtype arg should be list or dict"
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
processed_null_values = Utils._process_null_values(null_values)
|
51
|
+
projection, columns = Utils.handle_projection_columns(columns)
|
52
|
+
|
53
|
+
self._reader = RbBatchedCsv.new(
|
54
|
+
infer_schema_length,
|
55
|
+
batch_size,
|
56
|
+
has_header,
|
57
|
+
ignore_errors,
|
58
|
+
n_rows,
|
59
|
+
skip_rows,
|
60
|
+
projection,
|
61
|
+
sep,
|
62
|
+
rechunk,
|
63
|
+
columns,
|
64
|
+
encoding,
|
65
|
+
n_threads,
|
66
|
+
path,
|
67
|
+
dtype_list,
|
68
|
+
dtype_slice,
|
69
|
+
low_memory,
|
70
|
+
comment_char,
|
71
|
+
quote_char,
|
72
|
+
processed_null_values,
|
73
|
+
parse_dates,
|
74
|
+
skip_rows_after_header,
|
75
|
+
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
76
|
+
sample_size,
|
77
|
+
eol_char
|
78
|
+
)
|
79
|
+
self.new_columns = new_columns
|
80
|
+
end
|
81
|
+
|
82
|
+
def next_batches(n)
|
83
|
+
batches = _reader.next_batches(n)
|
84
|
+
if !batches.nil?
|
85
|
+
if new_columns
|
86
|
+
batches.map { |df| Utils._update_columns(Utils.wrap_df(df), new_columns) }
|
87
|
+
else
|
88
|
+
batches.map { |df| Utils.wrap_df(df) }
|
89
|
+
end
|
90
|
+
else
|
91
|
+
nil
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|