polars-df 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/Cargo.lock +1 -1
- data/README.md +6 -3
- data/ext/polars/Cargo.toml +1 -1
- data/ext/polars/src/lib.rs +6 -1
- data/lib/polars/data_frame.rb +66 -3
- data/lib/polars/io.rb +3 -1
- data/lib/polars/series.rb +71 -11
- data/lib/polars/slice.rb +1 -1
- data/lib/polars/utils.rb +20 -0
- data/lib/polars/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: df03134e7edf09e86b5a4f4f9ae9a926bac4c9c0804a29c3422c32675f478825
|
4
|
+
data.tar.gz: e0338be1aa96d0ad082ebf8fe27e608b2906b243dd49fa837aceb7f8186947d8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 75a139d30f9fdebaa84a21fa45cec8a199da76eb295e7099ceb849646a93fbc7ed80ffed18aaa8eb7bbfc53a32792b2e47101485ad31d727a47ed67d8d7e8110
|
7
|
+
data.tar.gz: 589f7fbc1300aadc05568308700f6a94b934e63c40bd1be0a3e7b6f564c0d55f256e2e45e926c128d80453d0e7d200b057f640b02cd6fb9aaddf5bf55dd89754
|
data/CHANGELOG.md
CHANGED
data/Cargo.lock
CHANGED
data/README.md
CHANGED
@@ -41,6 +41,9 @@ From a CSV
|
|
41
41
|
|
42
42
|
```ruby
|
43
43
|
Polars.read_csv("file.csv")
|
44
|
+
|
45
|
+
# or lazily with
|
46
|
+
Polars.scan_csv("file.csv")
|
44
47
|
```
|
45
48
|
|
46
49
|
From Parquet
|
@@ -135,9 +138,9 @@ df[Polars.col("a") <= 2]
|
|
135
138
|
And, or, and exclusive or
|
136
139
|
|
137
140
|
```ruby
|
138
|
-
df[(Polars.col("a") >
|
139
|
-
df[(Polars.col("a") >
|
140
|
-
df[(Polars.col("a") >
|
141
|
+
df[(Polars.col("a") > 1) & (Polars.col("b") == "two")] # and
|
142
|
+
df[(Polars.col("a") > 1) | (Polars.col("b") == "two")] # or
|
143
|
+
df[(Polars.col("a") > 1) ^ (Polars.col("b") == "two")] # xor
|
141
144
|
```
|
142
145
|
|
143
146
|
## Operations
|
data/ext/polars/Cargo.toml
CHANGED
data/ext/polars/src/lib.rs
CHANGED
@@ -22,7 +22,7 @@ use magnus::{
|
|
22
22
|
define_module, function, memoize, method, prelude::*, Error, RArray, RClass, RHash, RModule,
|
23
23
|
Value,
|
24
24
|
};
|
25
|
-
use polars::datatypes::{DataType, TimeUnit};
|
25
|
+
use polars::datatypes::{DataType, TimeUnit, IDX_DTYPE};
|
26
26
|
use polars::error::PolarsResult;
|
27
27
|
use polars::frame::DataFrame;
|
28
28
|
use polars::functions::{diag_concat_df, hor_concat_df};
|
@@ -71,6 +71,7 @@ fn init() -> RbResult<()> {
|
|
71
71
|
module.define_singleton_method("_sum_exprs", function!(sum_exprs, 1))?;
|
72
72
|
module.define_singleton_method("_as_struct", function!(as_struct, 1))?;
|
73
73
|
module.define_singleton_method("_arg_where", function!(arg_where, 1))?;
|
74
|
+
module.define_singleton_method("_get_idx_type", function!(get_idx_type, 0))?;
|
74
75
|
|
75
76
|
let class = module.define_class("RbBatchedCsv", Default::default())?;
|
76
77
|
class.define_singleton_method("new", function!(RbBatchedCsv::new, -1))?;
|
@@ -988,3 +989,7 @@ fn as_struct(exprs: RArray) -> RbResult<RbExpr> {
|
|
988
989
|
fn arg_where(condition: &RbExpr) -> RbExpr {
|
989
990
|
polars::lazy::dsl::arg_where(condition.inner.clone()).into()
|
990
991
|
}
|
992
|
+
|
993
|
+
fn get_idx_type() -> Value {
|
994
|
+
Wrap(IDX_DTYPE).into()
|
995
|
+
}
|
data/lib/polars/data_frame.rb
CHANGED
@@ -277,6 +277,7 @@ module Polars
|
|
277
277
|
_df.height
|
278
278
|
end
|
279
279
|
alias_method :count, :height
|
280
|
+
alias_method :length, :height
|
280
281
|
|
281
282
|
# Get the width of the DataFrame.
|
282
283
|
#
|
@@ -541,7 +542,7 @@ module Polars
|
|
541
542
|
|
542
543
|
if col_selection.is_a?(Array)
|
543
544
|
# df[.., [1, 2]]
|
544
|
-
if is_int_sequence(col_selection)
|
545
|
+
if Utils.is_int_sequence(col_selection)
|
545
546
|
series_list = col_selection.map { |i| to_series(i) }
|
546
547
|
df = self.class.new(series_list)
|
547
548
|
return df[row_selection]
|
@@ -574,6 +575,23 @@ module Polars
|
|
574
575
|
# df[["foo", "bar"]]
|
575
576
|
return _from_rbdf(_df.select(item))
|
576
577
|
end
|
578
|
+
|
579
|
+
if Utils.is_int_sequence(item)
|
580
|
+
item = Series.new("", item)
|
581
|
+
end
|
582
|
+
|
583
|
+
if item.is_a?(Series)
|
584
|
+
dtype = item.dtype
|
585
|
+
if dtype == Utf8
|
586
|
+
return _from_rbdf(_df.select(item))
|
587
|
+
elsif dtype == UInt32
|
588
|
+
return _from_rbdf(_df.take_with_series(item._s))
|
589
|
+
elsif [UInt8, UInt16, UInt64, Int8, Int16, Int32, Int64].include?(dtype)
|
590
|
+
return _from_rbdf(
|
591
|
+
_df.take_with_series(_pos_idxs(item, 0)._s)
|
592
|
+
)
|
593
|
+
end
|
594
|
+
end
|
577
595
|
end
|
578
596
|
|
579
597
|
# Ruby-specific
|
@@ -4662,8 +4680,53 @@ module Polars
|
|
4662
4680
|
end
|
4663
4681
|
end
|
4664
4682
|
|
4665
|
-
|
4666
|
-
|
4683
|
+
def _pos_idxs(idxs, dim)
|
4684
|
+
idx_type = Polars._get_idx_type
|
4685
|
+
|
4686
|
+
if idxs.is_a?(Series)
|
4687
|
+
if idxs.dtype == idx_type
|
4688
|
+
return idxs
|
4689
|
+
end
|
4690
|
+
if [UInt8, UInt16, idx_type == UInt32 ? UInt64 : UInt32, Int8, Int16, Int32, Int64].include?(idxs.dtype)
|
4691
|
+
if idx_type == UInt32
|
4692
|
+
if [Int64, UInt64].include?(idxs.dtype)
|
4693
|
+
if idxs.max >= 2**32
|
4694
|
+
raise ArgumentError, "Index positions should be smaller than 2^32."
|
4695
|
+
end
|
4696
|
+
end
|
4697
|
+
if idxs.dtype == Int64
|
4698
|
+
if idxs.min < -(2**32)
|
4699
|
+
raise ArgumentError, "Index positions should be bigger than -2^32 + 1."
|
4700
|
+
end
|
4701
|
+
end
|
4702
|
+
end
|
4703
|
+
if [Int8, Int16, Int32, Int64].include?(idxs.dtype)
|
4704
|
+
if idxs.min < 0
|
4705
|
+
if idx_type == UInt32
|
4706
|
+
if [Int8, Int16].include?(idxs.dtype)
|
4707
|
+
idxs = idxs.cast(Int32)
|
4708
|
+
end
|
4709
|
+
else
|
4710
|
+
if [Int8, Int16, Int32].include?(idxs.dtype)
|
4711
|
+
idxs = idxs.cast(Int64)
|
4712
|
+
end
|
4713
|
+
end
|
4714
|
+
|
4715
|
+
idxs =
|
4716
|
+
Polars.select(
|
4717
|
+
Polars.when(Polars.lit(idxs) < 0)
|
4718
|
+
.then(shape[dim] + Polars.lit(idxs))
|
4719
|
+
.otherwise(Polars.lit(idxs))
|
4720
|
+
).to_series
|
4721
|
+
end
|
4722
|
+
end
|
4723
|
+
|
4724
|
+
return idxs.cast(idx_type)
|
4725
|
+
end
|
4726
|
+
end
|
4727
|
+
|
4728
|
+
raise ArgumentError, "Unsupported idxs datatype."
|
4729
|
+
end
|
4667
4730
|
|
4668
4731
|
# @private
|
4669
4732
|
def self.hash_to_rbdf(data, columns: nil)
|
data/lib/polars/io.rb
CHANGED
@@ -606,8 +606,10 @@ module Polars
|
|
606
606
|
sql
|
607
607
|
elsif sql.is_a?(ActiveRecord::Relation)
|
608
608
|
sql.connection.select_all(sql.to_sql)
|
609
|
+
elsif sql.is_a?(String)
|
610
|
+
ActiveRecord::Base.connection.select_all(sql)
|
609
611
|
else
|
610
|
-
raise ArgumentError, "Expected ActiveRecord::Relation
|
612
|
+
raise ArgumentError, "Expected ActiveRecord::Relation, ActiveRecord::Result, or String"
|
611
613
|
end
|
612
614
|
data = {}
|
613
615
|
result.columns.each_with_index do |k, i|
|
data/lib/polars/series.rb
CHANGED
@@ -263,6 +263,10 @@ module Polars
|
|
263
263
|
#
|
264
264
|
# @return [Object]
|
265
265
|
def [](item)
|
266
|
+
if item.is_a?(Series) && [UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64].include?(item.dtype)
|
267
|
+
return Utils.wrap_s(_s.take_with_series(_pos_idxs(item)._s))
|
268
|
+
end
|
269
|
+
|
266
270
|
if item.is_a?(Integer)
|
267
271
|
return _s.get_idx(item)
|
268
272
|
end
|
@@ -271,6 +275,10 @@ module Polars
|
|
271
275
|
return Slice.new(self).apply(item)
|
272
276
|
end
|
273
277
|
|
278
|
+
if Utils.is_int_sequence(item)
|
279
|
+
return Utils.wrap_s(_s.take_with_series(_pos_idxs(Series.new("", item))._s))
|
280
|
+
end
|
281
|
+
|
274
282
|
raise ArgumentError, "Cannot get item of type: #{item.class.name}"
|
275
283
|
end
|
276
284
|
|
@@ -287,24 +295,23 @@ module Polars
|
|
287
295
|
end
|
288
296
|
|
289
297
|
if key.is_a?(Series)
|
290
|
-
if key.dtype ==
|
298
|
+
if key.dtype == Boolean
|
291
299
|
self._s = set(key, value)._s
|
292
|
-
elsif key.dtype ==
|
293
|
-
self._s = set_at_idx(key.cast(
|
294
|
-
elsif key.dtype ==
|
300
|
+
elsif key.dtype == UInt64
|
301
|
+
self._s = set_at_idx(key.cast(UInt32), value)._s
|
302
|
+
elsif key.dtype == UInt32
|
295
303
|
self._s = set_at_idx(key, value)._s
|
296
304
|
else
|
297
305
|
raise Todo
|
298
306
|
end
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
307
|
+
elsif key.is_a?(Array)
|
308
|
+
s = Utils.wrap_s(sequence_to_rbseries("", key, dtype: UInt32))
|
309
|
+
self[s] = value
|
310
|
+
elsif key.is_a?(Range)
|
311
|
+
s = Series.new("", key, dtype: UInt32)
|
303
312
|
self[s] = value
|
304
313
|
elsif key.is_a?(Integer)
|
305
|
-
|
306
|
-
# self[[key]] = value
|
307
|
-
set_at_idx(key, value)
|
314
|
+
self[[key]] = value
|
308
315
|
else
|
309
316
|
raise ArgumentError, "cannot use #{key} for indexing"
|
310
317
|
end
|
@@ -3527,6 +3534,59 @@ module Polars
|
|
3527
3534
|
end
|
3528
3535
|
end
|
3529
3536
|
|
3537
|
+
def _pos_idxs(idxs)
|
3538
|
+
idx_type = Polars._get_idx_type
|
3539
|
+
|
3540
|
+
if idxs.is_a?(Series)
|
3541
|
+
if idxs.dtype == idx_type
|
3542
|
+
return idxs
|
3543
|
+
end
|
3544
|
+
if [UInt8, UInt16, idx_type == UInt32 ? UInt64 : UInt32, Int8, Int16, Int32, Int64].include?(idxs.dtype)
|
3545
|
+
if idx_type == UInt32
|
3546
|
+
if [Int64, UInt64].include?(idxs.dtype)
|
3547
|
+
if idxs.max >= 2**32
|
3548
|
+
raise ArgumentError, "Index positions should be smaller than 2^32."
|
3549
|
+
end
|
3550
|
+
end
|
3551
|
+
if idxs.dtype == Int64
|
3552
|
+
if idxs.min < -(2**32)
|
3553
|
+
raise ArgumentError, "Index positions should be bigger than -2^32 + 1."
|
3554
|
+
end
|
3555
|
+
end
|
3556
|
+
end
|
3557
|
+
if [Int8, Int16, Int32, Int64].include?(idxs.dtype)
|
3558
|
+
if idxs.min < 0
|
3559
|
+
if idx_type == UInt32
|
3560
|
+
if [Int8, Int16].include?(idxs.dtype)
|
3561
|
+
idxs = idxs.cast(Int32)
|
3562
|
+
end
|
3563
|
+
else
|
3564
|
+
if [Int8, Int16, Int32].include?(idxs.dtype)
|
3565
|
+
idxs = idxs.cast(Int64)
|
3566
|
+
end
|
3567
|
+
end
|
3568
|
+
|
3569
|
+
# Update negative indexes to absolute indexes.
|
3570
|
+
return (
|
3571
|
+
idxs.to_frame
|
3572
|
+
.select(
|
3573
|
+
Polars.when(Polars.col(idxs.name) < 0)
|
3574
|
+
.then(len + Polars.col(idxs.name))
|
3575
|
+
.otherwise(Polars.col(idxs.name))
|
3576
|
+
.cast(idx_type)
|
3577
|
+
)
|
3578
|
+
.to_series(0)
|
3579
|
+
)
|
3580
|
+
end
|
3581
|
+
end
|
3582
|
+
|
3583
|
+
return idxs.cast(idx_type)
|
3584
|
+
end
|
3585
|
+
end
|
3586
|
+
|
3587
|
+
raise ArgumentError, "Unsupported idxs datatype."
|
3588
|
+
end
|
3589
|
+
|
3530
3590
|
def _comp(other, op)
|
3531
3591
|
if other.is_a?(Series)
|
3532
3592
|
return Utils.wrap_s(_s.send(op, other._s))
|
data/lib/polars/slice.rb
CHANGED
@@ -56,7 +56,7 @@ module Polars
|
|
56
56
|
# Normalize slice bounds, identify unbounded and/or zero-length slices.
|
57
57
|
def _slice_setup(s)
|
58
58
|
# can normalize slice indices as we know object size
|
59
|
-
obj_len = @obj.
|
59
|
+
obj_len = @obj.length
|
60
60
|
start = if s.begin
|
61
61
|
if s.begin < 0
|
62
62
|
[s.begin + obj_len, 0].max
|
data/lib/polars/utils.rb
CHANGED
@@ -181,6 +181,26 @@ module Polars
|
|
181
181
|
val.all? { |x| x.is_a?(eltype) }
|
182
182
|
end
|
183
183
|
|
184
|
+
def self.is_bool_sequence(val)
|
185
|
+
val.is_a?(Array) && val.all? { |x| x == true || x == false }
|
186
|
+
end
|
187
|
+
|
188
|
+
def self.is_dtype_sequence(val)
|
189
|
+
val.is_a?(Array) && val.all? { |x| is_polars_dtype(x) }
|
190
|
+
end
|
191
|
+
|
192
|
+
def self.is_int_sequence(val)
|
193
|
+
val.is_a?(Array) && _is_iterable_of(val, Integer)
|
194
|
+
end
|
195
|
+
|
196
|
+
def self.is_expr_sequence(val)
|
197
|
+
val.is_a?(Array) && _is_iterable_of(val, Expr)
|
198
|
+
end
|
199
|
+
|
200
|
+
def self.is_rbexpr_sequence(val)
|
201
|
+
val.is_a?(Array) && _is_iterable_of(val, RbExpr)
|
202
|
+
end
|
203
|
+
|
184
204
|
def self.is_str_sequence(val, allow_str: false)
|
185
205
|
if allow_str == false && val.is_a?(String)
|
186
206
|
false
|
data/lib/polars/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: polars-df
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-01-
|
11
|
+
date: 2023-01-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|