polars-df 0.3.1-arm64-darwin → 0.4.0-arm64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -1
- data/Cargo.lock +335 -310
- data/Cargo.toml +0 -1
- data/LICENSE-THIRD-PARTY.txt +5797 -7758
- data/README.md +29 -0
- data/lib/polars/3.0/polars.bundle +0 -0
- data/lib/polars/3.1/polars.bundle +0 -0
- data/lib/polars/3.2/polars.bundle +0 -0
- data/lib/polars/batched_csv_reader.rb +1 -1
- data/lib/polars/binary_expr.rb +77 -0
- data/lib/polars/binary_name_space.rb +66 -0
- data/lib/polars/data_frame.rb +63 -38
- data/lib/polars/date_time_expr.rb +6 -6
- data/lib/polars/expr.rb +9 -2
- data/lib/polars/io.rb +73 -62
- data/lib/polars/lazy_frame.rb +103 -7
- data/lib/polars/lazy_functions.rb +3 -2
- data/lib/polars/list_expr.rb +2 -2
- data/lib/polars/list_name_space.rb +2 -2
- data/lib/polars/series.rb +9 -1
- data/lib/polars/string_expr.rb +1 -1
- data/lib/polars/utils.rb +10 -2
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +2 -0
- metadata +4 -2
data/README.md
CHANGED
@@ -50,6 +50,9 @@ From Parquet
|
|
50
50
|
|
51
51
|
```ruby
|
52
52
|
Polars.read_parquet("file.parquet")
|
53
|
+
|
54
|
+
# or lazily with
|
55
|
+
Polars.scan_parquet("file.parquet")
|
53
56
|
```
|
54
57
|
|
55
58
|
From Active Record
|
@@ -60,6 +63,32 @@ Polars.read_sql(User.all)
|
|
60
63
|
Polars.read_sql("SELECT * FROM users")
|
61
64
|
```
|
62
65
|
|
66
|
+
From JSON
|
67
|
+
|
68
|
+
```ruby
|
69
|
+
Polars.read_json("file.json")
|
70
|
+
# or
|
71
|
+
Polars.read_ndjson("file.ndjson")
|
72
|
+
|
73
|
+
# or lazily with
|
74
|
+
Polars.scan_ndjson("file.ndjson")
|
75
|
+
```
|
76
|
+
|
77
|
+
From Feather / Arrow IPC
|
78
|
+
|
79
|
+
```ruby
|
80
|
+
Polars.read_ipc("file.arrow")
|
81
|
+
|
82
|
+
# or lazily with
|
83
|
+
Polars.scan_ipc("file.arrow")
|
84
|
+
```
|
85
|
+
|
86
|
+
From Avro
|
87
|
+
|
88
|
+
```ruby
|
89
|
+
Polars.read_avro("file.avro")
|
90
|
+
```
|
91
|
+
|
63
92
|
From a hash
|
64
93
|
|
65
94
|
```ruby
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,77 @@
|
|
1
|
+
module Polars
|
2
|
+
# Namespace for binary related expressions.
|
3
|
+
class BinaryExpr
|
4
|
+
# @private
|
5
|
+
attr_accessor :_rbexpr
|
6
|
+
|
7
|
+
# @private
|
8
|
+
def initialize(expr)
|
9
|
+
self._rbexpr = expr._rbexpr
|
10
|
+
end
|
11
|
+
|
12
|
+
# Check if binaries in Series contain a binary substring.
|
13
|
+
#
|
14
|
+
# @param lit [String]
|
15
|
+
# The binary substring to look for
|
16
|
+
#
|
17
|
+
# @return [Expr]
|
18
|
+
def contains(lit)
|
19
|
+
Utils.wrap_expr(_rbexpr.binary_contains(lit))
|
20
|
+
end
|
21
|
+
|
22
|
+
# Check if string values end with a binary substring.
|
23
|
+
#
|
24
|
+
# @param sub [String]
|
25
|
+
# Suffix substring.
|
26
|
+
#
|
27
|
+
# @return [Expr]
|
28
|
+
def ends_with(sub)
|
29
|
+
Utils.wrap_expr(_rbexpr.binary_ends_with(sub))
|
30
|
+
end
|
31
|
+
|
32
|
+
# Check if values start with a binary substring.
|
33
|
+
#
|
34
|
+
# @param sub [String]
|
35
|
+
# Prefix substring.
|
36
|
+
#
|
37
|
+
# @return [Expr]
|
38
|
+
def starts_with(sub)
|
39
|
+
Utils.wrap_expr(_rbexpr.binary_starts_with(sub))
|
40
|
+
end
|
41
|
+
|
42
|
+
# Decode a value using the provided encoding.
|
43
|
+
#
|
44
|
+
# @param encoding ["hex", "base64"]
|
45
|
+
# The encoding to use.
|
46
|
+
# @param strict [Boolean]
|
47
|
+
# Raise an error if the underlying value cannot be decoded,
|
48
|
+
# otherwise mask out with a null value.
|
49
|
+
#
|
50
|
+
# @return [Expr]
|
51
|
+
def decode(encoding, strict: true)
|
52
|
+
if encoding == "hex"
|
53
|
+
Utils.wrap_expr(_rbexpr.binary_hex_decode(strict))
|
54
|
+
elsif encoding == "base64"
|
55
|
+
Utils.wrap_expr(_rbexpr.binary_base64_decode(strict))
|
56
|
+
else
|
57
|
+
raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}"
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# Encode a value using the provided encoding.
|
62
|
+
#
|
63
|
+
# @param encoding ["hex", "base64"]
|
64
|
+
# The encoding to use.
|
65
|
+
#
|
66
|
+
# @return [Expr]
|
67
|
+
def encode(encoding)
|
68
|
+
if encoding == "hex"
|
69
|
+
Utils.wrap_expr(_rbexpr.binary_hex_encode)
|
70
|
+
elsif encoding == "base64"
|
71
|
+
Utils.wrap_expr(_rbexpr.binary_base64_encode)
|
72
|
+
else
|
73
|
+
raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}"
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
module Polars
|
2
|
+
# Series.bin namespace.
|
3
|
+
class BinaryNameSpace
|
4
|
+
include ExprDispatch
|
5
|
+
|
6
|
+
self._accessor = "bin"
|
7
|
+
|
8
|
+
# @private
|
9
|
+
def initialize(series)
|
10
|
+
self._s = series._s
|
11
|
+
end
|
12
|
+
|
13
|
+
# Check if binaries in Series contain a binary substring.
|
14
|
+
#
|
15
|
+
# @param lit [String]
|
16
|
+
# The binary substring to look for
|
17
|
+
#
|
18
|
+
# @return [Series]
|
19
|
+
def contains(lit)
|
20
|
+
super
|
21
|
+
end
|
22
|
+
|
23
|
+
# Check if string values end with a binary substring.
|
24
|
+
#
|
25
|
+
# @param sub [String]
|
26
|
+
# Suffix substring.
|
27
|
+
#
|
28
|
+
# @return [Series]
|
29
|
+
def ends_with(sub)
|
30
|
+
super
|
31
|
+
end
|
32
|
+
|
33
|
+
# Check if values start with a binary substring.
|
34
|
+
#
|
35
|
+
# @param sub [String]
|
36
|
+
# Prefix substring.
|
37
|
+
#
|
38
|
+
# @return [Series]
|
39
|
+
def starts_with(sub)
|
40
|
+
super
|
41
|
+
end
|
42
|
+
|
43
|
+
# Decode a value using the provided encoding.
|
44
|
+
#
|
45
|
+
# @param encoding ["hex", "base64"]
|
46
|
+
# The encoding to use.
|
47
|
+
# @param strict [Boolean]
|
48
|
+
# Raise an error if the underlying value cannot be decoded,
|
49
|
+
# otherwise mask out with a null value.
|
50
|
+
#
|
51
|
+
# @return [Series]
|
52
|
+
def decode(encoding, strict: true)
|
53
|
+
super
|
54
|
+
end
|
55
|
+
|
56
|
+
# Encode a value using the provided encoding.
|
57
|
+
#
|
58
|
+
# @param encoding ["hex", "base64"]
|
59
|
+
# The encoding to use.
|
60
|
+
#
|
61
|
+
# @return [Series]
|
62
|
+
def encode(encoding)
|
63
|
+
super
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
data/lib/polars/data_frame.rb
CHANGED
@@ -97,7 +97,7 @@ module Polars
|
|
97
97
|
eol_char: "\n"
|
98
98
|
)
|
99
99
|
if Utils.pathlike?(file)
|
100
|
-
path = Utils.
|
100
|
+
path = Utils.normalise_filepath(file)
|
101
101
|
else
|
102
102
|
path = nil
|
103
103
|
# if defined?(StringIO) && file.is_a?(StringIO)
|
@@ -196,32 +196,56 @@ module Polars
|
|
196
196
|
|
197
197
|
# @private
|
198
198
|
def self._read_parquet(
|
199
|
-
|
199
|
+
source,
|
200
200
|
columns: nil,
|
201
201
|
n_rows: nil,
|
202
202
|
parallel: "auto",
|
203
203
|
row_count_name: nil,
|
204
204
|
row_count_offset: 0,
|
205
|
-
low_memory: false
|
205
|
+
low_memory: false,
|
206
|
+
use_statistics: true,
|
207
|
+
rechunk: true
|
206
208
|
)
|
207
|
-
if Utils.pathlike?(
|
208
|
-
|
209
|
+
if Utils.pathlike?(source)
|
210
|
+
source = Utils.normalise_filepath(source)
|
211
|
+
end
|
212
|
+
if columns.is_a?(String)
|
213
|
+
columns = [columns]
|
209
214
|
end
|
210
215
|
|
211
|
-
if
|
212
|
-
|
216
|
+
if source.is_a?(String) && source.include?("*") && Utils.local_file?(source)
|
217
|
+
scan =
|
218
|
+
Polars.scan_parquet(
|
219
|
+
source,
|
220
|
+
n_rows: n_rows,
|
221
|
+
rechunk: true,
|
222
|
+
parallel: parallel,
|
223
|
+
row_count_name: row_count_name,
|
224
|
+
row_count_offset: row_count_offset,
|
225
|
+
low_memory: low_memory
|
226
|
+
)
|
227
|
+
|
228
|
+
if columns.nil?
|
229
|
+
return self._from_rbdf(scan.collect._df)
|
230
|
+
elsif Utils.is_str_sequence(columns, allow_str: false)
|
231
|
+
return self._from_rbdf(scan.select(columns).collect._df)
|
232
|
+
else
|
233
|
+
raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: Array[String]"
|
234
|
+
end
|
213
235
|
end
|
214
236
|
|
215
237
|
projection, columns = Utils.handle_projection_columns(columns)
|
216
238
|
_from_rbdf(
|
217
239
|
RbDataFrame.read_parquet(
|
218
|
-
|
240
|
+
source,
|
219
241
|
columns,
|
220
242
|
projection,
|
221
243
|
n_rows,
|
222
244
|
parallel,
|
223
245
|
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
224
|
-
low_memory
|
246
|
+
low_memory,
|
247
|
+
use_statistics,
|
248
|
+
rechunk
|
225
249
|
)
|
226
250
|
)
|
227
251
|
end
|
@@ -229,7 +253,7 @@ module Polars
|
|
229
253
|
# @private
|
230
254
|
def self._read_avro(file, columns: nil, n_rows: nil)
|
231
255
|
if Utils.pathlike?(file)
|
232
|
-
file = Utils.
|
256
|
+
file = Utils.normalise_filepath(file)
|
233
257
|
end
|
234
258
|
projection, columns = Utils.handle_projection_columns(columns)
|
235
259
|
_from_rbdf(RbDataFrame.read_avro(file, columns, projection, n_rows))
|
@@ -246,7 +270,7 @@ module Polars
|
|
246
270
|
memory_map: true
|
247
271
|
)
|
248
272
|
if Utils.pathlike?(file)
|
249
|
-
file = Utils.
|
273
|
+
file = Utils.normalise_filepath(file)
|
250
274
|
end
|
251
275
|
if columns.is_a?(String)
|
252
276
|
columns = [columns]
|
@@ -272,7 +296,7 @@ module Polars
|
|
272
296
|
# @private
|
273
297
|
def self._read_json(file)
|
274
298
|
if Utils.pathlike?(file)
|
275
|
-
file = Utils.
|
299
|
+
file = Utils.normalise_filepath(file)
|
276
300
|
end
|
277
301
|
|
278
302
|
_from_rbdf(RbDataFrame.read_json(file))
|
@@ -281,7 +305,7 @@ module Polars
|
|
281
305
|
# @private
|
282
306
|
def self._read_ndjson(file)
|
283
307
|
if Utils.pathlike?(file)
|
284
|
-
file = Utils.
|
308
|
+
file = Utils.normalise_filepath(file)
|
285
309
|
end
|
286
310
|
|
287
311
|
_from_rbdf(RbDataFrame.read_ndjson(file))
|
@@ -774,7 +798,7 @@ module Polars
|
|
774
798
|
row_oriented: false
|
775
799
|
)
|
776
800
|
if Utils.pathlike?(file)
|
777
|
-
file = Utils.
|
801
|
+
file = Utils.normalise_filepath(file)
|
778
802
|
end
|
779
803
|
|
780
804
|
_df.write_json(file, pretty, row_oriented)
|
@@ -789,7 +813,7 @@ module Polars
|
|
789
813
|
# @return [nil]
|
790
814
|
def write_ndjson(file)
|
791
815
|
if Utils.pathlike?(file)
|
792
|
-
file = Utils.
|
816
|
+
file = Utils.normalise_filepath(file)
|
793
817
|
end
|
794
818
|
|
795
819
|
_df.write_ndjson(file)
|
@@ -879,7 +903,7 @@ module Polars
|
|
879
903
|
end
|
880
904
|
|
881
905
|
if Utils.pathlike?(file)
|
882
|
-
file = Utils.
|
906
|
+
file = Utils.normalise_filepath(file)
|
883
907
|
end
|
884
908
|
|
885
909
|
_df.write_csv(
|
@@ -917,7 +941,7 @@ module Polars
|
|
917
941
|
compression = "uncompressed"
|
918
942
|
end
|
919
943
|
if Utils.pathlike?(file)
|
920
|
-
file = Utils.
|
944
|
+
file = Utils.normalise_filepath(file)
|
921
945
|
end
|
922
946
|
|
923
947
|
_df.write_avro(file, compression)
|
@@ -936,7 +960,7 @@ module Polars
|
|
936
960
|
compression = "uncompressed"
|
937
961
|
end
|
938
962
|
if Utils.pathlike?(file)
|
939
|
-
file = Utils.
|
963
|
+
file = Utils.normalise_filepath(file)
|
940
964
|
end
|
941
965
|
|
942
966
|
_df.write_ipc(file, compression)
|
@@ -978,7 +1002,7 @@ module Polars
|
|
978
1002
|
compression = "uncompressed"
|
979
1003
|
end
|
980
1004
|
if Utils.pathlike?(file)
|
981
|
-
file = Utils.
|
1005
|
+
file = Utils.normalise_filepath(file)
|
982
1006
|
end
|
983
1007
|
|
984
1008
|
_df.write_parquet(
|
@@ -3042,24 +3066,28 @@ module Polars
|
|
3042
3066
|
if aggregate_fn.is_a?(String)
|
3043
3067
|
case aggregate_fn
|
3044
3068
|
when "first"
|
3045
|
-
|
3069
|
+
aggregate_expr = Polars.element.first._rbexpr
|
3046
3070
|
when "sum"
|
3047
|
-
|
3071
|
+
aggregate_expr = Polars.element.sum._rbexpr
|
3048
3072
|
when "max"
|
3049
|
-
|
3073
|
+
aggregate_expr = Polars.element.max._rbexpr
|
3050
3074
|
when "min"
|
3051
|
-
|
3075
|
+
aggregate_expr = Polars.element.min._rbexpr
|
3052
3076
|
when "mean"
|
3053
|
-
|
3077
|
+
aggregate_expr = Polars.element.mean._rbexpr
|
3054
3078
|
when "median"
|
3055
|
-
|
3079
|
+
aggregate_expr = Polars.element.median._rbexpr
|
3056
3080
|
when "last"
|
3057
|
-
|
3081
|
+
aggregate_expr = Polars.element.last._rbexpr
|
3058
3082
|
when "count"
|
3059
|
-
|
3083
|
+
aggregate_expr = Polars.count._rbexpr
|
3060
3084
|
else
|
3061
3085
|
raise ArgumentError, "Argument aggregate fn: '#{aggregate_fn}' was not expected."
|
3062
3086
|
end
|
3087
|
+
elsif aggregate_fn.nil?
|
3088
|
+
aggregate_expr = nil
|
3089
|
+
else
|
3090
|
+
aggregate_expr = aggregate_function._rbexpr
|
3063
3091
|
end
|
3064
3092
|
|
3065
3093
|
_from_rbdf(
|
@@ -3067,9 +3095,9 @@ module Polars
|
|
3067
3095
|
values,
|
3068
3096
|
index,
|
3069
3097
|
columns,
|
3070
|
-
aggregate_fn._rbexpr,
|
3071
3098
|
maintain_order,
|
3072
3099
|
sort_columns,
|
3100
|
+
aggregate_expr,
|
3073
3101
|
separator
|
3074
3102
|
)
|
3075
3103
|
)
|
@@ -3174,7 +3202,7 @@ module Polars
|
|
3174
3202
|
# # │ B ┆ 1 │
|
3175
3203
|
# # │ C ┆ 2 │
|
3176
3204
|
# # │ D ┆ 3 │
|
3177
|
-
# # │
|
3205
|
+
# # │ … ┆ … │
|
3178
3206
|
# # │ F ┆ 5 │
|
3179
3207
|
# # │ G ┆ 6 │
|
3180
3208
|
# # │ H ┆ 7 │
|
@@ -4053,15 +4081,12 @@ module Polars
|
|
4053
4081
|
# # │ 5 ┆ 3.0 ┆ true │
|
4054
4082
|
# # └─────┴─────┴───────┘
|
4055
4083
|
def unique(maintain_order: true, subset: nil, keep: "first")
|
4056
|
-
|
4057
|
-
|
4058
|
-
subset
|
4059
|
-
|
4060
|
-
|
4061
|
-
|
4062
|
-
end
|
4063
|
-
|
4064
|
-
_from_rbdf(_df.unique(maintain_order, subset, keep))
|
4084
|
+
self._from_rbdf(
|
4085
|
+
lazy
|
4086
|
+
.unique(maintain_order: maintain_order, subset: subset, keep: keep)
|
4087
|
+
.collect(no_optimization: true)
|
4088
|
+
._df
|
4089
|
+
)
|
4065
4090
|
end
|
4066
4091
|
|
4067
4092
|
# Return the number of unique rows, or the number of unique row-subsets.
|
@@ -1130,7 +1130,7 @@ module Polars
|
|
1130
1130
|
# ]
|
1131
1131
|
# )
|
1132
1132
|
# # =>
|
1133
|
-
# # shape: (
|
1133
|
+
# # shape: (1_001, 2)
|
1134
1134
|
# # ┌─────────────────────────┬───────────────────┐
|
1135
1135
|
# # │ date ┆ milliseconds_diff │
|
1136
1136
|
# # │ --- ┆ --- │
|
@@ -1140,7 +1140,7 @@ module Polars
|
|
1140
1140
|
# # │ 2020-01-01 00:00:00.001 ┆ 1 │
|
1141
1141
|
# # │ 2020-01-01 00:00:00.002 ┆ 1 │
|
1142
1142
|
# # │ 2020-01-01 00:00:00.003 ┆ 1 │
|
1143
|
-
# # │
|
1143
|
+
# # │ … ┆ … │
|
1144
1144
|
# # │ 2020-01-01 00:00:00.997 ┆ 1 │
|
1145
1145
|
# # │ 2020-01-01 00:00:00.998 ┆ 1 │
|
1146
1146
|
# # │ 2020-01-01 00:00:00.999 ┆ 1 │
|
@@ -1169,7 +1169,7 @@ module Polars
|
|
1169
1169
|
# ]
|
1170
1170
|
# )
|
1171
1171
|
# # =>
|
1172
|
-
# # shape: (
|
1172
|
+
# # shape: (1_001, 2)
|
1173
1173
|
# # ┌─────────────────────────┬───────────────────┐
|
1174
1174
|
# # │ date ┆ microseconds_diff │
|
1175
1175
|
# # │ --- ┆ --- │
|
@@ -1179,7 +1179,7 @@ module Polars
|
|
1179
1179
|
# # │ 2020-01-01 00:00:00.001 ┆ 1000 │
|
1180
1180
|
# # │ 2020-01-01 00:00:00.002 ┆ 1000 │
|
1181
1181
|
# # │ 2020-01-01 00:00:00.003 ┆ 1000 │
|
1182
|
-
# # │
|
1182
|
+
# # │ … ┆ … │
|
1183
1183
|
# # │ 2020-01-01 00:00:00.997 ┆ 1000 │
|
1184
1184
|
# # │ 2020-01-01 00:00:00.998 ┆ 1000 │
|
1185
1185
|
# # │ 2020-01-01 00:00:00.999 ┆ 1000 │
|
@@ -1208,7 +1208,7 @@ module Polars
|
|
1208
1208
|
# ]
|
1209
1209
|
# )
|
1210
1210
|
# # =>
|
1211
|
-
# # shape: (
|
1211
|
+
# # shape: (1_001, 2)
|
1212
1212
|
# # ┌─────────────────────────┬──────────────────┐
|
1213
1213
|
# # │ date ┆ nanoseconds_diff │
|
1214
1214
|
# # │ --- ┆ --- │
|
@@ -1218,7 +1218,7 @@ module Polars
|
|
1218
1218
|
# # │ 2020-01-01 00:00:00.001 ┆ 1000000 │
|
1219
1219
|
# # │ 2020-01-01 00:00:00.002 ┆ 1000000 │
|
1220
1220
|
# # │ 2020-01-01 00:00:00.003 ┆ 1000000 │
|
1221
|
-
# # │
|
1221
|
+
# # │ … ┆ … │
|
1222
1222
|
# # │ 2020-01-01 00:00:00.997 ┆ 1000000 │
|
1223
1223
|
# # │ 2020-01-01 00:00:00.998 ┆ 1000000 │
|
1224
1224
|
# # │ 2020-01-01 00:00:00.999 ┆ 1000000 │
|
data/lib/polars/expr.rb
CHANGED
@@ -2194,7 +2194,7 @@ module Polars
|
|
2194
2194
|
# # │ 4 │
|
2195
2195
|
# # │ 6 │
|
2196
2196
|
# # │ 6 │
|
2197
|
-
# # │
|
2197
|
+
# # │ … │
|
2198
2198
|
# # │ 6 │
|
2199
2199
|
# # │ 6 │
|
2200
2200
|
# # │ 6 │
|
@@ -2571,7 +2571,7 @@ module Polars
|
|
2571
2571
|
# # │ e │
|
2572
2572
|
# # │ l │
|
2573
2573
|
# # │ l │
|
2574
|
-
# # │
|
2574
|
+
# # │ … │
|
2575
2575
|
# # │ o │
|
2576
2576
|
# # │ r │
|
2577
2577
|
# # │ l │
|
@@ -4962,6 +4962,13 @@ module Polars
|
|
4962
4962
|
ListExpr.new(self)
|
4963
4963
|
end
|
4964
4964
|
|
4965
|
+
# Create an object namespace of all binary related methods.
|
4966
|
+
#
|
4967
|
+
# @return [BinaryExpr]
|
4968
|
+
def bin
|
4969
|
+
BinaryExpr.new(self)
|
4970
|
+
end
|
4971
|
+
|
4965
4972
|
# Create an object namespace of all categorical related methods.
|
4966
4973
|
#
|
4967
4974
|
# @return [CatExpr]
|