polars-df 0.3.1-x86_64-linux → 0.5.0-x86_64-linux
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +24 -1
- data/Cargo.lock +486 -380
- data/Cargo.toml +0 -2
- data/LICENSE-THIRD-PARTY.txt +7353 -8473
- data/README.md +31 -2
- data/lib/polars/3.0/polars.so +0 -0
- data/lib/polars/3.1/polars.so +0 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/batched_csv_reader.rb +1 -1
- data/lib/polars/binary_expr.rb +77 -0
- data/lib/polars/binary_name_space.rb +66 -0
- data/lib/polars/convert.rb +2 -2
- data/lib/polars/data_frame.rb +263 -87
- data/lib/polars/data_types.rb +6 -4
- data/lib/polars/date_time_expr.rb +148 -8
- data/lib/polars/expr.rb +78 -11
- data/lib/polars/io.rb +73 -62
- data/lib/polars/lazy_frame.rb +107 -10
- data/lib/polars/lazy_functions.rb +7 -3
- data/lib/polars/list_expr.rb +70 -21
- data/lib/polars/list_name_space.rb +2 -2
- data/lib/polars/series.rb +190 -74
- data/lib/polars/string_expr.rb +150 -44
- data/lib/polars/string_name_space.rb +4 -4
- data/lib/polars/struct_name_space.rb +32 -0
- data/lib/polars/utils.rb +51 -9
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +4 -2
- metadata +4 -2
data/lib/polars/string_expr.rb
CHANGED
@@ -11,8 +11,8 @@ module Polars
|
|
11
11
|
|
12
12
|
# Parse a Utf8 expression to a Date/Datetime/Time type.
|
13
13
|
#
|
14
|
-
# @param
|
15
|
-
#
|
14
|
+
# @param dtype [Object]
|
15
|
+
# The data type to convert into. Can be either Date, Datetime, or Time.
|
16
16
|
# @param fmt [String]
|
17
17
|
# Format to use, refer to the
|
18
18
|
# [chrono strftime documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
|
@@ -33,57 +33,56 @@ module Polars
|
|
33
33
|
# the format string, if given, eg: "%F %T%.3f" => Datetime("ms"). If
|
34
34
|
# no fractional second component is found then the default is "us".
|
35
35
|
#
|
36
|
-
# @example
|
36
|
+
# @example Dealing with a consistent format:
|
37
|
+
# s = Polars::Series.new(["2020-01-01 01:00Z", "2020-01-01 02:00Z"])
|
38
|
+
# s.str.strptime(Polars::Datetime, "%Y-%m-%d %H:%M%#z")
|
39
|
+
# # =>
|
40
|
+
# # shape: (2,)
|
41
|
+
# # Series: '' [datetime[μs, +00:00]]
|
42
|
+
# # [
|
43
|
+
# # 2020-01-01 01:00:00 +00:00
|
44
|
+
# # 2020-01-01 02:00:00 +00:00
|
45
|
+
# # ]
|
46
|
+
#
|
47
|
+
# @example Dealing with different formats.
|
37
48
|
# s = Polars::Series.new(
|
38
49
|
# "date",
|
39
50
|
# [
|
40
51
|
# "2021-04-22",
|
41
52
|
# "2022-01-04 00:00:00",
|
42
53
|
# "01/31/22",
|
43
|
-
# "Sun Jul 8 00:34:60 2001"
|
54
|
+
# "Sun Jul 8 00:34:60 2001",
|
44
55
|
# ]
|
45
56
|
# )
|
46
|
-
# s.to_frame.
|
47
|
-
# Polars.
|
48
|
-
# .str.strptime(
|
49
|
-
# .
|
50
|
-
#
|
51
|
-
# )
|
52
|
-
#
|
53
|
-
#
|
54
|
-
# )
|
57
|
+
# s.to_frame.select(
|
58
|
+
# Polars.coalesce(
|
59
|
+
# Polars.col("date").str.strptime(Polars::Date, "%F", strict: false),
|
60
|
+
# Polars.col("date").str.strptime(Polars::Date, "%F %T", strict: false),
|
61
|
+
# Polars.col("date").str.strptime(Polars::Date, "%D", strict: false),
|
62
|
+
# Polars.col("date").str.strptime(Polars::Date, "%c", strict: false)
|
63
|
+
# )
|
64
|
+
# ).to_series
|
55
65
|
# # =>
|
56
|
-
# # shape: (4,
|
57
|
-
# #
|
58
|
-
# #
|
59
|
-
# #
|
60
|
-
# #
|
61
|
-
# #
|
62
|
-
# #
|
63
|
-
# #
|
64
|
-
|
65
|
-
|
66
|
-
# # └────────────┘
|
67
|
-
def strptime(datatype, fmt = nil, strict: true, exact: true, cache: true, tz_aware: false, utc: false)
|
68
|
-
if !Utils.is_polars_dtype(datatype)
|
69
|
-
raise ArgumentError, "expected: {DataType} got: #{datatype}"
|
70
|
-
end
|
71
|
-
|
72
|
-
if datatype == :date
|
66
|
+
# # shape: (4,)
|
67
|
+
# # Series: 'date' [date]
|
68
|
+
# # [
|
69
|
+
# # 2021-04-22
|
70
|
+
# # 2022-01-04
|
71
|
+
# # 2022-01-31
|
72
|
+
# # 2001-07-08
|
73
|
+
# # ]
|
74
|
+
def strptime(dtype, fmt = nil, strict: true, exact: true, cache: true, tz_aware: false, utc: false)
|
75
|
+
if dtype == Date
|
73
76
|
Utils.wrap_expr(_rbexpr.str_parse_date(fmt, strict, exact, cache))
|
74
|
-
elsif
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
else
|
81
|
-
dtcol.dt.cast_time_unit(tu)
|
82
|
-
end
|
83
|
-
elsif datatype == :time
|
77
|
+
elsif dtype == Datetime || dtype.is_a?(Datetime)
|
78
|
+
dtype = Datetime.new if dtype == Datetime
|
79
|
+
time_unit = dtype.time_unit
|
80
|
+
time_zone = dtype.time_zone
|
81
|
+
Utils.wrap_expr(_rbexpr.str_parse_datetime(fmt, time_unit, time_zone, strict, exact, cache, tz_aware, utc))
|
82
|
+
elsif dtype == Time
|
84
83
|
Utils.wrap_expr(_rbexpr.str_parse_time(fmt, strict, exact, cache))
|
85
84
|
else
|
86
|
-
raise ArgumentError, "dtype should be of type
|
85
|
+
raise ArgumentError, "dtype should be of type {Date, Datetime, Time}"
|
87
86
|
end
|
88
87
|
end
|
89
88
|
|
@@ -332,7 +331,7 @@ module Polars
|
|
332
331
|
# # │ -0001 │
|
333
332
|
# # │ 00000 │
|
334
333
|
# # │ 00001 │
|
335
|
-
# # │
|
334
|
+
# # │ … │
|
336
335
|
# # │ 10000 │
|
337
336
|
# # │ 100000 │
|
338
337
|
# # │ 1000000 │
|
@@ -521,6 +520,40 @@ module Polars
|
|
521
520
|
Utils.wrap_expr(_rbexpr.str_starts_with(sub))
|
522
521
|
end
|
523
522
|
|
523
|
+
# Parse string values as JSON.
|
524
|
+
#
|
525
|
+
# Throw errors if encounter invalid JSON strings.
|
526
|
+
#
|
527
|
+
# @param dtype [Object]
|
528
|
+
# The dtype to cast the extracted value to. If nil, the dtype will be
|
529
|
+
# inferred from the JSON value.
|
530
|
+
#
|
531
|
+
# @return [Expr]
|
532
|
+
#
|
533
|
+
# @example
|
534
|
+
# df = Polars::DataFrame.new(
|
535
|
+
# {"json" => ['{"a":1, "b": true}', nil, '{"a":2, "b": false}']}
|
536
|
+
# )
|
537
|
+
# dtype = Polars::Struct.new([Polars::Field.new("a", Polars::Int64), Polars::Field.new("b", Polars::Boolean)])
|
538
|
+
# df.select(Polars.col("json").str.json_extract(dtype))
|
539
|
+
# # =>
|
540
|
+
# # shape: (3, 1)
|
541
|
+
# # ┌─────────────┐
|
542
|
+
# # │ json │
|
543
|
+
# # │ --- │
|
544
|
+
# # │ struct[2] │
|
545
|
+
# # ╞═════════════╡
|
546
|
+
# # │ {1,true} │
|
547
|
+
# # │ {null,null} │
|
548
|
+
# # │ {2,false} │
|
549
|
+
# # └─────────────┘
|
550
|
+
def json_extract(dtype = nil)
|
551
|
+
if !dtype.nil?
|
552
|
+
dtype = Utils.rb_type_to_dtype(dtype)
|
553
|
+
end
|
554
|
+
Utils.wrap_expr(_rbexpr.str_json_extract(dtype))
|
555
|
+
end
|
556
|
+
|
524
557
|
# Extract the first match of json string with provided JSONPath expression.
|
525
558
|
#
|
526
559
|
# Throw errors if encounter invalid json strings.
|
@@ -846,10 +879,10 @@ module Polars
|
|
846
879
|
# # │ 1 ┆ 123ABC │
|
847
880
|
# # │ 2 ┆ abc456 │
|
848
881
|
# # └─────┴────────┘
|
849
|
-
def replace(pattern, value, literal: false)
|
882
|
+
def replace(pattern, value, literal: false, n: 1)
|
850
883
|
pattern = Utils.expr_to_lit_or_expr(pattern, str_to_lit: true)
|
851
884
|
value = Utils.expr_to_lit_or_expr(value, str_to_lit: true)
|
852
|
-
Utils.wrap_expr(_rbexpr.
|
885
|
+
Utils.wrap_expr(_rbexpr.str_replace_n(pattern._rbexpr, value._rbexpr, literal, n))
|
853
886
|
end
|
854
887
|
|
855
888
|
# Replace all matching regex/literal substrings with a new string value.
|
@@ -912,5 +945,78 @@ module Polars
|
|
912
945
|
def slice(offset, length = nil)
|
913
946
|
Utils.wrap_expr(_rbexpr.str_slice(offset, length))
|
914
947
|
end
|
948
|
+
|
949
|
+
# Returns a column with a separate row for every string character.
|
950
|
+
#
|
951
|
+
# @return [Expr]
|
952
|
+
#
|
953
|
+
# @example
|
954
|
+
# df = Polars::DataFrame.new({"a": ["foo", "bar"]})
|
955
|
+
# df.select(Polars.col("a").str.explode)
|
956
|
+
# # =>
|
957
|
+
# # shape: (6, 1)
|
958
|
+
# # ┌─────┐
|
959
|
+
# # │ a │
|
960
|
+
# # │ --- │
|
961
|
+
# # │ str │
|
962
|
+
# # ╞═════╡
|
963
|
+
# # │ f │
|
964
|
+
# # │ o │
|
965
|
+
# # │ o │
|
966
|
+
# # │ b │
|
967
|
+
# # │ a │
|
968
|
+
# # │ r │
|
969
|
+
# # └─────┘
|
970
|
+
def explode
|
971
|
+
Utils.wrap_expr(_rbexpr.explode)
|
972
|
+
end
|
973
|
+
|
974
|
+
# Parse integers with base radix from strings.
|
975
|
+
#
|
976
|
+
# By default base 2. ParseError/Overflows become Nulls.
|
977
|
+
#
|
978
|
+
# @param radix [Integer]
|
979
|
+
# Positive integer which is the base of the string we are parsing.
|
980
|
+
# Default: 2.
|
981
|
+
# @param strict [Boolean]
|
982
|
+
# Bool, Default=true will raise any ParseError or overflow as ComputeError.
|
983
|
+
# False silently convert to Null.
|
984
|
+
#
|
985
|
+
# @return [Expr]
|
986
|
+
#
|
987
|
+
# @example
|
988
|
+
# df = Polars::DataFrame.new({"bin" => ["110", "101", "010", "invalid"]})
|
989
|
+
# df.select(Polars.col("bin").str.parse_int(2, strict: false))
|
990
|
+
# # =>
|
991
|
+
# # shape: (4, 1)
|
992
|
+
# # ┌──────┐
|
993
|
+
# # │ bin │
|
994
|
+
# # │ --- │
|
995
|
+
# # │ i32 │
|
996
|
+
# # ╞══════╡
|
997
|
+
# # │ 6 │
|
998
|
+
# # │ 5 │
|
999
|
+
# # │ 2 │
|
1000
|
+
# # │ null │
|
1001
|
+
# # └──────┘
|
1002
|
+
#
|
1003
|
+
# @example
|
1004
|
+
# df = Polars::DataFrame.new({"hex" => ["fa1e", "ff00", "cafe", nil]})
|
1005
|
+
# df.select(Polars.col("hex").str.parse_int(16, strict: true))
|
1006
|
+
# # =>
|
1007
|
+
# # shape: (4, 1)
|
1008
|
+
# # ┌───────┐
|
1009
|
+
# # │ hex │
|
1010
|
+
# # │ --- │
|
1011
|
+
# # │ i32 │
|
1012
|
+
# # ╞═══════╡
|
1013
|
+
# # │ 64030 │
|
1014
|
+
# # │ 65280 │
|
1015
|
+
# # │ 51966 │
|
1016
|
+
# # │ null │
|
1017
|
+
# # └───────┘
|
1018
|
+
def parse_int(radix = 2, strict: true)
|
1019
|
+
Utils.wrap_expr(_rbexpr.str_parse_int(radix, strict))
|
1020
|
+
end
|
915
1021
|
end
|
916
1022
|
end
|
@@ -38,12 +38,12 @@ module Polars
|
|
38
38
|
# )
|
39
39
|
# s.to_frame.with_column(
|
40
40
|
# Polars.col("date")
|
41
|
-
# .str.strptime(
|
41
|
+
# .str.strptime(Polars::Date, "%F", strict: false)
|
42
42
|
# .fill_null(
|
43
|
-
# Polars.col("date").str.strptime(
|
43
|
+
# Polars.col("date").str.strptime(Polars::Date, "%F %T", strict: false)
|
44
44
|
# )
|
45
|
-
# .fill_null(Polars.col("date").str.strptime(
|
46
|
-
# .fill_null(Polars.col("date").str.strptime(
|
45
|
+
# .fill_null(Polars.col("date").str.strptime(Polars::Date, "%D", strict: false))
|
46
|
+
# .fill_null(Polars.col("date").str.strptime(Polars::Date, "%c", strict: false))
|
47
47
|
# )
|
48
48
|
# # =>
|
49
49
|
# # shape: (4, 1)
|
@@ -60,5 +60,37 @@ module Polars
|
|
60
60
|
def rename_fields(names)
|
61
61
|
super
|
62
62
|
end
|
63
|
+
|
64
|
+
# Get the struct definition as a name/dtype schema dict.
|
65
|
+
#
|
66
|
+
# @return [Object]
|
67
|
+
def schema
|
68
|
+
if _s.nil?
|
69
|
+
{}
|
70
|
+
else
|
71
|
+
_s.dtype.to_schema
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
# Convert this struct Series to a DataFrame with a separate column for each field.
|
76
|
+
#
|
77
|
+
# @return [DataFrame]
|
78
|
+
#
|
79
|
+
# @example
|
80
|
+
# s = Polars::Series.new([{"a" => 1, "b" => 2}, {"a" => 3, "b" => 4}])
|
81
|
+
# s.struct.unnest
|
82
|
+
# # =>
|
83
|
+
# # shape: (2, 2)
|
84
|
+
# # ┌─────┬─────┐
|
85
|
+
# # │ a ┆ b │
|
86
|
+
# # │ --- ┆ --- │
|
87
|
+
# # │ i64 ┆ i64 │
|
88
|
+
# # ╞═════╪═════╡
|
89
|
+
# # │ 1 ┆ 2 │
|
90
|
+
# # │ 3 ┆ 4 │
|
91
|
+
# # └─────┴─────┘
|
92
|
+
def unnest
|
93
|
+
Utils.wrap_df(_s.struct_unnest)
|
94
|
+
end
|
63
95
|
end
|
64
96
|
end
|
data/lib/polars/utils.rb
CHANGED
@@ -23,24 +23,42 @@ module Polars
|
|
23
23
|
Polars.col(name)
|
24
24
|
end
|
25
25
|
|
26
|
+
def self.arrlen(obj)
|
27
|
+
if obj.is_a?(Range)
|
28
|
+
# size only works for numeric ranges
|
29
|
+
obj.to_a.length
|
30
|
+
elsif obj.is_a?(String)
|
31
|
+
nil
|
32
|
+
else
|
33
|
+
obj.length
|
34
|
+
end
|
35
|
+
rescue
|
36
|
+
nil
|
37
|
+
end
|
38
|
+
|
26
39
|
def self._timedelta_to_pl_duration(td)
|
27
40
|
td
|
28
41
|
end
|
29
42
|
|
30
43
|
def self._datetime_to_pl_timestamp(dt, tu)
|
31
44
|
if tu == "ns"
|
32
|
-
(dt.to_datetime.
|
45
|
+
(dt.to_datetime.to_time.to_f * 1e9).to_i
|
33
46
|
elsif tu == "us"
|
34
|
-
(dt.to_datetime.
|
47
|
+
(dt.to_datetime.to_time.to_f * 1e6).to_i
|
35
48
|
elsif tu == "ms"
|
36
|
-
(dt.to_datetime.
|
49
|
+
(dt.to_datetime.to_time.to_f * 1e3).to_i
|
37
50
|
elsif tu.nil?
|
38
|
-
(dt.to_datetime.
|
51
|
+
(dt.to_datetime.to_time.to_f * 1e6).to_i
|
39
52
|
else
|
40
53
|
raise ArgumentError, "tu must be one of {{'ns', 'us', 'ms'}}, got #{tu}"
|
41
54
|
end
|
42
55
|
end
|
43
56
|
|
57
|
+
def self._date_to_pl_date(d)
|
58
|
+
dt = d.to_datetime.to_time
|
59
|
+
dt.to_i / (3600 * 24)
|
60
|
+
end
|
61
|
+
|
44
62
|
def self._to_ruby_datetime(value, dtype, tu: "ns", tz: nil)
|
45
63
|
if dtype == :date || dtype == Date
|
46
64
|
# days to seconds
|
@@ -69,6 +87,18 @@ module Polars
|
|
69
87
|
end
|
70
88
|
end
|
71
89
|
|
90
|
+
def self._to_ruby_duration(value, tu = "ns")
|
91
|
+
if tu == "ns"
|
92
|
+
value / 1e9
|
93
|
+
elsif tu == "us"
|
94
|
+
value / 1e6
|
95
|
+
elsif tu == "ms"
|
96
|
+
value / 1e3
|
97
|
+
else
|
98
|
+
raise ArgumentError, "tu must be one of {{'ns', 'us', 'ms'}}, got #{tu}"
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
72
102
|
def self.selection_to_rbexpr_list(exprs)
|
73
103
|
if exprs.is_a?(String) || exprs.is_a?(Symbol) || exprs.is_a?(Expr) || exprs.is_a?(Series)
|
74
104
|
exprs = [exprs]
|
@@ -93,12 +123,19 @@ module Polars
|
|
93
123
|
Polars.lit(value)
|
94
124
|
end
|
95
125
|
|
96
|
-
def self.
|
97
|
-
File.expand_path(path)
|
126
|
+
def self.normalise_filepath(path, check_not_directory: true)
|
127
|
+
path = File.expand_path(path)
|
128
|
+
if check_not_directory && File.exist?(path) && Dir.exist?(path)
|
129
|
+
raise ArgumentError, "Expected a file path; #{path} is a directory"
|
130
|
+
end
|
131
|
+
path
|
98
132
|
end
|
99
133
|
|
100
134
|
# TODO fix
|
101
|
-
def self.is_polars_dtype(data_type)
|
135
|
+
def self.is_polars_dtype(data_type, include_unknown: false)
|
136
|
+
if data_type == Unknown
|
137
|
+
return include_unknown
|
138
|
+
end
|
102
139
|
data_type.is_a?(Symbol) || data_type.is_a?(String) || data_type.is_a?(DataType) || (data_type.is_a?(Class) && data_type < DataType)
|
103
140
|
end
|
104
141
|
|
@@ -109,7 +146,8 @@ module Polars
|
|
109
146
|
TrueClass => :bool,
|
110
147
|
FalseClass => :bool,
|
111
148
|
::Date => :date,
|
112
|
-
::DateTime => :datetime
|
149
|
+
::DateTime => :datetime,
|
150
|
+
::Time => :datetime
|
113
151
|
}
|
114
152
|
|
115
153
|
# TODO fix
|
@@ -174,7 +212,7 @@ module Polars
|
|
174
212
|
end
|
175
213
|
|
176
214
|
def self.bool?(value)
|
177
|
-
value
|
215
|
+
value.is_a?(TrueClass) || value.is_a?(FalseClass)
|
178
216
|
end
|
179
217
|
|
180
218
|
def self.strlike?(value)
|
@@ -216,5 +254,9 @@ module Polars
|
|
216
254
|
val.is_a?(Array) && _is_iterable_of(val, String)
|
217
255
|
end
|
218
256
|
end
|
257
|
+
|
258
|
+
def self.local_file?(file)
|
259
|
+
Dir.glob(file).any?
|
260
|
+
end
|
219
261
|
end
|
220
262
|
end
|
data/lib/polars/version.rb
CHANGED
data/lib/polars.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
# ext
|
2
2
|
begin
|
3
|
-
|
3
|
+
require "polars/#{RUBY_VERSION.to_f}/polars"
|
4
4
|
rescue LoadError
|
5
|
-
|
5
|
+
require "polars/polars"
|
6
6
|
end
|
7
7
|
|
8
8
|
# stdlib
|
@@ -12,6 +12,8 @@ require "stringio"
|
|
12
12
|
# modules
|
13
13
|
require_relative "polars/expr_dispatch"
|
14
14
|
require_relative "polars/batched_csv_reader"
|
15
|
+
require_relative "polars/binary_expr"
|
16
|
+
require_relative "polars/binary_name_space"
|
15
17
|
require_relative "polars/cat_expr"
|
16
18
|
require_relative "polars/cat_name_space"
|
17
19
|
require_relative "polars/convert"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: polars-df
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: x86_64-linux
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-05-16 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description:
|
14
14
|
email: andrew@ankane.org
|
@@ -29,6 +29,8 @@ files:
|
|
29
29
|
- lib/polars/3.1/polars.so
|
30
30
|
- lib/polars/3.2/polars.so
|
31
31
|
- lib/polars/batched_csv_reader.rb
|
32
|
+
- lib/polars/binary_expr.rb
|
33
|
+
- lib/polars/binary_name_space.rb
|
32
34
|
- lib/polars/cat_expr.rb
|
33
35
|
- lib/polars/cat_name_space.rb
|
34
36
|
- lib/polars/convert.rb
|