polars-df 0.10.0-x86_64-linux-musl
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +3 -0
- data/CHANGELOG.md +175 -0
- data/Cargo.lock +2536 -0
- data/Cargo.toml +6 -0
- data/LICENSE-THIRD-PARTY.txt +38726 -0
- data/LICENSE.txt +20 -0
- data/README.md +437 -0
- data/lib/polars/3.1/polars.so +0 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/3.3/polars.so +0 -0
- data/lib/polars/array_expr.rb +537 -0
- data/lib/polars/array_name_space.rb +423 -0
- data/lib/polars/batched_csv_reader.rb +98 -0
- data/lib/polars/binary_expr.rb +77 -0
- data/lib/polars/binary_name_space.rb +66 -0
- data/lib/polars/cat_expr.rb +72 -0
- data/lib/polars/cat_name_space.rb +125 -0
- data/lib/polars/config.rb +530 -0
- data/lib/polars/convert.rb +93 -0
- data/lib/polars/data_frame.rb +5418 -0
- data/lib/polars/data_types.rb +466 -0
- data/lib/polars/date_time_expr.rb +1444 -0
- data/lib/polars/date_time_name_space.rb +1484 -0
- data/lib/polars/dynamic_group_by.rb +52 -0
- data/lib/polars/exceptions.rb +31 -0
- data/lib/polars/expr.rb +6105 -0
- data/lib/polars/expr_dispatch.rb +22 -0
- data/lib/polars/functions/aggregation/horizontal.rb +246 -0
- data/lib/polars/functions/aggregation/vertical.rb +282 -0
- data/lib/polars/functions/as_datatype.rb +248 -0
- data/lib/polars/functions/col.rb +47 -0
- data/lib/polars/functions/eager.rb +182 -0
- data/lib/polars/functions/lazy.rb +1280 -0
- data/lib/polars/functions/len.rb +49 -0
- data/lib/polars/functions/lit.rb +35 -0
- data/lib/polars/functions/random.rb +16 -0
- data/lib/polars/functions/range/date_range.rb +103 -0
- data/lib/polars/functions/range/int_range.rb +51 -0
- data/lib/polars/functions/repeat.rb +144 -0
- data/lib/polars/functions/whenthen.rb +96 -0
- data/lib/polars/functions.rb +57 -0
- data/lib/polars/group_by.rb +548 -0
- data/lib/polars/io.rb +890 -0
- data/lib/polars/lazy_frame.rb +2833 -0
- data/lib/polars/lazy_group_by.rb +84 -0
- data/lib/polars/list_expr.rb +791 -0
- data/lib/polars/list_name_space.rb +445 -0
- data/lib/polars/meta_expr.rb +222 -0
- data/lib/polars/name_expr.rb +198 -0
- data/lib/polars/plot.rb +109 -0
- data/lib/polars/rolling_group_by.rb +37 -0
- data/lib/polars/series.rb +4527 -0
- data/lib/polars/slice.rb +104 -0
- data/lib/polars/sql_context.rb +194 -0
- data/lib/polars/string_cache.rb +75 -0
- data/lib/polars/string_expr.rb +1519 -0
- data/lib/polars/string_name_space.rb +810 -0
- data/lib/polars/struct_expr.rb +98 -0
- data/lib/polars/struct_name_space.rb +96 -0
- data/lib/polars/testing.rb +507 -0
- data/lib/polars/utils.rb +422 -0
- data/lib/polars/version.rb +4 -0
- data/lib/polars/whenthen.rb +83 -0
- data/lib/polars-df.rb +1 -0
- data/lib/polars.rb +72 -0
- metadata +125 -0
data/lib/polars/slice.rb
ADDED
@@ -0,0 +1,104 @@
|
|
1
|
+
module Polars
|
2
|
+
# @private
|
3
|
+
class Slice
|
4
|
+
def initialize(obj)
|
5
|
+
@obj = obj
|
6
|
+
end
|
7
|
+
|
8
|
+
# Apply a slice operation, taking advantage of any potential fast paths.
|
9
|
+
def apply(s)
|
10
|
+
# normalize slice
|
11
|
+
_slice_setup(s)
|
12
|
+
|
13
|
+
# check for fast-paths / single-operation calls
|
14
|
+
if @slice_length == 0
|
15
|
+
@obj.cleared
|
16
|
+
elsif @is_unbounded && [-1, 1].include?(@stride)
|
17
|
+
@stride < 0 ? @obj.reverse : @obj.clone
|
18
|
+
elsif @start >= 0 && @stop >= 0 && @stride == 1
|
19
|
+
@obj.slice(@start, @slice_length)
|
20
|
+
elsif @stride < 0 && @slice_length == 1
|
21
|
+
@obj.slice(@stop + 1, 1)
|
22
|
+
else
|
23
|
+
# multi-operation calls; make lazy
|
24
|
+
lazyobj = _lazify(@obj)
|
25
|
+
sliced = @stride > 0 ? _slice_positive(lazyobj) : _slice_negative(lazyobj)
|
26
|
+
_as_original(sliced, @obj)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
# Return lazy variant back to its original type.
|
33
|
+
def _as_original(lazy, original)
|
34
|
+
frame = lazy.collect
|
35
|
+
original.is_a?(DataFrame) ? frame : frame.to_series
|
36
|
+
end
|
37
|
+
|
38
|
+
# Make lazy to ensure efficient/consistent handling.
|
39
|
+
def _lazify(obj)
|
40
|
+
obj.is_a?(DataFrame) ? obj.lazy : obj.to_frame.lazy
|
41
|
+
end
|
42
|
+
|
43
|
+
# Logic for slices with positive stride.
|
44
|
+
def _slice_positive(obj)
|
45
|
+
# note: at this point stride is guaranteed to be > 1
|
46
|
+
obj.slice(@start, @slice_length).take_every(@stride)
|
47
|
+
end
|
48
|
+
|
49
|
+
# Logic for slices with negative stride.
|
50
|
+
def _slice_negative(obj)
|
51
|
+
stride = @stride.abs
|
52
|
+
lazyslice = obj.slice(@stop + 1, @slice_length).reverse
|
53
|
+
stride > 1 ? lazyslice.take_every(stride) : lazyslice
|
54
|
+
end
|
55
|
+
|
56
|
+
# Normalize slice bounds, identify unbounded and/or zero-length slices.
|
57
|
+
def _slice_setup(s)
|
58
|
+
# can normalize slice indices as we know object size
|
59
|
+
obj_len = @obj.length
|
60
|
+
start = if s.begin
|
61
|
+
if s.begin < 0
|
62
|
+
[s.begin + obj_len, 0].max
|
63
|
+
else
|
64
|
+
s.begin
|
65
|
+
end
|
66
|
+
else
|
67
|
+
0
|
68
|
+
end
|
69
|
+
stop = if s.end
|
70
|
+
if s.end < 0
|
71
|
+
s.end + (s.exclude_end? ? 0 : 1) + obj_len
|
72
|
+
else
|
73
|
+
s.end + (s.exclude_end? ? 0 : 1)
|
74
|
+
end
|
75
|
+
else
|
76
|
+
obj_len
|
77
|
+
end
|
78
|
+
stride = 1
|
79
|
+
|
80
|
+
# check if slice is actually unbounded
|
81
|
+
if stride >= 1
|
82
|
+
@is_unbounded = start <= 0 && stop >= obj_len
|
83
|
+
else
|
84
|
+
@is_unbounded = stop == -1 && start >= obj_len - 1
|
85
|
+
end
|
86
|
+
|
87
|
+
# determine slice length
|
88
|
+
if @obj.is_empty
|
89
|
+
@slice_length = 0
|
90
|
+
elsif @is_unbounded
|
91
|
+
@slice_length = obj_len
|
92
|
+
else
|
93
|
+
@slice_length = if start == stop || (stride > 0 && start > stop) || (stride < 0 && start < stop)
|
94
|
+
0
|
95
|
+
else
|
96
|
+
(stop - start).abs
|
97
|
+
end
|
98
|
+
end
|
99
|
+
@start = start
|
100
|
+
@stop = stop
|
101
|
+
@stride = stride
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
@@ -0,0 +1,194 @@
|
|
1
|
+
module Polars
|
2
|
+
# Run SQL queries against DataFrame/LazyFrame data.
|
3
|
+
class SQLContext
|
4
|
+
# @private
|
5
|
+
attr_accessor :_ctxt, :_eager_execution
|
6
|
+
|
7
|
+
# Initialize a new `SQLContext`.
|
8
|
+
def initialize(frames = nil, eager_execution: false, **named_frames)
|
9
|
+
self._ctxt = RbSQLContext.new
|
10
|
+
self._eager_execution = eager_execution
|
11
|
+
|
12
|
+
frames = (frames || {}).to_h
|
13
|
+
|
14
|
+
if frames.any? || named_frames.any?
|
15
|
+
register_many(frames, **named_frames)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
# Parse the given SQL query and execute it against the registered frame data.
|
20
|
+
#
|
21
|
+
# @param query [String]
|
22
|
+
# A valid string SQL query.
|
23
|
+
# @param eager [Boolean]
|
24
|
+
# Apply the query eagerly, returning `DataFrame` instead of `LazyFrame`.
|
25
|
+
# If unset, the value of the init-time parameter "eager_execution" will be
|
26
|
+
# used. (Note that the query itself is always executed in lazy-mode; this
|
27
|
+
# parameter only impacts the type of the returned frame).
|
28
|
+
#
|
29
|
+
# @return [Object]
|
30
|
+
#
|
31
|
+
# @example Execute a SQL query against the registered frame data:
|
32
|
+
# df = Polars::DataFrame.new(
|
33
|
+
# [
|
34
|
+
# ["The Godfather", 1972, 6_000_000, 134_821_952, 9.2],
|
35
|
+
# ["The Dark Knight", 2008, 185_000_000, 533_316_061, 9.0],
|
36
|
+
# ["Schindler's List", 1993, 22_000_000, 96_067_179, 8.9],
|
37
|
+
# ["Pulp Fiction", 1994, 8_000_000, 107_930_000, 8.9],
|
38
|
+
# ["The Shawshank Redemption", 1994, 25_000_000, 28_341_469, 9.3],
|
39
|
+
# ],
|
40
|
+
# schema: ["title", "release_year", "budget", "gross", "imdb_score"]
|
41
|
+
# )
|
42
|
+
# ctx = Polars::SQLContext.new(films: df)
|
43
|
+
# ctx.execute(
|
44
|
+
# "
|
45
|
+
# SELECT title, release_year, imdb_score
|
46
|
+
# FROM films
|
47
|
+
# WHERE release_year > 1990
|
48
|
+
# ORDER BY imdb_score DESC
|
49
|
+
# ",
|
50
|
+
# eager: true
|
51
|
+
# )
|
52
|
+
# # =>
|
53
|
+
# # shape: (4, 3)
|
54
|
+
# # ┌──────────────────────────┬──────────────┬────────────┐
|
55
|
+
# # │ title ┆ release_year ┆ imdb_score │
|
56
|
+
# # │ --- ┆ --- ┆ --- │
|
57
|
+
# # │ str ┆ i64 ┆ f64 │
|
58
|
+
# # ╞══════════════════════════╪══════════════╪════════════╡
|
59
|
+
# # │ The Shawshank Redemption ┆ 1994 ┆ 9.3 │
|
60
|
+
# # │ The Dark Knight ┆ 2008 ┆ 9.0 │
|
61
|
+
# # │ Schindler's List ┆ 1993 ┆ 8.9 │
|
62
|
+
# # │ Pulp Fiction ┆ 1994 ┆ 8.9 │
|
63
|
+
# # └──────────────────────────┴──────────────┴────────────┘
|
64
|
+
#
|
65
|
+
# @example Execute a GROUP BY query:
|
66
|
+
# ctx.execute(
|
67
|
+
# "
|
68
|
+
# SELECT
|
69
|
+
# MAX(release_year / 10) * 10 AS decade,
|
70
|
+
# SUM(gross) AS total_gross,
|
71
|
+
# COUNT(title) AS n_films,
|
72
|
+
# FROM films
|
73
|
+
# GROUP BY (release_year / 10) -- decade
|
74
|
+
# ORDER BY total_gross DESC
|
75
|
+
# ",
|
76
|
+
# eager: true
|
77
|
+
# )
|
78
|
+
# # =>
|
79
|
+
# # shape: (3, 3)
|
80
|
+
# # ┌────────┬─────────────┬─────────┐
|
81
|
+
# # │ decade ┆ total_gross ┆ n_films │
|
82
|
+
# # │ --- ┆ --- ┆ --- │
|
83
|
+
# # │ i64 ┆ i64 ┆ u32 │
|
84
|
+
# # ╞════════╪═════════════╪═════════╡
|
85
|
+
# # │ 2000 ┆ 533316061 ┆ 1 │
|
86
|
+
# # │ 1990 ┆ 232338648 ┆ 3 │
|
87
|
+
# # │ 1970 ┆ 134821952 ┆ 1 │
|
88
|
+
# # └────────┴─────────────┴─────────┘
|
89
|
+
def execute(query, eager: nil)
|
90
|
+
res = Utils.wrap_ldf(_ctxt.execute(query))
|
91
|
+
eager || _eager_execution ? res.collect : res
|
92
|
+
end
|
93
|
+
|
94
|
+
# Register a single frame as a table, using the given name.
|
95
|
+
#
|
96
|
+
# @param name [String]
|
97
|
+
# Name of the table.
|
98
|
+
# @param frame [Object]
|
99
|
+
# eager/lazy frame to associate with this table name.
|
100
|
+
#
|
101
|
+
# @return [SQLContext]
|
102
|
+
#
|
103
|
+
# @example
|
104
|
+
# df = Polars::DataFrame.new({"hello" => ["world"]})
|
105
|
+
# ctx = Polars::SQLContext.new
|
106
|
+
# ctx.register("frame_data", df).execute("SELECT * FROM frame_data").collect
|
107
|
+
# # =>
|
108
|
+
# # shape: (1, 1)
|
109
|
+
# # ┌───────┐
|
110
|
+
# # │ hello │
|
111
|
+
# # │ --- │
|
112
|
+
# # │ str │
|
113
|
+
# # ╞═══════╡
|
114
|
+
# # │ world │
|
115
|
+
# # └───────┘
|
116
|
+
def register(name, frame)
|
117
|
+
if frame.is_a?(DataFrame)
|
118
|
+
frame = frame.lazy
|
119
|
+
end
|
120
|
+
_ctxt.register(name.to_s, frame._ldf)
|
121
|
+
self
|
122
|
+
end
|
123
|
+
|
124
|
+
# Register multiple eager/lazy frames as tables, using the associated names.
|
125
|
+
#
|
126
|
+
# @param frames [Hash]
|
127
|
+
# A `{name:frame, ...}` mapping.
|
128
|
+
# @param named_frames [Object]
|
129
|
+
# Named eager/lazy frames, provided as kwargs.
|
130
|
+
#
|
131
|
+
# @return [SQLContext]
|
132
|
+
def register_many(frames, **named_frames)
|
133
|
+
frames = (frames || {}).to_h
|
134
|
+
frames = frames.merge(named_frames)
|
135
|
+
frames.each do |name, frame|
|
136
|
+
register(name, frame)
|
137
|
+
end
|
138
|
+
self
|
139
|
+
end
|
140
|
+
|
141
|
+
# Unregister one or more eager/lazy frames by name.
|
142
|
+
#
|
143
|
+
# @param names [Object]
|
144
|
+
# Names of the tables to unregister.
|
145
|
+
#
|
146
|
+
# @return [SQLContext]
|
147
|
+
#
|
148
|
+
# @example Register with a SQLContext object:
|
149
|
+
# df0 = Polars::DataFrame.new({"ints" => [9, 8, 7, 6, 5]})
|
150
|
+
# lf1 = Polars::LazyFrame.new({"text" => ["a", "b", "c"]})
|
151
|
+
# lf2 = Polars::LazyFrame.new({"misc" => ["testing1234"]})
|
152
|
+
# ctx = Polars::SQLContext.new(test1: df0, test2: lf1, test3: lf2)
|
153
|
+
# ctx.tables
|
154
|
+
# # => ["test1", "test2", "test3"]
|
155
|
+
#
|
156
|
+
# @example Unregister one or more of the tables:
|
157
|
+
# ctx.unregister(["test1", "test3"]).tables
|
158
|
+
# # => ["test2"]
|
159
|
+
def unregister(names)
|
160
|
+
if names.is_a?(::String)
|
161
|
+
names = [names]
|
162
|
+
end
|
163
|
+
names.each do |nm|
|
164
|
+
_ctxt.unregister(nm)
|
165
|
+
end
|
166
|
+
self
|
167
|
+
end
|
168
|
+
|
169
|
+
# Return a list of the registered table names.
|
170
|
+
#
|
171
|
+
# @return [Array]
|
172
|
+
#
|
173
|
+
# @example Executing as SQL:
|
174
|
+
# frame_data = Polars::DataFrame.new({"hello" => ["world"]})
|
175
|
+
# ctx = Polars::SQLContext.new(hello_world: frame_data)
|
176
|
+
# ctx.execute("SHOW TABLES", eager: true)
|
177
|
+
# # =>
|
178
|
+
# # shape: (1, 1)
|
179
|
+
# # ┌─────────────┐
|
180
|
+
# # │ name │
|
181
|
+
# # │ --- │
|
182
|
+
# # │ str │
|
183
|
+
# # ╞═════════════╡
|
184
|
+
# # │ hello_world │
|
185
|
+
# # └─────────────┘
|
186
|
+
#
|
187
|
+
# @example Calling the method:
|
188
|
+
# ctx.tables
|
189
|
+
# # => ["hello_world"]
|
190
|
+
def tables
|
191
|
+
_ctxt.get_tables.sort
|
192
|
+
end
|
193
|
+
end
|
194
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module Polars
|
2
|
+
# Context manager for enabling and disabling the global string cache.
|
3
|
+
class StringCache
|
4
|
+
def initialize(&block)
|
5
|
+
RbStringCacheHolder.hold(&block)
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
module Functions
|
10
|
+
# Enable the global string cache.
|
11
|
+
#
|
12
|
+
# `Categorical` columns created under the same global string cache have
|
13
|
+
# the same underlying physical value when string values are equal. This allows the
|
14
|
+
# columns to be concatenated or used in a join operation, for example.
|
15
|
+
#
|
16
|
+
# @return [nil]
|
17
|
+
#
|
18
|
+
# @example Construct two Series using the same global string cache.
|
19
|
+
# Polars.enable_string_cache
|
20
|
+
# s1 = Polars::Series.new("color", ["red", "green", "red"], dtype: Polars::Categorical)
|
21
|
+
# s2 = Polars::Series.new("color", ["blue", "red", "green"], dtype: Polars::Categorical)
|
22
|
+
# Polars.disable_string_cache
|
23
|
+
#
|
24
|
+
# @example As both Series are constructed under the same global string cache, they can be concatenated.
|
25
|
+
# Polars.concat([s1, s2])
|
26
|
+
# # =>
|
27
|
+
# # shape: (6,)
|
28
|
+
# # Series: 'color' [cat]
|
29
|
+
# # [
|
30
|
+
# # "red"
|
31
|
+
# # "green"
|
32
|
+
# # "red"
|
33
|
+
# # "blue"
|
34
|
+
# # "red"
|
35
|
+
# # "green"
|
36
|
+
# # ]
|
37
|
+
def enable_string_cache
|
38
|
+
Plr.enable_string_cache
|
39
|
+
end
|
40
|
+
|
41
|
+
# Disable and clear the global string cache.
|
42
|
+
#
|
43
|
+
# @return [nil]
|
44
|
+
#
|
45
|
+
# @example Construct two Series using the same global string cache.
|
46
|
+
# Polars.enable_string_cache
|
47
|
+
# s1 = Polars::Series.new("color", ["red", "green", "red"], dtype: Polars::Categorical)
|
48
|
+
# s2 = Polars::Series.new("color", ["blue", "red", "green"], dtype: Polars::Categorical)
|
49
|
+
# Polars.disable_string_cache
|
50
|
+
#
|
51
|
+
# @example As both Series are constructed under the same global string cache, they can be concatenated.
|
52
|
+
# Polars.concat([s1, s2])
|
53
|
+
# # =>
|
54
|
+
# # shape: (6,)
|
55
|
+
# # Series: 'color' [cat]
|
56
|
+
# # [
|
57
|
+
# # "red"
|
58
|
+
# # "green"
|
59
|
+
# # "red"
|
60
|
+
# # "blue"
|
61
|
+
# # "red"
|
62
|
+
# # "green"
|
63
|
+
# # ]
|
64
|
+
def disable_string_cache
|
65
|
+
Plr.disable_string_cache
|
66
|
+
end
|
67
|
+
|
68
|
+
# Check whether the global string cache is enabled.
|
69
|
+
#
|
70
|
+
# @return [Boolean]
|
71
|
+
def using_string_cache
|
72
|
+
Plr.using_string_cache
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|