polars-df 0.2.0-x86_64-linux
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +3 -0
- data/CHANGELOG.md +33 -0
- data/Cargo.lock +2230 -0
- data/Cargo.toml +10 -0
- data/LICENSE-THIRD-PARTY.txt +38828 -0
- data/LICENSE.txt +20 -0
- data/README.md +91 -0
- data/lib/polars/3.0/polars.so +0 -0
- data/lib/polars/3.1/polars.so +0 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/batched_csv_reader.rb +96 -0
- data/lib/polars/cat_expr.rb +52 -0
- data/lib/polars/cat_name_space.rb +54 -0
- data/lib/polars/convert.rb +100 -0
- data/lib/polars/data_frame.rb +4833 -0
- data/lib/polars/data_types.rb +122 -0
- data/lib/polars/date_time_expr.rb +1418 -0
- data/lib/polars/date_time_name_space.rb +1484 -0
- data/lib/polars/dynamic_group_by.rb +52 -0
- data/lib/polars/exceptions.rb +20 -0
- data/lib/polars/expr.rb +5307 -0
- data/lib/polars/expr_dispatch.rb +22 -0
- data/lib/polars/functions.rb +453 -0
- data/lib/polars/group_by.rb +558 -0
- data/lib/polars/io.rb +814 -0
- data/lib/polars/lazy_frame.rb +2442 -0
- data/lib/polars/lazy_functions.rb +1195 -0
- data/lib/polars/lazy_group_by.rb +93 -0
- data/lib/polars/list_expr.rb +610 -0
- data/lib/polars/list_name_space.rb +346 -0
- data/lib/polars/meta_expr.rb +54 -0
- data/lib/polars/rolling_group_by.rb +35 -0
- data/lib/polars/series.rb +3730 -0
- data/lib/polars/slice.rb +104 -0
- data/lib/polars/string_expr.rb +972 -0
- data/lib/polars/string_name_space.rb +690 -0
- data/lib/polars/struct_expr.rb +100 -0
- data/lib/polars/struct_name_space.rb +64 -0
- data/lib/polars/utils.rb +192 -0
- data/lib/polars/version.rb +4 -0
- data/lib/polars/when.rb +16 -0
- data/lib/polars/when_then.rb +19 -0
- data/lib/polars-df.rb +1 -0
- data/lib/polars.rb +50 -0
- metadata +89 -0
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2020 Ritchie Vink
|
2
|
+
Copyright (c) 2022-2023 Andrew Kane
|
3
|
+
|
4
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
5
|
+
of this software and associated documentation files (the "Software"), to deal
|
6
|
+
in the Software without restriction, including without limitation the rights
|
7
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
8
|
+
copies of the Software, and to permit persons to whom the Software is
|
9
|
+
furnished to do so, subject to the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be included in all
|
12
|
+
copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
15
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
16
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
17
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
18
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
19
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
20
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,91 @@
|
|
1
|
+
# Polars Ruby
|
2
|
+
|
3
|
+
:fire: Blazingly fast DataFrames for Ruby, powered by [Polars](https://github.com/pola-rs/polars)
|
4
|
+
|
5
|
+
[![Build Status](https://github.com/ankane/polars-ruby/workflows/build/badge.svg?branch=master)](https://github.com/ankane/polars-ruby/actions)
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application’s Gemfile:
|
10
|
+
|
11
|
+
```ruby
|
12
|
+
gem "polars-df"
|
13
|
+
```
|
14
|
+
|
15
|
+
## Getting Started
|
16
|
+
|
17
|
+
This library follows the [Polars Python API](https://pola-rs.github.io/polars/py-polars/html/reference/index.html).
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
Polars.read_csv("iris.csv")
|
21
|
+
.lazy
|
22
|
+
.filter(Polars.col("sepal_length") > 5)
|
23
|
+
.groupby("species")
|
24
|
+
.agg(Polars.all.sum)
|
25
|
+
.collect
|
26
|
+
```
|
27
|
+
|
28
|
+
You can follow [Polars tutorials](https://pola-rs.github.io/polars-book/user-guide/introduction.html) and convert the code to Ruby in many cases. Feel free to open an issue if you run into problems. Some methods are missing at the moment.
|
29
|
+
|
30
|
+
## Examples
|
31
|
+
|
32
|
+
### Creating DataFrames
|
33
|
+
|
34
|
+
From a CSV
|
35
|
+
|
36
|
+
```ruby
|
37
|
+
Polars.read_csv("file.csv")
|
38
|
+
```
|
39
|
+
|
40
|
+
From Parquet
|
41
|
+
|
42
|
+
```ruby
|
43
|
+
Polars.read_parquet("file.parquet")
|
44
|
+
```
|
45
|
+
|
46
|
+
From Active Record
|
47
|
+
|
48
|
+
```ruby
|
49
|
+
Polars::DataFrame.new(User.all)
|
50
|
+
```
|
51
|
+
|
52
|
+
From a hash
|
53
|
+
|
54
|
+
```ruby
|
55
|
+
Polars::DataFrame.new({
|
56
|
+
a: [1, 2, 3],
|
57
|
+
b: ["one", "two", "three"]
|
58
|
+
})
|
59
|
+
```
|
60
|
+
|
61
|
+
From an array of series
|
62
|
+
|
63
|
+
```ruby
|
64
|
+
Polars::DataFrame.new([
|
65
|
+
Polars::Series.new("a", [1, 2, 3]),
|
66
|
+
Polars::Series.new("b", ["one", "two", "three"])
|
67
|
+
])
|
68
|
+
```
|
69
|
+
|
70
|
+
## History
|
71
|
+
|
72
|
+
View the [changelog](CHANGELOG.md)
|
73
|
+
|
74
|
+
## Contributing
|
75
|
+
|
76
|
+
Everyone is encouraged to help improve this project. Here are a few ways you can help:
|
77
|
+
|
78
|
+
- [Report bugs](https://github.com/ankane/polars-ruby/issues)
|
79
|
+
- Fix bugs and [submit pull requests](https://github.com/ankane/polars-ruby/pulls)
|
80
|
+
- Write, clarify, or fix documentation
|
81
|
+
- Suggest or add new features
|
82
|
+
|
83
|
+
To get started with development:
|
84
|
+
|
85
|
+
```sh
|
86
|
+
git clone https://github.com/ankane/polars-ruby.git
|
87
|
+
cd polars-ruby
|
88
|
+
bundle install
|
89
|
+
bundle exec rake compile
|
90
|
+
bundle exec rake test
|
91
|
+
```
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,96 @@
|
|
1
|
+
module Polars
|
2
|
+
# @private
|
3
|
+
class BatchedCsvReader
|
4
|
+
attr_accessor :_reader, :new_columns
|
5
|
+
|
6
|
+
def initialize(
|
7
|
+
file,
|
8
|
+
has_header: true,
|
9
|
+
columns: nil,
|
10
|
+
sep: ",",
|
11
|
+
comment_char: nil,
|
12
|
+
quote_char: '"',
|
13
|
+
skip_rows: 0,
|
14
|
+
dtypes: nil,
|
15
|
+
null_values: nil,
|
16
|
+
ignore_errors: false,
|
17
|
+
parse_dates: false,
|
18
|
+
n_threads: nil,
|
19
|
+
infer_schema_length: 100,
|
20
|
+
batch_size: 50_000,
|
21
|
+
n_rows: nil,
|
22
|
+
encoding: "utf8",
|
23
|
+
low_memory: false,
|
24
|
+
rechunk: true,
|
25
|
+
skip_rows_after_header: 0,
|
26
|
+
row_count_name: nil,
|
27
|
+
row_count_offset: 0,
|
28
|
+
sample_size: 1024,
|
29
|
+
eol_char: "\n",
|
30
|
+
new_columns: nil
|
31
|
+
)
|
32
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
33
|
+
path = Utils.format_path(file)
|
34
|
+
end
|
35
|
+
|
36
|
+
dtype_list = nil
|
37
|
+
dtype_slice = nil
|
38
|
+
if !dtypes.nil?
|
39
|
+
if dtypes.is_a?(Hash)
|
40
|
+
dtype_list = []
|
41
|
+
dtypes.each do|k, v|
|
42
|
+
dtype_list << [k, Utils.rb_type_to_dtype(v)]
|
43
|
+
end
|
44
|
+
elsif dtypes.is_a?(Array)
|
45
|
+
dtype_slice = dtypes
|
46
|
+
else
|
47
|
+
raise ArgumentError, "dtype arg should be list or dict"
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
processed_null_values = Utils._process_null_values(null_values)
|
52
|
+
projection, columns = Utils.handle_projection_columns(columns)
|
53
|
+
|
54
|
+
self._reader = RbBatchedCsv.new(
|
55
|
+
infer_schema_length,
|
56
|
+
batch_size,
|
57
|
+
has_header,
|
58
|
+
ignore_errors,
|
59
|
+
n_rows,
|
60
|
+
skip_rows,
|
61
|
+
projection,
|
62
|
+
sep,
|
63
|
+
rechunk,
|
64
|
+
columns,
|
65
|
+
encoding,
|
66
|
+
n_threads,
|
67
|
+
path,
|
68
|
+
dtype_list,
|
69
|
+
dtype_slice,
|
70
|
+
low_memory,
|
71
|
+
comment_char,
|
72
|
+
quote_char,
|
73
|
+
processed_null_values,
|
74
|
+
parse_dates,
|
75
|
+
skip_rows_after_header,
|
76
|
+
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
77
|
+
sample_size,
|
78
|
+
eol_char
|
79
|
+
)
|
80
|
+
self.new_columns = new_columns
|
81
|
+
end
|
82
|
+
|
83
|
+
def next_batches(n)
|
84
|
+
batches = _reader.next_batches(n)
|
85
|
+
if !batches.nil?
|
86
|
+
if new_columns
|
87
|
+
batches.map { |df| Utils._update_columns(Utils.wrap_df(df), new_columns) }
|
88
|
+
else
|
89
|
+
batches.map { |df| Utils.wrap_df(df) }
|
90
|
+
end
|
91
|
+
else
|
92
|
+
nil
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
module Polars
|
2
|
+
# Namespace for categorical related expressions.
|
3
|
+
class CatExpr
|
4
|
+
# @private
|
5
|
+
attr_accessor :_rbexpr
|
6
|
+
|
7
|
+
# @private
|
8
|
+
def initialize(expr)
|
9
|
+
self._rbexpr = expr._rbexpr
|
10
|
+
end
|
11
|
+
|
12
|
+
# Determine how this categorical series should be sorted.
|
13
|
+
#
|
14
|
+
# @param ordering ["physical", "lexical"]
|
15
|
+
# Ordering type:
|
16
|
+
#
|
17
|
+
# - 'physical' -> Use the physical representation of the categories to determine the order (default).
|
18
|
+
# - 'lexical' -> Use the string values to determine the ordering.
|
19
|
+
#
|
20
|
+
# @return [Expr]
|
21
|
+
#
|
22
|
+
# @example
|
23
|
+
# df = Polars::DataFrame.new(
|
24
|
+
# {"cats" => ["z", "z", "k", "a", "b"], "vals" => [3, 1, 2, 2, 3]}
|
25
|
+
# ).with_columns(
|
26
|
+
# [
|
27
|
+
# Polars.col("cats").cast(:cat).cat.set_ordering("lexical")
|
28
|
+
# ]
|
29
|
+
# )
|
30
|
+
# df.sort(["cats", "vals"])
|
31
|
+
# # =>
|
32
|
+
# # shape: (5, 2)
|
33
|
+
# # ┌──────┬──────┐
|
34
|
+
# # │ cats ┆ vals │
|
35
|
+
# # │ --- ┆ --- │
|
36
|
+
# # │ cat ┆ i64 │
|
37
|
+
# # ╞══════╪══════╡
|
38
|
+
# # │ a ┆ 2 │
|
39
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
40
|
+
# # │ b ┆ 3 │
|
41
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
42
|
+
# # │ k ┆ 2 │
|
43
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
44
|
+
# # │ z ┆ 1 │
|
45
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
46
|
+
# # │ z ┆ 3 │
|
47
|
+
# # └──────┴──────┘
|
48
|
+
def set_ordering(ordering)
|
49
|
+
Utils.wrap_expr(_rbexpr.cat_set_ordering(ordering))
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module Polars
|
2
|
+
# Series.cat namespace.
|
3
|
+
class CatNameSpace
|
4
|
+
include ExprDispatch
|
5
|
+
|
6
|
+
self._accessor = "cat"
|
7
|
+
|
8
|
+
# @private
|
9
|
+
def initialize(series)
|
10
|
+
self._s = series._s
|
11
|
+
end
|
12
|
+
|
13
|
+
# Determine how this categorical series should be sorted.
|
14
|
+
#
|
15
|
+
# @param ordering ["physical", "lexical"]
|
16
|
+
# Ordering type:
|
17
|
+
#
|
18
|
+
# - 'physical' -> Use the physical representation of the categories to
|
19
|
+
# determine the order (default).
|
20
|
+
# - 'lexical' -> Use the string values to determine the ordering.
|
21
|
+
#
|
22
|
+
# @return [Series]
|
23
|
+
#
|
24
|
+
# @example
|
25
|
+
# df = Polars::DataFrame.new(
|
26
|
+
# {"cats" => ["z", "z", "k", "a", "b"], "vals" => [3, 1, 2, 2, 3]}
|
27
|
+
# ).with_columns(
|
28
|
+
# [
|
29
|
+
# Polars.col("cats").cast(:cat).cat.set_ordering("lexical")
|
30
|
+
# ]
|
31
|
+
# )
|
32
|
+
# df.sort(["cats", "vals"])
|
33
|
+
# # =>
|
34
|
+
# # shape: (5, 2)
|
35
|
+
# # ┌──────┬──────┐
|
36
|
+
# # │ cats ┆ vals │
|
37
|
+
# # │ --- ┆ --- │
|
38
|
+
# # │ cat ┆ i64 │
|
39
|
+
# # ╞══════╪══════╡
|
40
|
+
# # │ a ┆ 2 │
|
41
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
42
|
+
# # │ b ┆ 3 │
|
43
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
44
|
+
# # │ k ┆ 2 │
|
45
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
46
|
+
# # │ z ┆ 1 │
|
47
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
48
|
+
# # │ z ┆ 3 │
|
49
|
+
# # └──────┴──────┘
|
50
|
+
def set_ordering(ordering)
|
51
|
+
super
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,100 @@
|
|
1
|
+
module Polars
|
2
|
+
module Convert
|
3
|
+
# Construct a DataFrame from a dictionary of sequences.
|
4
|
+
#
|
5
|
+
# This operation clones data, unless you pass in a `Hash<String, Series>`.
|
6
|
+
#
|
7
|
+
# @param data [Hash]
|
8
|
+
# Two-dimensional data represented as a hash. Hash must contain
|
9
|
+
# arrays.
|
10
|
+
# @param columns [Array]
|
11
|
+
# Column labels to use for resulting DataFrame. If specified, overrides any
|
12
|
+
# labels already present in the data. Must match data dimensions.
|
13
|
+
#
|
14
|
+
# @return [DataFrame]
|
15
|
+
#
|
16
|
+
# @example
|
17
|
+
# data = {"a" => [1, 2], "b" => [3, 4]}
|
18
|
+
# Polars.from_hash(data)
|
19
|
+
# # =>
|
20
|
+
# # shape: (2, 2)
|
21
|
+
# # ┌─────┬─────┐
|
22
|
+
# # │ a ┆ b │
|
23
|
+
# # │ --- ┆ --- │
|
24
|
+
# # │ i64 ┆ i64 │
|
25
|
+
# # ╞═════╪═════╡
|
26
|
+
# # │ 1 ┆ 3 │
|
27
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
28
|
+
# # │ 2 ┆ 4 │
|
29
|
+
# # └─────┴─────┘
|
30
|
+
def from_hash(data, columns: nil)
|
31
|
+
DataFrame._from_hash(data, columns: columns)
|
32
|
+
end
|
33
|
+
|
34
|
+
# Construct a DataFrame from a sequence of dictionaries. This operation clones data.
|
35
|
+
#
|
36
|
+
# @param hashes [Array]
|
37
|
+
# Array with hashes mapping column name to value.
|
38
|
+
# @param infer_schema_length [Integer]
|
39
|
+
# How many hashes/rows to scan to determine the data types
|
40
|
+
# if set to `nil` all rows are scanned. This will be slow.
|
41
|
+
# @param schema [Object]
|
42
|
+
# Schema that (partially) overwrites the inferred schema.
|
43
|
+
#
|
44
|
+
# @return [DataFrame]
|
45
|
+
#
|
46
|
+
# @example
|
47
|
+
# data = [{"a" => 1, "b" => 4}, {"a" => 2, "b" => 5}, {"a" => 3, "b" => 6}]
|
48
|
+
# Polars.from_hashes(data)
|
49
|
+
# # =>
|
50
|
+
# # shape: (3, 2)
|
51
|
+
# # ┌─────┬─────┐
|
52
|
+
# # │ a ┆ b │
|
53
|
+
# # │ --- ┆ --- │
|
54
|
+
# # │ i64 ┆ i64 │
|
55
|
+
# # ╞═════╪═════╡
|
56
|
+
# # │ 1 ┆ 4 │
|
57
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
58
|
+
# # │ 2 ┆ 5 │
|
59
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
60
|
+
# # │ 3 ┆ 6 │
|
61
|
+
# # └─────┴─────┘
|
62
|
+
#
|
63
|
+
# @example Overwrite first column name and dtype
|
64
|
+
# Polars.from_hashes(data, schema: {"c" => :i32})
|
65
|
+
# # =>
|
66
|
+
# # shape: (3, 2)
|
67
|
+
# # ┌─────┬─────┐
|
68
|
+
# # │ c ┆ b │
|
69
|
+
# # │ --- ┆ --- │
|
70
|
+
# # │ i32 ┆ i64 │
|
71
|
+
# # ╞═════╪═════╡
|
72
|
+
# # │ 1 ┆ 4 │
|
73
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
74
|
+
# # │ 2 ┆ 5 │
|
75
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
76
|
+
# # │ 3 ┆ 6 │
|
77
|
+
# # └─────┴─────┘
|
78
|
+
#
|
79
|
+
# @example Let polars infer the dtypes but inform about a 3rd column
|
80
|
+
# Polars.from_hashes(data, schema: {"a" => :unknown, "b" => :unknown, "c" => :i32})
|
81
|
+
# # shape: (3, 3)
|
82
|
+
# # ┌─────┬─────┬──────┐
|
83
|
+
# # │ a ┆ b ┆ c │
|
84
|
+
# # │ --- ┆ --- ┆ --- │
|
85
|
+
# # │ i64 ┆ i64 ┆ i32 │
|
86
|
+
# # ╞═════╪═════╪══════╡
|
87
|
+
# # │ 1 ┆ 4 ┆ null │
|
88
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┤
|
89
|
+
# # │ 2 ┆ 5 ┆ null │
|
90
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┤
|
91
|
+
# # │ 3 ┆ 6 ┆ null │
|
92
|
+
# # └─────┴─────┴──────┘
|
93
|
+
# def from_hashes(hashes, infer_schema_length: 50, schema: nil)
|
94
|
+
# DataFrame._from_hashes(hashes, infer_schema_length: infer_schema_length, schema: schema)
|
95
|
+
# end
|
96
|
+
|
97
|
+
# def from_records
|
98
|
+
# end
|
99
|
+
end
|
100
|
+
end
|