polder 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- polder-0.1.0/PKG-INFO +121 -0
- polder-0.1.0/README.md +104 -0
- polder-0.1.0/pyproject.toml +48 -0
- polder-0.1.0/src/polder/__init__.py +110 -0
- polder-0.1.0/src/polder/config/__init__.py +134 -0
- polder-0.1.0/src/polder/eager/__init__.py +0 -0
- polder-0.1.0/src/polder/eager/_narwhals_df_equals.py +26 -0
- polder-0.1.0/src/polder/eager/align.py +188 -0
- polder-0.1.0/src/polder/eager/array.py +248 -0
- polder-0.1.0/src/polder/eager/binary.py +140 -0
- polder-0.1.0/src/polder/eager/labels.py +33 -0
- polder-0.1.0/src/polder/eager/pivot.py +270 -0
- polder-0.1.0/src/polder/eager/unary.py +134 -0
- polder-0.1.0/src/polder/eager/value_array.py +105 -0
- polder-0.1.0/src/polder/lazy/__init__.py +0 -0
- polder-0.1.0/src/polder/lazy/align.py +188 -0
- polder-0.1.0/src/polder/lazy/array.py +339 -0
- polder-0.1.0/src/polder/lazy/binary.py +154 -0
- polder-0.1.0/src/polder/lazy/pivot.py +366 -0
- polder-0.1.0/src/polder/lazy/unary.py +90 -0
- polder-0.1.0/src/polder/operations/__init__.py +0 -0
- polder-0.1.0/src/polder/operations/align.py +59 -0
- polder-0.1.0/src/polder/operations/conversion.py +123 -0
- polder-0.1.0/src/polder/operations/creation.py +166 -0
- polder-0.1.0/src/polder/operations/unary.py +102 -0
- polder-0.1.0/src/polder/protocols/__init__.py +0 -0
- polder-0.1.0/src/polder/protocols/array.py +221 -0
- polder-0.1.0/src/polder/protocols/descriptor.py +10 -0
- polder-0.1.0/src/polder/protocols/implementations.py +16 -0
- polder-0.1.0/src/polder/py.typed +0 -0
- polder-0.1.0/src/polder/utils/__init__.py +0 -0
- polder-0.1.0/src/polder/utils/indexer.py +54 -0
- polder-0.1.0/src/polder/utils/orderedset.py +12 -0
polder-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: polder
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Arrays with DataFrame axis labels
|
|
5
|
+
Author: Nardi Lam
|
|
6
|
+
Author-email: Nardi Lam <mail@nardilam.nl>
|
|
7
|
+
Requires-Dist: array-api-compat==1.11.2
|
|
8
|
+
Requires-Dist: array-api-extra>=0.7.2
|
|
9
|
+
Requires-Dist: immutabledict>=4.3.1
|
|
10
|
+
Requires-Dist: narwhals>=2.18.1
|
|
11
|
+
Requires-Dist: numpy>=2.4.4
|
|
12
|
+
Requires-Dist: ordered-set>=4.1.0
|
|
13
|
+
Requires-Dist: types-array-api>=1.1.4
|
|
14
|
+
Requires-Dist: typing-extensions>=4.15.0
|
|
15
|
+
Requires-Python: >=3.11
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
|
|
18
|
+
<div align="center">
|
|
19
|
+
|
|
20
|
+
<h2 align="center">
|
|
21
|
+
<code>polder</code> - Finding the middle-ground between array and relational data models
|
|
22
|
+
</h2>
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
</div>
|
|
27
|
+
|
|
28
|
+
## Introduction
|
|
29
|
+
|
|
30
|
+
`polder` is a Python package that provides a simple composite data container type:
|
|
31
|
+
|
|
32
|
+
1. a Numpy-style ([array API](https://data-apis.org/array-api/latest/)) array,
|
|
33
|
+
2. whose entries are labeled using Polars-compatible ([Narwhals](https://narwhals-dev.github.io/narwhals/)) DataFrames.
|
|
34
|
+
|
|
35
|
+
In other words, it lets you bundle your *data* and your *metadata* into a single container, using an array-based library to handle the data, and a relational library to handle the metadata.
|
|
36
|
+
|
|
37
|
+
Install with pip:
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install polder
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
and [read the documentation here](https://nardi.github.io/polder).
|
|
44
|
+
|
|
45
|
+
### Why?
|
|
46
|
+
|
|
47
|
+
The common situation in which this is useful if you have a Numpy(-like) array, but find yourself having difficulties keeping track of what the values represent. You might find yourself creating an auxillary data structure for this, where you keep some list of labels next to your array, so that you avoid for example combining two arrays with the same values, but in a different order. Then, you might end up having to write extra code keeping track of these labels throughout your computation. `polder` aims to provide this data structure in a more general way, that you can adapt to the needs of your application.
|
|
48
|
+
|
|
49
|
+
This is similar to the [pandas](https://pandas.pydata.org/) idea of a DataFrame, as a Numpy array with labels. With pandas you can take a 2D Numpy array and with it's `Index`es apply arbitrarily complex labels on both axes (rows and columns). However it falls short in a number of ways:
|
|
50
|
+
|
|
51
|
+
1. Its API is quite large and, while intuitive for simple tasks, provides a lot of surprises as well.
|
|
52
|
+
2. While its `MultiIndex` technically supports very complex labeling, this is not the most intuitive part of the API and its functionality is often not effectively used.
|
|
53
|
+
3. It only supports 2-dimensional data.
|
|
54
|
+
4. It is not very performance-minded: many of its operations (especially around the index) perform "magic" such as automatically reordering data or extending it with missing values. This also makes it hard to predictably interact with other libraries without a full conversion step.
|
|
55
|
+
|
|
56
|
+
We are of the opinion the relational model of the DataFrame as promoted by Polars and similar libraries is the better fit, and as such this library aims to maintain a clear separation between the array part and the relational part.
|
|
57
|
+
|
|
58
|
+
It is also quite similar to [Xarray](https://docs.xarray.dev/en/stable/), which provides a very flexible "labeled array" container, but there are some conceptual differences. These lie mostly in the intended scope of the library, which leads to some different design decisions:
|
|
59
|
+
|
|
60
|
+
1. `polder` aims to be purely an orchestration layer over existing array and data processing libraries, so that all significant processing work is dispatched to those, in order to make use of their efficient implementations.
|
|
61
|
+
2. `polder` aims to provide a simple data model, which allows converting array-based code without having to reconsider your data modelling approach. In other words, you should be able to take an unlabeled array, and it should be "obvious" from context which labels to assign to the entries.
|
|
62
|
+
3. `polder` aims to support a larger range of array/DataFrame backends, in order to enable the use of the library in various contexts (for example, as part of adding structure to a GPU-backed numerical algorithm, or to perform automatic differentiation through a labeled computation).
|
|
63
|
+
4. `polder` aims to not have any performance surprises compared to the backend libraries. For example, if you have a Numpy-backed array, doing `array[3:5]` should return a view, doing `array[[5, 7]]` should return a copy. The only exception to this is that many operations perform *alignment*: `arr1 + arr2` will ensure that the labels for both are aligned, otherwise the result is not meaningful. This auto-alignment can be disabled, in case you want to avoid all unexpected performance regressions.
|
|
64
|
+
5. `polder` doesn't try to provide a universal data model: if you have data that doesn't nicely fit into an array structure or you want to perform certain operations that are not well defined in terms of array shapes (such as binning), you might be better off just using a DataFrame library.
|
|
65
|
+
|
|
66
|
+
## Status
|
|
67
|
+
|
|
68
|
+
At the center of the library is the `FrameLabeledArray` protocol, which is what user code should be written against. This protocol currently supports a number of basic functionalities:
|
|
69
|
+
- Decomposing into `values` (array-like) and `labels` (sequence of DataFrame-likes).
|
|
70
|
+
- Indexing (NumPy-style), which also supports indexing using Narwhals expressions.
|
|
71
|
+
- Pivoting: splitting a single axis into multiple, by orthogonally decomposing the labels into seperate dimensions corresponding to different label columns (and the reverse, "unpivoting").
|
|
72
|
+
- All special (dunder) operations as defined in the [array API](https://data-apis.org/array-api/latest/).
|
|
73
|
+
|
|
74
|
+
This protocol is supported by a number of generic operations:
|
|
75
|
+
- Creation: `pld.from_values_and_labels`, `pld.from_frame`
|
|
76
|
+
- Alignment: reordering multiple arrays so that their labels match up (this usually happens automatically when performing other operations).
|
|
77
|
+
- Unary elementwise operations as defined in the [array API](https://data-apis.org/array-api/latest/) (e.g. `pld.sin(arr)`).
|
|
78
|
+
|
|
79
|
+
Then, there are currently two implementations of the protocol:
|
|
80
|
+
|
|
81
|
+
1. An implementation that is eager (every operation is fully resolved immediately) and in principle supports any array that follows the [array API](https://data-apis.org/array-api/latest/). Currently there is "real" support (i.e. with tests and proper typing) for NumPy and JAX arrays. Note that JAX is an optional dependency, so if you are using the library you have to install it manually, but this is usually already the case (how else would you pass in a JAX array?). This implementation supports all functionality defined in the protocol.
|
|
82
|
+
2. An implementation that is entirely backed by Narwhals LazyFrames, and evaluates all operations lazily. This is useful when you only want to express your operations in an array-style and don't need any special interop with array-based libraries, but either don't care about the way the values are stored, or already have your data in a DataFrame format supported by Narwhals and would like to keep all operations within that format. Using this implementation you can keep your data in the same backend it already is, and use whatever query engine that backend has to optimize large computations. This implementation is in early development and does not yet support all functionality.
|
|
83
|
+
|
|
84
|
+
The advantage of having multiple implementations is also that you can convert easily between them. It might make sense to first perform some processing fully in the relational context, so more of the computation can be efficiently handled by a single query engine, and then switch over to the hybrid model when some interop with array-libraries is important.
|
|
85
|
+
|
|
86
|
+
Ideas for future implementations are:
|
|
87
|
+
|
|
88
|
+
1. An implementation that acts eagerly on arrays, but performs lazy indexing/reordering of values. This will make use of Narwhals lazy API for the labels, allowing to e.g. perform multiple slicing/reshaping operations without making excess copies of the array values. Since a lot of code has a "transformation → computation → transformation" flow, where the transformation steps are mostly reordering data but not changing the values, and the computation wants to keep the data in its most efficient shape, this handoff can be a natural transition point between the relational operations on the metadata and the array-based operations in the computational core.
|
|
89
|
+
|
|
90
|
+
## Development guidelines
|
|
91
|
+
|
|
92
|
+
There are a few structural guidelines for the development of this library.
|
|
93
|
+
|
|
94
|
+
### The labeled array protocol
|
|
95
|
+
|
|
96
|
+
The main data container is `FrameLabeledArray`, which is provided as a protocol. The idea is that users can write code against this protocol, and then this code can be run against various implementations of the protocol.
|
|
97
|
+
|
|
98
|
+
To be more flexible in code reuse vs specialization for efficiency, the various implementations don't inherit from a common ancestor. Instead they all implement the protocol, but are free to vary in implementation arbitrarily. As such the decision whether something should be in the protocol or not is quite significant, as it should generalize to all implementations, and it will also mean that users will write code against it. For those reasons stability of the protocol is quite important.
|
|
99
|
+
|
|
100
|
+
The protocol should mostly follow the [array API](https://data-apis.org/array-api/latest/), since the labeled array objects are arrays first. Additional functionality (such as `align` or `pivot`) are defined if they make sense for the format. Some array API functionality is also extended, such as indexing with an expression (that will filter the labels) or reshaping along label dimensions instead of giving an explicit shape.
|
|
101
|
+
|
|
102
|
+
To support all kinds of backend libraries, the protocol is limited to only those functionalities that make sense in every paradigm. Particularly, all arrays are immutable, with the expectation that for high performance code an accelerator library is used. In addition, because the degree of "laziness" may vary between implementations, this cannot easily be made explicit in the protocol. Instead you should expect computations to be resolved as soon as the array values are converted to an eager form, for example when you call `array.values()`.
|
|
103
|
+
|
|
104
|
+
Because of the heavy reliance on protocols/structural typing, all library code should be fully typed.
|
|
105
|
+
|
|
106
|
+
### Code structure
|
|
107
|
+
|
|
108
|
+
The structure of the repo is roughly as follows:
|
|
109
|
+
|
|
110
|
+
* The generic protocols are defined in the `protocols` subpackage.
|
|
111
|
+
* Implementations get their own subpackage, for example anything related to the eager implementation is stored in the `eager` subpackage.
|
|
112
|
+
* Generic operations defined for `FrameLabeledArray` (that either have a generic implementation or dispatch to more specialized ones based on type) are defined in the `operations` subpackage.
|
|
113
|
+
* Common operations are re-exported in the top-level `polder` package for ease of use.
|
|
114
|
+
|
|
115
|
+
### Style
|
|
116
|
+
|
|
117
|
+
* All code is autoformatted using Ruff.
|
|
118
|
+
* Tests are written using pytest.
|
|
119
|
+
* Specific backend libraries should not be requirements for end-users, they should always be dynamically imported, with a few exceptions (currently `narwhals` and `numpy`).
|
|
120
|
+
* The recommended usage is `import polder as pld` (similar to other libraries).
|
|
121
|
+
* When writing comments, please use full sentences and always end with a period. Comments that are a single phrase don't need a period (think section header). Always put comments on a separate line, not after code.
|
polder-0.1.0/README.md
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
|
|
3
|
+
<h2 align="center">
|
|
4
|
+
<code>polder</code> - Finding the middle-ground between array and relational data models
|
|
5
|
+
</h2>
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
</div>
|
|
10
|
+
|
|
11
|
+
## Introduction
|
|
12
|
+
|
|
13
|
+
`polder` is a Python package that provides a simple composite data container type:
|
|
14
|
+
|
|
15
|
+
1. a Numpy-style ([array API](https://data-apis.org/array-api/latest/)) array,
|
|
16
|
+
2. whose entries are labeled using Polars-compatible ([Narwhals](https://narwhals-dev.github.io/narwhals/)) DataFrames.
|
|
17
|
+
|
|
18
|
+
In other words, it lets you bundle your *data* and your *metadata* into a single container, using an array-based library to handle the data, and a relational library to handle the metadata.
|
|
19
|
+
|
|
20
|
+
Install with pip:
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
pip install polder
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
and [read the documentation here](https://nardi.github.io/polder).
|
|
27
|
+
|
|
28
|
+
### Why?
|
|
29
|
+
|
|
30
|
+
The common situation in which this is useful if you have a Numpy(-like) array, but find yourself having difficulties keeping track of what the values represent. You might find yourself creating an auxillary data structure for this, where you keep some list of labels next to your array, so that you avoid for example combining two arrays with the same values, but in a different order. Then, you might end up having to write extra code keeping track of these labels throughout your computation. `polder` aims to provide this data structure in a more general way, that you can adapt to the needs of your application.
|
|
31
|
+
|
|
32
|
+
This is similar to the [pandas](https://pandas.pydata.org/) idea of a DataFrame, as a Numpy array with labels. With pandas you can take a 2D Numpy array and with it's `Index`es apply arbitrarily complex labels on both axes (rows and columns). However it falls short in a number of ways:
|
|
33
|
+
|
|
34
|
+
1. Its API is quite large and, while intuitive for simple tasks, provides a lot of surprises as well.
|
|
35
|
+
2. While its `MultiIndex` technically supports very complex labeling, this is not the most intuitive part of the API and its functionality is often not effectively used.
|
|
36
|
+
3. It only supports 2-dimensional data.
|
|
37
|
+
4. It is not very performance-minded: many of its operations (especially around the index) perform "magic" such as automatically reordering data or extending it with missing values. This also makes it hard to predictably interact with other libraries without a full conversion step.
|
|
38
|
+
|
|
39
|
+
We are of the opinion the relational model of the DataFrame as promoted by Polars and similar libraries is the better fit, and as such this library aims to maintain a clear separation between the array part and the relational part.
|
|
40
|
+
|
|
41
|
+
It is also quite similar to [Xarray](https://docs.xarray.dev/en/stable/), which provides a very flexible "labeled array" container, but there are some conceptual differences. These lie mostly in the intended scope of the library, which leads to some different design decisions:
|
|
42
|
+
|
|
43
|
+
1. `polder` aims to be purely an orchestration layer over existing array and data processing libraries, so that all significant processing work is dispatched to those, in order to make use of their efficient implementations.
|
|
44
|
+
2. `polder` aims to provide a simple data model, which allows converting array-based code without having to reconsider your data modelling approach. In other words, you should be able to take an unlabeled array, and it should be "obvious" from context which labels to assign to the entries.
|
|
45
|
+
3. `polder` aims to support a larger range of array/DataFrame backends, in order to enable the use of the library in various contexts (for example, as part of adding structure to a GPU-backed numerical algorithm, or to perform automatic differentiation through a labeled computation).
|
|
46
|
+
4. `polder` aims to not have any performance surprises compared to the backend libraries. For example, if you have a Numpy-backed array, doing `array[3:5]` should return a view, doing `array[[5, 7]]` should return a copy. The only exception to this is that many operations perform *alignment*: `arr1 + arr2` will ensure that the labels for both are aligned, otherwise the result is not meaningful. This auto-alignment can be disabled, in case you want to avoid all unexpected performance regressions.
|
|
47
|
+
5. `polder` doesn't try to provide a universal data model: if you have data that doesn't nicely fit into an array structure or you want to perform certain operations that are not well defined in terms of array shapes (such as binning), you might be better off just using a DataFrame library.
|
|
48
|
+
|
|
49
|
+
## Status
|
|
50
|
+
|
|
51
|
+
At the center of the library is the `FrameLabeledArray` protocol, which is what user code should be written against. This protocol currently supports a number of basic functionalities:
|
|
52
|
+
- Decomposing into `values` (array-like) and `labels` (sequence of DataFrame-likes).
|
|
53
|
+
- Indexing (NumPy-style), which also supports indexing using Narwhals expressions.
|
|
54
|
+
- Pivoting: splitting a single axis into multiple, by orthogonally decomposing the labels into seperate dimensions corresponding to different label columns (and the reverse, "unpivoting").
|
|
55
|
+
- All special (dunder) operations as defined in the [array API](https://data-apis.org/array-api/latest/).
|
|
56
|
+
|
|
57
|
+
This protocol is supported by a number of generic operations:
|
|
58
|
+
- Creation: `pld.from_values_and_labels`, `pld.from_frame`
|
|
59
|
+
- Alignment: reordering multiple arrays so that their labels match up (this usually happens automatically when performing other operations).
|
|
60
|
+
- Unary elementwise operations as defined in the [array API](https://data-apis.org/array-api/latest/) (e.g. `pld.sin(arr)`).
|
|
61
|
+
|
|
62
|
+
Then, there are currently two implementations of the protocol:
|
|
63
|
+
|
|
64
|
+
1. An implementation that is eager (every operation is fully resolved immediately) and in principle supports any array that follows the [array API](https://data-apis.org/array-api/latest/). Currently there is "real" support (i.e. with tests and proper typing) for NumPy and JAX arrays. Note that JAX is an optional dependency, so if you are using the library you have to install it manually, but this is usually already the case (how else would you pass in a JAX array?). This implementation supports all functionality defined in the protocol.
|
|
65
|
+
2. An implementation that is entirely backed by Narwhals LazyFrames, and evaluates all operations lazily. This is useful when you only want to express your operations in an array-style and don't need any special interop with array-based libraries, but either don't care about the way the values are stored, or already have your data in a DataFrame format supported by Narwhals and would like to keep all operations within that format. Using this implementation you can keep your data in the same backend it already is, and use whatever query engine that backend has to optimize large computations. This implementation is in early development and does not yet support all functionality.
|
|
66
|
+
|
|
67
|
+
The advantage of having multiple implementations is also that you can convert easily between them. It might make sense to first perform some processing fully in the relational context, so more of the computation can be efficiently handled by a single query engine, and then switch over to the hybrid model when some interop with array-libraries is important.
|
|
68
|
+
|
|
69
|
+
Ideas for future implementations are:
|
|
70
|
+
|
|
71
|
+
1. An implementation that acts eagerly on arrays, but performs lazy indexing/reordering of values. This will make use of Narwhals lazy API for the labels, allowing to e.g. perform multiple slicing/reshaping operations without making excess copies of the array values. Since a lot of code has a "transformation → computation → transformation" flow, where the transformation steps are mostly reordering data but not changing the values, and the computation wants to keep the data in its most efficient shape, this handoff can be a natural transition point between the relational operations on the metadata and the array-based operations in the computational core.
|
|
72
|
+
|
|
73
|
+
## Development guidelines
|
|
74
|
+
|
|
75
|
+
There are a few structural guidelines for the development of this library.
|
|
76
|
+
|
|
77
|
+
### The labeled array protocol
|
|
78
|
+
|
|
79
|
+
The main data container is `FrameLabeledArray`, which is provided as a protocol. The idea is that users can write code against this protocol, and then this code can be run against various implementations of the protocol.
|
|
80
|
+
|
|
81
|
+
To be more flexible in code reuse vs specialization for efficiency, the various implementations don't inherit from a common ancestor. Instead they all implement the protocol, but are free to vary in implementation arbitrarily. As such the decision whether something should be in the protocol or not is quite significant, as it should generalize to all implementations, and it will also mean that users will write code against it. For those reasons stability of the protocol is quite important.
|
|
82
|
+
|
|
83
|
+
The protocol should mostly follow the [array API](https://data-apis.org/array-api/latest/), since the labeled array objects are arrays first. Additional functionality (such as `align` or `pivot`) are defined if they make sense for the format. Some array API functionality is also extended, such as indexing with an expression (that will filter the labels) or reshaping along label dimensions instead of giving an explicit shape.
|
|
84
|
+
|
|
85
|
+
To support all kinds of backend libraries, the protocol is limited to only those functionalities that make sense in every paradigm. Particularly, all arrays are immutable, with the expectation that for high performance code an accelerator library is used. In addition, because the degree of "laziness" may vary between implementations, this cannot easily be made explicit in the protocol. Instead you should expect computations to be resolved as soon as the array values are converted to an eager form, for example when you call `array.values()`.
|
|
86
|
+
|
|
87
|
+
Because of the heavy reliance on protocols/structural typing, all library code should be fully typed.
|
|
88
|
+
|
|
89
|
+
### Code structure
|
|
90
|
+
|
|
91
|
+
The structure of the repo is roughly as follows:
|
|
92
|
+
|
|
93
|
+
* The generic protocols are defined in the `protocols` subpackage.
|
|
94
|
+
* Implementations get their own subpackage, for example anything related to the eager implementation is stored in the `eager` subpackage.
|
|
95
|
+
* Generic operations defined for `FrameLabeledArray` (that either have a generic implementation or dispatch to more specialized ones based on type) are defined in the `operations` subpackage.
|
|
96
|
+
* Common operations are re-exported in the top-level `polder` package for ease of use.
|
|
97
|
+
|
|
98
|
+
### Style
|
|
99
|
+
|
|
100
|
+
* All code is autoformatted using Ruff.
|
|
101
|
+
* Tests are written using pytest.
|
|
102
|
+
* Specific backend libraries should not be requirements for end-users, they should always be dynamically imported, with a few exceptions (currently `narwhals` and `numpy`).
|
|
103
|
+
* The recommended usage is `import polder as pld` (similar to other libraries).
|
|
104
|
+
* When writing comments, please use full sentences and always end with a period. Comments that are a single phrase don't need a period (think section header). Always put comments on a separate line, not after code.
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "polder"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Arrays with DataFrame axis labels"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [{ name = "Nardi Lam", email = "mail@nardilam.nl" }]
|
|
7
|
+
requires-python = ">=3.11"
|
|
8
|
+
dependencies = [
|
|
9
|
+
"array-api-compat==1.11.2",
|
|
10
|
+
"array-api-extra>=0.7.2",
|
|
11
|
+
"immutabledict>=4.3.1",
|
|
12
|
+
"narwhals>=2.18.1",
|
|
13
|
+
"numpy>=2.4.4",
|
|
14
|
+
"ordered-set>=4.1.0",
|
|
15
|
+
"types-array-api>=1.1.4",
|
|
16
|
+
"typing-extensions>=4.15.0",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
[build-system]
|
|
20
|
+
requires = ["uv_build>=0.11.2,<0.12"]
|
|
21
|
+
build-backend = "uv_build"
|
|
22
|
+
|
|
23
|
+
[dependency-groups]
|
|
24
|
+
docs = ["zensical>=0.0.46", "mkdocstrings-python>=2.0.5"]
|
|
25
|
+
dev = [
|
|
26
|
+
"polars>=1.39.3",
|
|
27
|
+
"pytest>=9.0.2",
|
|
28
|
+
"jax>=0.10.0",
|
|
29
|
+
"pytest-markdown-docs>=0.9.2",
|
|
30
|
+
{ include-group = "docs" },
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[tool.ruff.format]
|
|
34
|
+
preview = true
|
|
35
|
+
|
|
36
|
+
[tool.ruff.lint]
|
|
37
|
+
select = ["E4", "E7", "E9", "F", "I"]
|
|
38
|
+
ignore = ["E731"]
|
|
39
|
+
|
|
40
|
+
[tool.pytest.ini_options]
|
|
41
|
+
# The docs pages are executable: every Python code block is run as a test by
|
|
42
|
+
# pytest-markdown-docs, so the examples stay reproducible.
|
|
43
|
+
addopts = [
|
|
44
|
+
"--import-mode=importlib",
|
|
45
|
+
"--markdown-docs",
|
|
46
|
+
"--markdown-docs-syntax=superfences",
|
|
47
|
+
]
|
|
48
|
+
testpaths = ["tests", "docs"]
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
from polder import config
|
|
2
|
+
from polder.eager.array import EagerFrameLabeledArray
|
|
3
|
+
from polder.lazy.array import LazyFrameLabeledArray
|
|
4
|
+
from polder.operations.conversion import convert
|
|
5
|
+
from polder.operations.creation import from_frame, from_values_and_labels
|
|
6
|
+
from polder.operations.unary import (
|
|
7
|
+
abs_,
|
|
8
|
+
acos,
|
|
9
|
+
acosh,
|
|
10
|
+
asin,
|
|
11
|
+
asinh,
|
|
12
|
+
atan,
|
|
13
|
+
atanh,
|
|
14
|
+
bitwise_invert,
|
|
15
|
+
ceil,
|
|
16
|
+
conj,
|
|
17
|
+
cos,
|
|
18
|
+
cosh,
|
|
19
|
+
exp,
|
|
20
|
+
expm1,
|
|
21
|
+
floor,
|
|
22
|
+
imag,
|
|
23
|
+
invert,
|
|
24
|
+
isfinite,
|
|
25
|
+
isinf,
|
|
26
|
+
isnan,
|
|
27
|
+
log,
|
|
28
|
+
log1p,
|
|
29
|
+
log2,
|
|
30
|
+
log10,
|
|
31
|
+
logical_not,
|
|
32
|
+
neg,
|
|
33
|
+
pos,
|
|
34
|
+
real,
|
|
35
|
+
reciprocal,
|
|
36
|
+
round_,
|
|
37
|
+
sign,
|
|
38
|
+
signbit,
|
|
39
|
+
sin,
|
|
40
|
+
sinh,
|
|
41
|
+
sqrt,
|
|
42
|
+
square,
|
|
43
|
+
tan,
|
|
44
|
+
tanh,
|
|
45
|
+
trunc,
|
|
46
|
+
)
|
|
47
|
+
from polder.protocols.array import FrameLabeledArray
|
|
48
|
+
from polder.protocols.implementations import (
|
|
49
|
+
EAGER,
|
|
50
|
+
LAZY,
|
|
51
|
+
FrameLabeledArrayImplementation,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
__all__ = [
|
|
55
|
+
# Config interface
|
|
56
|
+
"config",
|
|
57
|
+
# Implementations
|
|
58
|
+
"EagerFrameLabeledArray",
|
|
59
|
+
"LazyFrameLabeledArray",
|
|
60
|
+
# Creation operations
|
|
61
|
+
"from_frame",
|
|
62
|
+
"from_values_and_labels",
|
|
63
|
+
# Conversion operations
|
|
64
|
+
"convert",
|
|
65
|
+
# Unary operations
|
|
66
|
+
"pos",
|
|
67
|
+
"neg",
|
|
68
|
+
"abs_",
|
|
69
|
+
"invert",
|
|
70
|
+
"acos",
|
|
71
|
+
"acosh",
|
|
72
|
+
"asin",
|
|
73
|
+
"asinh",
|
|
74
|
+
"atan",
|
|
75
|
+
"atanh",
|
|
76
|
+
"bitwise_invert",
|
|
77
|
+
"ceil",
|
|
78
|
+
"conj",
|
|
79
|
+
"cos",
|
|
80
|
+
"cosh",
|
|
81
|
+
"exp",
|
|
82
|
+
"expm1",
|
|
83
|
+
"floor",
|
|
84
|
+
"imag",
|
|
85
|
+
"isfinite",
|
|
86
|
+
"isinf",
|
|
87
|
+
"isnan",
|
|
88
|
+
"log",
|
|
89
|
+
"log1p",
|
|
90
|
+
"log2",
|
|
91
|
+
"log10",
|
|
92
|
+
"logical_not",
|
|
93
|
+
"real",
|
|
94
|
+
"reciprocal",
|
|
95
|
+
"round_",
|
|
96
|
+
"sign",
|
|
97
|
+
"signbit",
|
|
98
|
+
"sin",
|
|
99
|
+
"sinh",
|
|
100
|
+
"square",
|
|
101
|
+
"sqrt",
|
|
102
|
+
"tan",
|
|
103
|
+
"tanh",
|
|
104
|
+
"trunc",
|
|
105
|
+
# Protocol-related objects
|
|
106
|
+
"FrameLabeledArray",
|
|
107
|
+
"EAGER",
|
|
108
|
+
"LAZY",
|
|
109
|
+
"FrameLabeledArrayImplementation",
|
|
110
|
+
]
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""Global configuration settings for polder."""
|
|
2
|
+
|
|
3
|
+
from contextlib import _GeneratorContextManager, contextmanager
|
|
4
|
+
from contextvars import ContextVar
|
|
5
|
+
from typing import Generator, overload
|
|
6
|
+
|
|
7
|
+
_auto_align: ContextVar[bool] = ContextVar("auto_align", default=True)
|
|
8
|
+
"""When performing operations with eager arrays, alignment is performed
|
|
9
|
+
automatically if this setting is true."""
|
|
10
|
+
|
|
11
|
+
_use_eager_evaluation_for_lazy_arrays: ContextVar[bool] = ContextVar(
|
|
12
|
+
"use_eager_evaluation_for_lazy_arrays", default=False
|
|
13
|
+
)
|
|
14
|
+
"""Use DataFrames instead of LazyFrames for lazy arrays. This can be useful for
|
|
15
|
+
testing purposes, because errors will surface more quickly and closer to where
|
|
16
|
+
they originate."""
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@overload
|
|
20
|
+
def auto_align() -> bool: ...
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@overload
|
|
24
|
+
def auto_align(enable: bool) -> _GeneratorContextManager[None, None, None]: ...
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@overload
|
|
28
|
+
def use_eager_evaluation_for_lazy_arrays() -> bool: ...
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@overload
|
|
32
|
+
def use_eager_evaluation_for_lazy_arrays(
|
|
33
|
+
enable: bool,
|
|
34
|
+
) -> _GeneratorContextManager[None, None, None]: ...
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def auto_align(
|
|
38
|
+
enable: bool | None = None,
|
|
39
|
+
) -> bool | _GeneratorContextManager[None, None, None]:
|
|
40
|
+
"""When performing operations with eager arrays, alignment is performed
|
|
41
|
+
automatically if this setting is true.
|
|
42
|
+
|
|
43
|
+
When called without arguments, returns the current auto_align value.
|
|
44
|
+
When called with an argument, returns a context manager during which the
|
|
45
|
+
setting has the provided value.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
enable: Whether to enable or disable automatic alignment of arrays in binary
|
|
49
|
+
operations. When False, alignment is only checked but not performed. When
|
|
50
|
+
None (the default), the current value is returned instead.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
The current setting as a bool when enable is None, otherwise a context manager
|
|
54
|
+
during which the setting has the provided value.
|
|
55
|
+
|
|
56
|
+
Example:
|
|
57
|
+
```python
|
|
58
|
+
import polder as pld
|
|
59
|
+
|
|
60
|
+
# Get current setting
|
|
61
|
+
current = pld.config.auto_align()
|
|
62
|
+
|
|
63
|
+
# Disable auto-alignment temporarily
|
|
64
|
+
with pld.config.auto_align(False):
|
|
65
|
+
result = arr1 + arr2 # Only checks if alignment is needed
|
|
66
|
+
```
|
|
67
|
+
"""
|
|
68
|
+
if enable is None:
|
|
69
|
+
# Get current value
|
|
70
|
+
return _auto_align.get()
|
|
71
|
+
else:
|
|
72
|
+
# Return context manager
|
|
73
|
+
@contextmanager
|
|
74
|
+
def _context_manager() -> Generator[None, None, None]:
|
|
75
|
+
token = _auto_align.set(enable)
|
|
76
|
+
try:
|
|
77
|
+
yield
|
|
78
|
+
finally:
|
|
79
|
+
_auto_align.reset(token)
|
|
80
|
+
|
|
81
|
+
return _context_manager()
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def use_eager_evaluation_for_lazy_arrays(
|
|
85
|
+
enable: bool | None = None,
|
|
86
|
+
) -> bool | _GeneratorContextManager[None, None, None]:
|
|
87
|
+
"""Use DataFrames instead of LazyFrames for lazy arrays. This can be useful for
|
|
88
|
+
testing purposes, because errors will surface more quickly and closer to where
|
|
89
|
+
they originate.
|
|
90
|
+
|
|
91
|
+
Note that some errors will still only surface lazily. For example an
|
|
92
|
+
"invalid shape" error may only arise when the shape is extracted, not on the
|
|
93
|
+
operation that produces the invalid shape.
|
|
94
|
+
|
|
95
|
+
When called without arguments, returns the current setting value.
|
|
96
|
+
When called with an argument, returns a context manager during which the
|
|
97
|
+
setting has the provided value.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
enable: Whether to enable eager evaluation for lazy arrays. When None (the
|
|
101
|
+
default), the current value is returned instead.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
The current setting as a bool when enable is None, otherwise a context manager
|
|
105
|
+
during which the setting has the provided value.
|
|
106
|
+
|
|
107
|
+
Example:
|
|
108
|
+
```python
|
|
109
|
+
import polder as pld
|
|
110
|
+
|
|
111
|
+
with pld.config.use_eager_evaluation_for_lazy_arrays(True):
|
|
112
|
+
lazy_array = pld.from_values_and_labels(values, labels, implementation=LAZY)
|
|
113
|
+
```
|
|
114
|
+
"""
|
|
115
|
+
if enable is None:
|
|
116
|
+
# Get current value
|
|
117
|
+
return _use_eager_evaluation_for_lazy_arrays.get()
|
|
118
|
+
else:
|
|
119
|
+
# Return context manager
|
|
120
|
+
@contextmanager
|
|
121
|
+
def _context_manager() -> Generator[None, None, None]:
|
|
122
|
+
token = _use_eager_evaluation_for_lazy_arrays.set(enable)
|
|
123
|
+
try:
|
|
124
|
+
yield
|
|
125
|
+
finally:
|
|
126
|
+
_use_eager_evaluation_for_lazy_arrays.reset(token)
|
|
127
|
+
|
|
128
|
+
return _context_manager()
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
__all__ = [
|
|
132
|
+
"auto_align",
|
|
133
|
+
"use_eager_evaluation_for_lazy_arrays",
|
|
134
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import narwhals as nw
|
|
2
|
+
import narwhals.typing as nwt
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def narwhals_df_equals(l1: nwt.DataFrameT, l2: nwt.DataFrameT) -> bool:
|
|
6
|
+
"""Determines equality of two DataFrames. Considers them equal if they have the same type,
|
|
7
|
+
columns and rows, with ordering for both being the same as well."""
|
|
8
|
+
# Two DataFrames are not equal if they have different types, different columns, or a different
|
|
9
|
+
# number of rows.
|
|
10
|
+
if type(l1) is not type(l2) or l1.columns != l2.columns or len(l1) != len(l2):
|
|
11
|
+
return False
|
|
12
|
+
|
|
13
|
+
# Otherwise, they are equal iff an outer join on all columns including row index creates no
|
|
14
|
+
# extra rows.
|
|
15
|
+
assert "__index" not in l1.columns
|
|
16
|
+
return len(l1) == (
|
|
17
|
+
l1
|
|
18
|
+
.with_row_index("__index")
|
|
19
|
+
.lazy()
|
|
20
|
+
.join(
|
|
21
|
+
l2.with_row_index("__index").lazy(), on=[*l1.columns, "__index"], how="full"
|
|
22
|
+
)
|
|
23
|
+
.select(nw.col("__index").fill_null(-1).count())
|
|
24
|
+
.collect()
|
|
25
|
+
.item()
|
|
26
|
+
)
|