polder 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. polder-0.1.0/PKG-INFO +121 -0
  2. polder-0.1.0/README.md +104 -0
  3. polder-0.1.0/pyproject.toml +48 -0
  4. polder-0.1.0/src/polder/__init__.py +110 -0
  5. polder-0.1.0/src/polder/config/__init__.py +134 -0
  6. polder-0.1.0/src/polder/eager/__init__.py +0 -0
  7. polder-0.1.0/src/polder/eager/_narwhals_df_equals.py +26 -0
  8. polder-0.1.0/src/polder/eager/align.py +188 -0
  9. polder-0.1.0/src/polder/eager/array.py +248 -0
  10. polder-0.1.0/src/polder/eager/binary.py +140 -0
  11. polder-0.1.0/src/polder/eager/labels.py +33 -0
  12. polder-0.1.0/src/polder/eager/pivot.py +270 -0
  13. polder-0.1.0/src/polder/eager/unary.py +134 -0
  14. polder-0.1.0/src/polder/eager/value_array.py +105 -0
  15. polder-0.1.0/src/polder/lazy/__init__.py +0 -0
  16. polder-0.1.0/src/polder/lazy/align.py +188 -0
  17. polder-0.1.0/src/polder/lazy/array.py +339 -0
  18. polder-0.1.0/src/polder/lazy/binary.py +154 -0
  19. polder-0.1.0/src/polder/lazy/pivot.py +366 -0
  20. polder-0.1.0/src/polder/lazy/unary.py +90 -0
  21. polder-0.1.0/src/polder/operations/__init__.py +0 -0
  22. polder-0.1.0/src/polder/operations/align.py +59 -0
  23. polder-0.1.0/src/polder/operations/conversion.py +123 -0
  24. polder-0.1.0/src/polder/operations/creation.py +166 -0
  25. polder-0.1.0/src/polder/operations/unary.py +102 -0
  26. polder-0.1.0/src/polder/protocols/__init__.py +0 -0
  27. polder-0.1.0/src/polder/protocols/array.py +221 -0
  28. polder-0.1.0/src/polder/protocols/descriptor.py +10 -0
  29. polder-0.1.0/src/polder/protocols/implementations.py +16 -0
  30. polder-0.1.0/src/polder/py.typed +0 -0
  31. polder-0.1.0/src/polder/utils/__init__.py +0 -0
  32. polder-0.1.0/src/polder/utils/indexer.py +54 -0
  33. polder-0.1.0/src/polder/utils/orderedset.py +12 -0
polder-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,121 @@
1
+ Metadata-Version: 2.3
2
+ Name: polder
3
+ Version: 0.1.0
4
+ Summary: Arrays with DataFrame axis labels
5
+ Author: Nardi Lam
6
+ Author-email: Nardi Lam <mail@nardilam.nl>
7
+ Requires-Dist: array-api-compat==1.11.2
8
+ Requires-Dist: array-api-extra>=0.7.2
9
+ Requires-Dist: immutabledict>=4.3.1
10
+ Requires-Dist: narwhals>=2.18.1
11
+ Requires-Dist: numpy>=2.4.4
12
+ Requires-Dist: ordered-set>=4.1.0
13
+ Requires-Dist: types-array-api>=1.1.4
14
+ Requires-Dist: typing-extensions>=4.15.0
15
+ Requires-Python: >=3.11
16
+ Description-Content-Type: text/markdown
17
+
18
+ <div align="center">
19
+
20
+ <h2 align="center">
21
+ <code>polder</code> - Finding the middle-ground between array and relational data models
22
+ </h2>
23
+
24
+
25
+
26
+ </div>
27
+
28
+ ## Introduction
29
+
30
+ `polder` is a Python package that provides a simple composite data container type:
31
+
32
+ 1. a Numpy-style ([array API](https://data-apis.org/array-api/latest/)) array,
33
+ 2. whose entries are labeled using Polars-compatible ([Narwhals](https://narwhals-dev.github.io/narwhals/)) DataFrames.
34
+
35
+ In other words, it lets you bundle your *data* and your *metadata* into a single container, using an array-based library to handle the data, and a relational library to handle the metadata.
36
+
37
+ Install with pip:
38
+
39
+ ```bash
40
+ pip install polder
41
+ ```
42
+
43
+ and [read the documentation here](https://nardi.github.io/polder).
44
+
45
+ ### Why?
46
+
47
+ The common situation in which this is useful if you have a Numpy(-like) array, but find yourself having difficulties keeping track of what the values represent. You might find yourself creating an auxillary data structure for this, where you keep some list of labels next to your array, so that you avoid for example combining two arrays with the same values, but in a different order. Then, you might end up having to write extra code keeping track of these labels throughout your computation. `polder` aims to provide this data structure in a more general way, that you can adapt to the needs of your application.
48
+
49
+ This is similar to the [pandas](https://pandas.pydata.org/) idea of a DataFrame, as a Numpy array with labels. With pandas you can take a 2D Numpy array and with it's `Index`es apply arbitrarily complex labels on both axes (rows and columns). However it falls short in a number of ways:
50
+
51
+ 1. Its API is quite large and, while intuitive for simple tasks, provides a lot of surprises as well.
52
+ 2. While its `MultiIndex` technically supports very complex labeling, this is not the most intuitive part of the API and its functionality is often not effectively used.
53
+ 3. It only supports 2-dimensional data.
54
+ 4. It is not very performance-minded: many of its operations (especially around the index) perform "magic" such as automatically reordering data or extending it with missing values. This also makes it hard to predictably interact with other libraries without a full conversion step.
55
+
56
+ We are of the opinion the relational model of the DataFrame as promoted by Polars and similar libraries is the better fit, and as such this library aims to maintain a clear separation between the array part and the relational part.
57
+
58
+ It is also quite similar to [Xarray](https://docs.xarray.dev/en/stable/), which provides a very flexible "labeled array" container, but there are some conceptual differences. These lie mostly in the intended scope of the library, which leads to some different design decisions:
59
+
60
+ 1. `polder` aims to be purely an orchestration layer over existing array and data processing libraries, so that all significant processing work is dispatched to those, in order to make use of their efficient implementations.
61
+ 2. `polder` aims to provide a simple data model, which allows converting array-based code without having to reconsider your data modelling approach. In other words, you should be able to take an unlabeled array, and it should be "obvious" from context which labels to assign to the entries.
62
+ 3. `polder` aims to support a larger range of array/DataFrame backends, in order to enable the use of the library in various contexts (for example, as part of adding structure to a GPU-backed numerical algorithm, or to perform automatic differentiation through a labeled computation).
63
+ 4. `polder` aims to not have any performance surprises compared to the backend libraries. For example, if you have a Numpy-backed array, doing `array[3:5]` should return a view, doing `array[[5, 7]]` should return a copy. The only exception to this is that many operations perform *alignment*: `arr1 + arr2` will ensure that the labels for both are aligned, otherwise the result is not meaningful. This auto-alignment can be disabled, in case you want to avoid all unexpected performance regressions.
64
+ 5. `polder` doesn't try to provide a universal data model: if you have data that doesn't nicely fit into an array structure or you want to perform certain operations that are not well defined in terms of array shapes (such as binning), you might be better off just using a DataFrame library.
65
+
66
+ ## Status
67
+
68
+ At the center of the library is the `FrameLabeledArray` protocol, which is what user code should be written against. This protocol currently supports a number of basic functionalities:
69
+ - Decomposing into `values` (array-like) and `labels` (sequence of DataFrame-likes).
70
+ - Indexing (NumPy-style), which also supports indexing using Narwhals expressions.
71
+ - Pivoting: splitting a single axis into multiple, by orthogonally decomposing the labels into seperate dimensions corresponding to different label columns (and the reverse, "unpivoting").
72
+ - All special (dunder) operations as defined in the [array API](https://data-apis.org/array-api/latest/).
73
+
74
+ This protocol is supported by a number of generic operations:
75
+ - Creation: `pld.from_values_and_labels`, `pld.from_frame`
76
+ - Alignment: reordering multiple arrays so that their labels match up (this usually happens automatically when performing other operations).
77
+ - Unary elementwise operations as defined in the [array API](https://data-apis.org/array-api/latest/) (e.g. `pld.sin(arr)`).
78
+
79
+ Then, there are currently two implementations of the protocol:
80
+
81
+ 1. An implementation that is eager (every operation is fully resolved immediately) and in principle supports any array that follows the [array API](https://data-apis.org/array-api/latest/). Currently there is "real" support (i.e. with tests and proper typing) for NumPy and JAX arrays. Note that JAX is an optional dependency, so if you are using the library you have to install it manually, but this is usually already the case (how else would you pass in a JAX array?). This implementation supports all functionality defined in the protocol.
82
+ 2. An implementation that is entirely backed by Narwhals LazyFrames, and evaluates all operations lazily. This is useful when you only want to express your operations in an array-style and don't need any special interop with array-based libraries, but either don't care about the way the values are stored, or already have your data in a DataFrame format supported by Narwhals and would like to keep all operations within that format. Using this implementation you can keep your data in the same backend it already is, and use whatever query engine that backend has to optimize large computations. This implementation is in early development and does not yet support all functionality.
83
+
84
+ The advantage of having multiple implementations is also that you can convert easily between them. It might make sense to first perform some processing fully in the relational context, so more of the computation can be efficiently handled by a single query engine, and then switch over to the hybrid model when some interop with array-libraries is important.
85
+
86
+ Ideas for future implementations are:
87
+
88
+ 1. An implementation that acts eagerly on arrays, but performs lazy indexing/reordering of values. This will make use of Narwhals lazy API for the labels, allowing to e.g. perform multiple slicing/reshaping operations without making excess copies of the array values. Since a lot of code has a "transformation → computation → transformation" flow, where the transformation steps are mostly reordering data but not changing the values, and the computation wants to keep the data in its most efficient shape, this handoff can be a natural transition point between the relational operations on the metadata and the array-based operations in the computational core.
89
+
90
+ ## Development guidelines
91
+
92
+ There are a few structural guidelines for the development of this library.
93
+
94
+ ### The labeled array protocol
95
+
96
+ The main data container is `FrameLabeledArray`, which is provided as a protocol. The idea is that users can write code against this protocol, and then this code can be run against various implementations of the protocol.
97
+
98
+ To be more flexible in code reuse vs specialization for efficiency, the various implementations don't inherit from a common ancestor. Instead they all implement the protocol, but are free to vary in implementation arbitrarily. As such the decision whether something should be in the protocol or not is quite significant, as it should generalize to all implementations, and it will also mean that users will write code against it. For those reasons stability of the protocol is quite important.
99
+
100
+ The protocol should mostly follow the [array API](https://data-apis.org/array-api/latest/), since the labeled array objects are arrays first. Additional functionality (such as `align` or `pivot`) are defined if they make sense for the format. Some array API functionality is also extended, such as indexing with an expression (that will filter the labels) or reshaping along label dimensions instead of giving an explicit shape.
101
+
102
+ To support all kinds of backend libraries, the protocol is limited to only those functionalities that make sense in every paradigm. Particularly, all arrays are immutable, with the expectation that for high performance code an accelerator library is used. In addition, because the degree of "laziness" may vary between implementations, this cannot easily be made explicit in the protocol. Instead you should expect computations to be resolved as soon as the array values are converted to an eager form, for example when you call `array.values()`.
103
+
104
+ Because of the heavy reliance on protocols/structural typing, all library code should be fully typed.
105
+
106
+ ### Code structure
107
+
108
+ The structure of the repo is roughly as follows:
109
+
110
+ * The generic protocols are defined in the `protocols` subpackage.
111
+ * Implementations get their own subpackage, for example anything related to the eager implementation is stored in the `eager` subpackage.
112
+ * Generic operations defined for `FrameLabeledArray` (that either have a generic implementation or dispatch to more specialized ones based on type) are defined in the `operations` subpackage.
113
+ * Common operations are re-exported in the top-level `polder` package for ease of use.
114
+
115
+ ### Style
116
+
117
+ * All code is autoformatted using Ruff.
118
+ * Tests are written using pytest.
119
+ * Specific backend libraries should not be requirements for end-users, they should always be dynamically imported, with a few exceptions (currently `narwhals` and `numpy`).
120
+ * The recommended usage is `import polder as pld` (similar to other libraries).
121
+ * When writing comments, please use full sentences and always end with a period. Comments that are a single phrase don't need a period (think section header). Always put comments on a separate line, not after code.
polder-0.1.0/README.md ADDED
@@ -0,0 +1,104 @@
1
+ <div align="center">
2
+
3
+ <h2 align="center">
4
+ <code>polder</code> - Finding the middle-ground between array and relational data models
5
+ </h2>
6
+
7
+
8
+
9
+ </div>
10
+
11
+ ## Introduction
12
+
13
+ `polder` is a Python package that provides a simple composite data container type:
14
+
15
+ 1. a Numpy-style ([array API](https://data-apis.org/array-api/latest/)) array,
16
+ 2. whose entries are labeled using Polars-compatible ([Narwhals](https://narwhals-dev.github.io/narwhals/)) DataFrames.
17
+
18
+ In other words, it lets you bundle your *data* and your *metadata* into a single container, using an array-based library to handle the data, and a relational library to handle the metadata.
19
+
20
+ Install with pip:
21
+
22
+ ```bash
23
+ pip install polder
24
+ ```
25
+
26
+ and [read the documentation here](https://nardi.github.io/polder).
27
+
28
+ ### Why?
29
+
30
+ The common situation in which this is useful if you have a Numpy(-like) array, but find yourself having difficulties keeping track of what the values represent. You might find yourself creating an auxillary data structure for this, where you keep some list of labels next to your array, so that you avoid for example combining two arrays with the same values, but in a different order. Then, you might end up having to write extra code keeping track of these labels throughout your computation. `polder` aims to provide this data structure in a more general way, that you can adapt to the needs of your application.
31
+
32
+ This is similar to the [pandas](https://pandas.pydata.org/) idea of a DataFrame, as a Numpy array with labels. With pandas you can take a 2D Numpy array and with it's `Index`es apply arbitrarily complex labels on both axes (rows and columns). However it falls short in a number of ways:
33
+
34
+ 1. Its API is quite large and, while intuitive for simple tasks, provides a lot of surprises as well.
35
+ 2. While its `MultiIndex` technically supports very complex labeling, this is not the most intuitive part of the API and its functionality is often not effectively used.
36
+ 3. It only supports 2-dimensional data.
37
+ 4. It is not very performance-minded: many of its operations (especially around the index) perform "magic" such as automatically reordering data or extending it with missing values. This also makes it hard to predictably interact with other libraries without a full conversion step.
38
+
39
+ We are of the opinion the relational model of the DataFrame as promoted by Polars and similar libraries is the better fit, and as such this library aims to maintain a clear separation between the array part and the relational part.
40
+
41
+ It is also quite similar to [Xarray](https://docs.xarray.dev/en/stable/), which provides a very flexible "labeled array" container, but there are some conceptual differences. These lie mostly in the intended scope of the library, which leads to some different design decisions:
42
+
43
+ 1. `polder` aims to be purely an orchestration layer over existing array and data processing libraries, so that all significant processing work is dispatched to those, in order to make use of their efficient implementations.
44
+ 2. `polder` aims to provide a simple data model, which allows converting array-based code without having to reconsider your data modelling approach. In other words, you should be able to take an unlabeled array, and it should be "obvious" from context which labels to assign to the entries.
45
+ 3. `polder` aims to support a larger range of array/DataFrame backends, in order to enable the use of the library in various contexts (for example, as part of adding structure to a GPU-backed numerical algorithm, or to perform automatic differentiation through a labeled computation).
46
+ 4. `polder` aims to not have any performance surprises compared to the backend libraries. For example, if you have a Numpy-backed array, doing `array[3:5]` should return a view, doing `array[[5, 7]]` should return a copy. The only exception to this is that many operations perform *alignment*: `arr1 + arr2` will ensure that the labels for both are aligned, otherwise the result is not meaningful. This auto-alignment can be disabled, in case you want to avoid all unexpected performance regressions.
47
+ 5. `polder` doesn't try to provide a universal data model: if you have data that doesn't nicely fit into an array structure or you want to perform certain operations that are not well defined in terms of array shapes (such as binning), you might be better off just using a DataFrame library.
48
+
49
+ ## Status
50
+
51
+ At the center of the library is the `FrameLabeledArray` protocol, which is what user code should be written against. This protocol currently supports a number of basic functionalities:
52
+ - Decomposing into `values` (array-like) and `labels` (sequence of DataFrame-likes).
53
+ - Indexing (NumPy-style), which also supports indexing using Narwhals expressions.
54
+ - Pivoting: splitting a single axis into multiple, by orthogonally decomposing the labels into seperate dimensions corresponding to different label columns (and the reverse, "unpivoting").
55
+ - All special (dunder) operations as defined in the [array API](https://data-apis.org/array-api/latest/).
56
+
57
+ This protocol is supported by a number of generic operations:
58
+ - Creation: `pld.from_values_and_labels`, `pld.from_frame`
59
+ - Alignment: reordering multiple arrays so that their labels match up (this usually happens automatically when performing other operations).
60
+ - Unary elementwise operations as defined in the [array API](https://data-apis.org/array-api/latest/) (e.g. `pld.sin(arr)`).
61
+
62
+ Then, there are currently two implementations of the protocol:
63
+
64
+ 1. An implementation that is eager (every operation is fully resolved immediately) and in principle supports any array that follows the [array API](https://data-apis.org/array-api/latest/). Currently there is "real" support (i.e. with tests and proper typing) for NumPy and JAX arrays. Note that JAX is an optional dependency, so if you are using the library you have to install it manually, but this is usually already the case (how else would you pass in a JAX array?). This implementation supports all functionality defined in the protocol.
65
+ 2. An implementation that is entirely backed by Narwhals LazyFrames, and evaluates all operations lazily. This is useful when you only want to express your operations in an array-style and don't need any special interop with array-based libraries, but either don't care about the way the values are stored, or already have your data in a DataFrame format supported by Narwhals and would like to keep all operations within that format. Using this implementation you can keep your data in the same backend it already is, and use whatever query engine that backend has to optimize large computations. This implementation is in early development and does not yet support all functionality.
66
+
67
+ The advantage of having multiple implementations is also that you can convert easily between them. It might make sense to first perform some processing fully in the relational context, so more of the computation can be efficiently handled by a single query engine, and then switch over to the hybrid model when some interop with array-libraries is important.
68
+
69
+ Ideas for future implementations are:
70
+
71
+ 1. An implementation that acts eagerly on arrays, but performs lazy indexing/reordering of values. This will make use of Narwhals lazy API for the labels, allowing to e.g. perform multiple slicing/reshaping operations without making excess copies of the array values. Since a lot of code has a "transformation → computation → transformation" flow, where the transformation steps are mostly reordering data but not changing the values, and the computation wants to keep the data in its most efficient shape, this handoff can be a natural transition point between the relational operations on the metadata and the array-based operations in the computational core.
72
+
73
+ ## Development guidelines
74
+
75
+ There are a few structural guidelines for the development of this library.
76
+
77
+ ### The labeled array protocol
78
+
79
+ The main data container is `FrameLabeledArray`, which is provided as a protocol. The idea is that users can write code against this protocol, and then this code can be run against various implementations of the protocol.
80
+
81
+ To be more flexible in code reuse vs specialization for efficiency, the various implementations don't inherit from a common ancestor. Instead they all implement the protocol, but are free to vary in implementation arbitrarily. As such the decision whether something should be in the protocol or not is quite significant, as it should generalize to all implementations, and it will also mean that users will write code against it. For those reasons stability of the protocol is quite important.
82
+
83
+ The protocol should mostly follow the [array API](https://data-apis.org/array-api/latest/), since the labeled array objects are arrays first. Additional functionality (such as `align` or `pivot`) are defined if they make sense for the format. Some array API functionality is also extended, such as indexing with an expression (that will filter the labels) or reshaping along label dimensions instead of giving an explicit shape.
84
+
85
+ To support all kinds of backend libraries, the protocol is limited to only those functionalities that make sense in every paradigm. Particularly, all arrays are immutable, with the expectation that for high performance code an accelerator library is used. In addition, because the degree of "laziness" may vary between implementations, this cannot easily be made explicit in the protocol. Instead you should expect computations to be resolved as soon as the array values are converted to an eager form, for example when you call `array.values()`.
86
+
87
+ Because of the heavy reliance on protocols/structural typing, all library code should be fully typed.
88
+
89
+ ### Code structure
90
+
91
+ The structure of the repo is roughly as follows:
92
+
93
+ * The generic protocols are defined in the `protocols` subpackage.
94
+ * Implementations get their own subpackage, for example anything related to the eager implementation is stored in the `eager` subpackage.
95
+ * Generic operations defined for `FrameLabeledArray` (that either have a generic implementation or dispatch to more specialized ones based on type) are defined in the `operations` subpackage.
96
+ * Common operations are re-exported in the top-level `polder` package for ease of use.
97
+
98
+ ### Style
99
+
100
+ * All code is autoformatted using Ruff.
101
+ * Tests are written using pytest.
102
+ * Specific backend libraries should not be requirements for end-users, they should always be dynamically imported, with a few exceptions (currently `narwhals` and `numpy`).
103
+ * The recommended usage is `import polder as pld` (similar to other libraries).
104
+ * When writing comments, please use full sentences and always end with a period. Comments that are a single phrase don't need a period (think section header). Always put comments on a separate line, not after code.
@@ -0,0 +1,48 @@
1
+ [project]
2
+ name = "polder"
3
+ version = "0.1.0"
4
+ description = "Arrays with DataFrame axis labels"
5
+ readme = "README.md"
6
+ authors = [{ name = "Nardi Lam", email = "mail@nardilam.nl" }]
7
+ requires-python = ">=3.11"
8
+ dependencies = [
9
+ "array-api-compat==1.11.2",
10
+ "array-api-extra>=0.7.2",
11
+ "immutabledict>=4.3.1",
12
+ "narwhals>=2.18.1",
13
+ "numpy>=2.4.4",
14
+ "ordered-set>=4.1.0",
15
+ "types-array-api>=1.1.4",
16
+ "typing-extensions>=4.15.0",
17
+ ]
18
+
19
+ [build-system]
20
+ requires = ["uv_build>=0.11.2,<0.12"]
21
+ build-backend = "uv_build"
22
+
23
+ [dependency-groups]
24
+ docs = ["zensical>=0.0.46", "mkdocstrings-python>=2.0.5"]
25
+ dev = [
26
+ "polars>=1.39.3",
27
+ "pytest>=9.0.2",
28
+ "jax>=0.10.0",
29
+ "pytest-markdown-docs>=0.9.2",
30
+ { include-group = "docs" },
31
+ ]
32
+
33
+ [tool.ruff.format]
34
+ preview = true
35
+
36
+ [tool.ruff.lint]
37
+ select = ["E4", "E7", "E9", "F", "I"]
38
+ ignore = ["E731"]
39
+
40
+ [tool.pytest.ini_options]
41
+ # The docs pages are executable: every Python code block is run as a test by
42
+ # pytest-markdown-docs, so the examples stay reproducible.
43
+ addopts = [
44
+ "--import-mode=importlib",
45
+ "--markdown-docs",
46
+ "--markdown-docs-syntax=superfences",
47
+ ]
48
+ testpaths = ["tests", "docs"]
@@ -0,0 +1,110 @@
1
+ from polder import config
2
+ from polder.eager.array import EagerFrameLabeledArray
3
+ from polder.lazy.array import LazyFrameLabeledArray
4
+ from polder.operations.conversion import convert
5
+ from polder.operations.creation import from_frame, from_values_and_labels
6
+ from polder.operations.unary import (
7
+ abs_,
8
+ acos,
9
+ acosh,
10
+ asin,
11
+ asinh,
12
+ atan,
13
+ atanh,
14
+ bitwise_invert,
15
+ ceil,
16
+ conj,
17
+ cos,
18
+ cosh,
19
+ exp,
20
+ expm1,
21
+ floor,
22
+ imag,
23
+ invert,
24
+ isfinite,
25
+ isinf,
26
+ isnan,
27
+ log,
28
+ log1p,
29
+ log2,
30
+ log10,
31
+ logical_not,
32
+ neg,
33
+ pos,
34
+ real,
35
+ reciprocal,
36
+ round_,
37
+ sign,
38
+ signbit,
39
+ sin,
40
+ sinh,
41
+ sqrt,
42
+ square,
43
+ tan,
44
+ tanh,
45
+ trunc,
46
+ )
47
+ from polder.protocols.array import FrameLabeledArray
48
+ from polder.protocols.implementations import (
49
+ EAGER,
50
+ LAZY,
51
+ FrameLabeledArrayImplementation,
52
+ )
53
+
54
+ __all__ = [
55
+ # Config interface
56
+ "config",
57
+ # Implementations
58
+ "EagerFrameLabeledArray",
59
+ "LazyFrameLabeledArray",
60
+ # Creation operations
61
+ "from_frame",
62
+ "from_values_and_labels",
63
+ # Conversion operations
64
+ "convert",
65
+ # Unary operations
66
+ "pos",
67
+ "neg",
68
+ "abs_",
69
+ "invert",
70
+ "acos",
71
+ "acosh",
72
+ "asin",
73
+ "asinh",
74
+ "atan",
75
+ "atanh",
76
+ "bitwise_invert",
77
+ "ceil",
78
+ "conj",
79
+ "cos",
80
+ "cosh",
81
+ "exp",
82
+ "expm1",
83
+ "floor",
84
+ "imag",
85
+ "isfinite",
86
+ "isinf",
87
+ "isnan",
88
+ "log",
89
+ "log1p",
90
+ "log2",
91
+ "log10",
92
+ "logical_not",
93
+ "real",
94
+ "reciprocal",
95
+ "round_",
96
+ "sign",
97
+ "signbit",
98
+ "sin",
99
+ "sinh",
100
+ "square",
101
+ "sqrt",
102
+ "tan",
103
+ "tanh",
104
+ "trunc",
105
+ # Protocol-related objects
106
+ "FrameLabeledArray",
107
+ "EAGER",
108
+ "LAZY",
109
+ "FrameLabeledArrayImplementation",
110
+ ]
@@ -0,0 +1,134 @@
1
+ """Global configuration settings for polder."""
2
+
3
+ from contextlib import _GeneratorContextManager, contextmanager
4
+ from contextvars import ContextVar
5
+ from typing import Generator, overload
6
+
7
+ _auto_align: ContextVar[bool] = ContextVar("auto_align", default=True)
8
+ """When performing operations with eager arrays, alignment is performed
9
+ automatically if this setting is true."""
10
+
11
+ _use_eager_evaluation_for_lazy_arrays: ContextVar[bool] = ContextVar(
12
+ "use_eager_evaluation_for_lazy_arrays", default=False
13
+ )
14
+ """Use DataFrames instead of LazyFrames for lazy arrays. This can be useful for
15
+ testing purposes, because errors will surface more quickly and closer to where
16
+ they originate."""
17
+
18
+
19
+ @overload
20
+ def auto_align() -> bool: ...
21
+
22
+
23
+ @overload
24
+ def auto_align(enable: bool) -> _GeneratorContextManager[None, None, None]: ...
25
+
26
+
27
+ @overload
28
+ def use_eager_evaluation_for_lazy_arrays() -> bool: ...
29
+
30
+
31
+ @overload
32
+ def use_eager_evaluation_for_lazy_arrays(
33
+ enable: bool,
34
+ ) -> _GeneratorContextManager[None, None, None]: ...
35
+
36
+
37
+ def auto_align(
38
+ enable: bool | None = None,
39
+ ) -> bool | _GeneratorContextManager[None, None, None]:
40
+ """When performing operations with eager arrays, alignment is performed
41
+ automatically if this setting is true.
42
+
43
+ When called without arguments, returns the current auto_align value.
44
+ When called with an argument, returns a context manager during which the
45
+ setting has the provided value.
46
+
47
+ Args:
48
+ enable: Whether to enable or disable automatic alignment of arrays in binary
49
+ operations. When False, alignment is only checked but not performed. When
50
+ None (the default), the current value is returned instead.
51
+
52
+ Returns:
53
+ The current setting as a bool when enable is None, otherwise a context manager
54
+ during which the setting has the provided value.
55
+
56
+ Example:
57
+ ```python
58
+ import polder as pld
59
+
60
+ # Get current setting
61
+ current = pld.config.auto_align()
62
+
63
+ # Disable auto-alignment temporarily
64
+ with pld.config.auto_align(False):
65
+ result = arr1 + arr2 # Only checks if alignment is needed
66
+ ```
67
+ """
68
+ if enable is None:
69
+ # Get current value
70
+ return _auto_align.get()
71
+ else:
72
+ # Return context manager
73
+ @contextmanager
74
+ def _context_manager() -> Generator[None, None, None]:
75
+ token = _auto_align.set(enable)
76
+ try:
77
+ yield
78
+ finally:
79
+ _auto_align.reset(token)
80
+
81
+ return _context_manager()
82
+
83
+
84
+ def use_eager_evaluation_for_lazy_arrays(
85
+ enable: bool | None = None,
86
+ ) -> bool | _GeneratorContextManager[None, None, None]:
87
+ """Use DataFrames instead of LazyFrames for lazy arrays. This can be useful for
88
+ testing purposes, because errors will surface more quickly and closer to where
89
+ they originate.
90
+
91
+ Note that some errors will still only surface lazily. For example an
92
+ "invalid shape" error may only arise when the shape is extracted, not on the
93
+ operation that produces the invalid shape.
94
+
95
+ When called without arguments, returns the current setting value.
96
+ When called with an argument, returns a context manager during which the
97
+ setting has the provided value.
98
+
99
+ Args:
100
+ enable: Whether to enable eager evaluation for lazy arrays. When None (the
101
+ default), the current value is returned instead.
102
+
103
+ Returns:
104
+ The current setting as a bool when enable is None, otherwise a context manager
105
+ during which the setting has the provided value.
106
+
107
+ Example:
108
+ ```python
109
+ import polder as pld
110
+
111
+ with pld.config.use_eager_evaluation_for_lazy_arrays(True):
112
+ lazy_array = pld.from_values_and_labels(values, labels, implementation=LAZY)
113
+ ```
114
+ """
115
+ if enable is None:
116
+ # Get current value
117
+ return _use_eager_evaluation_for_lazy_arrays.get()
118
+ else:
119
+ # Return context manager
120
+ @contextmanager
121
+ def _context_manager() -> Generator[None, None, None]:
122
+ token = _use_eager_evaluation_for_lazy_arrays.set(enable)
123
+ try:
124
+ yield
125
+ finally:
126
+ _use_eager_evaluation_for_lazy_arrays.reset(token)
127
+
128
+ return _context_manager()
129
+
130
+
131
+ __all__ = [
132
+ "auto_align",
133
+ "use_eager_evaluation_for_lazy_arrays",
134
+ ]
File without changes
@@ -0,0 +1,26 @@
1
+ import narwhals as nw
2
+ import narwhals.typing as nwt
3
+
4
+
5
+ def narwhals_df_equals(l1: nwt.DataFrameT, l2: nwt.DataFrameT) -> bool:
6
+ """Determines equality of two DataFrames. Considers them equal if they have the same type,
7
+ columns and rows, with ordering for both being the same as well."""
8
+ # Two DataFrames are not equal if they have different types, different columns, or a different
9
+ # number of rows.
10
+ if type(l1) is not type(l2) or l1.columns != l2.columns or len(l1) != len(l2):
11
+ return False
12
+
13
+ # Otherwise, they are equal iff an outer join on all columns including row index creates no
14
+ # extra rows.
15
+ assert "__index" not in l1.columns
16
+ return len(l1) == (
17
+ l1
18
+ .with_row_index("__index")
19
+ .lazy()
20
+ .join(
21
+ l2.with_row_index("__index").lazy(), on=[*l1.columns, "__index"], how="full"
22
+ )
23
+ .select(nw.col("__index").fill_null(-1).count())
24
+ .collect()
25
+ .item()
26
+ )