typol 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 PDT PARTNERS
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
typol-0.0.1/PKG-INFO ADDED
@@ -0,0 +1,11 @@
1
+ Metadata-Version: 2.4
2
+ Name: typol
3
+ Version: 0.0.1
4
+ Requires-Python: >=3.12
5
+ License-File: LICENSE.txt
6
+ Requires-Dist: more-itertools>=11.0.2
7
+ Requires-Dist: polars>=1.40.1
8
+ Requires-Dist: pytest>=9.0.3
9
+ Requires-Dist: ruff>=0.15.12
10
+ Requires-Dist: ty>=0.0.35
11
+ Dynamic: license-file
typol-0.0.1/README.md ADDED
@@ -0,0 +1,234 @@
1
+ # Typol
2
+
3
+ A typed wrapper around Polars, for statically enforcing shape types for dataframes. Get the speed and algebra of dataframes with the guarantees and maintainability of static typing. Follow how it works below, or scoll down to [see a full example](#full-example)
4
+
5
+ * Built around [Polars](https://github.com/pola-rs/polars/) – a thin layer to keep it simple and unsurprising; Polars docs will mostly apply outside of core type concepts
6
+ * Statically typed dataframes:
7
+ * Build confidence before running code: points out errors before they happen
8
+ * Allows tooling to guide you: language servers know what columns are available and what their types are, so what operations can happen
9
+ * Keeping structure also makes it easier to dive into long-untouched code, and can enforce consistency across systems
10
+ * *[See full reasoning in our FAQs](#faqs)*
11
+ * [Ty](https://github.com/astral-sh/ty) type checking — built to take advantage of the latest features like intersection types
12
+
13
+
14
+ ### How it works
15
+
16
+ Define your `Shape`s with the same goal as when you'd define a `Schema` in Polars. Let's say you have a dataframe that has 4 columns:
17
+
18
+ ```python
19
+ class Account(tp.Shape):
20
+ name = tp.dimension(str)
21
+ website = tp.dimension(str)
22
+ account_age = tp.dimension(datetime.timedelta)
23
+ uid = tp.dimension(int)
24
+
25
+ accounts = tp.DataFrame(Account, [...]) # type: tp.DataFrame[Account]
26
+ ```
27
+
28
+ You can write well-typed expressions to operate on this data:
29
+
30
+ ```python
31
+ email_address = accounts.s.name.str.to_lowercase() + "@" + accounts.s.website # type: Expr[Account, Account, str]
32
+ ```
33
+
34
+ The above says it's an expression that requires an `Account` dataframe, can write to an `Account` dataframe, and results in a `str` value. If you make a type error it'll tell you:
35
+
36
+ ```python
37
+ # Unsupported `+` operation ty(unsupported-operator)
38
+ # example.py(_, 17): Has type `BoundDimension[Account, int]`
39
+ # example.py(_, 42): Has type `Literal["@"]`
40
+ email_address = accounts.s.uid + "@" + accounts.s.website
41
+ ```
42
+
43
+ because you're trying to add a string and an int!
44
+
45
+ You can create new dataframes with these transformations:
46
+
47
+ ```python
48
+ class Contact(tp.Shape):
49
+ uid = tp.dimension(int)
50
+ email = tp.dimension(str)
51
+
52
+ accounts.transform(Contact, email_address.to(Contact.email))
53
+ ```
54
+
55
+ but it will catch you if you don't put it an appropriate column for the new shape:
56
+
57
+ ```python
58
+ # (1) Expected `Expr[Account, Contact, Any]`, found `Expr[Account, Account, str]`
59
+ accounts.transform(Contact, email_address)
60
+ # (2) Argument to bound method `Expr.to` is incorrect: Expected `BoundDimension[Contact, str]`, found `BoundDimension[Contact, int]`
61
+ accounts.transform(Contact, email_address.to(Contact.uid))
62
+ ```
63
+
64
+ since for (1) email address is still based off `Account.name`, and we haven't assigned it to something valid for `Contact`, and for (2) we can't assign a `str` to `Contact.uid`, which has to be an int.
65
+
66
+ You can also update columns in the same shape:
67
+
68
+ ```python
69
+ accounts.with_columns(accounts.s.name.str.to_lowercase())
70
+ ```
71
+
72
+ which will have lowercased names. If you change type, it won't be assignable to the same column:
73
+
74
+ ```python
75
+ # `.uid.cast_out(str)` can't be left in `.uid` if cast to a str
76
+ # Argument to bound method `DataFrame.with_columns` is incorrect:
77
+ # Expected `Expr[Account, Account, _] | BoundSeries[Account, _]`, found `Expr[Account, Never, str]`
78
+ accounts.with_columns(accounts.s.uid.cast_out(str))
79
+ ```
80
+
81
+ so you have to assign it using `.to` to one that makes sense:
82
+
83
+ ```python
84
+ # This works fine, because `str`s make sense for `.name`
85
+ accounts.with_columns(accounts.s.uid.cast_out(str).to(accounts.s.name))
86
+ ```
87
+ Need to filter out only interesting rows? No problem:
88
+
89
+ ```python
90
+ # Note: `accounts.s.website == "interesting.com"` has type `Expr[Account, _, bool]`
91
+ accounts.filter(accounts.s.website == "interesting.com")
92
+ ```
93
+
94
+ and it catches it if your filter makes no sense:
95
+
96
+ ```python
97
+ # Argument to bound method `DataFrame.filter` is incorrect: Expected `Expr[Account, _, bool]`, found `Expr[Account, _, str]`
98
+ accounts.filter(accounts.s.website + "interesting.com")
99
+ # Argument to bound method `DataFrame.filter` is incorrect: Expected `Expr[Account, _, bool]`, found `Expr[Contact, _, bool]`
100
+ accounts.filter(contacts.s.email.str.ends_with("interesting.com"))
101
+ ```
102
+
103
+ The first isn't a boolean expression to filter on (just a `str`), and the second tries to filter on a column of a different shape we don't have.
104
+
105
+ It even tracks combined shapes when you join data together!
106
+
107
+ ```python
108
+ full_data = accounts.join(contacts, accounts.s.uid.on(contacts.s.uid)) # type: tp.DataFrame[Accounts & Contacts]
109
+ ```
110
+
111
+ And when you get extract data back into Python types, it knows what it should be:
112
+
113
+ ```python
114
+ full_data[full_data.s.email].to_list() # type: list[str]
115
+ full_data[full_data.s.uid].to_list() # type: list[int]
116
+
117
+ combined_ages = datetime.timedelta()
118
+ for row in full_data.iter_rows():
119
+ combined_ages += row[full_data.s.account_age] # type: datetime.timedelta
120
+ ```
121
+
122
+ [See full example](#full-example)
123
+
124
+ As much of the above is statically enforced as is possible, giving much greater guarantees for
125
+ dataframe code. Where static enforcement is not possible, dynamic enforcement is used to ensure
126
+ the static types are always correct
127
+
128
+ ## FAQs
129
+
130
+ #### Why do you need static checking of dataframes?
131
+
132
+ Dataframe code can be difficult to maintain, as the shape is often quite implicit. Columns can be added and removed ad-hoc, with different sections of code having different expectations, and no way to enforce consistency statically
133
+
134
+ #### Why not use Pandera/other dynamic dataframe checking?
135
+
136
+ Existing tools, like Pandera, do provide dynamic verification of dataframe shapes. Whilst this can be good, it bites you at runtime which is well after a problem should be caught. It also doesn't provide any tooling benefit: types guide development by enabling autocomplete and the language server to direct you to what should and shouldn't work interactively. This hasn't been the direction the dataframes community has headed since it's data science focused, where the shapes are much more ad-hoc and transient, but fits well for code that is already well structured for dataclasses, like application level code
137
+
138
+ Simply, without static checking:
139
+ * there's no enforcement around dataframe expressions, only the shapes
140
+ * there's no enforcement going between dataframe and non-dataframe code
141
+ * you have to write enough tests to cover all the cases, since the shape enforcement requires the code to be run (such as in a test)
142
+ * your tooling doesn't help guide you as you develop
143
+
144
+ #### Why not just write code that uses dataclasses?
145
+
146
+ - Pure Python dataclass code isn't particularly performant, so isn't fast with large amounts of data
147
+ - Dataframes provide a whole algebra to deal with a lot of common transformations: joins, aggregations, pivots, etc. that you get for free
148
+
149
+ #### Why not write code in a faster statically typed language?
150
+
151
+ - If you have an existing codebase in Python, converting some parts to use Polars requires little activation energy; existing logic can play with it well enough
152
+ - Dataframes in Python are pretty fast, and Python can act as a quick-to-develop-in glue language for the underlying efficient logic
153
+ - Dataframes might be the right solution regardless: they provide more than just speed—they provide the right algebraic primitives for aggregating, combining, and transforming data
154
+
155
+ #### Why Polars?
156
+
157
+ Static typing only works if the underlying data's shape is immutable, otherwise the type would no longer match the shape. Operations on the shape creating new shapes is the way to go, rather than mutating types/shapes in place, and this is the approach that Polars is designed for. This allows us to make the library a light layer on top of Polars, rather than a significant implementation in its own right
158
+
159
+ #### How is this meant to work for data science where shapes are so ad-hoc?
160
+
161
+ In short, it's not: if Polars DataFrames are `dict`s, then Typol's are `TypedDict`s. If you have messy shapes, you don't need this, and you should stay in Polars (just like you should use `dict`s for variable keys). It's easy to go back and forth between Typol and Polars since it's a simple wrapper, so if you have some flexibly shaped code and some that's more rigidly shaped, use Typol and direct Polars appropriately and don't be scared of half-and-halfing it
162
+
163
+ #### Why Ty?
164
+
165
+ Ty supports intersection types that makes writing joins a lot less involved, it can construct the joint shape on the fly for you. If other type checkers start supporting this then there's no fundamental reason it can't work with them too.
166
+
167
+
168
+ ### Full example
169
+
170
+
171
+ ```python
172
+ import typol as tp
173
+
174
+ class Account(tp.Shape):
175
+ name = tp.dimension(str)
176
+ website = tp.dimension(str)
177
+ account_age = tp.dimension(tp.UINT_8)
178
+ phone = tp.dimension(str)
179
+
180
+
181
+ class Contact(tp.Shape):
182
+ email = tp.dimension(str)
183
+ known_since = tp.dimension(tp.UINT_16)
184
+ phone = tp.dimension(str)
185
+
186
+ # Let's say I have some account data
187
+ accounts = tp.DataFrame(Account, ...)
188
+ # Maybe a year has gone past
189
+ accounts.with_columns(
190
+ # This is type checked so the `+` operator must be on a number, and the used and produced
191
+ # dimensions must all be in `Account`
192
+ accounts.s.account_age + 1
193
+ )
194
+
195
+ # Let's create contacts out of the name and the website
196
+ contacts = accounts.transform(
197
+ Contact,
198
+ # This operation must only use dimensions that are available in `Account`, and must end up at
199
+ # a `Contact` dimension. All expression types are also checked to be `str`. All static checks
200
+ (accounts.s.name.str.to_lowercase() + "@" + accounts.s.website).to(Contact.email),
201
+ # Similar to the above, except with `int`s
202
+ (tp.lit(2026) - accounts.s.account_age).to(Contact.known_since),
203
+ # `phone` is in both shapes so we can leave it alone
204
+ )
205
+
206
+ emails = contacts[Contact.email].to_list()
207
+ reveal_type(emails) # list[str], Contact.known_since would reveal to `list[int]`
208
+ print("All emails found:", emails)
209
+
210
+ class PhoneAddress(tp.Shape):
211
+ number = tp.dimension(str)
212
+ street = tp.dimension(str)
213
+
214
+ # We have some data about the street addresses of phone lines
215
+ phone_addresses = tp.DataFrame(PhoneAddress, ...)
216
+
217
+ # Now lets join our contacts with the phone address data to work out what home addresses we already know
218
+ full_details = contacts.join(
219
+ phone_addresses,
220
+ contacts.s.phone.on(phone_addresses.s.number)
221
+ )
222
+ reveal_type(full_details) # tp.DataFrame[Contact & PhoneAddress]
223
+
224
+ reveal_type(full_details.s.street.is_null()) # Expr[PhoneAddress, _, bool]
225
+
226
+ # We still need to ask some of our friends for their home address so we can send out RSVPs
227
+ still_need_to_ask_for_address = full_details.filter(full_details.s.street.is_null())[full_details.s.email]
228
+ reveal_type(still_need_to_ask_for_address) # tp.Series[str]
229
+
230
+ # Send out an email asking if they can let us know where to send the RSVPs
231
+ send_email(still_need_to_ask_for_address.to_list(), "Send me your mailing address for birthday RSVPs!")
232
+ ```
233
+
234
+ More examples and snippets are available [in the `tests`](./tests)
@@ -0,0 +1,110 @@
1
+
2
+ [project]
3
+ name = "typol"
4
+ version = "0.0.1"
5
+ requires-python = ">=3.12"
6
+ dependencies = [
7
+ "more-itertools>=11.0.2",
8
+ "polars>=1.40.1",
9
+ "pytest>=9.0.3",
10
+ "ruff>=0.15.12",
11
+ "ty>=0.0.35",
12
+ ]
13
+
14
+ [tool.pytest.ini_options]
15
+ addopts = "--log-level=INFO -vvv"
16
+ testpaths = ["tests"]
17
+ filterwarnings = "ignore"
18
+
19
+ [tool.ty.rules]
20
+ unused-ignore-comment = "error"
21
+
22
+ [tool.ruff]
23
+ line-length = 100
24
+ target-version = "py312"
25
+
26
+ src = ["."]
27
+
28
+ # Enable explicit preview rules if we want to opt-in to them
29
+ lint.preview = true
30
+ lint.explicit-preview-rules = true
31
+
32
+ lint.select = [
33
+ "ANN",
34
+ "B",
35
+ "C4",
36
+ "D", # pydocstyle, configured with convention = "numpy" below
37
+ "E",
38
+ "F",
39
+ "FA",
40
+ "FBT003",
41
+ "FURB",
42
+ "G",
43
+ "I",
44
+ "ICN",
45
+ "LOG",
46
+ "N",
47
+ "PERF",
48
+ "PIE",
49
+ "PLC",
50
+ "PLE",
51
+ "PLR",
52
+ "PLW",
53
+ "PTH",
54
+ "PYI",
55
+ "RSE",
56
+ "RUF",
57
+ "SIM",
58
+ "T10",
59
+ "UP",
60
+ "ISC004", # implicit-string-concatenation-in-collection-literal, these are particularly bad since newlines can look like different collection elements
61
+ "RET501", # unnecessary-return-none, just use `return` if only `None` is possible (void functions), note other RET rules can conflict with mypy unreachability
62
+ "RET502", # implicit-return-value, use `return None` if other return values are possible (non-void functions)
63
+ "RET504", # unnecessary-assign, don't do `x = f(...); return x`, just do `return f(...)`
64
+ "TD002", # missing-todo-author (strict only), make it easy to find things you forgot to do
65
+ "TID251", # banned-api, allows us to declare banned imports below
66
+ "TID253", # banned-module-level-imports (strict only), imports only banned at top level in new code to force transition
67
+ "W605", # invalid-escape-sequence, use raw strings or \\
68
+ ]
69
+ # Resetting select seems to reset ignore
70
+ lint.ignore = [
71
+ "B905", # zip-without-explicit-strict, these are often useful, e.g. zip(items, items[1:])
72
+ "E741", # ambiguous-variable-name
73
+ "E501", # line-too-long, rely on ruff to format these
74
+ "PYI025", # unaliased-collections-abc-set-import, we haven't used `Set` for `set`, this rule seems to be for more legacy users, and just causes us to use an old name `AbstractSet` (and not `AbstractMapping` etc)
75
+ "PYI041", # redundant-numeric-union, if we have special logic for ints this can cause issues, I'm not sure we need to rely on type promotions
76
+ "PLR09", # too-many-arguments, too-many-statements, etc., it's not clear we want to enforce such stringent rewriting given other design choices
77
+ "RUF021", # parenthesize-chained-operators, this can lead to excessive brackets which reformats into excessive lines
78
+ "SIM118", # in-dict-keys, isn't good about handling non-dicts
79
+ "UP040", # non-pep695-type-alias, PEP695 type aliases are not equivalent, they can't be used for isinstance checks or for static/class methods
80
+ "UP046", # non-pep695-generic-class, PEP695 compatible class ...[T] only allows for autovariance, which doesn't handle some cases well
81
+ # pydocstyle (D) ignores — aligned with omitted numpydoc_validation checks below
82
+ "D1", # undocumented-*, a goal but we're not there yet
83
+ "D205", # blank-line-after-summary, not reliably auto-fixable
84
+ "D400", # ends-in-period (SS03)
85
+ "D401", # imperative-mood
86
+ "D404", # docstring-starts-with-this, not checked by numpydoc_validation
87
+ ]
88
+
89
+ format.skip-magic-trailing-comma = true
90
+
91
+
92
+ [tool.ruff.lint.flake8-tidy-imports]
93
+ banned-module-level-imports = ["ty_extensions"]
94
+
95
+ [tool.ruff.lint.flake8-tidy-imports.banned-api]
96
+ "path".msg = "Use `pathlib` instead."
97
+ "pytz".msg = "Use built-in `zoneinfo` instead."
98
+
99
+ [tool.ruff.lint.flake8-comprehensions]
100
+ allow-dict-calls-with-keyword-arguments = true
101
+
102
+ [tool.ruff.lint.flake8-import-conventions.aliases]
103
+ "typol" = "tp"
104
+ "polars" = "pl"
105
+
106
+ [tool.ruff.lint.isort]
107
+ split-on-trailing-comma = false # Ruff wants this set to align with format.skip-magic-trailing-comma
108
+
109
+ [tool.ruff.lint.pydocstyle]
110
+ convention = "numpy"
typol-0.0.1/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,104 @@
1
+ import math
2
+ from typing import TYPE_CHECKING, Final
3
+
4
+ import typol as tp
5
+
6
+ if TYPE_CHECKING:
7
+ pass
8
+
9
+
10
+ class Int(tp.Shape):
11
+ value = tp.dimension(int)
12
+
13
+
14
+ class Float(tp.Shape):
15
+ value = tp.dimension(tp.FLOAT_64)
16
+
17
+
18
+ class String(tp.Shape):
19
+ value = tp.dimension(str)
20
+
21
+
22
+ def _single_col_df[S: tp.Shape, T](
23
+ dimension: tp.BoundDimension[S, T], *values: T | None
24
+ ) -> tp.DataFrame[S]:
25
+ return tp.DataFrame(
26
+ dimension.shape, [tp.Entry.of(dimension.set_or_null(v)) for v in values]
27
+ )
28
+
29
+
30
+ _INTS: Final = _single_col_df(Int.value, 1, 5, 2, 4, 3)
31
+ _FLOATS: Final = _single_col_df(Float.value, 1.2, 4.8, 2.5, 4.0, math.pi)
32
+ _STRS: Final = _single_col_df(String.value, "spam", "eggs", "foo", "bar")
33
+
34
+
35
+ def test_comparsions() -> None:
36
+ assert _INTS.filter(Int.value.gt(3)).equals(_single_col_df(Int.value, 5, 4))
37
+ assert _INTS.filter(Int.value.lt(3)).equals(_single_col_df(Int.value, 1, 2))
38
+ assert _INTS.filter(Int.value.ge(3)).equals(_single_col_df(Int.value, 5, 4, 3))
39
+ assert _INTS.filter(Int.value.le(3)).equals(_single_col_df(Int.value, 1, 2, 3))
40
+ assert _INTS.filter(Int.value.eq(3)).equals(_single_col_df(Int.value, 3))
41
+
42
+ assert _FLOATS.filter(Float.value.gt(math.pi)).equals(
43
+ _single_col_df(Float.value, 4.8, 4.0)
44
+ )
45
+ assert _FLOATS.filter(Float.value.lt(math.pi)).equals(
46
+ _single_col_df(Float.value, 1.2, 2.5)
47
+ )
48
+ assert _FLOATS.filter(Float.value.ge(math.pi)).equals(
49
+ _single_col_df(Float.value, 4.8, 4.0, math.pi)
50
+ )
51
+ assert _FLOATS.filter(Float.value.le(math.pi)).equals(
52
+ _single_col_df(Float.value, 1.2, 2.5, math.pi)
53
+ )
54
+ assert _FLOATS.filter(Float.value.eq(math.pi)).equals(
55
+ _single_col_df(Float.value, math.pi)
56
+ )
57
+
58
+ assert _STRS.filter(String.value.gt("foo")).equals(
59
+ _single_col_df(String.value, "spam")
60
+ )
61
+ assert _STRS.filter(String.value.lt("foo")).equals(
62
+ _single_col_df(String.value, "eggs", "bar")
63
+ )
64
+ assert _STRS.filter(String.value.ge("foo")).equals(
65
+ _single_col_df(String.value, "spam", "foo")
66
+ )
67
+ assert _STRS.filter(String.value.le("foo")).equals(
68
+ _single_col_df(String.value, "eggs", "foo", "bar")
69
+ )
70
+ assert _STRS.filter(String.value.eq("foo")).equals(
71
+ _single_col_df(String.value, "foo")
72
+ )
73
+
74
+
75
+ def test_arithmetic() -> None:
76
+ assert _INTS.with_columns(Int.value * 2 + 5).equals(
77
+ _single_col_df(Int.value, 7, 15, 9, 13, 11)
78
+ )
79
+ assert _FLOATS.with_columns(Float.value**2 / 2).equals(
80
+ _single_col_df(Float.value, 0.72, 11.52, 3.125, 8.0, math.pi**2 / 2)
81
+ )
82
+ assert _STRS.with_columns(String.value + "!").equals(
83
+ _single_col_df(String.value, "spam!", "eggs!", "foo!", "bar!")
84
+ )
85
+
86
+
87
+ def test_lit() -> None:
88
+ assert _INTS.with_columns((tp.lit(5) + Int.value * 2).to(Int.value)).equals(
89
+ _single_col_df(Int.value, 7, 15, 9, 13, 11)
90
+ )
91
+ # Use a column name to make sure it doesn't pick that up
92
+ assert _STRS.with_columns((tp.lit("value") + String.value).to(String.value)).equals(
93
+ _single_col_df(String.value, "valuespam", "valueeggs", "valuefoo", "valuebar")
94
+ )
95
+
96
+
97
+ def test_when() -> None:
98
+ assert _INTS.with_columns(
99
+ tp.when(Int.value.gt(3)).then(Int.value / 2).otherwise(Int.value + 1)
100
+ ).equals(_single_col_df(Int.value, 2, 2, 3, 2, 4))
101
+ # Use a column name to make sure it doesn't pick that up
102
+ assert _STRS.with_columns(
103
+ tp.when(String.value.str.len_chars().gt(3)).then(String.value)
104
+ ).equals(_single_col_df(String.value, "spam", "eggs", None, None))