PyPI - typol - Versions diffs - 0.0.1__tar.gz - Mend

typol 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

typol-0.0.1/LICENSE.txt +21 -0
typol-0.0.1/PKG-INFO +11 -0
typol-0.0.1/README.md +234 -0
typol-0.0.1/pyproject.toml +110 -0
typol-0.0.1/setup.cfg +4 -0
typol-0.0.1/tests/test_expr.py +104 -0
typol-0.0.1/tests/test_frame.py +322 -0
typol-0.0.1/typol/__init__.py +179 -0
typol-0.0.1/typol/expr.py +1745 -0
typol-0.0.1/typol/frame.py +547 -0
typol-0.0.1/typol/lazy.py +429 -0
typol-0.0.1/typol/row.py +164 -0
typol-0.0.1/typol/series.py +243 -0
typol-0.0.1/typol/types.py +210 -0
typol-0.0.1/typol.egg-info/PKG-INFO +11 -0
typol-0.0.1/typol.egg-info/SOURCES.txt +17 -0
typol-0.0.1/typol.egg-info/dependency_links.txt +1 -0
typol-0.0.1/typol.egg-info/requires.txt +5 -0
typol-0.0.1/typol.egg-info/top_level.txt +1 -0

typol-0.0.1/LICENSE.txt ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 PDT PARTNERS
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

typol-0.0.1/PKG-INFO ADDED Viewed

@@ -0,0 +1,11 @@
+Metadata-Version: 2.4
+Name: typol
+Version: 0.0.1
+Requires-Python: >=3.12
+License-File: LICENSE.txt
+Requires-Dist: more-itertools>=11.0.2
+Requires-Dist: polars>=1.40.1
+Requires-Dist: pytest>=9.0.3
+Requires-Dist: ruff>=0.15.12
+Requires-Dist: ty>=0.0.35
+Dynamic: license-file

typol-0.0.1/README.md ADDED Viewed

@@ -0,0 +1,234 @@
+# Typol
+A typed wrapper around Polars, for statically enforcing shape types for dataframes. Get the speed and algebra of dataframes with the guarantees and maintainability of static typing. Follow how it works below, or scoll down to [see a full example](#full-example)
+* Built around [Polars](https://github.com/pola-rs/polars/) – a thin layer to keep it simple and unsurprising; Polars docs will mostly apply outside of core type concepts
+* Statically typed dataframes:
+  * Build confidence before running code: points out errors before they happen
+  * Allows tooling to guide you: language servers know what columns are available and what their types are, so what operations can happen
+  * Keeping structure also makes it easier to dive into long-untouched code, and can enforce consistency across systems
+  * *[See full reasoning in our FAQs](#faqs)*
+* [Ty](https://github.com/astral-sh/ty) type checking — built to take advantage of the latest features like intersection types
+### How it works
+Define your `Shape`s with the same goal as when you'd define a `Schema` in Polars. Let's say you have a dataframe that has 4 columns:
+```python
+class Account(tp.Shape):
+    name = tp.dimension(str)
+    website = tp.dimension(str)
+    account_age = tp.dimension(datetime.timedelta)
+    uid = tp.dimension(int)
+accounts = tp.DataFrame(Account, [...])  # type: tp.DataFrame[Account]
+```
+You can write well-typed expressions to operate on this data:
+```python
+email_address = accounts.s.name.str.to_lowercase() + "@" + accounts.s.website  # type: Expr[Account, Account, str]
+```
+The above says it's an expression that requires an `Account` dataframe, can write to an `Account` dataframe, and results in a `str` value. If you make a type error it'll tell you:
+```python
+# Unsupported `+` operation ty(unsupported-operator)
+# example.py(_, 17): Has type `BoundDimension[Account, int]`
+# example.py(_, 42): Has type `Literal["@"]`
+email_address = accounts.s.uid + "@" + accounts.s.website
+```
+because you're trying to add a string and an int!
+You can create new dataframes with these transformations:
+```python
+class Contact(tp.Shape):
+    uid = tp.dimension(int)
+    email = tp.dimension(str)
+accounts.transform(Contact, email_address.to(Contact.email))
+```
+but it will catch you if you don't put it an appropriate column for the new shape:
+```python
+# (1) Expected `Expr[Account, Contact, Any]`, found `Expr[Account, Account, str]`
+accounts.transform(Contact, email_address)
+# (2) Argument to bound method `Expr.to` is incorrect: Expected `BoundDimension[Contact, str]`, found `BoundDimension[Contact, int]`
+accounts.transform(Contact, email_address.to(Contact.uid))
+```
+since for (1) email address is still based off `Account.name`, and we haven't assigned it to something valid for `Contact`, and for (2) we can't assign a `str` to `Contact.uid`, which has to be an int.
+You can also update columns in the same shape:
+```python
+accounts.with_columns(accounts.s.name.str.to_lowercase())
+```
+which will have lowercased names. If you change type, it won't be assignable to the same column:
+```python
+# `.uid.cast_out(str)` can't be left in `.uid` if cast to a str
+# Argument to bound method `DataFrame.with_columns` is incorrect:
+#  Expected `Expr[Account, Account, _] | BoundSeries[Account, _]`, found `Expr[Account, Never, str]`
+accounts.with_columns(accounts.s.uid.cast_out(str))
+```
+so you have to assign it using `.to` to one that makes sense:
+```python
+# This works fine, because `str`s make sense for `.name`
+accounts.with_columns(accounts.s.uid.cast_out(str).to(accounts.s.name))
+```
+Need to filter out only interesting rows? No problem:
+```python
+# Note: `accounts.s.website == "interesting.com"` has type `Expr[Account, _, bool]`
+accounts.filter(accounts.s.website == "interesting.com")
+```
+and it catches it if your filter makes no sense:
+```python
+# Argument to bound method `DataFrame.filter` is incorrect: Expected `Expr[Account, _, bool]`, found `Expr[Account, _, str]`
+accounts.filter(accounts.s.website + "interesting.com")
+# Argument to bound method `DataFrame.filter` is incorrect: Expected `Expr[Account, _, bool]`, found `Expr[Contact, _, bool]`
+accounts.filter(contacts.s.email.str.ends_with("interesting.com"))
+```
+The first isn't a boolean expression to filter on (just a `str`), and the second tries to filter on a column of a different shape we don't have.
+It even tracks combined shapes when you join data together!
+```python
+full_data = accounts.join(contacts, accounts.s.uid.on(contacts.s.uid))  # type: tp.DataFrame[Accounts & Contacts]
+```
+And when you get extract data back into Python types, it knows what it should be:
+```python
+full_data[full_data.s.email].to_list()  # type: list[str]
+full_data[full_data.s.uid].to_list()  # type: list[int]
+combined_ages = datetime.timedelta()
+for row in full_data.iter_rows():
+    combined_ages += row[full_data.s.account_age]  # type: datetime.timedelta
+```
+[See full example](#full-example)
+As much of the above is statically enforced as is possible, giving much greater guarantees for
+dataframe code. Where static enforcement is not possible, dynamic enforcement is used to ensure
+the static types are always correct
+## FAQs
+#### Why do you need static checking of dataframes?
+Dataframe code can be difficult to maintain, as the shape is often quite implicit. Columns can be added and removed ad-hoc, with different sections of code having different expectations, and no way to enforce consistency statically
+#### Why not use Pandera/other dynamic dataframe checking?
+Existing tools, like Pandera, do provide dynamic verification of dataframe shapes. Whilst this can be good, it bites you at runtime which is well after a problem should be caught. It also doesn't provide any tooling benefit: types guide development by enabling autocomplete and the language server to direct you to what should and shouldn't work interactively. This hasn't been the direction the dataframes community has headed since it's data science focused, where the shapes are much more ad-hoc and transient, but fits well for code that is already well structured for dataclasses, like application level code
+Simply, without static checking:
+  * there's no enforcement around dataframe expressions, only the shapes
+  * there's no enforcement going between dataframe and non-dataframe code
+  * you have to write enough tests to cover all the cases, since the shape enforcement requires the code to be run (such as in a test)
+  * your tooling doesn't help guide you as you develop
+#### Why not just write code that uses dataclasses?
+- Pure Python dataclass code isn't particularly performant, so isn't fast with large amounts of data
+- Dataframes provide a whole algebra to deal with a lot of common transformations: joins, aggregations, pivots, etc. that you get for free
+#### Why not write code in a faster statically typed language?
+- If you have an existing codebase in Python, converting some parts to use Polars requires little activation energy; existing logic can play with it well enough
+- Dataframes in Python are pretty fast, and Python can act as a quick-to-develop-in glue language for the underlying efficient logic
+- Dataframes might be the right solution regardless: they provide more than just speed—they provide the right algebraic primitives for aggregating, combining, and transforming data
+#### Why Polars?
+Static typing only works if the underlying data's shape is immutable, otherwise the type would no longer match the shape. Operations on the shape creating new shapes is the way to go, rather than mutating types/shapes in place, and this is the approach that Polars is designed for. This allows us to make the library a light layer on top of Polars, rather than a significant implementation in its own right
+#### How is this meant to work for data science where shapes are so ad-hoc?
+In short, it's not: if Polars DataFrames are `dict`s, then Typol's are `TypedDict`s. If you have messy shapes, you don't need this, and you should stay in Polars (just like you should use `dict`s for variable keys). It's easy to go back and forth between Typol and Polars since it's a simple wrapper, so if you have some flexibly shaped code and some that's more rigidly shaped, use Typol and direct Polars appropriately and don't be scared of half-and-halfing it
+#### Why Ty?
+Ty supports intersection types that makes writing joins a lot less involved, it can construct the joint shape on the fly for you. If other type checkers start supporting this then there's no fundamental reason it can't work with them too.
+### Full example
+```python
+import typol as tp
+class Account(tp.Shape):
+    name = tp.dimension(str)
+    website = tp.dimension(str)
+    account_age = tp.dimension(tp.UINT_8)
+    phone = tp.dimension(str)
+class Contact(tp.Shape):
+    email = tp.dimension(str)
+    known_since = tp.dimension(tp.UINT_16)
+    phone = tp.dimension(str)
+# Let's say I have some account data
+accounts = tp.DataFrame(Account, ...)
+# Maybe a year has gone past
+accounts.with_columns(
+    # This is type checked so the `+` operator must be on a number, and the used and produced
+    # dimensions must all be in `Account`
+    accounts.s.account_age + 1
+)
+# Let's create contacts out of the name and the website
+contacts = accounts.transform(
+    Contact,
+    # This operation must only use dimensions that are available in `Account`, and must end up at
+    # a `Contact` dimension. All expression types are also checked to be `str`. All static checks
+    (accounts.s.name.str.to_lowercase() + "@" + accounts.s.website).to(Contact.email),
+    # Similar to the above, except with `int`s
+    (tp.lit(2026) - accounts.s.account_age).to(Contact.known_since),
+    # `phone` is in both shapes so we can leave it alone
+)
+emails = contacts[Contact.email].to_list()
+reveal_type(emails)  # list[str], Contact.known_since would reveal to `list[int]`
+print("All emails found:", emails)
+class PhoneAddress(tp.Shape):
+    number = tp.dimension(str)
+    street = tp.dimension(str)
+# We have some data about the street addresses of phone lines
+phone_addresses = tp.DataFrame(PhoneAddress, ...)
+# Now lets join our contacts with the phone address data to work out what home addresses we already know
+full_details = contacts.join(
+    phone_addresses,
+    contacts.s.phone.on(phone_addresses.s.number)
+)
+reveal_type(full_details)  # tp.DataFrame[Contact & PhoneAddress]
+reveal_type(full_details.s.street.is_null())  # Expr[PhoneAddress, _, bool]
+# We still need to ask some of our friends for their home address so we can send out RSVPs
+still_need_to_ask_for_address = full_details.filter(full_details.s.street.is_null())[full_details.s.email]
+reveal_type(still_need_to_ask_for_address)  # tp.Series[str]
+# Send out an email asking if they can let us know where to send the RSVPs
+send_email(still_need_to_ask_for_address.to_list(), "Send me your mailing address for birthday RSVPs!")
+```
+More examples and snippets are available [in the `tests`](./tests)

typol-0.0.1/pyproject.toml ADDED Viewed

@@ -0,0 +1,110 @@
+[project]
+name = "typol"
+version = "0.0.1"
+requires-python = ">=3.12"
+dependencies = [
+    "more-itertools>=11.0.2",
+    "polars>=1.40.1",
+    "pytest>=9.0.3",
+    "ruff>=0.15.12",
+    "ty>=0.0.35",
+]
+[tool.pytest.ini_options]
+addopts = "--log-level=INFO -vvv"
+testpaths = ["tests"]
+filterwarnings = "ignore"
+[tool.ty.rules]
+unused-ignore-comment = "error"
+[tool.ruff]
+line-length = 100
+target-version = "py312"
+src = ["."]
+# Enable explicit preview rules if we want to opt-in to them
+lint.preview = true
+lint.explicit-preview-rules = true
+lint.select = [
+    "ANN",
+    "B",
+    "C4",
+    "D",     # pydocstyle, configured with convention = "numpy" below
+    "E",
+    "F",
+    "FA",
+    "FBT003",
+    "FURB",
+    "G",
+    "I",
+    "ICN",
+    "LOG",
+    "N",
+    "PERF",
+    "PIE",
+    "PLC",
+    "PLE",
+    "PLR",
+    "PLW",
+    "PTH",
+    "PYI",
+    "RSE",
+    "RUF",
+    "SIM",
+    "T10",
+    "UP",
+    "ISC004",  # implicit-string-concatenation-in-collection-literal, these are particularly bad since newlines can look like different collection elements
+    "RET501",  # unnecessary-return-none, just use `return` if only `None` is possible (void functions), note other RET rules can conflict with mypy unreachability
+    "RET502",  # implicit-return-value, use `return None` if other return values are possible (non-void functions)
+    "RET504",  # unnecessary-assign, don't do `x = f(...); return x`, just do `return f(...)`
+    "TD002",   # missing-todo-author (strict only), make it easy to find things you forgot to do
+    "TID251",  # banned-api, allows us to declare banned imports below
+    "TID253",  # banned-module-level-imports (strict only), imports only banned at top level in new code to force transition
+    "W605",    # invalid-escape-sequence, use raw strings or \\
+]
+# Resetting select seems to reset ignore
+lint.ignore = [
+    "B905",   # zip-without-explicit-strict, these are often useful, e.g. zip(items, items[1:])
+    "E741",   # ambiguous-variable-name
+    "E501",   # line-too-long, rely on ruff to format these
+    "PYI025", # unaliased-collections-abc-set-import, we haven't used `Set` for `set`, this rule seems to be for more legacy users, and just causes us to use an old name `AbstractSet` (and not `AbstractMapping` etc)
+    "PYI041", # redundant-numeric-union, if we have special logic for ints this can cause issues, I'm not sure we need to rely on type promotions
+    "PLR09",  # too-many-arguments, too-many-statements, etc., it's not clear we want to enforce such stringent rewriting given other design choices
+    "RUF021", # parenthesize-chained-operators, this can lead to excessive brackets which reformats into excessive lines
+    "SIM118", # in-dict-keys, isn't good about handling non-dicts
+    "UP040",  # non-pep695-type-alias, PEP695 type aliases are not equivalent, they can't be used for isinstance checks or for static/class methods
+    "UP046",  # non-pep695-generic-class, PEP695 compatible class ...[T] only allows for autovariance, which doesn't handle some cases well
+    # pydocstyle (D) ignores — aligned with omitted numpydoc_validation checks below
+    "D1",     # undocumented-*, a goal but we're not there yet
+    "D205",   # blank-line-after-summary, not reliably auto-fixable
+    "D400",   # ends-in-period (SS03)
+    "D401",   # imperative-mood
+    "D404",   # docstring-starts-with-this, not checked by numpydoc_validation
+]
+format.skip-magic-trailing-comma = true
+[tool.ruff.lint.flake8-tidy-imports]
+banned-module-level-imports = ["ty_extensions"]
+[tool.ruff.lint.flake8-tidy-imports.banned-api]
+"path".msg = "Use `pathlib` instead."
+"pytz".msg = "Use built-in `zoneinfo` instead."
+[tool.ruff.lint.flake8-comprehensions]
+allow-dict-calls-with-keyword-arguments = true
+[tool.ruff.lint.flake8-import-conventions.aliases]
+"typol" = "tp"
+"polars" = "pl"
+[tool.ruff.lint.isort]
+split-on-trailing-comma = false  # Ruff wants this set to align with format.skip-magic-trailing-comma
+[tool.ruff.lint.pydocstyle]
+convention = "numpy"

typol-0.0.1/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

typol-0.0.1/tests/test_expr.py ADDED Viewed

@@ -0,0 +1,104 @@
+import math
+from typing import TYPE_CHECKING, Final
+import typol as tp
+if TYPE_CHECKING:
+    pass
+class Int(tp.Shape):
+    value = tp.dimension(int)
+class Float(tp.Shape):
+    value = tp.dimension(tp.FLOAT_64)
+class String(tp.Shape):
+    value = tp.dimension(str)
+def _single_col_df[S: tp.Shape, T](
+    dimension: tp.BoundDimension[S, T], *values: T | None
+) -> tp.DataFrame[S]:
+    return tp.DataFrame(
+        dimension.shape, [tp.Entry.of(dimension.set_or_null(v)) for v in values]
+    )
+_INTS: Final = _single_col_df(Int.value, 1, 5, 2, 4, 3)
+_FLOATS: Final = _single_col_df(Float.value, 1.2, 4.8, 2.5, 4.0, math.pi)
+_STRS: Final = _single_col_df(String.value, "spam", "eggs", "foo", "bar")
+def test_comparsions() -> None:
+    assert _INTS.filter(Int.value.gt(3)).equals(_single_col_df(Int.value, 5, 4))
+    assert _INTS.filter(Int.value.lt(3)).equals(_single_col_df(Int.value, 1, 2))
+    assert _INTS.filter(Int.value.ge(3)).equals(_single_col_df(Int.value, 5, 4, 3))
+    assert _INTS.filter(Int.value.le(3)).equals(_single_col_df(Int.value, 1, 2, 3))
+    assert _INTS.filter(Int.value.eq(3)).equals(_single_col_df(Int.value, 3))
+    assert _FLOATS.filter(Float.value.gt(math.pi)).equals(
+        _single_col_df(Float.value, 4.8, 4.0)
+    )
+    assert _FLOATS.filter(Float.value.lt(math.pi)).equals(
+        _single_col_df(Float.value, 1.2, 2.5)
+    )
+    assert _FLOATS.filter(Float.value.ge(math.pi)).equals(
+        _single_col_df(Float.value, 4.8, 4.0, math.pi)
+    )
+    assert _FLOATS.filter(Float.value.le(math.pi)).equals(
+        _single_col_df(Float.value, 1.2, 2.5, math.pi)
+    )
+    assert _FLOATS.filter(Float.value.eq(math.pi)).equals(
+        _single_col_df(Float.value, math.pi)
+    )
+    assert _STRS.filter(String.value.gt("foo")).equals(
+        _single_col_df(String.value, "spam")
+    )
+    assert _STRS.filter(String.value.lt("foo")).equals(
+        _single_col_df(String.value, "eggs", "bar")
+    )
+    assert _STRS.filter(String.value.ge("foo")).equals(
+        _single_col_df(String.value, "spam", "foo")
+    )
+    assert _STRS.filter(String.value.le("foo")).equals(
+        _single_col_df(String.value, "eggs", "foo", "bar")
+    )
+    assert _STRS.filter(String.value.eq("foo")).equals(
+        _single_col_df(String.value, "foo")
+    )
+def test_arithmetic() -> None:
+    assert _INTS.with_columns(Int.value * 2 + 5).equals(
+        _single_col_df(Int.value, 7, 15, 9, 13, 11)
+    )
+    assert _FLOATS.with_columns(Float.value**2 / 2).equals(
+        _single_col_df(Float.value, 0.72, 11.52, 3.125, 8.0, math.pi**2 / 2)
+    )
+    assert _STRS.with_columns(String.value + "!").equals(
+        _single_col_df(String.value, "spam!", "eggs!", "foo!", "bar!")
+    )
+def test_lit() -> None:
+    assert _INTS.with_columns((tp.lit(5) + Int.value * 2).to(Int.value)).equals(
+        _single_col_df(Int.value, 7, 15, 9, 13, 11)
+    )
+    # Use a column name to make sure it doesn't pick that up
+    assert _STRS.with_columns((tp.lit("value") + String.value).to(String.value)).equals(
+        _single_col_df(String.value, "valuespam", "valueeggs", "valuefoo", "valuebar")
+    )
+def test_when() -> None:
+    assert _INTS.with_columns(
+        tp.when(Int.value.gt(3)).then(Int.value / 2).otherwise(Int.value + 1)
+    ).equals(_single_col_df(Int.value, 2, 2, 3, 2, 4))
+    # Use a column name to make sure it doesn't pick that up
+    assert _STRS.with_columns(
+        tp.when(String.value.str.len_chars().gt(3)).then(String.value)
+    ).equals(_single_col_df(String.value, "spam", "eggs", None, None))