dataframely 2.8.0__tar.gz → 2.8.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataframely-2.8.2/.github/copilot-instructions.md +53 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/.github/dependabot.yml +2 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/.github/workflows/build.yml +2 -5
- dataframely-2.8.2/.github/workflows/chore-main.yml +21 -0
- dataframely-2.8.0/.github/workflows/chore.yml → dataframely-2.8.2/.github/workflows/chore-pr.yml +7 -14
- {dataframely-2.8.0 → dataframely-2.8.2}/.github/workflows/ci.yml +2 -2
- {dataframely-2.8.0 → dataframely-2.8.2}/.github/workflows/copilot-setup-steps.yml +1 -1
- {dataframely-2.8.0 → dataframely-2.8.2}/.github/workflows/scorecard.yml +1 -1
- {dataframely-2.8.0 → dataframely-2.8.2}/PKG-INFO +1 -1
- dataframely-2.8.2/SKILL.md +238 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/_base_schema.py +31 -1
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/conf.py +0 -1
- dataframely-2.8.2/docs/guides/coding-agents.md +75 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/guides/index.md +1 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/pyproject.toml +1 -1
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/schema/test_base.py +19 -0
- dataframely-2.8.0/.github/copilot-instructions.md +0 -237
- {dataframely-2.8.0 → dataframely-2.8.2}/.copier-answers.yml +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/.envrc +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/.gitattributes +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/.github/CODEOWNERS +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/.github/instructions/tests.instructions.md +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/.github/release-drafter.yml +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/.github/workflows/nightly.yml +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/.gitignore +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/.pre-commit-config.yaml +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/.prettierignore +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/.prettierrc +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/.readthedocs.yml +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/Cargo.lock +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/Cargo.toml +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/LICENSE +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/README.md +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/SECURITY.md +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/__init__.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/_compat.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/_deprecation.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/_filter.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/_match_to_schema.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/_native.pyi +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/_plugin.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/_polars.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/_pydantic.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/_rule.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/_serialization.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/_storage/__init__.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/_storage/_base.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/_storage/_exc.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/_storage/_fsspec.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/_storage/constants.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/_storage/delta.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/_storage/parquet.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/_typing.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/collection/__init__.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/collection/_base.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/collection/collection.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/collection/filter_result.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/columns/__init__.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/columns/_base.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/columns/_mixins.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/columns/_registry.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/columns/_utils.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/columns/any.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/columns/array.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/columns/binary.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/columns/bool.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/columns/categorical.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/columns/datetime.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/columns/decimal.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/columns/enum.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/columns/float.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/columns/integer.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/columns/list.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/columns/object.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/columns/string.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/columns/struct.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/config.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/exc.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/experimental/__init__.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/experimental/infer_schema.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/filter_result.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/functional.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/py.typed +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/random.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/schema.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/testing/__init__.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/testing/const.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/testing/factory.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/testing/mask.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/testing/rules.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/dataframely/testing/storage.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docker-compose.yml +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/_static/custom.css +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/_static/favicon.ico +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/_templates/autosummary/class.rst +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/_templates/autosummary/method.rst +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/_templates/classes/column.rst +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/_templates/classes/error.rst +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/_templates/classes/filter_result.rst +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/api/collection/generation.rst +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/api/collection/index.rst +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/api/collection/io.rst +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/api/collection/metadata.rst +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/api/collection/operations.rst +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/api/collection/validation.rst +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/api/columns/index.rst +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/api/errors/index.rst +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/api/experimental/index.rst +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/api/filter_result/failure_info.rst +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/api/filter_result/index.rst +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/api/index.rst +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/api/misc/index.rst +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/api/schema/conversion.rst +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/api/schema/generation.rst +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/api/schema/index.rst +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/api/schema/io.rst +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/api/schema/metadata.rst +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/api/schema/validation.rst +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/css/custom.css +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/guides/development.md +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/guides/examples/index.md +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/guides/examples/real-world.ipynb +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/guides/faq.md +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/guides/features/column-metadata.md +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/guides/features/data-generation.md +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/guides/features/index.md +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/guides/features/lazy-validation.md +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/guides/features/primary-keys.md +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/guides/features/serialization.md +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/guides/features/sql-generation.md +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/guides/migration/index.md +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/guides/migration/v1-v2.md +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/guides/quickstart.md +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/docs/index.md +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/pixi.lock +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/pixi.toml +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/rust-toolchain.toml +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/src/lib.rs +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/src/polars_plugin/mod.rs +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/src/polars_plugin/rule_failure.rs +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/src/polars_plugin/utils.rs +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/src/polars_plugin/validation_error.rs +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/src/regex/errdefs.rs +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/src/regex/mod.rs +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/src/regex/repr.rs +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/benches/conftest.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/benches/test_collection.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/benches/test_failure.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/benches/test_schema.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/collection/test_base.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/collection/test_cast.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/collection/test_collection_future_annotations.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/collection/test_create_empty.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/collection/test_filter_one_to_n.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/collection/test_filter_validate.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/collection/test_ignore_in_filter.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/collection/test_implementation.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/collection/test_join.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/collection/test_matches.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/collection/test_optional_members.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/collection/test_propagate_row_failures.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/collection/test_repr.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/collection/test_sample.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/collection/test_serialization.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/collection/test_storage.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/collection/test_validate_input.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/column_types/__init__.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/column_types/test_any.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/column_types/test_array.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/column_types/test_binary.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/column_types/test_datetime.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/column_types/test_decimal.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/column_types/test_enum.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/column_types/test_float.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/column_types/test_integer.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/column_types/test_list.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/column_types/test_object.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/column_types/test_string.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/column_types/test_struct.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/columns/__init__.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/columns/test_alias.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/columns/test_base.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/columns/test_check.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/columns/test_default_dtypes.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/columns/test_matches.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/columns/test_metadata.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/columns/test_polars_schema.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/columns/test_pyarrow.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/columns/test_rules.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/columns/test_sample.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/columns/test_sqlalchemy_columns.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/columns/test_str.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/columns/test_utils.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/conftest.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/core_validation/__init__.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/core_validation/test_match_to_schema.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/core_validation/test_rule_evaluation.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/experimental/test_infer_schema.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/failure_info/test_storage.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/functional/test_concat.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/functional/test_relationships.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/schema/test_cast.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/schema/test_create_empty.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/schema/test_create_empty_if_none.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/schema/test_filter.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/schema/test_inheritance.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/schema/test_matches.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/schema/test_read_write_parquet.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/schema/test_repr.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/schema/test_rule_implementation.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/schema/test_sample.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/schema/test_serialization.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/schema/test_storage.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/schema/test_validate.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/storage/test_delta.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/storage/test_fsspec.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/test_compat.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/test_config.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/test_deprecation.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/test_factory.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/test_native_regex.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/test_pydantic.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/test_random.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/test_serialization.py +0 -0
- {dataframely-2.8.0 → dataframely-2.8.2}/tests/test_typing.py +0 -0
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# Dataframely
|
|
2
|
+
|
|
3
|
+
## Package Management
|
|
4
|
+
|
|
5
|
+
This repository uses the Pixi package manager. When editing `pixi.toml`, run `pixi lock` afterwards.
|
|
6
|
+
|
|
7
|
+
When running any commands (like `pytest`), prepend them with `pixi run`.
|
|
8
|
+
|
|
9
|
+
## Code Style
|
|
10
|
+
|
|
11
|
+
### Documentation
|
|
12
|
+
|
|
13
|
+
- Document all public functions/methods and classes using docstrings
|
|
14
|
+
- For functions & methods, use Google Docstrings and include `Args` (if there are any arguments) and `Returns` (if
|
|
15
|
+
there is a return type).
|
|
16
|
+
- Do not include type hints in the docstrings
|
|
17
|
+
- Do not mention default values in the docstrings
|
|
18
|
+
- Do not write docstrings for private functions/methods unless the function is highly complex
|
|
19
|
+
|
|
20
|
+
### License Headers
|
|
21
|
+
|
|
22
|
+
Do not manually adjust or add license headers. A pre-commit hook will take care of this.
|
|
23
|
+
|
|
24
|
+
## Testing
|
|
25
|
+
|
|
26
|
+
- Never use classes for pytest, but only free functions
|
|
27
|
+
- Do not put `__init__.py` files into test directories
|
|
28
|
+
- Tests should not have docstrings unless they are very complicated or very specific, i.e. warrant a description beyond
|
|
29
|
+
the test's name
|
|
30
|
+
- All tests should follow the arrange-act-assert pattern. The respective logical blocks should be distinguished via
|
|
31
|
+
code comments as follows:
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
def test_method() -> None:
|
|
35
|
+
# Arrange
|
|
36
|
+
...
|
|
37
|
+
|
|
38
|
+
# Act
|
|
39
|
+
...
|
|
40
|
+
|
|
41
|
+
# Assert
|
|
42
|
+
...
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
- If two or more tests are structurally equivalent, they should be merged into a single test and parametrized with
|
|
46
|
+
`@pytest.mark.parametrize`
|
|
47
|
+
- If at least two tests share the same logic in the "arrange" step, the respective logic should be extracted into a
|
|
48
|
+
fixture
|
|
49
|
+
|
|
50
|
+
## Reviewing
|
|
51
|
+
|
|
52
|
+
When reviewing code changes, make sure that the `SKILL.md` is up-to-date and in line with the public API of this
|
|
53
|
+
package.
|
|
@@ -57,14 +57,11 @@ jobs:
|
|
|
57
57
|
environments: build
|
|
58
58
|
- name: Set version
|
|
59
59
|
run: pixi run -e build set-version
|
|
60
|
-
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
|
|
61
|
-
with:
|
|
62
|
-
python-version: "3.10"
|
|
63
60
|
- name: Build wheel
|
|
64
61
|
uses: PyO3/maturin-action@04ac600d27cdf7a9a280dadf7147097c42b757ad # v1.50.1
|
|
65
62
|
with:
|
|
66
63
|
command: build
|
|
67
|
-
args: --out dist --release
|
|
64
|
+
args: --out dist --release -i python3.10
|
|
68
65
|
manylinux: auto
|
|
69
66
|
sccache: true
|
|
70
67
|
- name: Check package
|
|
@@ -84,7 +81,7 @@ jobs:
|
|
|
84
81
|
id-token: write
|
|
85
82
|
environment: pypi
|
|
86
83
|
steps:
|
|
87
|
-
- uses: actions/download-artifact@
|
|
84
|
+
- uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
|
|
88
85
|
with:
|
|
89
86
|
path: dist
|
|
90
87
|
merge-multiple: true
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
name: Chore
|
|
2
|
+
on:
|
|
3
|
+
push:
|
|
4
|
+
branches: [main]
|
|
5
|
+
|
|
6
|
+
concurrency:
|
|
7
|
+
group: ${{ github.workflow }}-${{ github.ref }}
|
|
8
|
+
cancel-in-progress: true
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
draft-release:
|
|
12
|
+
name: Draft Release
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
permissions:
|
|
15
|
+
contents: write
|
|
16
|
+
pull-requests: read
|
|
17
|
+
steps:
|
|
18
|
+
- name: Update release draft
|
|
19
|
+
uses: release-drafter/release-drafter@139054aeaa9adc52ab36ddf67437541f039b88e2 # v7.1.1
|
|
20
|
+
with:
|
|
21
|
+
token: ${{ github.token }}
|
dataframely-2.8.0/.github/workflows/chore.yml → dataframely-2.8.2/.github/workflows/chore-pr.yml
RENAMED
|
@@ -3,8 +3,6 @@ on:
|
|
|
3
3
|
pull_request_target:
|
|
4
4
|
branches: [main]
|
|
5
5
|
types: [opened, reopened, edited, synchronize]
|
|
6
|
-
push:
|
|
7
|
-
branches: [main]
|
|
8
6
|
|
|
9
7
|
concurrency:
|
|
10
8
|
group: ${{ github.workflow }}-${{ github.ref }}
|
|
@@ -13,7 +11,6 @@ concurrency:
|
|
|
13
11
|
jobs:
|
|
14
12
|
check-pr-title:
|
|
15
13
|
name: Check PR Title
|
|
16
|
-
if: github.event_name == 'pull_request_target'
|
|
17
14
|
runs-on: ubuntu-latest
|
|
18
15
|
permissions:
|
|
19
16
|
contents: read
|
|
@@ -28,7 +25,7 @@ jobs:
|
|
|
28
25
|
GITHUB_TOKEN: ${{ github.token }}
|
|
29
26
|
- name: Post comment about invalid PR title
|
|
30
27
|
if: failure()
|
|
31
|
-
uses: marocchino/sticky-pull-request-comment@
|
|
28
|
+
uses: marocchino/sticky-pull-request-comment@70d2764d1a7d5d9560b100cbea0077fc8f633987 # v3.0.2
|
|
32
29
|
with:
|
|
33
30
|
header: conventional-commit-pr-title
|
|
34
31
|
message: |
|
|
@@ -45,22 +42,18 @@ jobs:
|
|
|
45
42
|
</details>
|
|
46
43
|
- name: Delete comment about invalid PR title
|
|
47
44
|
if: success()
|
|
48
|
-
uses: marocchino/sticky-pull-request-comment@
|
|
45
|
+
uses: marocchino/sticky-pull-request-comment@70d2764d1a7d5d9560b100cbea0077fc8f633987 # v3.0.2
|
|
49
46
|
with:
|
|
50
47
|
header: conventional-commit-pr-title
|
|
51
48
|
delete: true
|
|
52
49
|
|
|
53
|
-
|
|
54
|
-
name:
|
|
50
|
+
assign-labels:
|
|
51
|
+
name: Assign Labels
|
|
55
52
|
runs-on: ubuntu-latest
|
|
56
53
|
permissions:
|
|
57
|
-
contents: write
|
|
58
54
|
pull-requests: write
|
|
59
55
|
steps:
|
|
60
|
-
- name:
|
|
61
|
-
uses: release-drafter/release-drafter@
|
|
56
|
+
- name: Assign labels
|
|
57
|
+
uses: release-drafter/release-drafter/autolabeler@139054aeaa9adc52ab36ddf67437541f039b88e2 # v7.1.1
|
|
62
58
|
with:
|
|
63
|
-
|
|
64
|
-
disable-autolabeler: ${{ github.event_name == 'push' }}
|
|
65
|
-
env:
|
|
66
|
-
GITHUB_TOKEN: ${{ github.token }}
|
|
59
|
+
token: ${{ github.token }}
|
|
@@ -30,7 +30,7 @@ jobs:
|
|
|
30
30
|
- name: Install Rust
|
|
31
31
|
run: rustup show
|
|
32
32
|
- name: Cache Rust dependencies
|
|
33
|
-
uses: Swatinem/rust-cache@
|
|
33
|
+
uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1
|
|
34
34
|
- name: pre-commit
|
|
35
35
|
run: pixi run pre-commit-run --color=always --show-diff-on-failure
|
|
36
36
|
|
|
@@ -71,7 +71,7 @@ jobs:
|
|
|
71
71
|
- name: Install Rust
|
|
72
72
|
run: rustup show
|
|
73
73
|
- name: Cache Rust dependencies
|
|
74
|
-
uses: Swatinem/rust-cache@
|
|
74
|
+
uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1
|
|
75
75
|
- name: Install repository
|
|
76
76
|
run: pixi run -e ${{ matrix.environment }} postinstall
|
|
77
77
|
- name: Run pytest
|
|
@@ -21,6 +21,6 @@ jobs:
|
|
|
21
21
|
- name: Install Rust
|
|
22
22
|
run: rustup show
|
|
23
23
|
- name: Cache Rust dependencies
|
|
24
|
-
uses: Swatinem/rust-cache@
|
|
24
|
+
uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1
|
|
25
25
|
- name: Install repository
|
|
26
26
|
run: pixi run postinstall
|
|
@@ -74,6 +74,6 @@ jobs:
|
|
|
74
74
|
# Upload the results to GitHub's code scanning dashboard (optional).
|
|
75
75
|
# Commenting out will disable upload of results to your repo's Code Scanning dashboard
|
|
76
76
|
- name: "Upload to code-scanning"
|
|
77
|
-
uses: github/codeql-action/upload-sarif@89a39a4e59826350b863aa6b6252a07ad50cf83e #
|
|
77
|
+
uses: github/codeql-action/upload-sarif@89a39a4e59826350b863aa6b6252a07ad50cf83e # v4.32.4
|
|
78
78
|
with:
|
|
79
79
|
sarif_file: results.sarif
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: dataframely
|
|
3
|
+
description: Best practices for polars data processing with dataframely. Covers definitions of Schema and Collection, usage of
|
|
4
|
+
.validate() and .filter(), type hints, and testing. Use when writing or modifying code involving dataframely or
|
|
5
|
+
polars data frames.
|
|
6
|
+
license: BSD-3-Clause
|
|
7
|
+
user-invocable: false
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
# Overview
|
|
11
|
+
|
|
12
|
+
`dataframely` provides two types:
|
|
13
|
+
|
|
14
|
+
- `dy.Schema` documents and enforces the structure of a single data frame
|
|
15
|
+
- `dy.Collection` documents and enforces the relationships between multiple related data frames that each have their
|
|
16
|
+
own `dy.Schema`
|
|
17
|
+
|
|
18
|
+
## `dy.Schema`
|
|
19
|
+
|
|
20
|
+
A subclass of `dy.Schema` describes the structure of a single dataframe.
|
|
21
|
+
|
|
22
|
+
```python
|
|
23
|
+
class MyHouseSchema(dy.Schema):
|
|
24
|
+
"""A schema for a dataframe describing houses."""
|
|
25
|
+
|
|
26
|
+
street = dy.String(primary_key=True)
|
|
27
|
+
number = dy.UInt16(primary_key=True)
|
|
28
|
+
#: Description on the number of rooms.
|
|
29
|
+
rooms = dy.UInt8()
|
|
30
|
+
#: Description on the area of the house.
|
|
31
|
+
area = dy.UInt16()
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
The schema can be used in type hints via `dy.DataFrame[MyHouseSchema]` and `dy.LazyFrame[MyHouseSchema]` to express
|
|
35
|
+
schema adherence statically. It can also be used to validate the structure and contents of a data frame at runtime
|
|
36
|
+
using validation and filtering.
|
|
37
|
+
|
|
38
|
+
`dy.DataFrame[...]` and `dy.LazyFrame[...]` are typically referred to as "typed data frames". They are typing-only
|
|
39
|
+
wrappers around `pl.DataFrame` and `pl.LazyFrame`, respectively, and only express intent. They are never initialized at
|
|
40
|
+
runtime.
|
|
41
|
+
|
|
42
|
+
### Defining Constraints
|
|
43
|
+
|
|
44
|
+
Persist all implicit assumptions on the data as constraints in the schema. Use docstrings purely to answer the "what"
|
|
45
|
+
about the column contents.
|
|
46
|
+
|
|
47
|
+
- Use the most specific type possible for each column (e.g. `dy.Enum` instead of `dy.String` when applicable).
|
|
48
|
+
- Use pre-defined arguments (e.g. `nullable`, `min`, `regex`) for column-level constraints if possible.
|
|
49
|
+
- Use the `check` argument for non-standard column-level constraints that cannot be expressed using pre-defined
|
|
50
|
+
arguments. Prefer the defining the check as a dictionary with keys describing the type of check:
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
class MySchema(dy.Schema):
|
|
54
|
+
col = dy.UInt8(check={"divisible_by_two": lambda col: (col % 2) == 0})
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
- Use rules (i.e. methods decorated with `@dy.rule`) for cross-column constraints. Use expressive names for the rules
|
|
58
|
+
and use `cls` to refer to the schema:
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
class MySchema(dy.Schema):
|
|
62
|
+
col1 = dy.UInt8()
|
|
63
|
+
col2 = dy.UInt8()
|
|
64
|
+
|
|
65
|
+
@dy.rule()
|
|
66
|
+
def col1_greater_col2(cls) -> pl.Expr:
|
|
67
|
+
return cls.col1.col > cls.col2.col
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
- Use group rules (i.e. methods decorated with `@dy.rule(group_by=...)`) for cross-row constraints beyond primary key
|
|
71
|
+
checks.
|
|
72
|
+
|
|
73
|
+
### Referencing Columns
|
|
74
|
+
|
|
75
|
+
When referencing columns of the schema anywhere in the code, always reference column as attribute of the schema class:
|
|
76
|
+
|
|
77
|
+
- Use `Schema.column.col` instead of `pl.col("column")` to obtain a `pl.Expr` referencing the column.
|
|
78
|
+
- Use `Schema.column.name` to reference the column name as a string.
|
|
79
|
+
|
|
80
|
+
This allows for easier refactorings and enables lookups on column definitions and constraints via LSP.
|
|
81
|
+
|
|
82
|
+
## `dy.Collection`
|
|
83
|
+
|
|
84
|
+
A subclass of `dy.Collection` describes a set of related data frames, each described by a `dy.Schema`. Data frames in a
|
|
85
|
+
collection should share at least a subset of their primary key.
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
class MyStreetSchema(dy.Schema):
|
|
89
|
+
"""A schema for a dataframe describing streets."""
|
|
90
|
+
|
|
91
|
+
# Shared primary key component with MyHouseSchema
|
|
92
|
+
street = dy.String(primary_key=True)
|
|
93
|
+
city = dy.String()
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class MyCollection(dy.Collection):
|
|
97
|
+
"""A collection of related dataframes."""
|
|
98
|
+
|
|
99
|
+
houses: dy.LazyFrame[MyHouseSchema]
|
|
100
|
+
streets: dy.LazyFrame[MyStreetSchema]
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
The collection can be used in a standalone manner (much like a dataclass). It can also be used to validate the
|
|
104
|
+
structure and contents of its members and their relationships at runtime using validation and filtering.
|
|
105
|
+
|
|
106
|
+
### Defining Constraints
|
|
107
|
+
|
|
108
|
+
Persist all implicit assumptions about the relationships between the collections' data frames as constraints in the
|
|
109
|
+
collection.
|
|
110
|
+
|
|
111
|
+
- Use filters (i.e. methods decorated with `@dy.filter`) to enforce assumptions about the relationships (e.g. 1:1, 1:N)
|
|
112
|
+
between the collections' data frames. Leverage `dy.functional` for writing filter logic.
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
class MyCollection(dy.Collection):
|
|
116
|
+
houses: dy.LazyFrame[MyHouseSchema]
|
|
117
|
+
streets: dy.LazyFrame[MyStreetSchema]
|
|
118
|
+
|
|
119
|
+
@dy.filter()
|
|
120
|
+
def all_houses_on_known_streets(cls) -> pl.LazyFrame:
|
|
121
|
+
return dy.functional.require_relationship_one_to_at_least_one(
|
|
122
|
+
cls.streets, cls.houses, on="street"
|
|
123
|
+
)
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
# Usage Conventions
|
|
127
|
+
|
|
128
|
+
## Clear Interfaces
|
|
129
|
+
|
|
130
|
+
Structure data processing code with clear interfaces documented using `dataframely` type hints:
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
def preprocess(raw: dy.LazyFrame[MyRawSchema]) -> dy.DataFrame[MyPreprocessedSchema]:
|
|
134
|
+
# Internal data frames do not require schemas
|
|
135
|
+
df: pl.LazyFrame = ...
|
|
136
|
+
return MyPreprocessedSchema.validate(df, cast=True)
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
- Use schemas for all input and output data frames in a function. Omit type hints if the function is a private helper
|
|
140
|
+
(prefixed with `_`) unless the schema critically improves readability or testability.
|
|
141
|
+
- Omit schemas for short-lived temporary data frames. Never define schemas for function-local data frames.
|
|
142
|
+
|
|
143
|
+
## Validation and Filtering
|
|
144
|
+
|
|
145
|
+
Both `.validate` and `.filter` enforce the schema at runtime. Pass `cast=True` for safe type-casting.
|
|
146
|
+
|
|
147
|
+
- **`Schema.validate`** — raises on failure. Use when failures are unexpected (e.g. transforming already-validated
|
|
148
|
+
data).
|
|
149
|
+
- **`Schema.filter`** — returns valid rows plus a `FailureInfo` describing filtered-out rows. Use when failures are
|
|
150
|
+
possible and should be handled gracefully. Failures should either be kept around or logged for introspection. The
|
|
151
|
+
`FailureInfo` object provides several utility methods to obtain information about the failures:
|
|
152
|
+
- `len(failure)` provides the total number of failures
|
|
153
|
+
- `failure.counts()` provides the number of violations by rule
|
|
154
|
+
- `failure.invalid()` provides the data frame of invalid rows
|
|
155
|
+
- `failure.details()` provides the data frame of invalid rows with additional columns providing information on which
|
|
156
|
+
rules were violated
|
|
157
|
+
|
|
158
|
+
When performing validation or filtering, prefer using `pipe` to clarify the flow of data:
|
|
159
|
+
|
|
160
|
+
```python
|
|
161
|
+
result = df.pipe(MySchema.validate)
|
|
162
|
+
out, failures = df.pipe(MySchema.filter)
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
### Pure Casting
|
|
166
|
+
|
|
167
|
+
Use `Schema.cast` as an escape-hatch when it is already known that the data frame conforms to the schema and the
|
|
168
|
+
runtime cost of the validation should not be incurred. Generally, prefer using `Schema.validate` or `Schema.filter`.
|
|
169
|
+
|
|
170
|
+
## Testing
|
|
171
|
+
|
|
172
|
+
Unless otherwise specified by the user or the project context, add unit tests for all (non-private) methods performing
|
|
173
|
+
data transformations.
|
|
174
|
+
|
|
175
|
+
- Do not test properties already guaranteed by the schema (e.g. data types, nullability, value constraints).
|
|
176
|
+
|
|
177
|
+
### Test structure
|
|
178
|
+
|
|
179
|
+
Write tests with the following structure:
|
|
180
|
+
|
|
181
|
+
1. "Arrange": Define synthetic input data and expected output
|
|
182
|
+
2. "Act": Execute the transformation
|
|
183
|
+
3. "Assert": Compare expected and actual output using `assert_frame_equal` from `polars.testing`
|
|
184
|
+
|
|
185
|
+
```python
|
|
186
|
+
from polars.testing import assert_frame_equal
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def test_grouped_sum():
|
|
190
|
+
df = pl.DataFrame({
|
|
191
|
+
"col1": [1, 2, 3],
|
|
192
|
+
"col2": ["a", "a", "b"],
|
|
193
|
+
}).pipe(MyInputSchema.validate, cast=True)
|
|
194
|
+
|
|
195
|
+
expected = pl.DataFrame({
|
|
196
|
+
"col1": ["a", "b"],
|
|
197
|
+
"col2": [3, 3],
|
|
198
|
+
})
|
|
199
|
+
|
|
200
|
+
result = my_code(df)
|
|
201
|
+
|
|
202
|
+
assert_frame_equal(expected, result)
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
### Generating Synthetic Test Data
|
|
206
|
+
|
|
207
|
+
Use `dataframely`'s synthetic data generation for creating inputs to functions requiring typed data frames in their
|
|
208
|
+
input. Generate synthetic data for schemas as follows:
|
|
209
|
+
|
|
210
|
+
- Use `MySchema.sample(num_rows=...)` to generate fully random data when exact contents don't matter.
|
|
211
|
+
- Use `MySchema.sample(overrides=...)` to generate random data with specific columns pinned to certain values for
|
|
212
|
+
testing specific functionality. Prefer using dicts of lists for overrides unless specifically prompted otherwise.
|
|
213
|
+
- When using dicts of lists: for providing overrides that are constant across all rows, provide scalar values instead
|
|
214
|
+
of lists of equal values.
|
|
215
|
+
- Always use `MySchema.create_empty()` instead of sampling with empty overrides when an empty data frame is needed.
|
|
216
|
+
|
|
217
|
+
Synthetic data for collections should be generated as follows:
|
|
218
|
+
|
|
219
|
+
- Use `MyCollection.sample(num_rows=...)` to generate fully random data when exact contents don't matter.
|
|
220
|
+
- Use `MyCollection.sample(overrides=...)` to generate random data where certain values of the collection members
|
|
221
|
+
matter. Use lists of dicts for providing overrides as "objects" spanning the collection members.
|
|
222
|
+
- Values for shared primary keys must be provided at the root of the dictionaries
|
|
223
|
+
- Values for individual collection members must be provided in nested dictionaries under the keys corresponding to
|
|
224
|
+
the collection member names.
|
|
225
|
+
- Always use `MyCollection.create_empty()` instead of sampling with empty overrides when an empty collection is needed.
|
|
226
|
+
|
|
227
|
+
## I/O Conventions
|
|
228
|
+
|
|
229
|
+
When writing typed data frames to disk, prefer using `MySchema.write_...` instead of using `write_...` directly on the
|
|
230
|
+
data frame. This ensures that schema metadata is persisted alongside the data and can be leveraged when reading the
|
|
231
|
+
data back in.
|
|
232
|
+
|
|
233
|
+
When reading typed data frames from disk, prefer using `MySchema.read_...` instead of using `pl.read_...` directly from
|
|
234
|
+
|
|
235
|
+
# Getting more information
|
|
236
|
+
|
|
237
|
+
`dataframely` provides clear function signatures, type hints and docstrings for the full public API. For more
|
|
238
|
+
information, inspect the source code in the site packages. If available, always use the LSP tool to find documentation.
|
|
@@ -119,7 +119,9 @@ class SchemaMeta(ABCMeta):
|
|
|
119
119
|
result = Metadata()
|
|
120
120
|
for base in bases:
|
|
121
121
|
result.update(mcs._get_metadata_recursively(base))
|
|
122
|
-
|
|
122
|
+
namespace_metadata = mcs._get_metadata(namespace)
|
|
123
|
+
mcs._remove_overridden_columns(result, namespace, bases)
|
|
124
|
+
result.update(namespace_metadata)
|
|
123
125
|
namespace[_COLUMN_ATTR] = result.columns
|
|
124
126
|
cls = super().__new__(mcs, name, bases, namespace, *args, **kwargs)
|
|
125
127
|
|
|
@@ -207,6 +209,34 @@ class SchemaMeta(ABCMeta):
|
|
|
207
209
|
val._name = val.alias or name
|
|
208
210
|
return val
|
|
209
211
|
|
|
212
|
+
@staticmethod
|
|
213
|
+
def _remove_overridden_columns(
|
|
214
|
+
result: Metadata,
|
|
215
|
+
namespace: dict[str, Any],
|
|
216
|
+
bases: tuple[type[object], ...],
|
|
217
|
+
) -> None:
|
|
218
|
+
"""Remove inherited columns that the child namespace explicitly overrides.
|
|
219
|
+
|
|
220
|
+
Before merging the child namespace, we must drop any parent columns whose
|
|
221
|
+
attribute name is redefined in the child. This allows subclasses to redefine
|
|
222
|
+
inherited columns while still detecting genuine alias conflicts.
|
|
223
|
+
|
|
224
|
+
In multiple-inheritance scenarios, the same attribute name may appear in more
|
|
225
|
+
than one base with different aliases, so we walk all parent MROs and collect
|
|
226
|
+
every matching column key to remove.
|
|
227
|
+
"""
|
|
228
|
+
for attr, value in namespace.items():
|
|
229
|
+
if not isinstance(value, Column):
|
|
230
|
+
continue
|
|
231
|
+
keys_to_remove: set[str] = set()
|
|
232
|
+
for base in bases:
|
|
233
|
+
for parent_cls in base.__mro__:
|
|
234
|
+
parent_col = parent_cls.__dict__.get(attr)
|
|
235
|
+
if parent_col is not None and isinstance(parent_col, Column):
|
|
236
|
+
keys_to_remove.add(parent_col.alias or attr)
|
|
237
|
+
for parent_key in keys_to_remove:
|
|
238
|
+
result.columns.pop(parent_key, None)
|
|
239
|
+
|
|
210
240
|
@staticmethod
|
|
211
241
|
def _get_metadata_recursively(kls: type[object]) -> Metadata:
|
|
212
242
|
result = Metadata()
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# Using `dataframely` with coding agents
|
|
2
|
+
|
|
3
|
+
Coding agents like [Claude Code](https://code.claude.com/), [Codex](https://openai.com/codex/) and
|
|
4
|
+
[GitHub Copilot](https://github.com/features/copilot) are particularly powerful when two criteria are met:
|
|
5
|
+
|
|
6
|
+
1. The agent has access to the full context required to solve the problem, i.e. does not have to guess.
|
|
7
|
+
2. The results of the agent's work can be easily verified.
|
|
8
|
+
|
|
9
|
+
When writing data processing logic, `dataframely` helps to fulfill these criteria.
|
|
10
|
+
|
|
11
|
+
To help your coding agent write idiomatic `dataframely` code, we provide a `dataframely`
|
|
12
|
+
[skill](https://raw.githubusercontent.com/Quantco/dataframely/refs/heads/main/SKILL.md) following the
|
|
13
|
+
[`agentskills.io` spec](https://agentskills.io/specification). You can install it by placing it where your agent can
|
|
14
|
+
find it. For example, if you are using Claude Code:
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
mkdir -p .claude/skills/dataframely/
|
|
18
|
+
curl -o .claude/skills/dataframely/SKILL.md https://raw.githubusercontent.com/Quantco/dataframely/refs/heads/main/SKILL.md
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
or if you are using [skills.sh](https://skills.sh/) to manage your skills:
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
npx skills add Quantco/dataframely
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
Refer to the documentation of your coding agent for instructions on how to add custom skills.
|
|
28
|
+
|
|
29
|
+
## Tell the agent about your data with `dataframely` schemas
|
|
30
|
+
|
|
31
|
+
`dataframely` schemas provide a clear format for documenting dataframe structure and contents, which helps coding
|
|
32
|
+
agents understand your code base. We recommend structuring your data processing code using clear interfaces that are
|
|
33
|
+
documented using `dataframely` type hints. This streamlines your coding agent's ability to find the right schema at the
|
|
34
|
+
right time.
|
|
35
|
+
|
|
36
|
+
For example:
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
def preprocess(raw: dy.LazyFrame[MyRawSchema]) -> dy.DataFrame[MyPreprocessedSchema]:
|
|
40
|
+
...
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
gives a coding agent much more information than the schema-less alternative:
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
def load_data(raw: pl.LazyFrame) -> pl.DataFrame:
|
|
47
|
+
...
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
This convention also makes your code more readable and maintainable for human developers.
|
|
51
|
+
|
|
52
|
+
If there is additional domain information that is not natively expressed through the structure of the schema, we
|
|
53
|
+
recommend documenting this as docstrings on the definition of the schema columns. One common example would be the
|
|
54
|
+
semantic meanings of enum values referring to conventions in the data:
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
class HospitalStaySchema(dy.Schema):
|
|
58
|
+
# Reason for admission to the hospital
|
|
59
|
+
# N = Emergency
|
|
60
|
+
# V = Transfer from another hospital
|
|
61
|
+
# ...
|
|
62
|
+
admission_reason = dy.Enum(["N", "V", ...])
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Verifying results
|
|
66
|
+
|
|
67
|
+
`dataframely` supports you and your coding agent in writing unit tests for individual pieces of logic. One significant
|
|
68
|
+
bottleneck is the generation of appropriate test data. Check out
|
|
69
|
+
[our documentation on synthetic data generation](./features/data-generation.md) to see how `dataframely` can help you
|
|
70
|
+
generate realistic test data that meets the constraints of your schema. We recommend requiring your coding agent to
|
|
71
|
+
write tests using this functionality to verify its work.
|
|
72
|
+
|
|
73
|
+
<!-- prettier-ignore -->
|
|
74
|
+
> [!NOTE]
|
|
75
|
+
> The official skill already tells your coding agent how to best write unit tests with dataframely.
|
|
@@ -27,7 +27,7 @@ description = "A declarative, polars-native data frame validation library"
|
|
|
27
27
|
name = "dataframely"
|
|
28
28
|
readme = "README.md"
|
|
29
29
|
requires-python = ">=3.10"
|
|
30
|
-
version = "2.8.
|
|
30
|
+
version = "2.8.2"
|
|
31
31
|
|
|
32
32
|
[project.optional-dependencies]
|
|
33
33
|
deltalake = ["deltalake"]
|
|
@@ -141,3 +141,22 @@ def test_user_error_polars_datatype_type() -> None:
|
|
|
141
141
|
class MySchemaWithPolarsDataTypeType(dy.Schema):
|
|
142
142
|
a = dy.Int32(nullable=False)
|
|
143
143
|
b = pl.String # User error: Used pl.String instead of dy.String()
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def test_override() -> None:
|
|
147
|
+
class FirstSchema(dy.Schema):
|
|
148
|
+
x = dy.Int64()
|
|
149
|
+
|
|
150
|
+
class SecondSchema(FirstSchema):
|
|
151
|
+
x = dy.Int64(nullable=True)
|
|
152
|
+
|
|
153
|
+
first_columns = FirstSchema.columns()
|
|
154
|
+
second_columns = SecondSchema.columns()
|
|
155
|
+
|
|
156
|
+
assert set(first_columns) == {"x"}
|
|
157
|
+
assert set(second_columns) == {"x"}
|
|
158
|
+
|
|
159
|
+
assert first_columns["x"].nullable is False
|
|
160
|
+
assert second_columns["x"].nullable is True
|
|
161
|
+
|
|
162
|
+
assert type(second_columns["x"]) is type(first_columns["x"])
|