matchbox-db 0.6.2__tar.gz → 0.6.3.dev53__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/.github/workflows/prerelease.yml +38 -8
- {matchbox_db-0.6.2/src/matchbox_db.egg-info → matchbox_db-0.6.3.dev53}/PKG-INFO +1 -1
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/client/_handler.py +5 -3
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/client/eval/utils.py +2 -6
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/client/models/models.py +2 -2
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/client/results.py +32 -47
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/common/eval.py +6 -6
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/common/factories/entities.py +2 -2
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/common/factories/models.py +11 -34
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/common/factories/scenarios.py +20 -9
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/common/factories/sources.py +1 -1
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/common/transform.py +35 -34
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/api/main.py +71 -3
- matchbox_db-0.6.3.dev53/src/matchbox/server/api/static/favicon.png +0 -0
- matchbox_db-0.6.3.dev53/src/matchbox/server/api/static/swagger-ui-bundle.js +2 -0
- matchbox_db-0.6.3.dev53/src/matchbox/server/api/static/swagger-ui.css +3 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/postgresql/adapter.py +4 -1
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/postgresql/utils/evaluation.py +9 -7
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53/src/matchbox_db.egg-info}/PKG-INFO +1 -1
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox_db.egg-info/SOURCES.txt +3 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/client/models/methodologies/test_linkers_deterministic.py +1 -1
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/client/models/methodologies/test_linkers_probabilistic.py +1 -2
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/client/test_results.py +7 -10
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/common/factories/test_entity_factory.py +6 -6
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/common/factories/test_model_factory.py +9 -6
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/common/factories/test_probability_generation.py +14 -20
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/common/test_eval.py +10 -11
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/e2e/test_e2e_evaluation.py +3 -2
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/server/api/routes/test_routes_main.py +96 -2
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/server/api/routes/test_routes_resolution.py +4 -4
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/server/test_adapter.py +2 -1
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/uv.lock +1499 -1499
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/.github/pull_request_template.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/.github/workflows/ci.yml +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/.github/workflows/release.yml +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/.gitignore +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/.pre-commit-config.yaml +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/.vscode/launch.json +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/.vscode/settings.json +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/LICENSE +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/README.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docker-compose.yml +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/api/client/dags.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/api/client/eval.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/api/client/index.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/api/client/models.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/api/client/queries.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/api/client/results.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/api/client/sources.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/api/common/arrow.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/api/common/db.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/api/common/dtos.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/api/common/eval.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/api/common/exceptions.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/api/common/factories/entities.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/api/common/factories/index.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/api/common/factories/models.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/api/common/factories/scenarios.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/api/common/factories/sources.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/api/common/graph.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/api/common/hash.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/api/common/index.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/api/common/logging.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/api/common/transform.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/api/server/api.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/api/server/backends/postgresql.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/api/server/index.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/api/server/uploads.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/assets/matchbox-icon-dark.png +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/assets/matchbox-icon.svg +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/assets/matchbox-logo-dark.svg +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/assets/matchbox-logo-light.svg +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/client/evaluation.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/client/explore-dags.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/client/install.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/client/link-data.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/client/look-up.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/contributing.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/index.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/server/concepts.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/server/install.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/server/risks.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/stylesheets/extra.css +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/docs/use-cases.md +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/environments/containers.env +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/environments/development.env +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/environments/sample_client.env +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/environments/sample_server.env +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/justfile +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/mkdocs.yml +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/pyproject.toml +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/setup.cfg +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/__init__.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/client/__init__.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/client/_settings.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/client/authorisation.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/client/dags.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/client/eval/__init__.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/client/eval/justfile +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/client/eval/mock_ui.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/client/eval/ui.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/client/models/__init__.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/client/models/comparison.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/client/models/dedupers/__init__.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/client/models/dedupers/base.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/client/models/dedupers/naive.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/client/models/linkers/__init__.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/client/models/linkers/base.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/client/models/linkers/deterministic.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/client/models/linkers/splinklinker.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/client/models/linkers/weighteddeterministic.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/client/queries.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/client/sources.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/common/__init__.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/common/arrow.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/common/db.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/common/dtos.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/common/exceptions.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/common/factories/__init__.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/common/factories/dags.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/common/graph.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/common/hash.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/common/logging.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/Dockerfile +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/__init__.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/api/__init__.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/api/dependencies.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/api/routers/__init__.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/api/routers/eval.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/api/routers/resolution.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/base.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/postgresql/.gitkeep +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/postgresql/__init__.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/postgresql/alembic/env.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/postgresql/alembic/script.py.mako +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/postgresql/alembic/versions/05cc4181a0ad_removed_source_key_reference_and_added_.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/postgresql/alembic/versions/1907c34cfa1f_create_tables_given_schema.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/postgresql/alembic/versions/3754ae042254_move_orm_to_root_leaf_contains_structure.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/postgresql/alembic/versions/40a8e5ed48f2_create_schema_without_tables.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/postgresql/alembic/versions/4a7c35f86405_move_sourceconfigs_from_sourceaddress_.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/postgresql/alembic/versions/7a2d1b10ac0f_switch_from_location_uri_to_name.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/postgresql/alembic/versions/83b134a86713_simplify_resolution_naming_and_hashing.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/postgresql/alembic/versions/95c0b5c23446_renaming_sources_to_source_config.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/postgresql/alembic/versions/ae63f79f6b39_renamed_sourcecolumns_to_sourcefields.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/postgresql/alembic/versions/b38d61ab11cc_add_index_to_the_clustersourcekey_table.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/postgresql/alembic/versions/b694eb292dea_add_an_index_to_the_probabilities_.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/postgresql/alembic/versions/beba75a24962_add_pkspace_table.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/postgresql/alembic/versions/c4cb937d00f4_add_modelconfigs.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/postgresql/alembic/versions/dd0c3a9ecdf9_add_migrations_for_first_eval_tables.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/postgresql/alembic/versions/e4122bdf9b0d_renamed_primary_keys_to_just_keys.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/postgresql/alembic/versions/f3c9279437f4_add_content_hash_to_resolutions.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/postgresql/alembic.ini +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/postgresql/db.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/postgresql/justfile +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/postgresql/mixin.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/postgresql/orm.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/postgresql/utils/__init__.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/postgresql/utils/db.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/postgresql/utils/insert.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/postgresql/utils/query.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/postgresql/utils/results.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox/server/uploads.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox_db.egg-info/dependency_links.txt +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox_db.egg-info/requires.txt +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/src/matchbox_db.egg-info/top_level.txt +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/__init__.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/client/__init__.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/client/models/__init__.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/client/models/methodologies/__init__.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/client/models/methodologies/test_dedupers_deterministic.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/client/models/test_comparison.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/client/test_dags.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/client/test_eval.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/client/test_handler.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/client/test_models.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/client/test_queries.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/client/test_sources.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/common/__init__.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/common/factories/__init__.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/common/factories/test_linked_factory.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/common/factories/test_scenarios.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/common/factories/test_source_factory.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/common/factories/test_testkit_dag.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/common/test_dto.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/common/test_graph.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/common/test_hash.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/common/test_results.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/common/test_transform.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/conftest.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/data/all_companies.csv +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/e2e/test_e2e_dag.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/fixtures/__init__.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/fixtures/client.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/fixtures/db.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/fixtures/graph.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/justfile +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/server/__init__.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/server/api/__init__.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/server/api/routes/__init__.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/server/api/routes/test_routes_eval.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/server/postgresql/__init__.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/server/postgresql/test_pg_core.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/server/postgresql/test_pg_migrations.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/server/postgresql/test_pg_sql.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/server/test_uploads.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/test/utils.py +0 -0
- {matchbox_db-0.6.2 → matchbox_db-0.6.3.dev53}/trufflehog-exclude.txt +0 -0
|
@@ -7,8 +7,10 @@ env:
|
|
|
7
7
|
IMAGE_NAME: ${{ github.repository }}
|
|
8
8
|
|
|
9
9
|
jobs:
|
|
10
|
-
build
|
|
10
|
+
build:
|
|
11
11
|
runs-on: ubuntu-latest
|
|
12
|
+
outputs:
|
|
13
|
+
mb_version: ${{ steps.get_version.outputs.mb_version }}
|
|
12
14
|
permissions:
|
|
13
15
|
contents: write
|
|
14
16
|
packages: write
|
|
@@ -34,11 +36,21 @@ jobs:
|
|
|
34
36
|
python-version: "3.11"
|
|
35
37
|
|
|
36
38
|
- name: Extract development version
|
|
39
|
+
id: get_version
|
|
37
40
|
run: |
|
|
38
|
-
echo "
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
41
|
+
echo "mb_version=$(uv run --frozen python -m setuptools_scm | sed 's/+.*//')" \
|
|
42
|
+
>> "$GITHUB_OUTPUT"
|
|
43
|
+
|
|
44
|
+
- name: Build package
|
|
45
|
+
env:
|
|
46
|
+
SETUPTOOLS_SCM_PRETEND_VERSION: ${{ steps.get_version.outputs.mb_version }}
|
|
47
|
+
run: uv build
|
|
48
|
+
|
|
49
|
+
- name: Upload package artifacts
|
|
50
|
+
uses: actions/upload-artifact@v4
|
|
51
|
+
with:
|
|
52
|
+
name: package-dist
|
|
53
|
+
path: ./dist
|
|
42
54
|
|
|
43
55
|
- name: Extract tag metadata for Docker
|
|
44
56
|
id: meta
|
|
@@ -46,7 +58,7 @@ jobs:
|
|
|
46
58
|
with:
|
|
47
59
|
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
|
|
48
60
|
tags: |
|
|
49
|
-
type=raw,value=${{
|
|
61
|
+
type=raw,value=${{ steps.get_version.outputs.mb_version }}
|
|
50
62
|
type=raw,value=development
|
|
51
63
|
|
|
52
64
|
- name: Build and push Docker image
|
|
@@ -56,7 +68,7 @@ jobs:
|
|
|
56
68
|
file: src/matchbox/server/Dockerfile
|
|
57
69
|
push: true
|
|
58
70
|
build-args: |
|
|
59
|
-
MB_VERSION=${{
|
|
71
|
+
MB_VERSION=${{ steps.get_version.outputs.mb_version }}
|
|
60
72
|
tags: ${{ steps.meta.outputs.tags }}
|
|
61
73
|
|
|
62
74
|
- name: Delete existing development release if it exists
|
|
@@ -76,5 +88,23 @@ jobs:
|
|
|
76
88
|
|
|
77
89
|
May be unstable.
|
|
78
90
|
|
|
79
|
-
**Version:** ${{
|
|
91
|
+
**Version:** ${{ steps.get_version.outputs.mb_version }}
|
|
80
92
|
**Commit:** ${{ github.sha }}
|
|
93
|
+
|
|
94
|
+
deploy-package:
|
|
95
|
+
needs: build
|
|
96
|
+
runs-on: ubuntu-latest
|
|
97
|
+
permissions:
|
|
98
|
+
id-token: write
|
|
99
|
+
|
|
100
|
+
environment: pypi
|
|
101
|
+
|
|
102
|
+
steps:
|
|
103
|
+
- name: Download package artifacts
|
|
104
|
+
uses: actions/download-artifact@v4
|
|
105
|
+
with:
|
|
106
|
+
name: package-dist
|
|
107
|
+
path: ./dist
|
|
108
|
+
|
|
109
|
+
- name: Publish package to PyPI
|
|
110
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: matchbox-db
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.3.dev53
|
|
4
4
|
Summary: A framework for orchestrating and comparing data linking and deduplication methodologies.
|
|
5
5
|
Author: Department for Business and Trade
|
|
6
6
|
Project-URL: Documentation, https://uktrade.github.io/matchbox/
|
|
@@ -8,6 +8,7 @@ from importlib.metadata import version
|
|
|
8
8
|
from io import BytesIO
|
|
9
9
|
|
|
10
10
|
import httpx
|
|
11
|
+
import polars as pl
|
|
11
12
|
from pyarrow import Table
|
|
12
13
|
from pyarrow.parquet import read_table
|
|
13
14
|
from tenacity import (
|
|
@@ -301,13 +302,14 @@ def get_resolution(
|
|
|
301
302
|
|
|
302
303
|
@http_retry
|
|
303
304
|
def set_data(
|
|
304
|
-
name: ResolutionName, data:
|
|
305
|
+
name: ResolutionName, data: pl.DataFrame, validate_type: ResolutionType
|
|
305
306
|
) -> UploadStatus:
|
|
306
307
|
"""Upload source hashes or model results to server."""
|
|
307
308
|
log_prefix = f"Resolution {name}"
|
|
308
309
|
logger.debug("Uploading results", prefix=log_prefix)
|
|
309
310
|
|
|
310
|
-
|
|
311
|
+
data_arrow = data.to_arrow() if isinstance(data, pl.DataFrame) else data
|
|
312
|
+
buffer = table_to_buffer(table=data_arrow)
|
|
311
313
|
|
|
312
314
|
# Initialise upload
|
|
313
315
|
metadata_res = CLIENT.post(
|
|
@@ -432,7 +434,7 @@ def download_eval_data() -> tuple[Table, Table]:
|
|
|
432
434
|
check_schema(SCHEMA_JUDGEMENTS, judgements.schema)
|
|
433
435
|
check_schema(SCHEMA_CLUSTER_EXPANSION, expansion.schema)
|
|
434
436
|
|
|
435
|
-
return judgements, expansion
|
|
437
|
+
return pl.from_arrow(judgements), pl.from_arrow(expansion)
|
|
436
438
|
|
|
437
439
|
|
|
438
440
|
# Admin
|
|
@@ -128,11 +128,7 @@ class EvalData:
|
|
|
128
128
|
|
|
129
129
|
threshold = int(threshold * 100)
|
|
130
130
|
|
|
131
|
-
root_leaf = (
|
|
132
|
-
results.root_leaf()
|
|
133
|
-
.rename({"root_id": "root", "leaf_id": "leaf"})
|
|
134
|
-
.to_arrow()
|
|
135
|
-
)
|
|
131
|
+
root_leaf = results.root_leaf().rename({"root_id": "root", "leaf_id": "leaf"})
|
|
136
132
|
return precision_recall([root_leaf], self.judgements, self.expansion)[0]
|
|
137
133
|
|
|
138
134
|
def pr_curve(self, results: Results) -> Figure:
|
|
@@ -140,7 +136,7 @@ class EvalData:
|
|
|
140
136
|
all_p = []
|
|
141
137
|
all_r = []
|
|
142
138
|
|
|
143
|
-
probs =
|
|
139
|
+
probs = results.probabilities
|
|
144
140
|
thresholds = probs.select("probability").unique().to_series()
|
|
145
141
|
for i, t in enumerate(sorted(thresholds)):
|
|
146
142
|
float_thresh = t / 100
|
|
@@ -224,8 +224,8 @@ class Model:
|
|
|
224
224
|
if for_validation:
|
|
225
225
|
self.results = Results(
|
|
226
226
|
probabilities=results,
|
|
227
|
-
left_root_leaf=self.left_query.leaf_id
|
|
228
|
-
right_root_leaf=self.right_query.leaf_id
|
|
227
|
+
left_root_leaf=self.left_query.leaf_id,
|
|
228
|
+
right_root_leaf=self.right_query.leaf_id
|
|
229
229
|
if right_df is not None
|
|
230
230
|
else None,
|
|
231
231
|
)
|
|
@@ -4,8 +4,6 @@ from collections.abc import Hashable
|
|
|
4
4
|
from typing import ParamSpec, TypeVar
|
|
5
5
|
|
|
6
6
|
import polars as pl
|
|
7
|
-
import pyarrow as pa
|
|
8
|
-
import pyarrow.compute as pc
|
|
9
7
|
from pydantic import ConfigDict
|
|
10
8
|
|
|
11
9
|
from matchbox.common.arrow import SCHEMA_RESULTS
|
|
@@ -33,14 +31,14 @@ class Results:
|
|
|
33
31
|
|
|
34
32
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
35
33
|
|
|
36
|
-
probabilities:
|
|
37
|
-
_clusters:
|
|
34
|
+
probabilities: pl.DataFrame
|
|
35
|
+
_clusters: pl.DataFrame | None = None
|
|
38
36
|
|
|
39
37
|
def __init__(
|
|
40
38
|
self,
|
|
41
|
-
probabilities:
|
|
42
|
-
left_root_leaf:
|
|
43
|
-
right_root_leaf:
|
|
39
|
+
probabilities: pl.DataFrame,
|
|
40
|
+
left_root_leaf: pl.DataFrame | None = None,
|
|
41
|
+
right_root_leaf: pl.DataFrame | None = None,
|
|
44
42
|
) -> None:
|
|
45
43
|
"""Initialises and validates results."""
|
|
46
44
|
self.left_root_leaf = None
|
|
@@ -51,61 +49,49 @@ class Results:
|
|
|
51
49
|
if right_root_leaf is not None:
|
|
52
50
|
self.right_root_leaf = right_root_leaf
|
|
53
51
|
|
|
54
|
-
if isinstance(probabilities, pl.DataFrame):
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
if not isinstance(probabilities, pa.Table):
|
|
58
|
-
raise ValueError("Expected a polars DataFrame or pyarrow Table.")
|
|
52
|
+
if not isinstance(probabilities, pl.DataFrame):
|
|
53
|
+
raise ValueError(f"Expected a polars DataFrame, got {type(probabilities)}.")
|
|
59
54
|
|
|
60
55
|
expected_fields = set(SCHEMA_RESULTS.names)
|
|
61
|
-
if set(probabilities.
|
|
56
|
+
if set(probabilities.columns) != expected_fields:
|
|
62
57
|
raise ValueError(
|
|
63
58
|
f"Expected {expected_fields}.\nFound {set(probabilities.column_names)}."
|
|
64
59
|
)
|
|
65
60
|
|
|
66
61
|
# Handle empty tables
|
|
67
|
-
if probabilities.
|
|
68
|
-
|
|
69
|
-
probabilities = pa.Table.from_arrays(
|
|
70
|
-
empty_arrays, names=[field.name for field in SCHEMA_RESULTS]
|
|
71
|
-
)
|
|
62
|
+
if probabilities.height == 0:
|
|
63
|
+
probabilities = pl.DataFrame(schema=pl.Schema(SCHEMA_RESULTS))
|
|
72
64
|
|
|
73
65
|
# Process probability field if it contains floating-point or decimal values
|
|
74
|
-
probability_type = probabilities["probability"].
|
|
75
|
-
if
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
options=pc.CastOptions(
|
|
81
|
-
target_type=pa.uint8(),
|
|
82
|
-
allow_float_truncate=True,
|
|
83
|
-
allow_decimal_truncate=True,
|
|
84
|
-
),
|
|
66
|
+
probability_type = probabilities["probability"].dtype
|
|
67
|
+
if probability_type.is_float() or probability_type.is_decimal():
|
|
68
|
+
probability_uint8 = pl.Series(
|
|
69
|
+
probabilities.select(
|
|
70
|
+
pl.col("probability").mul(100).round(0).cast(pl.UInt8)
|
|
71
|
+
)
|
|
85
72
|
)
|
|
86
73
|
|
|
87
74
|
# Check max value only if the table is not empty
|
|
88
|
-
max_prob =
|
|
89
|
-
if max_prob is not None and max_prob
|
|
90
|
-
p_max =
|
|
91
|
-
p_min =
|
|
75
|
+
max_prob = probability_uint8.max()
|
|
76
|
+
if max_prob is not None and max_prob > 100:
|
|
77
|
+
p_max = max_prob
|
|
78
|
+
p_min = probability_uint8.min()
|
|
92
79
|
raise ValueError(f"Probability range misconfigured: [{p_min}, {p_max}]")
|
|
93
80
|
|
|
94
|
-
probabilities = probabilities.
|
|
95
|
-
|
|
96
|
-
field_="probability",
|
|
97
|
-
column=probability_uint8,
|
|
81
|
+
probabilities = probabilities.replace_column(
|
|
82
|
+
probabilities.get_column_index("probability"), probability_uint8
|
|
98
83
|
)
|
|
99
84
|
|
|
100
|
-
|
|
85
|
+
# need schema in format recognised by polars
|
|
86
|
+
self.probabilities = probabilities.cast(pl.Schema(SCHEMA_RESULTS))
|
|
101
87
|
|
|
102
88
|
@property
|
|
103
89
|
def clusters(self):
|
|
104
90
|
"""Retrieve new clusters implied by these results."""
|
|
105
|
-
if
|
|
91
|
+
if self._clusters is None:
|
|
106
92
|
im = IntMap()
|
|
107
93
|
self._clusters = to_clusters(
|
|
108
|
-
results=self.probabilities, dtype=
|
|
94
|
+
results=self.probabilities, dtype=pl.Int64, hash_func=im.index
|
|
109
95
|
)
|
|
110
96
|
return self._clusters
|
|
111
97
|
|
|
@@ -146,7 +132,7 @@ class Results:
|
|
|
146
132
|
) -> pl.DataFrame:
|
|
147
133
|
"""Enriches the probability results with the source data."""
|
|
148
134
|
return self._merge_with_source_data(
|
|
149
|
-
base_df=
|
|
135
|
+
base_df=self.probabilities,
|
|
150
136
|
base_df_cols=["left_id", "right_id", "probability"],
|
|
151
137
|
left_data=left_data,
|
|
152
138
|
left_key=left_key,
|
|
@@ -165,7 +151,7 @@ class Results:
|
|
|
165
151
|
) -> pl.DataFrame:
|
|
166
152
|
"""Enriches the cluster results with the source data."""
|
|
167
153
|
return self._merge_with_source_data(
|
|
168
|
-
base_df=
|
|
154
|
+
base_df=self.clusters,
|
|
169
155
|
base_df_cols=["parent", "child", "probability"],
|
|
170
156
|
left_data=left_data,
|
|
171
157
|
left_key=left_key,
|
|
@@ -182,20 +168,19 @@ class Results:
|
|
|
182
168
|
"This Results object wasn't instantiated for validation features."
|
|
183
169
|
)
|
|
184
170
|
|
|
185
|
-
parents_root_leaf =
|
|
171
|
+
parents_root_leaf = self.left_root_leaf.select(["id", "leaf_id"])
|
|
186
172
|
if self.right_root_leaf is not None:
|
|
187
173
|
parents_root_leaf = pl.concat(
|
|
188
174
|
[
|
|
189
175
|
parents_root_leaf,
|
|
190
|
-
|
|
176
|
+
self.right_root_leaf.select(["id", "leaf_id"]),
|
|
191
177
|
]
|
|
192
178
|
)
|
|
193
179
|
|
|
194
180
|
# Go from parent-child (where child could be the root of another model)
|
|
195
181
|
# to root-leaf, where leaf is a source cluster ID
|
|
196
182
|
root_leaf_res = (
|
|
197
|
-
|
|
198
|
-
.rename({"parent": "root_id"})
|
|
183
|
+
self.clusters.rename({"parent": "root_id"})
|
|
199
184
|
.join(parents_root_leaf, left_on="child", right_on="id")
|
|
200
185
|
.select(["root_id", "leaf_id"])
|
|
201
186
|
.unique()
|
|
@@ -205,7 +190,7 @@ class Results:
|
|
|
205
190
|
unmerged_ids_rows = (
|
|
206
191
|
parents_root_leaf.select("id", "leaf_id")
|
|
207
192
|
.join(
|
|
208
|
-
|
|
193
|
+
self.clusters.select("child"),
|
|
209
194
|
left_on="id",
|
|
210
195
|
right_on="child",
|
|
211
196
|
how="anti",
|
|
@@ -4,7 +4,6 @@ from itertools import chain, combinations
|
|
|
4
4
|
from typing import TypeAlias
|
|
5
5
|
|
|
6
6
|
import polars as pl
|
|
7
|
-
from pyarrow import Table
|
|
8
7
|
from pydantic import BaseModel, Field, field_validator
|
|
9
8
|
|
|
10
9
|
from matchbox.common.graph import ModelResolutionName
|
|
@@ -39,7 +38,9 @@ class Judgement(BaseModel):
|
|
|
39
38
|
|
|
40
39
|
|
|
41
40
|
def precision_recall(
|
|
42
|
-
models_root_leaf: list[
|
|
41
|
+
models_root_leaf: list[pl.DataFrame],
|
|
42
|
+
judgements: pl.DataFrame,
|
|
43
|
+
expansion: pl.DataFrame,
|
|
43
44
|
) -> list[PrecisionRecall]:
|
|
44
45
|
"""From models and eval data, compute scores inspired by precision-recall.
|
|
45
46
|
|
|
@@ -78,10 +79,9 @@ def precision_recall(
|
|
|
78
79
|
for root_leaf in models_root_leaf:
|
|
79
80
|
if not len(root_leaf):
|
|
80
81
|
raise ValueError("Model data cannot be empty.")
|
|
81
|
-
leaves_per_set.append(set(root_leaf["leaf"].
|
|
82
|
+
leaves_per_set.append(set(root_leaf["leaf"].to_list()))
|
|
82
83
|
clusters = (
|
|
83
|
-
|
|
84
|
-
.group_by("root")
|
|
84
|
+
root_leaf.group_by("root")
|
|
85
85
|
.agg(pl.col("leaf").alias("leaves"))
|
|
86
86
|
.select("leaves")
|
|
87
87
|
.to_series()
|
|
@@ -93,7 +93,7 @@ def precision_recall(
|
|
|
93
93
|
pairs_per_model.append(model_pairs)
|
|
94
94
|
|
|
95
95
|
validation_pairs, validation_net_count, validation_leaves = process_judgements(
|
|
96
|
-
|
|
96
|
+
judgements, expansion
|
|
97
97
|
)
|
|
98
98
|
leaves_per_set.append(validation_leaves)
|
|
99
99
|
|
|
@@ -521,7 +521,7 @@ def generate_entities(
|
|
|
521
521
|
|
|
522
522
|
|
|
523
523
|
def probabilities_to_results_entities(
|
|
524
|
-
probabilities:
|
|
524
|
+
probabilities: pl.DataFrame,
|
|
525
525
|
left_clusters: tuple[ClusterEntity, ...],
|
|
526
526
|
right_clusters: tuple[ClusterEntity, ...] | None = None,
|
|
527
527
|
threshold: float | int = 0,
|
|
@@ -547,7 +547,7 @@ def probabilities_to_results_entities(
|
|
|
547
547
|
djs.add(entity)
|
|
548
548
|
|
|
549
549
|
# Add edges to the disjoint set
|
|
550
|
-
for record in probabilities.
|
|
550
|
+
for record in probabilities.to_dicts():
|
|
551
551
|
if record["probability"] >= threshold:
|
|
552
552
|
djs.union(
|
|
553
553
|
left_lookup[record["left_id"]],
|
|
@@ -76,12 +76,12 @@ add_model_class(MockDeduper)
|
|
|
76
76
|
add_model_class(MockLinker)
|
|
77
77
|
|
|
78
78
|
|
|
79
|
-
def component_report(all_nodes: list[Any], table:
|
|
79
|
+
def component_report(all_nodes: list[Any], table: pl.DataFrame) -> dict:
|
|
80
80
|
"""Fast reporting on connected components using rustworkx.
|
|
81
81
|
|
|
82
82
|
Args:
|
|
83
83
|
all_nodes: list of identities of inputs being matched
|
|
84
|
-
table:
|
|
84
|
+
table: Polars dataframe with 'left', 'right' columns
|
|
85
85
|
|
|
86
86
|
Returns:
|
|
87
87
|
dictionary containing basic component statistics
|
|
@@ -252,7 +252,7 @@ def generate_dummy_probabilities(
|
|
|
252
252
|
num_components: int,
|
|
253
253
|
total_rows: int | None = None,
|
|
254
254
|
seed: int = 42,
|
|
255
|
-
) ->
|
|
255
|
+
) -> pl.DataFrame:
|
|
256
256
|
"""Generate dummy Arrow probabilities data with guaranteed isolated components.
|
|
257
257
|
|
|
258
258
|
While much of the factory system uses generate_entity_probabilities, this function
|
|
@@ -269,7 +269,7 @@ def generate_dummy_probabilities(
|
|
|
269
269
|
seed: Random seed for reproducibility
|
|
270
270
|
|
|
271
271
|
Returns:
|
|
272
|
-
|
|
272
|
+
Polars dataframe with 'left_id', 'right_id', and 'probability' columns
|
|
273
273
|
"""
|
|
274
274
|
# Validate inputs
|
|
275
275
|
deduplicate = False
|
|
@@ -419,14 +419,9 @@ def generate_dummy_probabilities(
|
|
|
419
419
|
# Convert to arrays
|
|
420
420
|
lefts, rights, probs = zip(*all_edges, strict=True)
|
|
421
421
|
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
prob_array = pa.array(probs, type=pa.uint8())
|
|
426
|
-
|
|
427
|
-
return pa.table(
|
|
428
|
-
[left_array, right_array, prob_array],
|
|
429
|
-
names=["left_id", "right_id", "probability"],
|
|
422
|
+
return pl.DataFrame(
|
|
423
|
+
{"left_id": lefts, "right_id": rights, "probability": probs},
|
|
424
|
+
schema={"left_id": pl.UInt64, "right_id": pl.UInt64, "probability": pl.UInt8},
|
|
430
425
|
)
|
|
431
426
|
|
|
432
427
|
|
|
@@ -436,7 +431,7 @@ def generate_entity_probabilities(
|
|
|
436
431
|
source_entities: frozenset[SourceEntity],
|
|
437
432
|
prob_range: tuple[float, float] = (0.8, 1.0),
|
|
438
433
|
seed: int = 42,
|
|
439
|
-
) ->
|
|
434
|
+
) -> pl.DataFrame:
|
|
440
435
|
"""Generate probabilities that will recover entity relationships.
|
|
441
436
|
|
|
442
437
|
Compares ClusterEntity objects against ground truth SourceEntities by checking
|
|
@@ -530,27 +525,9 @@ def generate_entity_probabilities(
|
|
|
530
525
|
|
|
531
526
|
# If no edges were generated, return empty table with correct schema
|
|
532
527
|
if not edges:
|
|
533
|
-
return
|
|
534
|
-
[
|
|
535
|
-
pa.array([], type=pa.uint64()),
|
|
536
|
-
pa.array([], type=pa.uint64()),
|
|
537
|
-
pa.array([], type=pa.uint8()),
|
|
538
|
-
],
|
|
539
|
-
schema=SCHEMA_RESULTS,
|
|
540
|
-
)
|
|
528
|
+
return pl.DataFrame(schema=pl.Schema(SCHEMA_RESULTS))
|
|
541
529
|
|
|
542
|
-
|
|
543
|
-
lefts, rights, probs = zip(*edges, strict=False)
|
|
544
|
-
|
|
545
|
-
# Create PyArrow arrays
|
|
546
|
-
left_array = pa.array(lefts, type=pa.uint64())
|
|
547
|
-
right_array = pa.array(rights, type=pa.uint64())
|
|
548
|
-
prob_array = pa.array(probs, type=pa.uint8())
|
|
549
|
-
|
|
550
|
-
return pa.table(
|
|
551
|
-
[left_array, right_array, prob_array],
|
|
552
|
-
schema=SCHEMA_RESULTS,
|
|
553
|
-
)
|
|
530
|
+
return pl.DataFrame(edges, orient="row", schema=pl.Schema(SCHEMA_RESULTS))
|
|
554
531
|
|
|
555
532
|
|
|
556
533
|
class ModelTestkit(BaseModel):
|
|
@@ -565,7 +542,7 @@ class ModelTestkit(BaseModel):
|
|
|
565
542
|
right_data: pa.Table | None
|
|
566
543
|
right_query: Query | None
|
|
567
544
|
right_clusters: dict[int, ClusterEntity] | None
|
|
568
|
-
probabilities:
|
|
545
|
+
probabilities: pl.DataFrame
|
|
569
546
|
|
|
570
547
|
_entities: tuple[ClusterEntity, ...]
|
|
571
548
|
_threshold: int
|
|
@@ -6,6 +6,7 @@ from pathlib import Path
|
|
|
6
6
|
from typing import Any, Literal
|
|
7
7
|
|
|
8
8
|
import pyarrow as pa
|
|
9
|
+
from polars.testing import assert_frame_equal
|
|
9
10
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
10
11
|
from sqlalchemy import Engine
|
|
11
12
|
|
|
@@ -171,7 +172,9 @@ def create_dedupe_scenario(
|
|
|
171
172
|
|
|
172
173
|
# Add to backend and DAG
|
|
173
174
|
backend.insert_resolution(resolution=model_testkit.model.to_resolution())
|
|
174
|
-
backend.insert_model_data(
|
|
175
|
+
backend.insert_model_data(
|
|
176
|
+
name=name, results=model_testkit.probabilities.to_arrow()
|
|
177
|
+
)
|
|
175
178
|
dag.add_model(model_testkit)
|
|
176
179
|
|
|
177
180
|
return dag
|
|
@@ -216,7 +219,9 @@ def create_probabilistic_dedupe_scenario(
|
|
|
216
219
|
|
|
217
220
|
# Add to backend and DAG
|
|
218
221
|
backend.insert_resolution(resolution=model_testkit.model.to_resolution())
|
|
219
|
-
backend.insert_model_data(
|
|
222
|
+
backend.insert_model_data(
|
|
223
|
+
name=name, results=model_testkit.probabilities.to_arrow()
|
|
224
|
+
)
|
|
220
225
|
backend.set_model_truth(name=name, truth=50)
|
|
221
226
|
dag.add_model(model_testkit)
|
|
222
227
|
|
|
@@ -271,7 +276,9 @@ def create_link_scenario(
|
|
|
271
276
|
|
|
272
277
|
# Add to backend and DAG
|
|
273
278
|
backend.insert_resolution(resolution=crn_duns_model.model.to_resolution())
|
|
274
|
-
backend.insert_model_data(
|
|
279
|
+
backend.insert_model_data(
|
|
280
|
+
name=crn_duns_name, results=crn_duns_model.probabilities.to_arrow()
|
|
281
|
+
)
|
|
275
282
|
dag.add_model(crn_duns_model)
|
|
276
283
|
|
|
277
284
|
# Create CRN-CDMS link
|
|
@@ -299,7 +306,9 @@ def create_link_scenario(
|
|
|
299
306
|
|
|
300
307
|
# Add to backend and DAG
|
|
301
308
|
backend.insert_resolution(resolution=crn_cdms_model.model.to_resolution())
|
|
302
|
-
backend.insert_model_data(
|
|
309
|
+
backend.insert_model_data(
|
|
310
|
+
name=crn_cdms_name, results=crn_cdms_model.probabilities.to_arrow()
|
|
311
|
+
)
|
|
303
312
|
backend.set_model_truth(name=crn_cdms_name, truth=75)
|
|
304
313
|
dag.add_model(crn_cdms_model)
|
|
305
314
|
|
|
@@ -344,7 +353,7 @@ def create_link_scenario(
|
|
|
344
353
|
# Add to backend and DAG
|
|
345
354
|
backend.insert_resolution(resolution=final_join_model.model.to_resolution())
|
|
346
355
|
backend.insert_model_data(
|
|
347
|
-
name=final_join_name, results=final_join_model.probabilities
|
|
356
|
+
name=final_join_name, results=final_join_model.probabilities.to_arrow()
|
|
348
357
|
)
|
|
349
358
|
dag.add_model(final_join_model)
|
|
350
359
|
|
|
@@ -422,15 +431,17 @@ def create_alt_dedupe_scenario(
|
|
|
422
431
|
seed=seed,
|
|
423
432
|
)
|
|
424
433
|
|
|
425
|
-
assert model_testkit1.probabilities
|
|
426
|
-
|
|
434
|
+
assert len(model_testkit1.probabilities) > 0
|
|
435
|
+
assert_frame_equal(model_testkit1.probabilities, model_testkit2.probabilities)
|
|
427
436
|
|
|
428
437
|
for model, threshold in ((model_testkit1, 50), (model_testkit2, 75)):
|
|
429
438
|
model.threshold = threshold
|
|
430
439
|
|
|
431
440
|
# Add both models to backend and DAG
|
|
432
441
|
backend.insert_resolution(resolution=model.model.to_resolution())
|
|
433
|
-
backend.insert_model_data(
|
|
442
|
+
backend.insert_model_data(
|
|
443
|
+
name=model.name, results=model.probabilities.to_arrow()
|
|
444
|
+
)
|
|
434
445
|
backend.set_model_truth(name=model.name, truth=threshold)
|
|
435
446
|
|
|
436
447
|
# Add to DAG
|
|
@@ -509,7 +520,7 @@ def create_convergent_scenario(
|
|
|
509
520
|
seed=seed,
|
|
510
521
|
)
|
|
511
522
|
|
|
512
|
-
assert model_testkit.probabilities
|
|
523
|
+
assert len(model_testkit.probabilities) > 0
|
|
513
524
|
|
|
514
525
|
# Add to DAG
|
|
515
526
|
dag.add_model(model_testkit)
|
|
@@ -202,7 +202,7 @@ class LinkedSourcesTestkit(BaseModel):
|
|
|
202
202
|
|
|
203
203
|
def diff_results(
|
|
204
204
|
self,
|
|
205
|
-
probabilities:
|
|
205
|
+
probabilities: pl.DataFrame,
|
|
206
206
|
sources: list[SourceResolutionName],
|
|
207
207
|
left_clusters: tuple[ClusterEntity, ...],
|
|
208
208
|
right_clusters: tuple[ClusterEntity, ...] | None = None,
|