hsds-record-matcher 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hsds_record_matcher-1.0.0/.gitignore +233 -0
- hsds_record_matcher-1.0.0/PKG-INFO +249 -0
- hsds_record_matcher-1.0.0/README.md +218 -0
- hsds_record_matcher-1.0.0/pyproject.toml +101 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/__init__.py +5 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/config/__init__.py +21 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/config/entity_resolution_run_config.py +278 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/__init__.py +5 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/apply_mitigation.py +646 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/clean_entities.py +431 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/cluster_pairs.py +383 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/dataframe_utils.py +88 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/domain_utils.py +231 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/evidence_policy.py +36 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/feature_extractor.py +1031 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/generate_candidates.py +891 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/materialize_review_queue.py +124 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/ml_inference.py +211 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/nlp/__init__.py +6 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/nlp/algorithms.py +83 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/nlp/safeguards.py +41 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/nlp/scoring.py +43 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/nlp/types.py +24 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/pair_tiering.py +41 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/pipeline.py +282 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/prepare_persistence_artifacts.py +99 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/score_candidates.py +1123 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/taxonomy_utils.py +249 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/training_feature_store.py +1062 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/training_features.py +156 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/training_schema.py +203 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/dagster/__init__.py +5 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/dagster/components/__init__.py +7 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/dagster/components/entity_resolution_component.py +212 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/definitions.py +11 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/defs/__init__.py +1 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/observability/__init__.py +6 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/observability/progress.py +133 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/observability/tracer.py +153 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/py.typed +0 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/tf_idf_models/tfidf_vectorizer_organization.joblib +0 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/tf_idf_models/tfidf_vectorizer_service.joblib +0 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/types/__init__.py +69 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/types/artifact_rows.py +65 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/types/contracts.py +97 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/types/domain.py +15 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/types/frames.py +221 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/types/rows/__init__.py +17 -0
- hsds_record_matcher-1.0.0/src/hsds_entity_resolution/types/rows/clean_entity_rows.py +85 -0
- hsds_record_matcher-1.0.0/tests/__init__.py +13 -0
- hsds_record_matcher-1.0.0/tests/conftest.py +13 -0
- hsds_record_matcher-1.0.0/tests/contract/common_experiment_schema_contract.json +98 -0
- hsds_record_matcher-1.0.0/tests/contract/test_artifact_to_model_mapping.py +388 -0
- hsds_record_matcher-1.0.0/tests/contract/test_common_experiment_schema_contract.py +176 -0
- hsds_record_matcher-1.0.0/tests/contract/test_job_registry.py +50 -0
- hsds_record_matcher-1.0.0/tests/contract/test_sql_templates.py +101 -0
- hsds_record_matcher-1.0.0/tests/contract/test_training_data_schema_contract.py +53 -0
- hsds_record_matcher-1.0.0/tests/contract/training_data_schema_contract.json +206 -0
- hsds_record_matcher-1.0.0/tests/fixtures/__init__.py +0 -0
- hsds_record_matcher-1.0.0/tests/fixtures/burrell_sph_pair.json +136 -0
- hsds_record_matcher-1.0.0/tests/integration/test_adapter_flow.py +169 -0
- hsds_record_matcher-1.0.0/tests/integration/test_consumer_defs.py +15 -0
- hsds_record_matcher-1.0.0/tests/test_apply_mitigation_and_review_queue.py +432 -0
- hsds_record_matcher-1.0.0/tests/test_clean_entities.py +425 -0
- hsds_record_matcher-1.0.0/tests/test_cluster_pairs.py +91 -0
- hsds_record_matcher-1.0.0/tests/test_consumer_embedding_freshness.py +194 -0
- hsds_record_matcher-1.0.0/tests/test_cut_training_dataset.py +108 -0
- hsds_record_matcher-1.0.0/tests/test_dagster_component_and_defs.py +74 -0
- hsds_record_matcher-1.0.0/tests/test_dataframe_utils.py +41 -0
- hsds_record_matcher-1.0.0/tests/test_deduplication_common_models.py +127 -0
- hsds_record_matcher-1.0.0/tests/test_domain_taxonomy_utils.py +670 -0
- hsds_record_matcher-1.0.0/tests/test_embedding_adapter.py +316 -0
- hsds_record_matcher-1.0.0/tests/test_entity_resolution_pipeline.py +367 -0
- hsds_record_matcher-1.0.0/tests/test_generate_candidates_prefilter.py +462 -0
- hsds_record_matcher-1.0.0/tests/test_incremental_scenarios.py +1637 -0
- hsds_record_matcher-1.0.0/tests/test_ml_inference_tfidf_path_resolution.py +51 -0
- hsds_record_matcher-1.0.0/tests/test_ml_signal_overrides.py +309 -0
- hsds_record_matcher-1.0.0/tests/test_nlp_scoring.py +341 -0
- hsds_record_matcher-1.0.0/tests/test_persistence_executor.py +88 -0
- hsds_record_matcher-1.0.0/tests/test_real_data_scenarios.py +800 -0
- hsds_record_matcher-1.0.0/tests/test_run_config_validation.py +37 -0
- hsds_record_matcher-1.0.0/tests/test_score_candidates_ml_inference.py +1140 -0
- hsds_record_matcher-1.0.0/tests/test_snowflake_embedding_store_queries.py +71 -0
- hsds_record_matcher-1.0.0/tests/test_source_loader_helpers.py +217 -0
- hsds_record_matcher-1.0.0/tests/test_staging_retention_policy.py +112 -0
- hsds_record_matcher-1.0.0/tests/test_taxonomy_embedding_loader.py +277 -0
- hsds_record_matcher-1.0.0/tests/test_training_feature_store.py +86 -0
- hsds_record_matcher-1.0.0/tests/test_types_modules.py +19 -0
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[codz]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.nox/
|
|
43
|
+
.coverage
|
|
44
|
+
.coverage.*
|
|
45
|
+
.cache
|
|
46
|
+
nosetests.xml
|
|
47
|
+
coverage.xml
|
|
48
|
+
*.cover
|
|
49
|
+
*.py.cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
.pytest_cache/
|
|
52
|
+
cover/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
.pybuilder/
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
88
|
+
# .python-version
|
|
89
|
+
|
|
90
|
+
# pipenv
|
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
94
|
+
# install all needed dependencies.
|
|
95
|
+
#Pipfile.lock
|
|
96
|
+
|
|
97
|
+
# UV
|
|
98
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
|
99
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
100
|
+
# commonly ignored for libraries.
|
|
101
|
+
#uv.lock
|
|
102
|
+
|
|
103
|
+
# poetry
|
|
104
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
105
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
106
|
+
# commonly ignored for libraries.
|
|
107
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
108
|
+
#poetry.lock
|
|
109
|
+
#poetry.toml
|
|
110
|
+
|
|
111
|
+
# pdm
|
|
112
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
113
|
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
|
114
|
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
|
115
|
+
#pdm.lock
|
|
116
|
+
#pdm.toml
|
|
117
|
+
.pdm-python
|
|
118
|
+
.pdm-build/
|
|
119
|
+
|
|
120
|
+
# pixi
|
|
121
|
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
|
122
|
+
#pixi.lock
|
|
123
|
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
|
124
|
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
|
125
|
+
.pixi
|
|
126
|
+
|
|
127
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
128
|
+
__pypackages__/
|
|
129
|
+
|
|
130
|
+
# Celery stuff
|
|
131
|
+
celerybeat-schedule
|
|
132
|
+
celerybeat.pid
|
|
133
|
+
|
|
134
|
+
# SageMath parsed files
|
|
135
|
+
*.sage.py
|
|
136
|
+
|
|
137
|
+
# Environments
|
|
138
|
+
.env
|
|
139
|
+
.envrc
|
|
140
|
+
.venv
|
|
141
|
+
env/
|
|
142
|
+
venv/
|
|
143
|
+
ENV/
|
|
144
|
+
env.bak/
|
|
145
|
+
venv.bak/
|
|
146
|
+
|
|
147
|
+
# Spyder project settings
|
|
148
|
+
.spyderproject
|
|
149
|
+
.spyproject
|
|
150
|
+
|
|
151
|
+
# Rope project settings
|
|
152
|
+
.ropeproject
|
|
153
|
+
|
|
154
|
+
# mkdocs documentation
|
|
155
|
+
/site
|
|
156
|
+
|
|
157
|
+
# mypy
|
|
158
|
+
.mypy_cache/
|
|
159
|
+
.dmypy.json
|
|
160
|
+
dmypy.json
|
|
161
|
+
|
|
162
|
+
# Pyre type checker
|
|
163
|
+
.pyre/
|
|
164
|
+
|
|
165
|
+
# pytype static type analyzer
|
|
166
|
+
.pytype/
|
|
167
|
+
|
|
168
|
+
# Cython debug symbols
|
|
169
|
+
cython_debug/
|
|
170
|
+
|
|
171
|
+
# PyCharm
|
|
172
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
173
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
174
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
175
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
176
|
+
#.idea/
|
|
177
|
+
|
|
178
|
+
# Abstra
|
|
179
|
+
# Abstra is an AI-powered process automation framework.
|
|
180
|
+
# Ignore directories containing user credentials, local state, and settings.
|
|
181
|
+
# Learn more at https://abstra.io/docs
|
|
182
|
+
.abstra/
|
|
183
|
+
|
|
184
|
+
# Visual Studio Code
|
|
185
|
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
|
186
|
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
|
187
|
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
|
188
|
+
# you could uncomment the following to ignore the entire vscode folder
|
|
189
|
+
# .vscode/
|
|
190
|
+
|
|
191
|
+
# Ruff stuff:
|
|
192
|
+
.ruff_cache/
|
|
193
|
+
|
|
194
|
+
.uv-cache/
|
|
195
|
+
.cursor/
|
|
196
|
+
|
|
197
|
+
# PyPI configuration file
|
|
198
|
+
.pypirc
|
|
199
|
+
|
|
200
|
+
# Marimo
|
|
201
|
+
marimo/_static/
|
|
202
|
+
marimo/_lsp/
|
|
203
|
+
__marimo__/
|
|
204
|
+
|
|
205
|
+
# Streamlit
|
|
206
|
+
.streamlit/secrets.toml
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
#Ignore cursor AI rules
|
|
210
|
+
.cursor/rules/codacy.mdc
|
|
211
|
+
|
|
212
|
+
# Codacy
|
|
213
|
+
.codacy/AGENTS.md
|
|
214
|
+
|
|
215
|
+
# dbt — target/ contains per-run artifacts; never commit.
|
|
216
|
+
consumer/dbt/target/
|
|
217
|
+
hsds_dbt_macros/target/
|
|
218
|
+
|
|
219
|
+
.dg/
|
|
220
|
+
|
|
221
|
+
consumer/
|
|
222
|
+
hsds_dbt_macros/
|
|
223
|
+
scripts/
|
|
224
|
+
|
|
225
|
+
HSDS_DATA_MODEL.md
|
|
226
|
+
_hsds_table_desc/
|
|
227
|
+
|
|
228
|
+
.agents/skills/snowsql-investigation/
|
|
229
|
+
.codacy/
|
|
230
|
+
docs/
|
|
231
|
+
!docs/
|
|
232
|
+
docs/*
|
|
233
|
+
!docs/deduplication-schema-audit.md
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hsds-record-matcher
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Reusable Dagster component package for HSDS entity resolution workflows.
|
|
5
|
+
Project-URL: Homepage, https://github.com/211-Connect/hsds-entity-resolution
|
|
6
|
+
Project-URL: Repository, https://github.com/211-Connect/hsds-entity-resolution
|
|
7
|
+
Project-URL: Issues, https://github.com/211-Connect/hsds-entity-resolution/issues
|
|
8
|
+
Keywords: dagster,dbt,deduplication,entity-resolution,hsds
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Classifier: Typing :: Typed
|
|
17
|
+
Requires-Python: <3.15,>=3.10
|
|
18
|
+
Requires-Dist: dagster-dbt>=0.28
|
|
19
|
+
Requires-Dist: dagster-snowflake>=0.28.17
|
|
20
|
+
Requires-Dist: dagster==1.12.17
|
|
21
|
+
Requires-Dist: dbt-snowflake>=1.8
|
|
22
|
+
Requires-Dist: huggingface-hub>=1.5.0
|
|
23
|
+
Requires-Dist: joblib<2,>=1.3
|
|
24
|
+
Requires-Dist: numpy<3,>=1.26
|
|
25
|
+
Requires-Dist: polars<2,>=1.18
|
|
26
|
+
Requires-Dist: pydantic<3,>=2
|
|
27
|
+
Requires-Dist: scikit-learn<2,>=1.3
|
|
28
|
+
Requires-Dist: tenacity>=9.1.4
|
|
29
|
+
Requires-Dist: tldextract>=5.3.1
|
|
30
|
+
Description-Content-Type: text/markdown
|
|
31
|
+
|
|
32
|
+
# hsds_entity_resolution
|
|
33
|
+
|
|
34
|
+
`hsds_entity_resolution` helps community organizations deduplicate HSDS data and orchestrate
|
|
35
|
+
continual checks that support long-running community data sharing partnerships.
|
|
36
|
+
|
|
37
|
+
## Project goals
|
|
38
|
+
|
|
39
|
+
- Improve entity matching quality across partner-provided HSDS datasets
|
|
40
|
+
- Reduce duplicate records that block trusted cross-organization coordination
|
|
41
|
+
- Run repeatable validation and quality checks as data pipelines evolve
|
|
42
|
+
- Support sustainable, long-term community data sharing operations
|
|
43
|
+
|
|
44
|
+
## Tooling
|
|
45
|
+
|
|
46
|
+
- **Dagster (`dagster`, `dg`)**: pipeline orchestration, definitions, and local development UI
|
|
47
|
+
- **dbt (Snowflake adapter)**: SQL management for source denormalization and incremental persistence
|
|
48
|
+
- **dagster-dbt**: Dagster integration that invokes dbt staging and mart phases inside jobs
|
|
49
|
+
- **Pydantic v2**: typed data models and validation for HSDS entities and pipeline I/O
|
|
50
|
+
- **Ruff**: Python formatting and linting for fast local feedback
|
|
51
|
+
- **Pyright**: static type checking for `src/` and `tests/`
|
|
52
|
+
- **Codacy CLI (`.codacy/cli.sh`)**: static analysis and security scanning (Pylint, Semgrep,
|
|
53
|
+
Lizard, Trivy)
|
|
54
|
+
- **uv**: dependency and virtual environment management
|
|
55
|
+
|
|
56
|
+
## Component Package Layout
|
|
57
|
+
|
|
58
|
+
Reusable Dagster components live in:
|
|
59
|
+
|
|
60
|
+
- `src/hsds_entity_resolution/dagster/components/`
|
|
61
|
+
|
|
62
|
+
Core library code should live outside the Dagster adapter layer:
|
|
63
|
+
|
|
64
|
+
- `src/hsds_entity_resolution/core/`
|
|
65
|
+
- `src/hsds_entity_resolution/types/`
|
|
66
|
+
- `src/hsds_entity_resolution/config/`
|
|
67
|
+
|
|
68
|
+
The canonical public component entry point is:
|
|
69
|
+
|
|
70
|
+
- `hsds_entity_resolution.dagster.components.EntityResolutionComponent`
|
|
71
|
+
|
|
72
|
+
This module is exported through the Dagster registry entry-point group:
|
|
73
|
+
|
|
74
|
+
- `dagster_dg_cli.registry_modules`
|
|
75
|
+
|
|
76
|
+
## Getting started
|
|
77
|
+
|
|
78
|
+
### Install dependencies
|
|
79
|
+
|
|
80
|
+
Ensure [`uv`](https://docs.astral.sh/uv/) is installed following the
|
|
81
|
+
[official documentation](https://docs.astral.sh/uv/getting-started/installation/), then run:
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
uv sync
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### Run the project
|
|
88
|
+
|
|
89
|
+
This repo has two entry points depending on what you are working on:
|
|
90
|
+
|
|
91
|
+
| Goal | Command |
|
|
92
|
+
| --- | --- |
|
|
93
|
+
| Run the IL211 pipeline (jobs, schedules, Snowflake) | `uv run dagster dev -m consumer.definitions` |
|
|
94
|
+
| Develop the `EntityResolutionComponent` library | `dg dev` |
|
|
95
|
+
|
|
96
|
+
**For pipeline development and debugging, always use:**
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
uv run dagster dev -m consumer.definitions
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
Then open [http://localhost:3000](http://localhost:3000) and go to
|
|
103
|
+
**Deployment → consumer.definitions → Jobs** to find:
|
|
104
|
+
|
|
105
|
+
- `entity_resolution__il211_regional__organization`
|
|
106
|
+
- `entity_resolution__il211_regional__service`
|
|
107
|
+
|
|
108
|
+
Use the **Launchpad** tab on either job to configure a run (e.g. restrict to one
|
|
109
|
+
`source_schema` for faster local testing) and launch it manually.
|
|
110
|
+
|
|
111
|
+
`dg dev` loads the reusable `EntityResolutionComponent` package — it intentionally
|
|
112
|
+
has no jobs or assets of its own and is only useful when working on the component
|
|
113
|
+
library itself.
|
|
114
|
+
|
|
115
|
+
## dbt project (`consumer/dbt/`)
|
|
116
|
+
|
|
117
|
+
The pipeline uses a dbt project to manage all complex SQL in one place. It runs
|
|
118
|
+
in two phases inside every Dagster job:
|
|
119
|
+
|
|
120
|
+
| Phase | dbt select | What it does |
|
|
121
|
+
| --- | --- | --- |
|
|
122
|
+
| **Staging** (before Python ER) | `--select staging` | Materializes `stg_service_denormalized` and `stg_organization_denormalized` tables in `DEDUPLICATION.ER_STAGING` from raw HSDS tables in `NORSE_STAGING` |
|
|
123
|
+
| **Marts** (after Python ER stages artifacts) | `--select marts` | Incremental merge models upsert artifact staging rows into the final output tables in `DEDUPLICATION.ER_RUNTIME` |
|
|
124
|
+
|
|
125
|
+
### Do you need to run any dbt commands before startup?
|
|
126
|
+
|
|
127
|
+
**No.** `dagster dev -m consumer.definitions` starts cleanly — the `DbtCliResource`
|
|
128
|
+
is just a resource handle at startup and triggers no dbt execution. dbt parses
|
|
129
|
+
and compiles the project automatically when each job phase runs.
|
|
130
|
+
|
|
131
|
+
No external dbt packages are used, so `dbt deps` is never required. `dbt build`
|
|
132
|
+
is not used; Dagster controls execution order via the phased job structure.
|
|
133
|
+
|
|
134
|
+
### Useful sanity check during development
|
|
135
|
+
|
|
136
|
+
After editing the dbt project, validate syntax and macro references without
|
|
137
|
+
hitting Snowflake:
|
|
138
|
+
|
|
139
|
+
```bash
|
|
140
|
+
cd consumer/dbt
|
|
141
|
+
uv run dbt parse --profiles-dir .
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
This confirms all Jinja loops compile, macro calls are valid, and `sources.yml`
|
|
145
|
+
references are consistent.
|
|
146
|
+
|
|
147
|
+
### Required environment variables for dbt
|
|
148
|
+
|
|
149
|
+
| Variable | Default | Purpose |
|
|
150
|
+
| --- | --- | --- |
|
|
151
|
+
| `SNOWFLAKE_ACCOUNT` | — | Snowflake account identifier |
|
|
152
|
+
| `SNOWFLAKE_USERNAME` | — | Snowflake username |
|
|
153
|
+
| `SNOWFLAKE_PASSWORD` | — | Snowflake password (or use `SNOWFLAKE_PRIVATE_KEY_PATH`) |
|
|
154
|
+
| `SNOWFLAKE_ROLE` | `SYSADMIN` | Snowflake role |
|
|
155
|
+
| `SNOWFLAKE_WAREHOUSE` | — | Snowflake virtual warehouse |
|
|
156
|
+
| `ER_TARGET_DATABASE` | `DEDUPLICATION` | Database for runtime and reconciliation tables |
|
|
157
|
+
| `ER_RUNTIME_SCHEMA` | `ER_RUNTIME` | Schema for mart output tables |
|
|
158
|
+
| `ER_INCREMENTAL_STATE_SCHEMA` | `ER_INCREMENTAL_STATE` | Schema for incremental state tables |
|
|
159
|
+
| `ER_STAGING_DATABASE` | `DEDUPLICATION` | Database for persistent staging tables |
|
|
160
|
+
| `ER_STAGING_SCHEMA` | `ER_STAGING` | Schema for persistent staging tables |
|
|
161
|
+
| `ER_HSDS_DATABASE` | `NORSE_STAGING` | Source HSDS database |
|
|
162
|
+
|
|
163
|
+
### dbt project structure
|
|
164
|
+
|
|
165
|
+
```
|
|
166
|
+
consumer/dbt/
|
|
167
|
+
dbt_project.yml — project config, model materialization defaults
|
|
168
|
+
profiles.yml — Snowflake connection (env-var based; DbtCliResource overrides in prod)
|
|
169
|
+
models/
|
|
170
|
+
sources.yml — er_staging source definitions with not_null/unique tests
|
|
171
|
+
schema.yml — schema tests for staging and mart models
|
|
172
|
+
staging/
|
|
173
|
+
stg_service_denormalized.sql — multi-tenant UNION over target_schemas
|
|
174
|
+
stg_organization_denormalized.sql — multi-tenant UNION over target_schemas
|
|
175
|
+
marts/
|
|
176
|
+
denormalized_service_cache.sql
|
|
177
|
+
denormalized_organization_cache.sql
|
|
178
|
+
deduplication_run.sql
|
|
179
|
+
duplicate_pairs.sql
|
|
180
|
+
duplicate_pair_scores.sql
|
|
181
|
+
duplicate_reasons.sql
|
|
182
|
+
mitigated_pairs.sql
|
|
183
|
+
duplicate_clusters.sql
|
|
184
|
+
duplicate_cluster_pairs.sql
|
|
185
|
+
macros/
|
|
186
|
+
taxonomy_rollup.sql — ARRAY_AGG of taxonomy objects for service or org
|
|
187
|
+
location_rollup_service.sql — SAL → LOCATION → ADDRESS for services
|
|
188
|
+
location_rollup_org.sql — LOCATION.ORGANIZATION_ID → ADDRESS for orgs
|
|
189
|
+
phone_rollup_service.sql — 3-path phone UNION for services
|
|
190
|
+
phone_rollup_org.sql — 4-path phone UNION for organizations
|
|
191
|
+
service_rollup.sql — org's services with nested taxonomy codes
|
|
192
|
+
service_contact_rollup.sql — service-level email/website rollup to org
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
## Contributing
|
|
196
|
+
|
|
197
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for pull request requirements, quality checks, and review
|
|
198
|
+
expectations.
|
|
199
|
+
|
|
200
|
+
## Additional Docs
|
|
201
|
+
|
|
202
|
+
- [DEDUPLICATION schema audit](docs/deduplication-schema-audit.md)
|
|
203
|
+
- [Review interface write path](docs/review-interface-write-path.md)
|
|
204
|
+
- [Training and tuning notes](docs/training-and-tuning.md)
|
|
205
|
+
|
|
206
|
+
## Using This In Another Dagster Repo
|
|
207
|
+
|
|
208
|
+
1. Publish or install this package (for example: `pip install hsds-record-matcher`).
|
|
209
|
+
2. Confirm discovery in the target environment:
|
|
210
|
+
|
|
211
|
+
```bash
|
|
212
|
+
dg list components --package hsds_entity_resolution
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
3. Use the component key in YAML:
|
|
216
|
+
|
|
217
|
+
```yaml
|
|
218
|
+
type: hsds_entity_resolution.dagster.components.EntityResolutionComponent
|
|
219
|
+
attributes: {}
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
## Publishing
|
|
223
|
+
|
|
224
|
+
This package is set up to publish to PyPI from GitHub Actions via Trusted Publishing.
|
|
225
|
+
|
|
226
|
+
### PyPI Trusted Publisher settings
|
|
227
|
+
|
|
228
|
+
For the pending or normal PyPI publisher, use:
|
|
229
|
+
|
|
230
|
+
- PyPI project name: `hsds-record-matcher`
|
|
231
|
+
- Owner: `211-Connect`
|
|
232
|
+
- Repository name: `hsds-entity-resolution`
|
|
233
|
+
- Workflow name: `publish.yml`
|
|
234
|
+
- Environment name: `pypi`
|
|
235
|
+
|
|
236
|
+
The repository name field should be only the repository name, not `owner/repo`.
|
|
237
|
+
|
|
238
|
+
The distribution name on PyPI is independent from the import path in Python:
|
|
239
|
+
|
|
240
|
+
- Install name: `hsds-record-matcher`
|
|
241
|
+
- Import path: `hsds_entity_resolution`
|
|
242
|
+
|
|
243
|
+
### Release flow
|
|
244
|
+
|
|
245
|
+
1. Update `version` in `pyproject.toml`.
|
|
246
|
+
2. Merge or push that change to `main`.
|
|
247
|
+
3. GitHub Actions will build the wheel and sdist, validate them with `twine check`, and publish to PyPI through the `pypi` environment if the version changed.
|
|
248
|
+
|
|
249
|
+
You can also run the publish workflow manually with `workflow_dispatch`.
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
# hsds_entity_resolution
|
|
2
|
+
|
|
3
|
+
`hsds_entity_resolution` helps community organizations deduplicate HSDS data and orchestrate
|
|
4
|
+
continual checks that support long-running community data sharing partnerships.
|
|
5
|
+
|
|
6
|
+
## Project goals
|
|
7
|
+
|
|
8
|
+
- Improve entity matching quality across partner-provided HSDS datasets
|
|
9
|
+
- Reduce duplicate records that block trusted cross-organization coordination
|
|
10
|
+
- Run repeatable validation and quality checks as data pipelines evolve
|
|
11
|
+
- Support sustainable, long-term community data sharing operations
|
|
12
|
+
|
|
13
|
+
## Tooling
|
|
14
|
+
|
|
15
|
+
- **Dagster (`dagster`, `dg`)**: pipeline orchestration, definitions, and local development UI
|
|
16
|
+
- **dbt (Snowflake adapter)**: SQL management for source denormalization and incremental persistence
|
|
17
|
+
- **dagster-dbt**: Dagster integration that invokes dbt staging and mart phases inside jobs
|
|
18
|
+
- **Pydantic v2**: typed data models and validation for HSDS entities and pipeline I/O
|
|
19
|
+
- **Ruff**: Python formatting and linting for fast local feedback
|
|
20
|
+
- **Pyright**: static type checking for `src/` and `tests/`
|
|
21
|
+
- **Codacy CLI (`.codacy/cli.sh`)**: static analysis and security scanning (Pylint, Semgrep,
|
|
22
|
+
Lizard, Trivy)
|
|
23
|
+
- **uv**: dependency and virtual environment management
|
|
24
|
+
|
|
25
|
+
## Component Package Layout
|
|
26
|
+
|
|
27
|
+
Reusable Dagster components live in:
|
|
28
|
+
|
|
29
|
+
- `src/hsds_entity_resolution/dagster/components/`
|
|
30
|
+
|
|
31
|
+
Core library code should live outside the Dagster adapter layer:
|
|
32
|
+
|
|
33
|
+
- `src/hsds_entity_resolution/core/`
|
|
34
|
+
- `src/hsds_entity_resolution/types/`
|
|
35
|
+
- `src/hsds_entity_resolution/config/`
|
|
36
|
+
|
|
37
|
+
The canonical public component entry point is:
|
|
38
|
+
|
|
39
|
+
- `hsds_entity_resolution.dagster.components.EntityResolutionComponent`
|
|
40
|
+
|
|
41
|
+
This module is exported through the Dagster registry entry-point group:
|
|
42
|
+
|
|
43
|
+
- `dagster_dg_cli.registry_modules`
|
|
44
|
+
|
|
45
|
+
## Getting started
|
|
46
|
+
|
|
47
|
+
### Install dependencies
|
|
48
|
+
|
|
49
|
+
Ensure [`uv`](https://docs.astral.sh/uv/) is installed following the
|
|
50
|
+
[official documentation](https://docs.astral.sh/uv/getting-started/installation/), then run:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
uv sync
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### Run the project
|
|
57
|
+
|
|
58
|
+
This repo has two entry points depending on what you are working on:
|
|
59
|
+
|
|
60
|
+
| Goal | Command |
|
|
61
|
+
| --- | --- |
|
|
62
|
+
| Run the IL211 pipeline (jobs, schedules, Snowflake) | `uv run dagster dev -m consumer.definitions` |
|
|
63
|
+
| Develop the `EntityResolutionComponent` library | `dg dev` |
|
|
64
|
+
|
|
65
|
+
**For pipeline development and debugging, always use:**
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
uv run dagster dev -m consumer.definitions
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Then open [http://localhost:3000](http://localhost:3000) and go to
|
|
72
|
+
**Deployment → consumer.definitions → Jobs** to find:
|
|
73
|
+
|
|
74
|
+
- `entity_resolution__il211_regional__organization`
|
|
75
|
+
- `entity_resolution__il211_regional__service`
|
|
76
|
+
|
|
77
|
+
Use the **Launchpad** tab on either job to configure a run (e.g. restrict to one
|
|
78
|
+
`source_schema` for faster local testing) and launch it manually.
|
|
79
|
+
|
|
80
|
+
`dg dev` loads the reusable `EntityResolutionComponent` package — it intentionally
|
|
81
|
+
has no jobs or assets of its own and is only useful when working on the component
|
|
82
|
+
library itself.
|
|
83
|
+
|
|
84
|
+
## dbt project (`consumer/dbt/`)
|
|
85
|
+
|
|
86
|
+
The pipeline uses a dbt project to manage all complex SQL in one place. It runs
|
|
87
|
+
in two phases inside every Dagster job:
|
|
88
|
+
|
|
89
|
+
| Phase | dbt select | What it does |
|
|
90
|
+
| --- | --- | --- |
|
|
91
|
+
| **Staging** (before Python ER) | `--select staging` | Materializes `stg_service_denormalized` and `stg_organization_denormalized` tables in `DEDUPLICATION.ER_STAGING` from raw HSDS tables in `NORSE_STAGING` |
|
|
92
|
+
| **Marts** (after Python ER stages artifacts) | `--select marts` | Incremental merge models upsert artifact staging rows into the final output tables in `DEDUPLICATION.ER_RUNTIME` |
|
|
93
|
+
|
|
94
|
+
### Do you need to run any dbt commands before startup?
|
|
95
|
+
|
|
96
|
+
**No.** `dagster dev -m consumer.definitions` starts cleanly — the `DbtCliResource`
|
|
97
|
+
is just a resource handle at startup and triggers no dbt execution. dbt parses
|
|
98
|
+
and compiles the project automatically when each job phase runs.
|
|
99
|
+
|
|
100
|
+
No external dbt packages are used, so `dbt deps` is never required. `dbt build`
|
|
101
|
+
is not used; Dagster controls execution order via the phased job structure.
|
|
102
|
+
|
|
103
|
+
### Useful sanity check during development
|
|
104
|
+
|
|
105
|
+
After editing the dbt project, validate syntax and macro references without
|
|
106
|
+
hitting Snowflake:
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
cd consumer/dbt
|
|
110
|
+
uv run dbt parse --profiles-dir .
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
This confirms all Jinja loops compile, macro calls are valid, and `sources.yml`
|
|
114
|
+
references are consistent.
|
|
115
|
+
|
|
116
|
+
### Required environment variables for dbt
|
|
117
|
+
|
|
118
|
+
| Variable | Default | Purpose |
|
|
119
|
+
| --- | --- | --- |
|
|
120
|
+
| `SNOWFLAKE_ACCOUNT` | — | Snowflake account identifier |
|
|
121
|
+
| `SNOWFLAKE_USERNAME` | — | Snowflake username |
|
|
122
|
+
| `SNOWFLAKE_PASSWORD` | — | Snowflake password (or use `SNOWFLAKE_PRIVATE_KEY_PATH`) |
|
|
123
|
+
| `SNOWFLAKE_ROLE` | `SYSADMIN` | Snowflake role |
|
|
124
|
+
| `SNOWFLAKE_WAREHOUSE` | — | Snowflake virtual warehouse |
|
|
125
|
+
| `ER_TARGET_DATABASE` | `DEDUPLICATION` | Database for runtime and reconciliation tables |
|
|
126
|
+
| `ER_RUNTIME_SCHEMA` | `ER_RUNTIME` | Schema for mart output tables |
|
|
127
|
+
| `ER_INCREMENTAL_STATE_SCHEMA` | `ER_INCREMENTAL_STATE` | Schema for incremental state tables |
|
|
128
|
+
| `ER_STAGING_DATABASE` | `DEDUPLICATION` | Database for persistent staging tables |
|
|
129
|
+
| `ER_STAGING_SCHEMA` | `ER_STAGING` | Schema for persistent staging tables |
|
|
130
|
+
| `ER_HSDS_DATABASE` | `NORSE_STAGING` | Source HSDS database |
|
|
131
|
+
|
|
132
|
+
### dbt project structure
|
|
133
|
+
|
|
134
|
+
```
|
|
135
|
+
consumer/dbt/
|
|
136
|
+
dbt_project.yml — project config, model materialization defaults
|
|
137
|
+
profiles.yml — Snowflake connection (env-var based; DbtCliResource overrides in prod)
|
|
138
|
+
models/
|
|
139
|
+
sources.yml — er_staging source definitions with not_null/unique tests
|
|
140
|
+
schema.yml — schema tests for staging and mart models
|
|
141
|
+
staging/
|
|
142
|
+
stg_service_denormalized.sql — multi-tenant UNION over target_schemas
|
|
143
|
+
stg_organization_denormalized.sql — multi-tenant UNION over target_schemas
|
|
144
|
+
marts/
|
|
145
|
+
denormalized_service_cache.sql
|
|
146
|
+
denormalized_organization_cache.sql
|
|
147
|
+
deduplication_run.sql
|
|
148
|
+
duplicate_pairs.sql
|
|
149
|
+
duplicate_pair_scores.sql
|
|
150
|
+
duplicate_reasons.sql
|
|
151
|
+
mitigated_pairs.sql
|
|
152
|
+
duplicate_clusters.sql
|
|
153
|
+
duplicate_cluster_pairs.sql
|
|
154
|
+
macros/
|
|
155
|
+
taxonomy_rollup.sql — ARRAY_AGG of taxonomy objects for service or org
|
|
156
|
+
location_rollup_service.sql — SAL → LOCATION → ADDRESS for services
|
|
157
|
+
location_rollup_org.sql — LOCATION.ORGANIZATION_ID → ADDRESS for orgs
|
|
158
|
+
phone_rollup_service.sql — 3-path phone UNION for services
|
|
159
|
+
phone_rollup_org.sql — 4-path phone UNION for organizations
|
|
160
|
+
service_rollup.sql — org's services with nested taxonomy codes
|
|
161
|
+
service_contact_rollup.sql — service-level email/website rollup to org
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
## Contributing
|
|
165
|
+
|
|
166
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for pull request requirements, quality checks, and review
|
|
167
|
+
expectations.
|
|
168
|
+
|
|
169
|
+
## Additional Docs
|
|
170
|
+
|
|
171
|
+
- [DEDUPLICATION schema audit](docs/deduplication-schema-audit.md)
|
|
172
|
+
- [Review interface write path](docs/review-interface-write-path.md)
|
|
173
|
+
- [Training and tuning notes](docs/training-and-tuning.md)
|
|
174
|
+
|
|
175
|
+
## Using This In Another Dagster Repo
|
|
176
|
+
|
|
177
|
+
1. Publish or install this package (for example: `pip install hsds-record-matcher`).
|
|
178
|
+
2. Confirm discovery in the target environment:
|
|
179
|
+
|
|
180
|
+
```bash
|
|
181
|
+
dg list components --package hsds_entity_resolution
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
3. Use the component key in YAML:
|
|
185
|
+
|
|
186
|
+
```yaml
|
|
187
|
+
type: hsds_entity_resolution.dagster.components.EntityResolutionComponent
|
|
188
|
+
attributes: {}
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
## Publishing
|
|
192
|
+
|
|
193
|
+
This package is set up to publish to PyPI from GitHub Actions via Trusted Publishing.
|
|
194
|
+
|
|
195
|
+
### PyPI Trusted Publisher settings
|
|
196
|
+
|
|
197
|
+
For the pending or normal PyPI publisher, use:
|
|
198
|
+
|
|
199
|
+
- PyPI project name: `hsds-record-matcher`
|
|
200
|
+
- Owner: `211-Connect`
|
|
201
|
+
- Repository name: `hsds-entity-resolution`
|
|
202
|
+
- Workflow name: `publish.yml`
|
|
203
|
+
- Environment name: `pypi`
|
|
204
|
+
|
|
205
|
+
The repository name field should be only the repository name, not `owner/repo`.
|
|
206
|
+
|
|
207
|
+
The distribution name on PyPI is independent from the import path in Python:
|
|
208
|
+
|
|
209
|
+
- Install name: `hsds-record-matcher`
|
|
210
|
+
- Import path: `hsds_entity_resolution`
|
|
211
|
+
|
|
212
|
+
### Release flow
|
|
213
|
+
|
|
214
|
+
1. Update `version` in `pyproject.toml`.
|
|
215
|
+
2. Merge or push that change to `main`.
|
|
216
|
+
3. GitHub Actions will build the wheel and sdist, validate them with `twine check`, and publish to PyPI through the `pypi` environment if the version changed.
|
|
217
|
+
|
|
218
|
+
You can also run the publish workflow manually with `workflow_dispatch`.
|