hsds-record-matcher 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. hsds_record_matcher-1.0.0/.gitignore +233 -0
  2. hsds_record_matcher-1.0.0/PKG-INFO +249 -0
  3. hsds_record_matcher-1.0.0/README.md +218 -0
  4. hsds_record_matcher-1.0.0/pyproject.toml +101 -0
  5. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/__init__.py +5 -0
  6. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/config/__init__.py +21 -0
  7. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/config/entity_resolution_run_config.py +278 -0
  8. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/__init__.py +5 -0
  9. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/apply_mitigation.py +646 -0
  10. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/clean_entities.py +431 -0
  11. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/cluster_pairs.py +383 -0
  12. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/dataframe_utils.py +88 -0
  13. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/domain_utils.py +231 -0
  14. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/evidence_policy.py +36 -0
  15. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/feature_extractor.py +1031 -0
  16. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/generate_candidates.py +891 -0
  17. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/materialize_review_queue.py +124 -0
  18. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/ml_inference.py +211 -0
  19. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/nlp/__init__.py +6 -0
  20. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/nlp/algorithms.py +83 -0
  21. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/nlp/safeguards.py +41 -0
  22. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/nlp/scoring.py +43 -0
  23. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/nlp/types.py +24 -0
  24. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/pair_tiering.py +41 -0
  25. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/pipeline.py +282 -0
  26. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/prepare_persistence_artifacts.py +99 -0
  27. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/score_candidates.py +1123 -0
  28. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/taxonomy_utils.py +249 -0
  29. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/training_feature_store.py +1062 -0
  30. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/training_features.py +156 -0
  31. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/core/training_schema.py +203 -0
  32. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/dagster/__init__.py +5 -0
  33. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/dagster/components/__init__.py +7 -0
  34. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/dagster/components/entity_resolution_component.py +212 -0
  35. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/definitions.py +11 -0
  36. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/defs/__init__.py +1 -0
  37. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/observability/__init__.py +6 -0
  38. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/observability/progress.py +133 -0
  39. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/observability/tracer.py +153 -0
  40. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/py.typed +0 -0
  41. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/tf_idf_models/tfidf_vectorizer_organization.joblib +0 -0
  42. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/tf_idf_models/tfidf_vectorizer_service.joblib +0 -0
  43. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/types/__init__.py +69 -0
  44. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/types/artifact_rows.py +65 -0
  45. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/types/contracts.py +97 -0
  46. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/types/domain.py +15 -0
  47. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/types/frames.py +221 -0
  48. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/types/rows/__init__.py +17 -0
  49. hsds_record_matcher-1.0.0/src/hsds_entity_resolution/types/rows/clean_entity_rows.py +85 -0
  50. hsds_record_matcher-1.0.0/tests/__init__.py +13 -0
  51. hsds_record_matcher-1.0.0/tests/conftest.py +13 -0
  52. hsds_record_matcher-1.0.0/tests/contract/common_experiment_schema_contract.json +98 -0
  53. hsds_record_matcher-1.0.0/tests/contract/test_artifact_to_model_mapping.py +388 -0
  54. hsds_record_matcher-1.0.0/tests/contract/test_common_experiment_schema_contract.py +176 -0
  55. hsds_record_matcher-1.0.0/tests/contract/test_job_registry.py +50 -0
  56. hsds_record_matcher-1.0.0/tests/contract/test_sql_templates.py +101 -0
  57. hsds_record_matcher-1.0.0/tests/contract/test_training_data_schema_contract.py +53 -0
  58. hsds_record_matcher-1.0.0/tests/contract/training_data_schema_contract.json +206 -0
  59. hsds_record_matcher-1.0.0/tests/fixtures/__init__.py +0 -0
  60. hsds_record_matcher-1.0.0/tests/fixtures/burrell_sph_pair.json +136 -0
  61. hsds_record_matcher-1.0.0/tests/integration/test_adapter_flow.py +169 -0
  62. hsds_record_matcher-1.0.0/tests/integration/test_consumer_defs.py +15 -0
  63. hsds_record_matcher-1.0.0/tests/test_apply_mitigation_and_review_queue.py +432 -0
  64. hsds_record_matcher-1.0.0/tests/test_clean_entities.py +425 -0
  65. hsds_record_matcher-1.0.0/tests/test_cluster_pairs.py +91 -0
  66. hsds_record_matcher-1.0.0/tests/test_consumer_embedding_freshness.py +194 -0
  67. hsds_record_matcher-1.0.0/tests/test_cut_training_dataset.py +108 -0
  68. hsds_record_matcher-1.0.0/tests/test_dagster_component_and_defs.py +74 -0
  69. hsds_record_matcher-1.0.0/tests/test_dataframe_utils.py +41 -0
  70. hsds_record_matcher-1.0.0/tests/test_deduplication_common_models.py +127 -0
  71. hsds_record_matcher-1.0.0/tests/test_domain_taxonomy_utils.py +670 -0
  72. hsds_record_matcher-1.0.0/tests/test_embedding_adapter.py +316 -0
  73. hsds_record_matcher-1.0.0/tests/test_entity_resolution_pipeline.py +367 -0
  74. hsds_record_matcher-1.0.0/tests/test_generate_candidates_prefilter.py +462 -0
  75. hsds_record_matcher-1.0.0/tests/test_incremental_scenarios.py +1637 -0
  76. hsds_record_matcher-1.0.0/tests/test_ml_inference_tfidf_path_resolution.py +51 -0
  77. hsds_record_matcher-1.0.0/tests/test_ml_signal_overrides.py +309 -0
  78. hsds_record_matcher-1.0.0/tests/test_nlp_scoring.py +341 -0
  79. hsds_record_matcher-1.0.0/tests/test_persistence_executor.py +88 -0
  80. hsds_record_matcher-1.0.0/tests/test_real_data_scenarios.py +800 -0
  81. hsds_record_matcher-1.0.0/tests/test_run_config_validation.py +37 -0
  82. hsds_record_matcher-1.0.0/tests/test_score_candidates_ml_inference.py +1140 -0
  83. hsds_record_matcher-1.0.0/tests/test_snowflake_embedding_store_queries.py +71 -0
  84. hsds_record_matcher-1.0.0/tests/test_source_loader_helpers.py +217 -0
  85. hsds_record_matcher-1.0.0/tests/test_staging_retention_policy.py +112 -0
  86. hsds_record_matcher-1.0.0/tests/test_taxonomy_embedding_loader.py +277 -0
  87. hsds_record_matcher-1.0.0/tests/test_training_feature_store.py +86 -0
  88. hsds_record_matcher-1.0.0/tests/test_types_modules.py +19 -0
@@ -0,0 +1,233 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+ #poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ #pdm.lock
116
+ #pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ #pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # SageMath parsed files
135
+ *.sage.py
136
+
137
+ # Environments
138
+ .env
139
+ .envrc
140
+ .venv
141
+ env/
142
+ venv/
143
+ ENV/
144
+ env.bak/
145
+ venv.bak/
146
+
147
+ # Spyder project settings
148
+ .spyderproject
149
+ .spyproject
150
+
151
+ # Rope project settings
152
+ .ropeproject
153
+
154
+ # mkdocs documentation
155
+ /site
156
+
157
+ # mypy
158
+ .mypy_cache/
159
+ .dmypy.json
160
+ dmypy.json
161
+
162
+ # Pyre type checker
163
+ .pyre/
164
+
165
+ # pytype static type analyzer
166
+ .pytype/
167
+
168
+ # Cython debug symbols
169
+ cython_debug/
170
+
171
+ # PyCharm
172
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
173
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
174
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
175
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
176
+ #.idea/
177
+
178
+ # Abstra
179
+ # Abstra is an AI-powered process automation framework.
180
+ # Ignore directories containing user credentials, local state, and settings.
181
+ # Learn more at https://abstra.io/docs
182
+ .abstra/
183
+
184
+ # Visual Studio Code
185
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
186
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
187
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
188
+ # you could uncomment the following to ignore the entire vscode folder
189
+ # .vscode/
190
+
191
+ # Ruff stuff:
192
+ .ruff_cache/
193
+
194
+ .uv-cache/
195
+ .cursor/
196
+
197
+ # PyPI configuration file
198
+ .pypirc
199
+
200
+ # Marimo
201
+ marimo/_static/
202
+ marimo/_lsp/
203
+ __marimo__/
204
+
205
+ # Streamlit
206
+ .streamlit/secrets.toml
207
+
208
+
209
+ #Ignore cursor AI rules
210
+ .cursor/rules/codacy.mdc
211
+
212
+ # Codacy
213
+ .codacy/AGENTS.md
214
+
215
+ # dbt — target/ contains per-run artifacts; never commit.
216
+ consumer/dbt/target/
217
+ hsds_dbt_macros/target/
218
+
219
+ .dg/
220
+
221
+ consumer/
222
+ hsds_dbt_macros/
223
+ scripts/
224
+
225
+ HSDS_DATA_MODEL.md
226
+ _hsds_table_desc/
227
+
228
+ .agents/skills/snowsql-investigation/
229
+ .codacy/
230
+ docs/
231
+ !docs/
232
+ docs/*
233
+ !docs/deduplication-schema-audit.md
@@ -0,0 +1,249 @@
1
+ Metadata-Version: 2.4
2
+ Name: hsds-record-matcher
3
+ Version: 1.0.0
4
+ Summary: Reusable Dagster component package for HSDS entity resolution workflows.
5
+ Project-URL: Homepage, https://github.com/211-Connect/hsds-entity-resolution
6
+ Project-URL: Repository, https://github.com/211-Connect/hsds-entity-resolution
7
+ Project-URL: Issues, https://github.com/211-Connect/hsds-entity-resolution/issues
8
+ Keywords: dagster,dbt,deduplication,entity-resolution,hsds
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Classifier: Typing :: Typed
17
+ Requires-Python: <3.15,>=3.10
18
+ Requires-Dist: dagster-dbt>=0.28
19
+ Requires-Dist: dagster-snowflake>=0.28.17
20
+ Requires-Dist: dagster==1.12.17
21
+ Requires-Dist: dbt-snowflake>=1.8
22
+ Requires-Dist: huggingface-hub>=1.5.0
23
+ Requires-Dist: joblib<2,>=1.3
24
+ Requires-Dist: numpy<3,>=1.26
25
+ Requires-Dist: polars<2,>=1.18
26
+ Requires-Dist: pydantic<3,>=2
27
+ Requires-Dist: scikit-learn<2,>=1.3
28
+ Requires-Dist: tenacity>=9.1.4
29
+ Requires-Dist: tldextract>=5.3.1
30
+ Description-Content-Type: text/markdown
31
+
32
+ # hsds_entity_resolution
33
+
34
+ `hsds_entity_resolution` helps community organizations deduplicate HSDS data and orchestrate
35
+ continual checks that support long-running community data sharing partnerships.
36
+
37
+ ## Project goals
38
+
39
+ - Improve entity matching quality across partner-provided HSDS datasets
40
+ - Reduce duplicate records that block trusted cross-organization coordination
41
+ - Run repeatable validation and quality checks as data pipelines evolve
42
+ - Support sustainable, long-term community data sharing operations
43
+
44
+ ## Tooling
45
+
46
+ - **Dagster (`dagster`, `dg`)**: pipeline orchestration, definitions, and local development UI
47
+ - **dbt (Snowflake adapter)**: SQL management for source denormalization and incremental persistence
48
+ - **dagster-dbt**: Dagster integration that invokes dbt staging and mart phases inside jobs
49
+ - **Pydantic v2**: typed data models and validation for HSDS entities and pipeline I/O
50
+ - **Ruff**: Python formatting and linting for fast local feedback
51
+ - **Pyright**: static type checking for `src/` and `tests/`
52
+ - **Codacy CLI (`.codacy/cli.sh`)**: static analysis and security scanning (Pylint, Semgrep,
53
+ Lizard, Trivy)
54
+ - **uv**: dependency and virtual environment management
55
+
56
+ ## Component Package Layout
57
+
58
+ Reusable Dagster components live in:
59
+
60
+ - `src/hsds_entity_resolution/dagster/components/`
61
+
62
+ Core library code should live outside the Dagster adapter layer:
63
+
64
+ - `src/hsds_entity_resolution/core/`
65
+ - `src/hsds_entity_resolution/types/`
66
+ - `src/hsds_entity_resolution/config/`
67
+
68
+ The canonical public component entry point is:
69
+
70
+ - `hsds_entity_resolution.dagster.components.EntityResolutionComponent`
71
+
72
+ This module is exported through the Dagster registry entry-point group:
73
+
74
+ - `dagster_dg_cli.registry_modules`
75
+
76
+ ## Getting started
77
+
78
+ ### Install dependencies
79
+
80
+ Ensure [`uv`](https://docs.astral.sh/uv/) is installed following the
81
+ [official documentation](https://docs.astral.sh/uv/getting-started/installation/), then run:
82
+
83
+ ```bash
84
+ uv sync
85
+ ```
86
+
87
+ ### Run the project
88
+
89
+ This repo has two entry points depending on what you are working on:
90
+
91
+ | Goal | Command |
92
+ | --- | --- |
93
+ | Run the IL211 pipeline (jobs, schedules, Snowflake) | `uv run dagster dev -m consumer.definitions` |
94
+ | Develop the `EntityResolutionComponent` library | `dg dev` |
95
+
96
+ **For pipeline development and debugging, always use:**
97
+
98
+ ```bash
99
+ uv run dagster dev -m consumer.definitions
100
+ ```
101
+
102
+ Then open [http://localhost:3000](http://localhost:3000) and go to
103
+ **Deployment → consumer.definitions → Jobs** to find:
104
+
105
+ - `entity_resolution__il211_regional__organization`
106
+ - `entity_resolution__il211_regional__service`
107
+
108
+ Use the **Launchpad** tab on either job to configure a run (e.g. restrict to one
109
+ `source_schema` for faster local testing) and launch it manually.
110
+
111
+ `dg dev` loads the reusable `EntityResolutionComponent` package — it intentionally
112
+ has no jobs or assets of its own and is only useful when working on the component
113
+ library itself.
114
+
115
+ ## dbt project (`consumer/dbt/`)
116
+
117
+ The pipeline uses a dbt project to manage all complex SQL in one place. It runs
118
+ in two phases inside every Dagster job:
119
+
120
+ | Phase | dbt select | What it does |
121
+ | --- | --- | --- |
122
+ | **Staging** (before Python ER) | `--select staging` | Materializes `stg_service_denormalized` and `stg_organization_denormalized` tables in `DEDUPLICATION.ER_STAGING` from raw HSDS tables in `NORSE_STAGING` |
123
+ | **Marts** (after Python ER stages artifacts) | `--select marts` | Incremental merge models upsert artifact staging rows into the final output tables in `DEDUPLICATION.ER_RUNTIME` |
124
+
125
+ ### Do you need to run any dbt commands before startup?
126
+
127
+ **No.** `dagster dev -m consumer.definitions` starts cleanly — the `DbtCliResource`
128
+ is just a resource handle at startup and triggers no dbt execution. dbt parses
129
+ and compiles the project automatically when each job phase runs.
130
+
131
+ No external dbt packages are used, so `dbt deps` is never required. `dbt build`
132
+ is not used; Dagster controls execution order via the phased job structure.
133
+
134
+ ### Useful sanity check during development
135
+
136
+ After editing the dbt project, validate syntax and macro references without
137
+ hitting Snowflake:
138
+
139
+ ```bash
140
+ cd consumer/dbt
141
+ uv run dbt parse --profiles-dir .
142
+ ```
143
+
144
+ This confirms all Jinja loops compile, macro calls are valid, and `sources.yml`
145
+ references are consistent.
146
+
147
+ ### Required environment variables for dbt
148
+
149
+ | Variable | Default | Purpose |
150
+ | --- | --- | --- |
151
+ | `SNOWFLAKE_ACCOUNT` | — | Snowflake account identifier |
152
+ | `SNOWFLAKE_USERNAME` | — | Snowflake username |
153
+ | `SNOWFLAKE_PASSWORD` | — | Snowflake password (or use `SNOWFLAKE_PRIVATE_KEY_PATH`) |
154
+ | `SNOWFLAKE_ROLE` | `SYSADMIN` | Snowflake role |
155
+ | `SNOWFLAKE_WAREHOUSE` | — | Snowflake virtual warehouse |
156
+ | `ER_TARGET_DATABASE` | `DEDUPLICATION` | Database for runtime and reconciliation tables |
157
+ | `ER_RUNTIME_SCHEMA` | `ER_RUNTIME` | Schema for mart output tables |
158
+ | `ER_INCREMENTAL_STATE_SCHEMA` | `ER_INCREMENTAL_STATE` | Schema for incremental state tables |
159
+ | `ER_STAGING_DATABASE` | `DEDUPLICATION` | Database for persistent staging tables |
160
+ | `ER_STAGING_SCHEMA` | `ER_STAGING` | Schema for persistent staging tables |
161
+ | `ER_HSDS_DATABASE` | `NORSE_STAGING` | Source HSDS database |
162
+
163
+ ### dbt project structure
164
+
165
+ ```
166
+ consumer/dbt/
167
+ dbt_project.yml — project config, model materialization defaults
168
+ profiles.yml — Snowflake connection (env-var based; DbtCliResource overrides in prod)
169
+ models/
170
+ sources.yml — er_staging source definitions with not_null/unique tests
171
+ schema.yml — schema tests for staging and mart models
172
+ staging/
173
+ stg_service_denormalized.sql — multi-tenant UNION over target_schemas
174
+ stg_organization_denormalized.sql — multi-tenant UNION over target_schemas
175
+ marts/
176
+ denormalized_service_cache.sql
177
+ denormalized_organization_cache.sql
178
+ deduplication_run.sql
179
+ duplicate_pairs.sql
180
+ duplicate_pair_scores.sql
181
+ duplicate_reasons.sql
182
+ mitigated_pairs.sql
183
+ duplicate_clusters.sql
184
+ duplicate_cluster_pairs.sql
185
+ macros/
186
+ taxonomy_rollup.sql — ARRAY_AGG of taxonomy objects for service or org
187
+ location_rollup_service.sql — SAL → LOCATION → ADDRESS for services
188
+ location_rollup_org.sql — LOCATION.ORGANIZATION_ID → ADDRESS for orgs
189
+ phone_rollup_service.sql — 3-path phone UNION for services
190
+ phone_rollup_org.sql — 4-path phone UNION for organizations
191
+ service_rollup.sql — org's services with nested taxonomy codes
192
+ service_contact_rollup.sql — service-level email/website rollup to org
193
+ ```
194
+
195
+ ## Contributing
196
+
197
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for pull request requirements, quality checks, and review
198
+ expectations.
199
+
200
+ ## Additional Docs
201
+
202
+ - [DEDUPLICATION schema audit](docs/deduplication-schema-audit.md)
203
+ - [Review interface write path](docs/review-interface-write-path.md)
204
+ - [Training and tuning notes](docs/training-and-tuning.md)
205
+
206
+ ## Using This In Another Dagster Repo
207
+
208
+ 1. Publish or install this package (for example: `pip install hsds-record-matcher`).
209
+ 2. Confirm discovery in the target environment:
210
+
211
+ ```bash
212
+ dg list components --package hsds_entity_resolution
213
+ ```
214
+
215
+ 3. Use the component key in YAML:
216
+
217
+ ```yaml
218
+ type: hsds_entity_resolution.dagster.components.EntityResolutionComponent
219
+ attributes: {}
220
+ ```
221
+
222
+ ## Publishing
223
+
224
+ This package is set up to publish to PyPI from GitHub Actions via Trusted Publishing.
225
+
226
+ ### PyPI Trusted Publisher settings
227
+
228
+ For the pending or normal PyPI publisher, use:
229
+
230
+ - PyPI project name: `hsds-record-matcher`
231
+ - Owner: `211-Connect`
232
+ - Repository name: `hsds-entity-resolution`
233
+ - Workflow name: `publish.yml`
234
+ - Environment name: `pypi`
235
+
236
+ The repository name field should be only the repository name, not `owner/repo`.
237
+
238
+ The distribution name on PyPI is independent from the import path in Python:
239
+
240
+ - Install name: `hsds-record-matcher`
241
+ - Import path: `hsds_entity_resolution`
242
+
243
+ ### Release flow
244
+
245
+ 1. Update `version` in `pyproject.toml`.
246
+ 2. Merge or push that change to `main`.
247
+ 3. GitHub Actions will build the wheel and sdist, validate them with `twine check`, and publish to PyPI through the `pypi` environment if the version changed.
248
+
249
+ You can also run the publish workflow manually with `workflow_dispatch`.
@@ -0,0 +1,218 @@
1
+ # hsds_entity_resolution
2
+
3
+ `hsds_entity_resolution` helps community organizations deduplicate HSDS data and orchestrate
4
+ continual checks that support long-running community data sharing partnerships.
5
+
6
+ ## Project goals
7
+
8
+ - Improve entity matching quality across partner-provided HSDS datasets
9
+ - Reduce duplicate records that block trusted cross-organization coordination
10
+ - Run repeatable validation and quality checks as data pipelines evolve
11
+ - Support sustainable, long-term community data sharing operations
12
+
13
+ ## Tooling
14
+
15
+ - **Dagster (`dagster`, `dg`)**: pipeline orchestration, definitions, and local development UI
16
+ - **dbt (Snowflake adapter)**: SQL management for source denormalization and incremental persistence
17
+ - **dagster-dbt**: Dagster integration that invokes dbt staging and mart phases inside jobs
18
+ - **Pydantic v2**: typed data models and validation for HSDS entities and pipeline I/O
19
+ - **Ruff**: Python formatting and linting for fast local feedback
20
+ - **Pyright**: static type checking for `src/` and `tests/`
21
+ - **Codacy CLI (`.codacy/cli.sh`)**: static analysis and security scanning (Pylint, Semgrep,
22
+ Lizard, Trivy)
23
+ - **uv**: dependency and virtual environment management
24
+
25
+ ## Component Package Layout
26
+
27
+ Reusable Dagster components live in:
28
+
29
+ - `src/hsds_entity_resolution/dagster/components/`
30
+
31
+ Core library code should live outside the Dagster adapter layer:
32
+
33
+ - `src/hsds_entity_resolution/core/`
34
+ - `src/hsds_entity_resolution/types/`
35
+ - `src/hsds_entity_resolution/config/`
36
+
37
+ The canonical public component entry point is:
38
+
39
+ - `hsds_entity_resolution.dagster.components.EntityResolutionComponent`
40
+
41
+ This module is exported through the Dagster registry entry-point group:
42
+
43
+ - `dagster_dg_cli.registry_modules`
44
+
45
+ ## Getting started
46
+
47
+ ### Install dependencies
48
+
49
+ Ensure [`uv`](https://docs.astral.sh/uv/) is installed following the
50
+ [official documentation](https://docs.astral.sh/uv/getting-started/installation/), then run:
51
+
52
+ ```bash
53
+ uv sync
54
+ ```
55
+
56
+ ### Run the project
57
+
58
+ This repo has two entry points depending on what you are working on:
59
+
60
+ | Goal | Command |
61
+ | --- | --- |
62
+ | Run the IL211 pipeline (jobs, schedules, Snowflake) | `uv run dagster dev -m consumer.definitions` |
63
+ | Develop the `EntityResolutionComponent` library | `dg dev` |
64
+
65
+ **For pipeline development and debugging, always use:**
66
+
67
+ ```bash
68
+ uv run dagster dev -m consumer.definitions
69
+ ```
70
+
71
+ Then open [http://localhost:3000](http://localhost:3000) and go to
72
+ **Deployment → consumer.definitions → Jobs** to find:
73
+
74
+ - `entity_resolution__il211_regional__organization`
75
+ - `entity_resolution__il211_regional__service`
76
+
77
+ Use the **Launchpad** tab on either job to configure a run (e.g. restrict to one
78
+ `source_schema` for faster local testing) and launch it manually.
79
+
80
+ `dg dev` loads the reusable `EntityResolutionComponent` package — it intentionally
81
+ has no jobs or assets of its own and is only useful when working on the component
82
+ library itself.
83
+
84
+ ## dbt project (`consumer/dbt/`)
85
+
86
+ The pipeline uses a dbt project to manage all complex SQL in one place. It runs
87
+ in two phases inside every Dagster job:
88
+
89
+ | Phase | dbt select | What it does |
90
+ | --- | --- | --- |
91
+ | **Staging** (before Python ER) | `--select staging` | Materializes `stg_service_denormalized` and `stg_organization_denormalized` tables in `DEDUPLICATION.ER_STAGING` from raw HSDS tables in `NORSE_STAGING` |
92
+ | **Marts** (after Python ER stages artifacts) | `--select marts` | Incremental merge models upsert artifact staging rows into the final output tables in `DEDUPLICATION.ER_RUNTIME` |
93
+
94
+ ### Do you need to run any dbt commands before startup?
95
+
96
+ **No.** `dagster dev -m consumer.definitions` starts cleanly — the `DbtCliResource`
97
+ is just a resource handle at startup and triggers no dbt execution. dbt parses
98
+ and compiles the project automatically when each job phase runs.
99
+
100
+ No external dbt packages are used, so `dbt deps` is never required. `dbt build`
101
+ is not used; Dagster controls execution order via the phased job structure.
102
+
103
+ ### Useful sanity check during development
104
+
105
+ After editing the dbt project, validate syntax and macro references without
106
+ hitting Snowflake:
107
+
108
+ ```bash
109
+ cd consumer/dbt
110
+ uv run dbt parse --profiles-dir .
111
+ ```
112
+
113
+ This confirms all Jinja loops compile, macro calls are valid, and `sources.yml`
114
+ references are consistent.
115
+
116
+ ### Required environment variables for dbt
117
+
118
+ | Variable | Default | Purpose |
119
+ | --- | --- | --- |
120
+ | `SNOWFLAKE_ACCOUNT` | — | Snowflake account identifier |
121
+ | `SNOWFLAKE_USERNAME` | — | Snowflake username |
122
+ | `SNOWFLAKE_PASSWORD` | — | Snowflake password (or use `SNOWFLAKE_PRIVATE_KEY_PATH`) |
123
+ | `SNOWFLAKE_ROLE` | `SYSADMIN` | Snowflake role |
124
+ | `SNOWFLAKE_WAREHOUSE` | — | Snowflake virtual warehouse |
125
+ | `ER_TARGET_DATABASE` | `DEDUPLICATION` | Database for runtime and reconciliation tables |
126
+ | `ER_RUNTIME_SCHEMA` | `ER_RUNTIME` | Schema for mart output tables |
127
+ | `ER_INCREMENTAL_STATE_SCHEMA` | `ER_INCREMENTAL_STATE` | Schema for incremental state tables |
128
+ | `ER_STAGING_DATABASE` | `DEDUPLICATION` | Database for persistent staging tables |
129
+ | `ER_STAGING_SCHEMA` | `ER_STAGING` | Schema for persistent staging tables |
130
+ | `ER_HSDS_DATABASE` | `NORSE_STAGING` | Source HSDS database |
131
+
132
+ ### dbt project structure
133
+
134
+ ```
135
+ consumer/dbt/
136
+ dbt_project.yml — project config, model materialization defaults
137
+ profiles.yml — Snowflake connection (env-var based; DbtCliResource overrides in prod)
138
+ models/
139
+ sources.yml — er_staging source definitions with not_null/unique tests
140
+ schema.yml — schema tests for staging and mart models
141
+ staging/
142
+ stg_service_denormalized.sql — multi-tenant UNION over target_schemas
143
+ stg_organization_denormalized.sql — multi-tenant UNION over target_schemas
144
+ marts/
145
+ denormalized_service_cache.sql
146
+ denormalized_organization_cache.sql
147
+ deduplication_run.sql
148
+ duplicate_pairs.sql
149
+ duplicate_pair_scores.sql
150
+ duplicate_reasons.sql
151
+ mitigated_pairs.sql
152
+ duplicate_clusters.sql
153
+ duplicate_cluster_pairs.sql
154
+ macros/
155
+ taxonomy_rollup.sql — ARRAY_AGG of taxonomy objects for service or org
156
+ location_rollup_service.sql — SAL → LOCATION → ADDRESS for services
157
+ location_rollup_org.sql — LOCATION.ORGANIZATION_ID → ADDRESS for orgs
158
+ phone_rollup_service.sql — 3-path phone UNION for services
159
+ phone_rollup_org.sql — 4-path phone UNION for organizations
160
+ service_rollup.sql — org's services with nested taxonomy codes
161
+ service_contact_rollup.sql — service-level email/website rollup to org
162
+ ```
163
+
164
+ ## Contributing
165
+
166
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for pull request requirements, quality checks, and review
167
+ expectations.
168
+
169
+ ## Additional Docs
170
+
171
+ - [DEDUPLICATION schema audit](docs/deduplication-schema-audit.md)
172
+ - [Review interface write path](docs/review-interface-write-path.md)
173
+ - [Training and tuning notes](docs/training-and-tuning.md)
174
+
175
+ ## Using This In Another Dagster Repo
176
+
177
+ 1. Publish or install this package (for example: `pip install hsds-record-matcher`).
178
+ 2. Confirm discovery in the target environment:
179
+
180
+ ```bash
181
+ dg list components --package hsds_entity_resolution
182
+ ```
183
+
184
+ 3. Use the component key in YAML:
185
+
186
+ ```yaml
187
+ type: hsds_entity_resolution.dagster.components.EntityResolutionComponent
188
+ attributes: {}
189
+ ```
190
+
191
+ ## Publishing
192
+
193
+ This package is set up to publish to PyPI from GitHub Actions via Trusted Publishing.
194
+
195
+ ### PyPI Trusted Publisher settings
196
+
197
+ For the pending or normal PyPI publisher, use:
198
+
199
+ - PyPI project name: `hsds-record-matcher`
200
+ - Owner: `211-Connect`
201
+ - Repository name: `hsds-entity-resolution`
202
+ - Workflow name: `publish.yml`
203
+ - Environment name: `pypi`
204
+
205
+ The repository name field should be only the repository name, not `owner/repo`.
206
+
207
+ The distribution name on PyPI is independent from the import path in Python:
208
+
209
+ - Install name: `hsds-record-matcher`
210
+ - Import path: `hsds_entity_resolution`
211
+
212
+ ### Release flow
213
+
214
+ 1. Update `version` in `pyproject.toml`.
215
+ 2. Merge or push that change to `main`.
216
+ 3. GitHub Actions will build the wheel and sdist, validate them with `twine check`, and publish to PyPI through the `pypi` environment if the version changed.
217
+
218
+ You can also run the publish workflow manually with `workflow_dispatch`.