moncpipelib 0.39.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (227) hide show
  1. moncpipelib-0.39.4/.github/workflows/ci.yml +60 -0
  2. moncpipelib-0.39.4/.github/workflows/publish-pypi.yml +59 -0
  3. moncpipelib-0.39.4/.gitignore +218 -0
  4. moncpipelib-0.39.4/.gitleaks.toml +99 -0
  5. moncpipelib-0.39.4/.pre-commit-config.yaml +51 -0
  6. moncpipelib-0.39.4/CODE_OF_CONDUCT.md +72 -0
  7. moncpipelib-0.39.4/CONTRIBUTING.md +82 -0
  8. moncpipelib-0.39.4/LICENSE +202 -0
  9. moncpipelib-0.39.4/NOTICE +5 -0
  10. moncpipelib-0.39.4/PKG-INFO +160 -0
  11. moncpipelib-0.39.4/README.md +115 -0
  12. moncpipelib-0.39.4/SECURITY.md +43 -0
  13. moncpipelib-0.39.4/docs/best-practices.md +602 -0
  14. moncpipelib-0.39.4/docs/cookbook.md +11221 -0
  15. moncpipelib-0.39.4/docs/data-contracts-spec.md +1207 -0
  16. moncpipelib-0.39.4/docs/lineage-tracking.md +604 -0
  17. moncpipelib-0.39.4/docs/openlineage-integration-spec.md +645 -0
  18. moncpipelib-0.39.4/docs/scd2-guide.md +242 -0
  19. moncpipelib-0.39.4/docs/security.md +114 -0
  20. moncpipelib-0.39.4/docs/uuid_handling.md +211 -0
  21. moncpipelib-0.39.4/docs/whats-new-summaries/v0.32.0.md +65 -0
  22. moncpipelib-0.39.4/docs/whats-new-summaries/v0.7.0-to-v0.11.3.md +53 -0
  23. moncpipelib-0.39.4/mkdocs.yml +30 -0
  24. moncpipelib-0.39.4/pyproject.toml +124 -0
  25. moncpipelib-0.39.4/schemas/openlineage/1-0-0/ColumnClassificationFacet.json +27 -0
  26. moncpipelib-0.39.4/schemas/openlineage/1-0-0/DataPartitionFacet.json +39 -0
  27. moncpipelib-0.39.4/schemas/openlineage/1-0-0/MoncpipelibLineageFacet.json +64 -0
  28. moncpipelib-0.39.4/schemas/openlineage/1-0-0/SourceFileFacet.json +36 -0
  29. moncpipelib-0.39.4/scripts/bench_reconcile_work_mem.py +468 -0
  30. moncpipelib-0.39.4/scripts/bench_upsert_staging_merge.py +649 -0
  31. moncpipelib-0.39.4/scripts/generate_cookbook.py +176 -0
  32. moncpipelib-0.39.4/src/moncpipelib/__init__.py +227 -0
  33. moncpipelib-0.39.4/src/moncpipelib/config.py +531 -0
  34. moncpipelib-0.39.4/src/moncpipelib/contracts/__init__.py +195 -0
  35. moncpipelib-0.39.4/src/moncpipelib/contracts/checks.py +1006 -0
  36. moncpipelib-0.39.4/src/moncpipelib/contracts/checks_types.py +541 -0
  37. moncpipelib-0.39.4/src/moncpipelib/contracts/exceptions.py +46 -0
  38. moncpipelib-0.39.4/src/moncpipelib/contracts/hashing.py +149 -0
  39. moncpipelib-0.39.4/src/moncpipelib/contracts/loader.py +2030 -0
  40. moncpipelib-0.39.4/src/moncpipelib/contracts/models.py +791 -0
  41. moncpipelib-0.39.4/src/moncpipelib/contracts/reconciliation.py +476 -0
  42. moncpipelib-0.39.4/src/moncpipelib/contracts/sql_checks.py +1002 -0
  43. moncpipelib-0.39.4/src/moncpipelib/contracts/validators.py +1594 -0
  44. moncpipelib-0.39.4/src/moncpipelib/diagnostics/__init__.py +21 -0
  45. moncpipelib-0.39.4/src/moncpipelib/diagnostics/cgroup.py +158 -0
  46. moncpipelib-0.39.4/src/moncpipelib/diagnostics/sampler.py +271 -0
  47. moncpipelib-0.39.4/src/moncpipelib/diagnostics/types.py +158 -0
  48. moncpipelib-0.39.4/src/moncpipelib/historical.py +269 -0
  49. moncpipelib-0.39.4/src/moncpipelib/ingest/__init__.py +81 -0
  50. moncpipelib-0.39.4/src/moncpipelib/ingest/_http.py +329 -0
  51. moncpipelib-0.39.4/src/moncpipelib/ingest/dispatcher.py +280 -0
  52. moncpipelib-0.39.4/src/moncpipelib/ingest/exceptions.py +15 -0
  53. moncpipelib-0.39.4/src/moncpipelib/ingest/filenames.py +90 -0
  54. moncpipelib-0.39.4/src/moncpipelib/ingest/manifest.py +396 -0
  55. moncpipelib-0.39.4/src/moncpipelib/ingest/partition_reader.py +192 -0
  56. moncpipelib-0.39.4/src/moncpipelib/ingest/patterns/__init__.py +118 -0
  57. moncpipelib-0.39.4/src/moncpipelib/ingest/patterns/_extract.py +270 -0
  58. moncpipelib-0.39.4/src/moncpipelib/ingest/patterns/_payload_naming.py +175 -0
  59. moncpipelib-0.39.4/src/moncpipelib/ingest/patterns/_upload.py +70 -0
  60. moncpipelib-0.39.4/src/moncpipelib/ingest/patterns/api_resolver.py +286 -0
  61. moncpipelib-0.39.4/src/moncpipelib/ingest/patterns/http_urls.py +333 -0
  62. moncpipelib-0.39.4/src/moncpipelib/ingest/prefix.py +137 -0
  63. moncpipelib-0.39.4/src/moncpipelib/ingest/resolver.py +212 -0
  64. moncpipelib-0.39.4/src/moncpipelib/ingest/resolvers/__init__.py +249 -0
  65. moncpipelib-0.39.4/src/moncpipelib/ingest/resolvers/calendar.py +252 -0
  66. moncpipelib-0.39.4/src/moncpipelib/ingest/resolvers/uts.py +400 -0
  67. moncpipelib-0.39.4/src/moncpipelib/ingest/sensors.py +267 -0
  68. moncpipelib-0.39.4/src/moncpipelib/ingest/streaming.py +132 -0
  69. moncpipelib-0.39.4/src/moncpipelib/ingest/types.py +146 -0
  70. moncpipelib-0.39.4/src/moncpipelib/io_managers/__init__.py +15 -0
  71. moncpipelib-0.39.4/src/moncpipelib/io_managers/enums.py +100 -0
  72. moncpipelib-0.39.4/src/moncpipelib/io_managers/postgres.py +949 -0
  73. moncpipelib-0.39.4/src/moncpipelib/io_managers/writers.py +1224 -0
  74. moncpipelib-0.39.4/src/moncpipelib/jobs.py +295 -0
  75. moncpipelib-0.39.4/src/moncpipelib/lineage/__init__.py +69 -0
  76. moncpipelib-0.39.4/src/moncpipelib/lineage/column_metadata.py +110 -0
  77. moncpipelib-0.39.4/src/moncpipelib/lineage/models.py +651 -0
  78. moncpipelib-0.39.4/src/moncpipelib/lineage/openlineage.py +575 -0
  79. moncpipelib-0.39.4/src/moncpipelib/lineage/tracker.py +928 -0
  80. moncpipelib-0.39.4/src/moncpipelib/py.typed +0 -0
  81. moncpipelib-0.39.4/src/moncpipelib/reference.py +195 -0
  82. moncpipelib-0.39.4/src/moncpipelib/rendering.py +104 -0
  83. moncpipelib-0.39.4/src/moncpipelib/resources/__init__.py +22 -0
  84. moncpipelib-0.39.4/src/moncpipelib/resources/_app_name.py +97 -0
  85. moncpipelib-0.39.4/src/moncpipelib/resources/_contract_helpers.py +527 -0
  86. moncpipelib-0.39.4/src/moncpipelib/resources/_registry_helpers.py +684 -0
  87. moncpipelib-0.39.4/src/moncpipelib/resources/_scd2_helpers.py +854 -0
  88. moncpipelib-0.39.4/src/moncpipelib/resources/_schema.py +295 -0
  89. moncpipelib-0.39.4/src/moncpipelib/resources/blob.py +434 -0
  90. moncpipelib-0.39.4/src/moncpipelib/resources/keyvault.py +90 -0
  91. moncpipelib-0.39.4/src/moncpipelib/resources/postgres.py +3642 -0
  92. moncpipelib-0.39.4/src/moncpipelib/resources/types.py +822 -0
  93. moncpipelib-0.39.4/src/moncpipelib/scd/__init__.py +8 -0
  94. moncpipelib-0.39.4/src/moncpipelib/scd/changes.py +161 -0
  95. moncpipelib-0.39.4/src/moncpipelib/sensors.py +461 -0
  96. moncpipelib-0.39.4/src/moncpipelib/streaming.py +138 -0
  97. moncpipelib-0.39.4/src/moncpipelib/tags.py +202 -0
  98. moncpipelib-0.39.4/src/moncpipelib/testing/__init__.py +28 -0
  99. moncpipelib-0.39.4/src/moncpipelib/testing/query_builder.py +119 -0
  100. moncpipelib-0.39.4/src/moncpipelib/testing/sql_sanitizer.py +290 -0
  101. moncpipelib-0.39.4/src/moncpipelib/testing/table_utils.py +76 -0
  102. moncpipelib-0.39.4/src/moncpipelib/transforms/__init__.py +25 -0
  103. moncpipelib-0.39.4/src/moncpipelib/transforms/hashing.py +75 -0
  104. moncpipelib-0.39.4/src/moncpipelib/transforms/normalization.py +289 -0
  105. moncpipelib-0.39.4/src/moncpipelib/transforms/sanitization.py +324 -0
  106. moncpipelib-0.39.4/src/moncpipelib/versioning.py +45 -0
  107. moncpipelib-0.39.4/tests/__init__.py +1 -0
  108. moncpipelib-0.39.4/tests/conftest.py +8 -0
  109. moncpipelib-0.39.4/tests/cookbook/__init__.py +0 -0
  110. moncpipelib-0.39.4/tests/cookbook/conftest.py +245 -0
  111. moncpipelib-0.39.4/tests/cookbook/test_api_resolver_cookbook.py +318 -0
  112. moncpipelib-0.39.4/tests/cookbook/test_asset_naming_cookbook.py +300 -0
  113. moncpipelib-0.39.4/tests/cookbook/test_blob_ingest_cookbook.py +198 -0
  114. moncpipelib-0.39.4/tests/cookbook/test_calendar_resolver_cookbook.py +205 -0
  115. moncpipelib-0.39.4/tests/cookbook/test_contract_checks_cookbook.py +367 -0
  116. moncpipelib-0.39.4/tests/cookbook/test_contract_column_tests_cookbook.py +566 -0
  117. moncpipelib-0.39.4/tests/cookbook/test_contract_parameters_cookbook.py +117 -0
  118. moncpipelib-0.39.4/tests/cookbook/test_diagnostics_cookbook.py +178 -0
  119. moncpipelib-0.39.4/tests/cookbook/test_from_ingest_period_registry_cookbook.py +172 -0
  120. moncpipelib-0.39.4/tests/cookbook/test_lineage_cookbook.py +305 -0
  121. moncpipelib-0.39.4/tests/cookbook/test_non_archive_payload_cookbook.py +170 -0
  122. moncpipelib-0.39.4/tests/cookbook/test_openlineage_cookbook.py +190 -0
  123. moncpipelib-0.39.4/tests/cookbook/test_partition_aware_cookbook.py +421 -0
  124. moncpipelib-0.39.4/tests/cookbook/test_partition_reader_cookbook.py +170 -0
  125. moncpipelib-0.39.4/tests/cookbook/test_pii_cookbook.py +139 -0
  126. moncpipelib-0.39.4/tests/cookbook/test_pii_metadata_cookbook.py +96 -0
  127. moncpipelib-0.39.4/tests/cookbook/test_query_data_lineage_metadata.py +94 -0
  128. moncpipelib-0.39.4/tests/cookbook/test_reference_silver_cookbook.py +216 -0
  129. moncpipelib-0.39.4/tests/cookbook/test_resource_write_cookbook.py +569 -0
  130. moncpipelib-0.39.4/tests/cookbook/test_scd2_changes_cookbook.py +139 -0
  131. moncpipelib-0.39.4/tests/cookbook/test_scd2_lifecycle_cookbook.py +207 -0
  132. moncpipelib-0.39.4/tests/cookbook/test_scd2_reconcile_multi_contract_cookbook.py +111 -0
  133. moncpipelib-0.39.4/tests/cookbook/test_scd2_sequence_cookbook.py +105 -0
  134. moncpipelib-0.39.4/tests/cookbook/test_tags_cookbook.py +132 -0
  135. moncpipelib-0.39.4/tests/cookbook/test_testing_utils_cookbook.py +156 -0
  136. moncpipelib-0.39.4/tests/cookbook/test_text_normalizer_cookbook.py +123 -0
  137. moncpipelib-0.39.4/tests/cookbook/test_transforms_cookbook.py +312 -0
  138. moncpipelib-0.39.4/tests/cookbook/test_universal_io_manager_cookbook.py +875 -0
  139. moncpipelib-0.39.4/tests/cookbook/test_versioning_cookbook.py +46 -0
  140. moncpipelib-0.39.4/tests/cookbook/test_write_modes_cookbook.py +1172 -0
  141. moncpipelib-0.39.4/tests/integration/__init__.py +0 -0
  142. moncpipelib-0.39.4/tests/integration/conftest.py +722 -0
  143. moncpipelib-0.39.4/tests/integration/test_batched_output.py +392 -0
  144. moncpipelib-0.39.4/tests/integration/test_blob_resource.py +172 -0
  145. moncpipelib-0.39.4/tests/integration/test_contract_enforcement.py +990 -0
  146. moncpipelib-0.39.4/tests/integration/test_diagnostics_sampler.py +186 -0
  147. moncpipelib-0.39.4/tests/integration/test_driver_seam_smoke.py +57 -0
  148. moncpipelib-0.39.4/tests/integration/test_empty_read_schema.py +177 -0
  149. moncpipelib-0.39.4/tests/integration/test_json_polars.py +366 -0
  150. moncpipelib-0.39.4/tests/integration/test_lineage_atomicity.py +219 -0
  151. moncpipelib-0.39.4/tests/integration/test_lineage_parents.py +194 -0
  152. moncpipelib-0.39.4/tests/integration/test_lineage_replaces.py +295 -0
  153. moncpipelib-0.39.4/tests/integration/test_load_input.py +177 -0
  154. moncpipelib-0.39.4/tests/integration/test_partition_aware.py +510 -0
  155. moncpipelib-0.39.4/tests/integration/test_period_registry.py +151 -0
  156. moncpipelib-0.39.4/tests/integration/test_reference.py +308 -0
  157. moncpipelib-0.39.4/tests/integration/test_scd2_handle_output.py +424 -0
  158. moncpipelib-0.39.4/tests/integration/test_scd2_integration.py +3396 -0
  159. moncpipelib-0.39.4/tests/integration/test_upsert_staging_merge.py +150 -0
  160. moncpipelib-0.39.4/tests/integration/test_uuid_polars.py +519 -0
  161. moncpipelib-0.39.4/tests/integration/test_write_append.py +365 -0
  162. moncpipelib-0.39.4/tests/integration/test_write_full_refresh.py +421 -0
  163. moncpipelib-0.39.4/tests/integration/test_write_upsert.py +524 -0
  164. moncpipelib-0.39.4/tests/integration/test_write_upsert_characterization.py +268 -0
  165. moncpipelib-0.39.4/tests/test_app_name.py +242 -0
  166. moncpipelib-0.39.4/tests/test_blob_resource.py +246 -0
  167. moncpipelib-0.39.4/tests/test_calendar_release_resolver.py +332 -0
  168. moncpipelib-0.39.4/tests/test_config.py +219 -0
  169. moncpipelib-0.39.4/tests/test_contract_corpus.py +278 -0
  170. moncpipelib-0.39.4/tests/test_contracts/__init__.py +1 -0
  171. moncpipelib-0.39.4/tests/test_contracts/fixtures/invalid_contracts/empty_columns.yaml +8 -0
  172. moncpipelib-0.39.4/tests/test_contracts/fixtures/invalid_contracts/invalid_column_type.yaml +11 -0
  173. moncpipelib-0.39.4/tests/test_contracts/fixtures/invalid_contracts/missing_schema.yaml +5 -0
  174. moncpipelib-0.39.4/tests/test_contracts/fixtures/invalid_contracts/missing_version.yaml +9 -0
  175. moncpipelib-0.39.4/tests/test_contracts/fixtures/minimal_contract.yaml +14 -0
  176. moncpipelib-0.39.4/tests/test_contracts/fixtures/valid_contract.yaml +111 -0
  177. moncpipelib-0.39.4/tests/test_contracts/fixtures/valid_contract_with_tags.yaml +21 -0
  178. moncpipelib-0.39.4/tests/test_contracts/test_checks.py +1459 -0
  179. moncpipelib-0.39.4/tests/test_contracts/test_checks_types.py +391 -0
  180. moncpipelib-0.39.4/tests/test_contracts/test_exceptions.py +146 -0
  181. moncpipelib-0.39.4/tests/test_contracts/test_loader.py +1961 -0
  182. moncpipelib-0.39.4/tests/test_contracts/test_models.py +762 -0
  183. moncpipelib-0.39.4/tests/test_contracts/test_validators.py +1108 -0
  184. moncpipelib-0.39.4/tests/test_contracts_hashing.py +341 -0
  185. moncpipelib-0.39.4/tests/test_diagnostics.py +676 -0
  186. moncpipelib-0.39.4/tests/test_historical.py +682 -0
  187. moncpipelib-0.39.4/tests/test_ingest_api_resolver.py +1304 -0
  188. moncpipelib-0.39.4/tests/test_ingest_contract_loader.py +616 -0
  189. moncpipelib-0.39.4/tests/test_ingest_dispatcher.py +745 -0
  190. moncpipelib-0.39.4/tests/test_ingest_drain.py +108 -0
  191. moncpipelib-0.39.4/tests/test_ingest_extract.py +316 -0
  192. moncpipelib-0.39.4/tests/test_ingest_filenames.py +116 -0
  193. moncpipelib-0.39.4/tests/test_ingest_http.py +181 -0
  194. moncpipelib-0.39.4/tests/test_ingest_http_urls.py +732 -0
  195. moncpipelib-0.39.4/tests/test_ingest_manifest.py +260 -0
  196. moncpipelib-0.39.4/tests/test_ingest_partition_reader.py +515 -0
  197. moncpipelib-0.39.4/tests/test_ingest_prefix.py +79 -0
  198. moncpipelib-0.39.4/tests/test_ingest_resolver.py +392 -0
  199. moncpipelib-0.39.4/tests/test_ingest_resolvers_registry.py +113 -0
  200. moncpipelib-0.39.4/tests/test_ingest_sensors.py +418 -0
  201. moncpipelib-0.39.4/tests/test_ingest_streaming_memory.py +178 -0
  202. moncpipelib-0.39.4/tests/test_jobs.py +201 -0
  203. moncpipelib-0.39.4/tests/test_keyvault_resource.py +107 -0
  204. moncpipelib-0.39.4/tests/test_lineage_models.py +547 -0
  205. moncpipelib-0.39.4/tests/test_lineage_tracker.py +1145 -0
  206. moncpipelib-0.39.4/tests/test_manifest_streaming.py +255 -0
  207. moncpipelib-0.39.4/tests/test_openlineage.py +352 -0
  208. moncpipelib-0.39.4/tests/test_pii_metadata.py +255 -0
  209. moncpipelib-0.39.4/tests/test_postgres_io_manager.py +2351 -0
  210. moncpipelib-0.39.4/tests/test_postgres_resource.py +6911 -0
  211. moncpipelib-0.39.4/tests/test_reference.py +377 -0
  212. moncpipelib-0.39.4/tests/test_rendering.py +197 -0
  213. moncpipelib-0.39.4/tests/test_resource_blob_streaming.py +440 -0
  214. moncpipelib-0.39.4/tests/test_resources_types.py +1079 -0
  215. moncpipelib-0.39.4/tests/test_scd2.py +209 -0
  216. moncpipelib-0.39.4/tests/test_scd2_writer_helpers.py +143 -0
  217. moncpipelib-0.39.4/tests/test_sensors.py +192 -0
  218. moncpipelib-0.39.4/tests/test_tags.py +265 -0
  219. moncpipelib-0.39.4/tests/test_testing/__init__.py +1 -0
  220. moncpipelib-0.39.4/tests/test_testing/test_query_builder.py +236 -0
  221. moncpipelib-0.39.4/tests/test_testing/test_sql_sanitizer.py +375 -0
  222. moncpipelib-0.39.4/tests/test_testing/test_table_utils.py +153 -0
  223. moncpipelib-0.39.4/tests/test_text_normalizer.py +300 -0
  224. moncpipelib-0.39.4/tests/test_transforms.py +423 -0
  225. moncpipelib-0.39.4/tests/test_uts_release_resolver.py +543 -0
  226. moncpipelib-0.39.4/tests/test_versioning.py +115 -0
  227. moncpipelib-0.39.4/uv.lock +2147 -0
@@ -0,0 +1,60 @@
1
+ # Public CI for moncpipelib. Standalone -- no internal reusable workflows.
2
+ # Generated into the public tree by scripts/oss-export/export.py; do not hand-edit
3
+ # in the public repo (re-generate from the private export tooling instead).
4
+ name: CI
5
+
6
+ on:
7
+ push:
8
+ branches: [main]
9
+ pull_request:
10
+ branches: [main]
11
+
12
+ permissions:
13
+ contents: read
14
+
15
+ jobs:
16
+ leak-scan:
17
+ name: Leak scan (gitleaks)
18
+ runs-on: ubuntu-latest
19
+ env:
20
+ # gitleaks-action requires a paid license for org-owned repos; the CLI
21
+ # does not. Run the pinned binary directly instead.
22
+ GITLEAKS_VERSION: "8.30.1"
23
+ steps:
24
+ - uses: actions/checkout@v4
25
+ - name: Install gitleaks
26
+ run: |
27
+ base="https://github.com/gitleaks/gitleaks/releases/download/v${GITLEAKS_VERSION}"
28
+ file="gitleaks_${GITLEAKS_VERSION}_linux_x64.tar.gz"
29
+ curl -fsSL "$base/$file" -o "$file"
30
+ curl -fsSL "$base/gitleaks_${GITLEAKS_VERSION}_checksums.txt" -o checksums.txt
31
+ grep "$file" checksums.txt | sha256sum -c -
32
+ tar -xzf "$file" gitleaks
33
+ - name: Scan
34
+ run: ./gitleaks dir . --config .gitleaks.toml --no-banner --redact
35
+
36
+ quality:
37
+ name: Lint, type-check, test
38
+ runs-on: ubuntu-latest
39
+ steps:
40
+ - uses: actions/checkout@v4
41
+
42
+ - name: Install uv
43
+ uses: astral-sh/setup-uv@v5
44
+ with:
45
+ enable-cache: true
46
+
47
+ - name: Set up Python
48
+ run: uv python install 3.11
49
+
50
+ - name: Sync dependencies
51
+ run: uv sync --all-extras --dev
52
+
53
+ - name: Ruff lint
54
+ run: uv run ruff check src tests scripts
55
+
56
+ - name: Mypy
57
+ run: uv run mypy src
58
+
59
+ - name: Pytest
60
+ run: uv run pytest -q
@@ -0,0 +1,59 @@
1
+ # Publishes moncpipelib to PyPI via Trusted Publishing (OIDC) -- no API token.
2
+ #
3
+ # Runs in the PUBLIC repo (a non-EMU org). Generated into the public tree by
4
+ # the private export tooling; do not hand-edit in the public repo.
5
+ #
6
+ # One-time PyPI setup (https://pypi.org/manage/project/moncpipelib/settings/publishing/):
7
+ # Add a Trusted Publisher:
8
+ # Owner: model-oncology-public
9
+ # Repository: moncpipelib
10
+ # Workflow: publish-pypi.yml
11
+ # Environment: pypi
12
+ #
13
+ # Release flow: bump the version in the internal repo -> publish the mirror ->
14
+ # create a GitHub Release in the public repo (tag vX.Y.Z) -> this runs and
15
+ # uploads. PyPI rejects re-uploading an existing version, so each publish needs
16
+ # a new version.
17
+ name: Publish to PyPI
18
+
19
+ on:
20
+ release:
21
+ types: [published]
22
+ workflow_dispatch:
23
+
24
+ permissions:
25
+ contents: read
26
+
27
+ jobs:
28
+ build:
29
+ name: Build distributions
30
+ runs-on: ubuntu-latest
31
+ steps:
32
+ - uses: actions/checkout@v4
33
+ - name: Install uv
34
+ uses: astral-sh/setup-uv@v5
35
+ with:
36
+ version: "latest"
37
+ - name: Build sdist + wheel
38
+ run: uv build
39
+ - uses: actions/upload-artifact@v4
40
+ with:
41
+ name: dist
42
+ path: dist/
43
+
44
+ publish:
45
+ name: Publish to PyPI
46
+ needs: build
47
+ runs-on: ubuntu-latest
48
+ environment: pypi
49
+ permissions:
50
+ id-token: write # OIDC token for Trusted Publishing
51
+ steps:
52
+ - uses: actions/download-artifact@v4
53
+ with:
54
+ name: dist
55
+ path: dist/
56
+ - name: Publish
57
+ # NOTE: pin to the COMMIT sha, not the annotated-tag-object sha, or
58
+ # Actions mis-resolves it to a non-existent ghcr.io image tag.
59
+ uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b # v1.14.0
@@ -0,0 +1,218 @@
1
+ # Claude Worktrees
2
+ .claude/
3
+
4
+ # Byte-compiled / optimized / DLL files
5
+ __pycache__/
6
+ *.py[codz]
7
+ *$py.class
8
+
9
+ # C extensions
10
+ *.so
11
+
12
+ # Distribution / packaging
13
+ .Python
14
+ build/
15
+ develop-eggs/
16
+ dist/
17
+ downloads/
18
+ eggs/
19
+ .eggs/
20
+ lib/
21
+ lib64/
22
+ parts/
23
+ sdist/
24
+ var/
25
+ wheels/
26
+ share/python-wheels/
27
+ *.egg-info/
28
+ .installed.cfg
29
+ *.egg
30
+ MANIFEST
31
+
32
+ # PyInstaller
33
+ # Usually these files are written by a python script from a template
34
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
35
+ *.manifest
36
+ *.spec
37
+
38
+ # Installer logs
39
+ pip-log.txt
40
+ pip-delete-this-directory.txt
41
+
42
+ # Unit test / coverage reports
43
+ htmlcov/
44
+ .tox/
45
+ .nox/
46
+ .coverage
47
+ .coverage.*
48
+ .cache
49
+ nosetests.xml
50
+ coverage.xml
51
+ *.cover
52
+ *.py.cover
53
+ .hypothesis/
54
+ .pytest_cache/
55
+ cover/
56
+
57
+ # Translations
58
+ *.mo
59
+ *.pot
60
+
61
+ # Django stuff:
62
+ *.log
63
+ local_settings.py
64
+ db.sqlite3
65
+ db.sqlite3-journal
66
+
67
+ # Flask stuff:
68
+ instance/
69
+ .webassets-cache
70
+
71
+ # Scrapy stuff:
72
+ .scrapy
73
+
74
+ # Sphinx documentation
75
+ docs/_build/
76
+
77
+ # PyBuilder
78
+ .pybuilder/
79
+ target/
80
+
81
+ # Jupyter Notebook
82
+ .ipynb_checkpoints
83
+
84
+ # IPython
85
+ profile_default/
86
+ ipython_config.py
87
+
88
+ # pyenv
89
+ # For a library or package, you might want to ignore these files since the code is
90
+ # intended to run in multiple environments; otherwise, check them in:
91
+ # .python-version
92
+
93
+ # pipenv
94
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
95
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
96
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
97
+ # install all needed dependencies.
98
+ #Pipfile.lock
99
+
100
+ # UV
101
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
102
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
103
+ # commonly ignored for libraries.
104
+ #uv.lock
105
+
106
+ # poetry
107
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
108
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
109
+ # commonly ignored for libraries.
110
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
111
+ #poetry.lock
112
+ #poetry.toml
113
+
114
+ # pdm
115
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
116
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
117
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
118
+ #pdm.lock
119
+ #pdm.toml
120
+ .pdm-python
121
+ .pdm-build/
122
+
123
+ # pixi
124
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
125
+ #pixi.lock
126
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
127
+ # in the .venv directory. It is recommended not to include this directory in version control.
128
+ .pixi
129
+
130
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
131
+ __pypackages__/
132
+
133
+ # Celery stuff
134
+ celerybeat-schedule
135
+ celerybeat.pid
136
+
137
+ # SageMath parsed files
138
+ *.sage.py
139
+
140
+ # Environments
141
+ .env
142
+ .envrc
143
+ .venv
144
+ env/
145
+ venv/
146
+ ENV/
147
+ env.bak/
148
+ venv.bak/
149
+
150
+ # Spyder project settings
151
+ .spyderproject
152
+ .spyproject
153
+
154
+ # Rope project settings
155
+ .ropeproject
156
+
157
+ # mkdocs documentation
158
+ /site
159
+
160
+ # mypy
161
+ .mypy_cache/
162
+ .dmypy.json
163
+ dmypy.json
164
+
165
+ # Pyre type checker
166
+ .pyre/
167
+
168
+ # pytype static type analyzer
169
+ .pytype/
170
+
171
+ # Cython debug symbols
172
+ cython_debug/
173
+
174
+ # PyCharm
175
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
176
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
177
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
178
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
179
+ #.idea/
180
+
181
+ # Abstra
182
+ # Abstra is an AI-powered process automation framework.
183
+ # Ignore directories containing user credentials, local state, and settings.
184
+ # Learn more at https://abstra.io/docs
185
+ .abstra/
186
+
187
+ # Visual Studio Code
188
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
189
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
190
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
191
+ # you could uncomment the following to ignore the entire vscode folder
192
+ # .vscode/
193
+
194
+ # Ruff stuff:
195
+ .ruff_cache/
196
+
197
+ # PyPI configuration file
198
+ .pypirc
199
+
200
+ # Cursor
201
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
202
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
203
+ # refer to https://docs.cursor.com/context/ignore-files
204
+ .cursorignore
205
+ .cursorindexingignore
206
+
207
+ # Marimo
208
+ marimo/_static/
209
+ marimo/_lsp/
210
+ __marimo__/
211
+
212
+ # Cookbook build artifacts
213
+ .cookbook_artifacts.json
214
+
215
+ # scripts/bench_*.py per-run output (work_mem #294, upsert staging-merge #375).
216
+ # Trailing wildcard matters: result files carry suffixes after "results"
217
+ # (e.g. bench_work_mem_results_58m_256mb.json), which bench_*_results.json missed.
218
+ bench_*results*.json
@@ -0,0 +1,99 @@
1
+ # gitleaks configuration for the PUBLIC moncpipelib repository.
2
+ #
3
+ # This ships into the public tree as `.gitleaks.toml` and is run by the public
4
+ # CI on every push/PR. It is the ongoing "suspenders" -- the export-time
5
+ # scan.py is the "belt". Keep the custom rules here in sync with
6
+ # scripts/oss-export/scan.py in the private repo.
7
+
8
+ title = "moncpipelib leak-scan config"
9
+
10
+ [extend]
11
+ useDefault = true
12
+
13
+ [[rules]]
14
+ id = "mo-azure-cus-resource"
15
+ description = "Internal Azure resource name (-cus-npe/prd/prod)"
16
+ regex = '''\b[a-z][a-z0-9]*-cus-(npe|prd|prod)\b'''
17
+ tags = ["internal", "model-oncology"]
18
+
19
+ [[rules]]
20
+ id = "mo-pg-flex-server"
21
+ description = "Internal Postgres flexible server name"
22
+ regex = '''\bsrv-psgrsdb[\w-]*\b'''
23
+ tags = ["internal", "model-oncology"]
24
+
25
+ [[rules]]
26
+ id = "mo-storage-account"
27
+ description = "Internal Azure storage account name"
28
+ regex = '''\bstadatalake[\w-]*\b'''
29
+ tags = ["internal", "model-oncology"]
30
+
31
+ [[rules]]
32
+ id = "mo-internal-db-name"
33
+ description = "Internal database name"
34
+ regex = '''\boncalytics\b'''
35
+ tags = ["internal", "model-oncology"]
36
+
37
+ [[rules]]
38
+ id = "mo-employee-handle"
39
+ description = "Employee handle"
40
+ regex = '''\bahaight[\w-]*\b'''
41
+ tags = ["internal", "model-oncology"]
42
+
43
+ [[rules]]
44
+ id = "mo-internal-email-domain"
45
+ description = "Internal email domain"
46
+ regex = '''\boncologyhealthpartners\.com\b'''
47
+ tags = ["internal", "model-oncology"]
48
+
49
+ [[rules]]
50
+ id = "mo-azure-pg-fqdn"
51
+ description = "Azure Postgres FQDN"
52
+ regex = '''\b[a-z0-9-]+\.postgres\.database\.azure\.com\b'''
53
+ tags = ["internal", "model-oncology"]
54
+
55
+ [[rules]]
56
+ id = "mo-azure-sql-fqdn"
57
+ description = "Azure SQL FQDN"
58
+ regex = '''\b[a-z0-9-]+\.database\.windows\.net\b'''
59
+ tags = ["internal", "model-oncology"]
60
+
61
+ [[rules]]
62
+ id = "mo-azure-storage-fqdn"
63
+ description = "Azure storage FQDN (blob/dfs/file/queue/table)"
64
+ regex = '''\b[a-z0-9]+\.(?:blob|dfs|file|queue|table)\.core\.windows\.net\b'''
65
+ tags = ["internal", "model-oncology"]
66
+
67
+ [[rules]]
68
+ id = "mo-azure-artifacts-feed"
69
+ description = "Internal Azure Artifacts package feed"
70
+ regex = '''\bpkgs\.dev\.azure\.com\b'''
71
+ tags = ["internal", "model-oncology"]
72
+
73
+ # Detect a subscription GUID by CONTEXT rather than embedding the real values
74
+ # (this file is public). The precise known-ID check lives in the private
75
+ # export-time scanner (scripts/oss-export/scan.py), which is never published.
76
+ [[rules]]
77
+ id = "mo-azure-subscription"
78
+ description = "Azure subscription id appearing in a subscription context"
79
+ regex = '''(?i)subscription["'\s:=/_-]+[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}'''
80
+ tags = ["internal", "model-oncology"]
81
+
82
+ [[rules]]
83
+ id = "mo-azure-id-context"
84
+ description = "Azure tenant/client/object/app id appearing in context"
85
+ regex = '''(?i)\b(?:tenant|client[_-]?id|object[_-]?id|app[_-]?id)\b["'\s:=/_-]+[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}'''
86
+ tags = ["internal", "model-oncology"]
87
+
88
+ [allowlist]
89
+ description = "Known-safe placeholders and synthetic example values"
90
+ # Match allowlist regexes against the full match (not the capture/secret), so
91
+ # placeholder hosts like examplestorageacct.dfs.core.windows.net are suppressed.
92
+ regexTarget = "match"
93
+ regexes = [
94
+ '''example\.com''',
95
+ '''examplestorageacct''',
96
+ '''\bpg-(nonprod|prod)\b''',
97
+ '''\blaw-(nonprod|prod)\b''',
98
+ '''\bbench_user\b''',
99
+ ]
@@ -0,0 +1,51 @@
1
+ ---
2
+ exclude: |
3
+ (?x)^(
4
+ .*cache.*/.*|
5
+ .*venv.*/.*
6
+ )$
7
+ fail_fast: true
8
+ default_language_version:
9
+ python: python3.13
10
+ repos:
11
+ # ----------------------------- Security ----------------------------------- #
12
+ - repo: https://github.com/gitleaks/gitleaks
13
+ rev: v8.30.1
14
+ hooks:
15
+ - id: gitleaks
16
+ name: "Detect hardcoded secrets"
17
+
18
+ # ----------------------------- Python ------------------------------------- #
19
+ - repo: https://github.com/astral-sh/ruff-pre-commit
20
+ rev: v0.15.14
21
+ hooks:
22
+ - id: ruff-format
23
+ name: "Format with Ruff"
24
+ - id: ruff
25
+ name: "Lint with Ruff"
26
+ args: [--fix, --exit-non-zero-on-fix]
27
+
28
+ # ----------------------------- YAML --------------------------------------- #
29
+ - repo: https://github.com/adrienverge/yamllint
30
+ rev: v1.38.0
31
+ hooks:
32
+ - id: yamllint
33
+ name: "Lint YAML files"
34
+ args: [-d, relaxed]
35
+
36
+ # ----------------------------- Filesystem / Git --------------------------- #
37
+ - repo: https://github.com/pre-commit/pre-commit-hooks
38
+ rev: v6.0.0
39
+ hooks:
40
+ - id: check-merge-conflict
41
+ name: "Detect conflict markers"
42
+ - id: end-of-file-fixer
43
+ name: "Fix end of file"
44
+ - id: trailing-whitespace
45
+ name: "Trim trailing whitespace"
46
+ - id: check-added-large-files
47
+ name: "Block large file commits"
48
+ args: ["--maxkb=5000"]
49
+ - id: no-commit-to-branch
50
+ name: "Protect main branch"
51
+ args: ["--branch", "main"]
@@ -0,0 +1,72 @@
1
+ # Contributor Covenant Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ We as members, contributors, and leaders pledge to make participation in our
6
+ community a harassment-free experience for everyone, regardless of age, body
7
+ size, visible or invisible disability, ethnicity, sex characteristics, gender
8
+ identity and expression, level of experience, education, socio-economic status,
9
+ nationality, personal appearance, race, religion, or sexual identity and
10
+ orientation.
11
+
12
+ We pledge to act and interact in ways that contribute to an open, welcoming,
13
+ diverse, inclusive, and healthy community.
14
+
15
+ ## Our Standards
16
+
17
+ Examples of behavior that contributes to a positive environment for our
18
+ community include:
19
+
20
+ - Demonstrating empathy and kindness toward other people
21
+ - Being respectful of differing opinions, viewpoints, and experiences
22
+ - Giving and gracefully accepting constructive feedback
23
+ - Accepting responsibility and apologizing to those affected by our mistakes,
24
+ and learning from the experience
25
+ - Focusing on what is best not just for us as individuals, but for the overall
26
+ community
27
+
28
+ Examples of unacceptable behavior include:
29
+
30
+ - The use of sexualized language or imagery, and sexual attention or advances of
31
+ any kind
32
+ - Trolling, insulting or derogatory comments, and personal or political attacks
33
+ - Public or private harassment
34
+ - Publishing others' private information, such as a physical or email address,
35
+ without their explicit permission
36
+ - Other conduct which could reasonably be considered inappropriate in a
37
+ professional setting
38
+
39
+ ## Enforcement Responsibilities
40
+
41
+ Community leaders are responsible for clarifying and enforcing our standards of
42
+ acceptable behavior and will take appropriate and fair corrective action in
43
+ response to any behavior that they deem inappropriate, threatening, offensive,
44
+ or harmful.
45
+
46
+ Community leaders have the right and responsibility to remove, edit, or reject
47
+ comments, commits, code, wiki edits, issues, and other contributions that are
48
+ not aligned to this Code of Conduct, and will communicate reasons for moderation
49
+ decisions when appropriate.
50
+
51
+ ## Scope
52
+
53
+ This Code of Conduct applies within all community spaces, and also applies when
54
+ an individual is officially representing the community in public spaces.
55
+
56
+ ## Enforcement
57
+
58
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
59
+ reported to the community leaders responsible for enforcement at
60
+ engineering@modeloncology.com. All complaints will be reviewed and investigated
61
+ promptly and fairly.
62
+
63
+ All community leaders are obligated to respect the privacy and security of the
64
+ reporter of any incident.
65
+
66
+ ## Attribution
67
+
68
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage],
69
+ version 2.1, available at
70
+ https://www.contributor-covenant.org/version/2/1/code_of_conduct.html.
71
+
72
+ [homepage]: https://www.contributor-covenant.org
@@ -0,0 +1,82 @@
1
+ # Contributing to moncpipelib
2
+
3
+ Thanks for your interest in contributing.
4
+
5
+ `moncpipelib` is maintained by Model Oncology and developed primarily against
6
+ our internal data-pipeline needs. We welcome issues and pull requests from the
7
+ community.
8
+
9
+ ## How this repository is maintained
10
+
11
+ This public repository is a **one-way mirror** generated from an internal
12
+ repository. The `main` branch is republished (force-pushed) on each release, so
13
+ **commits pushed directly to `main` here do not persist**. That does not mean
14
+ your contribution is unwelcome -- it changes the mechanics:
15
+
16
+ - **Issues and discussions** are the best way to report bugs and propose
17
+ changes; we triage them directly.
18
+ - **Pull requests** are reviewed here. When we accept one, we reincorporate the
19
+ change into the internal source and it lands in the public repo on the next
20
+ sync (with attribution preserved). Your PR branch is the unit of review; the
21
+ public `main` is not a durable merge target.
22
+
23
+ If this workflow ever becomes a friction point, open an issue -- we would rather
24
+ adjust the process than lose a good contribution.
25
+
26
+ ## Development setup
27
+
28
+ This project uses [uv](https://docs.astral.sh/uv/) for environment and
29
+ dependency management.
30
+
31
+ ```bash
32
+ uv sync --all-extras --dev
33
+ ```
34
+
35
+ ## Before you open a pull request
36
+
37
+ Run the full local check suite -- CI runs the same steps:
38
+
39
+ ```bash
40
+ uv run ruff check src tests scripts
41
+ uv run ruff format --check src tests scripts
42
+ uv run mypy src
43
+ uv run pytest
44
+ ```
45
+
46
+ If you change anything under `tests/cookbook/`, regenerate the cookbook docs
47
+ (CI fails if they are stale):
48
+
49
+ ```bash
50
+ uv run pytest tests/cookbook/ --cookbook-collect
51
+ uv run python scripts/generate_cookbook.py
52
+ ```
53
+
54
+ ## Guidelines
55
+
56
+ - Target Python 3.11+. The codebase is fully typed and checked under mypy
57
+ strict mode -- new code must type-check cleanly.
58
+ - I/O at boundaries (blob storage, database, archive, network, filesystem)
59
+ must stream by default. Methods that return whole payloads as `bytes` are
60
+ reserved for content that is contractually bounded to a few MB, and the
61
+ docstring must say so.
62
+ - Keep changes focused and include tests.
63
+ - **Test fixtures must be synthetic.** Never derive a fixture from a real data
64
+ extract. All sample data (identifiers, names, dates, claims, etc.) must be
65
+ fabricated. PRs that add fixtures resembling real records will be rejected.
66
+ Do not add binary fixtures (e.g. `.parquet`) without prior discussion in an
67
+ issue.
68
+
69
+ ## Code of Conduct
70
+
71
+ This project adheres to a [Code of Conduct](CODE_OF_CONDUCT.md). By
72
+ participating, you are expected to uphold it.
73
+
74
+ ## Security
75
+
76
+ Please do not file public issues for security vulnerabilities. See
77
+ [SECURITY.md](SECURITY.md) for the disclosure process.
78
+
79
+ ## License
80
+
81
+ By contributing, you agree that your contributions will be licensed under the
82
+ Apache License 2.0, consistent with the rest of the project.