pdbminebuilder 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. pdbminebuilder-0.2.0/.env.example +9 -0
  2. pdbminebuilder-0.2.0/.gitattributes +2 -0
  3. pdbminebuilder-0.2.0/.github/workflows/ci.yml +45 -0
  4. pdbminebuilder-0.2.0/.github/workflows/deploy-docs.yml +52 -0
  5. pdbminebuilder-0.2.0/.github/workflows/release.yml +68 -0
  6. pdbminebuilder-0.2.0/.gitignore +37 -0
  7. pdbminebuilder-0.2.0/CHANGELOG.md +35 -0
  8. pdbminebuilder-0.2.0/CLAUDE.md +309 -0
  9. pdbminebuilder-0.2.0/LICENSE +21 -0
  10. pdbminebuilder-0.2.0/PKG-INFO +136 -0
  11. pdbminebuilder-0.2.0/README.md +102 -0
  12. pdbminebuilder-0.2.0/alembic/README +1 -0
  13. pdbminebuilder-0.2.0/alembic/env.py +116 -0
  14. pdbminebuilder-0.2.0/alembic/script.py.mako +28 -0
  15. pdbminebuilder-0.2.0/alembic/versions/bf39ba596b4a_prd_family_replace_docid_with_name_in_.py +35 -0
  16. pdbminebuilder-0.2.0/alembic.ini +137 -0
  17. pdbminebuilder-0.2.0/config.example.yml +47 -0
  18. pdbminebuilder-0.2.0/config.test.yml +27 -0
  19. pdbminebuilder-0.2.0/docker/docker-compose.test.yml +17 -0
  20. pdbminebuilder-0.2.0/docker/init/01-extensions.sql +2 -0
  21. pdbminebuilder-0.2.0/docs/architecture.md +157 -0
  22. pdbminebuilder-0.2.0/docs/pipelines.md +332 -0
  23. pdbminebuilder-0.2.0/docs/rdb_docs/cc.yml +521 -0
  24. pdbminebuilder-0.2.0/docs/rdb_docs/ccmodel.yml +192 -0
  25. pdbminebuilder-0.2.0/docs/rdb_docs/contacts.yml +54 -0
  26. pdbminebuilder-0.2.0/docs/rdb_docs/emdb.yml +2872 -0
  27. pdbminebuilder-0.2.0/docs/rdb_docs/ihm.yml +3721 -0
  28. pdbminebuilder-0.2.0/docs/rdb_docs/pdbj.yml +12632 -0
  29. pdbminebuilder-0.2.0/docs/rdb_docs/prd.yml +687 -0
  30. pdbminebuilder-0.2.0/docs/rdb_docs/prd_family.yml +279 -0
  31. pdbminebuilder-0.2.0/docs/rdb_docs/vrpt.yml +2763 -0
  32. pdbminebuilder-0.2.0/docs/schema.md +201 -0
  33. pdbminebuilder-0.2.0/pixi.lock +3073 -0
  34. pdbminebuilder-0.2.0/pixi.toml +71 -0
  35. pdbminebuilder-0.2.0/pyproject.toml +54 -0
  36. pdbminebuilder-0.2.0/schemas/pkout.json +1 -0
  37. pdbminebuilder-0.2.0/scripts/cif2json.py +142 -0
  38. pdbminebuilder-0.2.0/scripts/convert_yaml_to_sa.py +572 -0
  39. pdbminebuilder-0.2.0/scripts/generate_rdb_docs.py +203 -0
  40. pdbminebuilder-0.2.0/scripts/generate_schema_docs.py +234 -0
  41. pdbminebuilder-0.2.0/scripts/init_rdkit.sql +54 -0
  42. pdbminebuilder-0.2.0/scripts/inject_column_comments.py +160 -0
  43. pdbminebuilder-0.2.0/scripts/postgresql_bulkload.conf +35 -0
  44. pdbminebuilder-0.2.0/scripts/postgresql_safe.conf +20 -0
  45. pdbminebuilder-0.2.0/scripts/postgresql_tuned.conf +69 -0
  46. pdbminebuilder-0.2.0/scripts/rdkit_functions.sql +221 -0
  47. pdbminebuilder-0.2.0/scripts/reload_failed.py +190 -0
  48. pdbminebuilder-0.2.0/scripts/verify_schema_equivalence.py +411 -0
  49. pdbminebuilder-0.2.0/src/pdbminebuilder/__init__.py +3 -0
  50. pdbminebuilder-0.2.0/src/pdbminebuilder/__main__.py +6 -0
  51. pdbminebuilder-0.2.0/src/pdbminebuilder/cli.py +369 -0
  52. pdbminebuilder-0.2.0/src/pdbminebuilder/commands/__init__.py +1 -0
  53. pdbminebuilder-0.2.0/src/pdbminebuilder/commands/load.py +164 -0
  54. pdbminebuilder-0.2.0/src/pdbminebuilder/commands/reset.py +101 -0
  55. pdbminebuilder-0.2.0/src/pdbminebuilder/commands/stats.py +147 -0
  56. pdbminebuilder-0.2.0/src/pdbminebuilder/commands/sync.py +183 -0
  57. pdbminebuilder-0.2.0/src/pdbminebuilder/commands/test.py +185 -0
  58. pdbminebuilder-0.2.0/src/pdbminebuilder/commands/update.py +212 -0
  59. pdbminebuilder-0.2.0/src/pdbminebuilder/commands/utils.py +47 -0
  60. pdbminebuilder-0.2.0/src/pdbminebuilder/config.py +140 -0
  61. pdbminebuilder-0.2.0/src/pdbminebuilder/db/__init__.py +5 -0
  62. pdbminebuilder-0.2.0/src/pdbminebuilder/db/_type_utils.py +62 -0
  63. pdbminebuilder-0.2.0/src/pdbminebuilder/db/connection.py +128 -0
  64. pdbminebuilder-0.2.0/src/pdbminebuilder/db/delta.py +764 -0
  65. pdbminebuilder-0.2.0/src/pdbminebuilder/db/loader.py +680 -0
  66. pdbminebuilder-0.2.0/src/pdbminebuilder/db/metadata.py +188 -0
  67. pdbminebuilder-0.2.0/src/pdbminebuilder/models/__init__.py +53 -0
  68. pdbminebuilder-0.2.0/src/pdbminebuilder/models/cc.py +794 -0
  69. pdbminebuilder-0.2.0/src/pdbminebuilder/models/ccmodel.py +283 -0
  70. pdbminebuilder-0.2.0/src/pdbminebuilder/models/contacts.py +87 -0
  71. pdbminebuilder-0.2.0/src/pdbminebuilder/models/emdb.py +3987 -0
  72. pdbminebuilder-0.2.0/src/pdbminebuilder/models/ihm.py +2079 -0
  73. pdbminebuilder-0.2.0/src/pdbminebuilder/models/pdbj.py +16913 -0
  74. pdbminebuilder-0.2.0/src/pdbminebuilder/models/prd.py +1044 -0
  75. pdbminebuilder-0.2.0/src/pdbminebuilder/models/prd_family.py +414 -0
  76. pdbminebuilder-0.2.0/src/pdbminebuilder/models/vrpt.py +4077 -0
  77. pdbminebuilder-0.2.0/src/pdbminebuilder/parsers/__init__.py +20 -0
  78. pdbminebuilder-0.2.0/src/pdbminebuilder/parsers/cif.py +205 -0
  79. pdbminebuilder-0.2.0/src/pdbminebuilder/parsers/mmjson.py +246 -0
  80. pdbminebuilder-0.2.0/src/pdbminebuilder/pipelines/__init__.py +1 -0
  81. pdbminebuilder-0.2.0/src/pdbminebuilder/pipelines/base.py +861 -0
  82. pdbminebuilder-0.2.0/src/pdbminebuilder/pipelines/cc.py +822 -0
  83. pdbminebuilder-0.2.0/src/pdbminebuilder/pipelines/ccmodel.py +508 -0
  84. pdbminebuilder-0.2.0/src/pdbminebuilder/pipelines/contacts.py +340 -0
  85. pdbminebuilder-0.2.0/src/pdbminebuilder/pipelines/emdb.py +413 -0
  86. pdbminebuilder-0.2.0/src/pdbminebuilder/pipelines/ihm.py +508 -0
  87. pdbminebuilder-0.2.0/src/pdbminebuilder/pipelines/pdbj.py +820 -0
  88. pdbminebuilder-0.2.0/src/pdbminebuilder/pipelines/prd.py +619 -0
  89. pdbminebuilder-0.2.0/src/pdbminebuilder/pipelines/prd_family.py +255 -0
  90. pdbminebuilder-0.2.0/src/pdbminebuilder/pipelines/vrpt.py +341 -0
  91. pdbminebuilder-0.2.0/src/pdbminebuilder/py.typed +0 -0
  92. pdbminebuilder-0.2.0/src/pdbminebuilder/utils/__init__.py +1 -0
  93. pdbminebuilder-0.2.0/src/pdbminebuilder/utils/assembly.py +232 -0
  94. pdbminebuilder-0.2.0/src/pdbminebuilder/utils/brief_summary.py +365 -0
  95. pdbminebuilder-0.2.0/src/pdbminebuilder/utils/patches.py +61 -0
  96. pdbminebuilder-0.2.0/tests/__init__.py +1 -0
  97. pdbminebuilder-0.2.0/tests/conftest.py +260 -0
  98. pdbminebuilder-0.2.0/tests/fixtures/cc/ATP.json.gz +0 -0
  99. pdbminebuilder-0.2.0/tests/fixtures/cc/EOH.json.gz +0 -0
  100. pdbminebuilder-0.2.0/tests/fixtures/cc/HOH.json.gz +0 -0
  101. pdbminebuilder-0.2.0/tests/fixtures/ccmodel/M_6EL_00001.cif.gz +0 -0
  102. pdbminebuilder-0.2.0/tests/fixtures/ccmodel/M_94M_00001.cif.gz +0 -0
  103. pdbminebuilder-0.2.0/tests/fixtures/ccmodel/M_DAL_00001.cif.gz +0 -0
  104. pdbminebuilder-0.2.0/tests/fixtures/ccmodel/M_EOH_00001.cif.gz +0 -0
  105. pdbminebuilder-0.2.0/tests/fixtures/pdbj/1crn.cif.gz +0 -0
  106. pdbminebuilder-0.2.0/tests/fixtures/pdbj/1ubq.cif.gz +0 -0
  107. pdbminebuilder-0.2.0/tests/fixtures/pdbj/4hhb.cif.gz +0 -0
  108. pdbminebuilder-0.2.0/tests/fixtures/pdbj/5pti.cif.gz +0 -0
  109. pdbminebuilder-0.2.0/tests/fixtures/prd/PRDCC_000001.cif.gz +0 -0
  110. pdbminebuilder-0.2.0/tests/fixtures/prd/PRDCC_000006.cif.gz +0 -0
  111. pdbminebuilder-0.2.0/tests/fixtures/prd/PRDCC_000007.cif.gz +0 -0
  112. pdbminebuilder-0.2.0/tests/fixtures/prd/PRD_000001.cif.gz +0 -0
  113. pdbminebuilder-0.2.0/tests/fixtures/prd/PRD_000006.cif.gz +0 -0
  114. pdbminebuilder-0.2.0/tests/fixtures/prd/PRD_000007.cif.gz +0 -0
  115. pdbminebuilder-0.2.0/tests/fixtures/prd_family/family-all.cif.gz +0 -0
  116. pdbminebuilder-0.2.0/tests/integration/__init__.py +5 -0
  117. pdbminebuilder-0.2.0/tests/integration/test_cc_integration.py +229 -0
  118. pdbminebuilder-0.2.0/tests/integration/test_ccmodel_integration.py +215 -0
  119. pdbminebuilder-0.2.0/tests/integration/test_pdbj_integration.py +239 -0
  120. pdbminebuilder-0.2.0/tests/integration/test_prd_integration.py +221 -0
  121. pdbminebuilder-0.2.0/tests/test_assembly.py +288 -0
  122. pdbminebuilder-0.2.0/tests/test_base.py +314 -0
  123. pdbminebuilder-0.2.0/tests/test_brief_summary.py +308 -0
  124. pdbminebuilder-0.2.0/tests/test_cc_cif.py +700 -0
  125. pdbminebuilder-0.2.0/tests/test_ccmodel_cif.py +327 -0
  126. pdbminebuilder-0.2.0/tests/test_commands.py +279 -0
  127. pdbminebuilder-0.2.0/tests/test_delta.py +456 -0
  128. pdbminebuilder-0.2.0/tests/test_emdb.py +393 -0
  129. pdbminebuilder-0.2.0/tests/test_environment.py +69 -0
  130. pdbminebuilder-0.2.0/tests/test_format_parity.py +359 -0
  131. pdbminebuilder-0.2.0/tests/test_ihm.py +554 -0
  132. pdbminebuilder-0.2.0/tests/test_loader_migration.py +139 -0
  133. pdbminebuilder-0.2.0/tests/test_metadata.py +265 -0
  134. pdbminebuilder-0.2.0/tests/test_mmjson.py +244 -0
  135. pdbminebuilder-0.2.0/tests/test_model_registry.py +140 -0
  136. pdbminebuilder-0.2.0/tests/test_mtime_filtering.py +170 -0
  137. pdbminebuilder-0.2.0/tests/test_parsers.py +227 -0
  138. pdbminebuilder-0.2.0/tests/test_patches.py +75 -0
  139. pdbminebuilder-0.2.0/tests/test_pdbj.py +527 -0
  140. pdbminebuilder-0.2.0/tests/test_pdbj_cif.py +1180 -0
  141. pdbminebuilder-0.2.0/tests/test_prd_cif.py +443 -0
  142. pdbminebuilder-0.2.0/tests/test_prd_family_cif.py +296 -0
  143. pdbminebuilder-0.2.0/tests/test_type_utils.py +69 -0
  144. pdbminebuilder-0.2.0/website/.gitignore +20 -0
  145. pdbminebuilder-0.2.0/website/README.md +41 -0
  146. pdbminebuilder-0.2.0/website/docs/database/cc.mdx +80 -0
  147. pdbminebuilder-0.2.0/website/docs/database/ccmodel.mdx +14 -0
  148. pdbminebuilder-0.2.0/website/docs/database/contacts.mdx +14 -0
  149. pdbminebuilder-0.2.0/website/docs/database/emdb.mdx +14 -0
  150. pdbminebuilder-0.2.0/website/docs/database/ihm.mdx +14 -0
  151. pdbminebuilder-0.2.0/website/docs/database/overview.md +84 -0
  152. pdbminebuilder-0.2.0/website/docs/database/pdbj.mdx +14 -0
  153. pdbminebuilder-0.2.0/website/docs/database/prd.mdx +14 -0
  154. pdbminebuilder-0.2.0/website/docs/database/prd_family.mdx +14 -0
  155. pdbminebuilder-0.2.0/website/docs/database/vrpt.mdx +14 -0
  156. pdbminebuilder-0.2.0/website/docs/getting-started/configuration.md +145 -0
  157. pdbminebuilder-0.2.0/website/docs/getting-started/installation.md +155 -0
  158. pdbminebuilder-0.2.0/website/docs/getting-started/migration.md +207 -0
  159. pdbminebuilder-0.2.0/website/docs/getting-started/sync.md +99 -0
  160. pdbminebuilder-0.2.0/website/docs/getting-started/update.md +188 -0
  161. pdbminebuilder-0.2.0/website/docusaurus.config.ts +88 -0
  162. pdbminebuilder-0.2.0/website/package-lock.json +18448 -0
  163. pdbminebuilder-0.2.0/website/package.json +47 -0
  164. pdbminebuilder-0.2.0/website/sidebars.ts +39 -0
  165. pdbminebuilder-0.2.0/website/src/components/SchemaFilter.tsx +89 -0
  166. pdbminebuilder-0.2.0/website/src/css/custom.css +133 -0
  167. pdbminebuilder-0.2.0/website/src/pages/index.module.css +23 -0
  168. pdbminebuilder-0.2.0/website/src/pages/index.tsx +86 -0
  169. pdbminebuilder-0.2.0/website/src/pages/schema-search.tsx +250 -0
  170. pdbminebuilder-0.2.0/website/src/types/schema.ts +10 -0
  171. pdbminebuilder-0.2.0/website/static/.nojekyll +0 -0
  172. pdbminebuilder-0.2.0/website/static/data/allSchemas.json +1 -0
  173. pdbminebuilder-0.2.0/website/static/img/docusaurus-social-card.jpg +0 -0
  174. pdbminebuilder-0.2.0/website/static/img/docusaurus.png +0 -0
  175. pdbminebuilder-0.2.0/website/static/img/favicon.ico +0 -0
  176. pdbminebuilder-0.2.0/website/static/img/logo.svg +1 -0
  177. pdbminebuilder-0.2.0/website/static/img/undraw_docusaurus_mountain.svg +171 -0
  178. pdbminebuilder-0.2.0/website/static/img/undraw_docusaurus_react.svg +170 -0
  179. pdbminebuilder-0.2.0/website/static/img/undraw_docusaurus_tree.svg +40 -0
  180. pdbminebuilder-0.2.0/website/tsconfig.json +8 -0
@@ -0,0 +1,9 @@
1
+ # PostgreSQL connection
2
+ PGPORT=5433
3
+ PGHOST=localhost
4
+ PGDATA=postgres_data_5433
5
+ PGUSER=pdbj
6
+ PGDATABASE=pmb
7
+
8
+ # Data directory (PDBj data root)
9
+ DATA_DIR=/path/to/pdb/data
@@ -0,0 +1,2 @@
1
+ # SCM syntax highlighting & preventing 3-way merges
2
+ pixi.lock merge=binary linguist-language=YAML linguist-generated=true -diff
@@ -0,0 +1,45 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ paths-ignore:
7
+ - "*.md"
8
+ - "docs/**"
9
+ - "website/**"
10
+ - "LICENSE"
11
+ pull_request:
12
+ branches: [main]
13
+ paths-ignore:
14
+ - "*.md"
15
+ - "docs/**"
16
+ - "website/**"
17
+ - "LICENSE"
18
+
19
+ jobs:
20
+ lint:
21
+ runs-on: ubuntu-latest
22
+ steps:
23
+ - uses: actions/checkout@v4
24
+
25
+ - uses: prefix-dev/setup-pixi@v0.9.2
26
+ with:
27
+ cache: true
28
+
29
+ - name: Ruff check
30
+ run: pixi run lint
31
+
32
+ - name: Ruff format check
33
+ run: pixi run format-check
34
+
35
+ test:
36
+ runs-on: ubuntu-latest
37
+ steps:
38
+ - uses: actions/checkout@v4
39
+
40
+ - uses: prefix-dev/setup-pixi@v0.9.2
41
+ with:
42
+ cache: true
43
+
44
+ - name: Run unit tests
45
+ run: pixi run test-unit
@@ -0,0 +1,52 @@
1
+ name: Deploy Docs
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ paths: ["website/**"]
7
+
8
+ permissions:
9
+ contents: read
10
+ pages: write
11
+ id-token: write
12
+
13
+ concurrency:
14
+ group: pages
15
+ cancel-in-progress: false
16
+
17
+ jobs:
18
+ build:
19
+ runs-on: ubuntu-latest
20
+ defaults:
21
+ run:
22
+ working-directory: website
23
+ steps:
24
+ - uses: actions/checkout@v4
25
+ with:
26
+ fetch-depth: 0
27
+
28
+ - uses: actions/setup-node@v4
29
+ with:
30
+ node-version: 22
31
+ cache: npm
32
+ cache-dependency-path: website/package-lock.json
33
+
34
+ - run: npm ci
35
+ - run: npm run build
36
+
37
+ - uses: actions/upload-pages-artifact@v3
38
+ with:
39
+ path: website/build
40
+
41
+ deploy:
42
+ needs: build
43
+ permissions:
44
+ pages: write
45
+ id-token: write
46
+ environment:
47
+ name: github-pages
48
+ url: ${{ steps.deployment.outputs.page_url }}
49
+ runs-on: ubuntu-latest
50
+ steps:
51
+ - uses: actions/deploy-pages@v4
52
+ id: deployment
@@ -0,0 +1,68 @@
1
+ name: Release
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+
8
+ permissions:
9
+ contents: write
10
+ id-token: write # Required for trusted publishing to PyPI
11
+
12
+ jobs:
13
+ release:
14
+ runs-on: ubuntu-latest
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+
18
+ - name: Extract version from tag
19
+ id: version
20
+ run: echo "version=${GITHUB_REF_NAME#v}" >> "$GITHUB_OUTPUT"
21
+
22
+ - name: Extract release notes from CHANGELOG.md
23
+ id: changelog
24
+ run: |
25
+ # Extract the section for the tagged version
26
+ version="${{ steps.version.outputs.version }}"
27
+ notes=$(awk -v ver="$version" '
28
+ /^## \[/ {
29
+ if (found) exit
30
+ if ($0 ~ "\\[" ver "\\]") found=1; next
31
+ }
32
+ found { print }
33
+ ' CHANGELOG.md)
34
+
35
+ if [ -z "$notes" ]; then
36
+ echo "::warning::No CHANGELOG entry found for version $version"
37
+ notes="Release $version"
38
+ fi
39
+
40
+ # Write to file for gh release
41
+ echo "$notes" > release_notes.md
42
+
43
+ - name: Create GitHub Release
44
+ run: |
45
+ gh release create "$GITHUB_REF_NAME" \
46
+ --title "$GITHUB_REF_NAME" \
47
+ --notes-file release_notes.md
48
+ env:
49
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
50
+
51
+ publish:
52
+ runs-on: ubuntu-latest
53
+ needs: release
54
+ environment: pypi
55
+ steps:
56
+ - uses: actions/checkout@v4
57
+
58
+ - uses: actions/setup-python@v5
59
+ with:
60
+ python-version: "3.12"
61
+
62
+ - name: Build package
63
+ run: |
64
+ pip install build
65
+ python -m build
66
+
67
+ - name: Publish to PyPI
68
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,37 @@
1
+ # pixi environments
2
+ .pixi/*
3
+ !.pixi/config.toml
4
+
5
+ # Python
6
+ __pycache__/
7
+ *.pyc
8
+ *.pyo
9
+
10
+ # Environment
11
+ .env
12
+
13
+ node_modules/
14
+
15
+ # Docusaurus
16
+ website/build/
17
+ website/.docusaurus/
18
+ website/node_modules/
19
+ data
20
+ !website/static/data/
21
+ logs/
22
+ postgres_data_5433
23
+ package-lock.json
24
+ !website/package-lock.json
25
+ setup-db.sh
26
+ config.yml
27
+ dist
28
+ .cursor
29
+
30
+ # Claude Code
31
+ plans/
32
+
33
+ # Build artifacts
34
+ *.egg-info/
35
+
36
+ # Lock files (pixi manages dependencies)
37
+ uv.lock
@@ -0,0 +1,35 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
9
+
10
+ ## [0.2.0] - 2026-03-07
11
+
12
+ Initial release as an independent Python project. Rewritten from
13
+ [mine2updater](https://gitlab.com/pdbjapan/mine2updater) (Node.js) by PDBj.
14
+
15
+ ### Added
16
+
17
+ - 7 data pipelines: pdbj, cc, ccmodel, prd, prd_family, vrpt, contacts
18
+ - 2 schema-only definitions: emdb, ihm
19
+ - Dual format support (CIF / mmJSON) for pdbj, cc, ccmodel, prd pipelines
20
+ - Unified parsing via gemmi for both CIF and mmJSON
21
+ - Multi-process parallel loading with ProcessPoolExecutor
22
+ - Bulk load mode (COPY protocol) for initial data loading
23
+ - Mtime-based skip optimization for incremental updates
24
+ - RDKit PostgreSQL cartridge integration for chemical searches
25
+ - SMILES generation from molecular structure via ccd2rdmol
26
+ - SQLAlchemy Core schema definitions with Alembic migrations
27
+ - CLI with 9 commands: sync, update, load, all, setup-rdkit, test, reset, stats, version
28
+ - Pydantic-based configuration with YAML and environment variable support
29
+ - Documentation website with auto-generated schema docs
30
+ - Docker-based test environment (PostgreSQL + RDKit)
31
+ - PyPI publishing support with trusted publishing
32
+ - Environment version tests for Python and PostgreSQL
33
+ - Alternative installation methods (pip, conda+pip)
34
+ - `config.example.yml` with documented options
35
+ - MIT license
@@ -0,0 +1,309 @@
1
+ # pdb-mine-builder
2
+
3
+ PDBj (Protein Data Bank Japan) のデータを PostgreSQL にロードする CLI ツール。
4
+
5
+ ## Tech Stack
6
+
7
+ - **Language**: Python 3.12+
8
+ - **Package Manager**: Pixi (Conda/PyPI hybrid)
9
+ - **Database**: PostgreSQL 17+ (version managed by rdkit-postgresql, psycopg3)
10
+ - **Schema**: SQLAlchemy Core (DDL only) + Alembic (migrations)
11
+ - **CLI**: Typer + Rich
12
+ - **Config**: Pydantic
13
+ - **Parser**: gemmi (CIF and mmJSON unified)
14
+ - **Chemistry**: RDKit PostgreSQL cartridge + ccd2rdmol
15
+
16
+ ## Quick Start
17
+
18
+ ```bash
19
+ pixi install
20
+ pixi run pmb --help
21
+ pixi run pmb update pdbj --limit 10
22
+ ```
23
+
24
+ ## Project Structure
25
+
26
+ ```
27
+ src/pdbminebuilder/
28
+ ├── __init__.py
29
+ ├── __main__.py # Entry point
30
+ ├── cli.py # Typer CLI commands (9 commands)
31
+ ├── config.py # Pydantic settings
32
+ ├── models/
33
+ │ ├── __init__.py # MetaData registry (ALL_METADATA, get_metadata())
34
+ │ ├── cc.py # cc schema (10 tables)
35
+ │ ├── ccmodel.py # ccmodel schema
36
+ │ ├── contacts.py # contacts schema
37
+ │ ├── emdb.py # emdb schema (schema only, no pipeline)
38
+ │ ├── ihm.py # ihm schema (schema only, no pipeline)
39
+ │ ├── pdbj.py # pdbj schema (~400 tables)
40
+ │ ├── prd.py # prd schema
41
+ │ ├── prd_family.py # prd_family schema
42
+ │ └── vrpt.py # vrpt schema
43
+ ├── db/
44
+ │ ├── connection.py # psycopg3 connection pool
45
+ │ ├── delta.py # Delta computing
46
+ │ ├── loader.py # Parallel loader (ProcessPoolExecutor)
47
+ │ ├── metadata.py # Entry metadata tracking (mtime-based skip)
48
+ │ └── _type_utils.py # SA type to PostgreSQL type converter
49
+ ├── parsers/
50
+ │ ├── cif.py # Unified parser (CIF + mmJSON via gemmi)
51
+ │ └── mmjson.py # Utilities: normalize_column_name(), merge_data()
52
+ └── pipelines/
53
+ ├── base.py # BasePipeline + transform_category()
54
+ ├── pdbj.py # Main PDB data (CIF/mmJSON)
55
+ ├── cc.py # Chemical components
56
+ ├── ccmodel.py # Chemical component models
57
+ ├── prd.py # BIRD data (dual data blocks)
58
+ ├── prd_family.py # BIRD family data (CIF only)
59
+ ├── vrpt.py # Validation reports (CIF)
60
+ └── contacts.py # Contact data (custom JSON)
61
+ alembic/ # Alembic migration config
62
+ ├── env.py # Multi-schema support
63
+ └── versions/ # Migration scripts
64
+ docker/
65
+ ├── docker-compose.test.yml # Test DB (PostgreSQL + RDKit)
66
+ └── init/
67
+ └── 01-extensions.sql # RDKit extension setup
68
+ scripts/ # One-shot utility scripts
69
+ tests/ # Unit + integration tests (pytest)
70
+ docs/ # Architecture docs
71
+ ```
72
+
73
+ ## Pipelines
74
+
75
+ ### Pipeline List
76
+
77
+ | Pipeline | Default Format | Notes |
78
+ |----------|---------------|-------|
79
+ | pdbj | CIF | File-based (~248k files), atom_site skipped |
80
+ | cc | CIF | Single file (components.cif.gz), ~40k blocks |
81
+ | ccmodel | CIF | Single file (chem_comp_model.cif.gz) |
82
+ | prd | CIF | Dual file (prd-all.cif.gz + prdcc-all.cif.gz) |
83
+ | prd_family | CIF | Single file (family-all.cif.gz), CIF only |
84
+ | vrpt | CIF | Uses gemmi.CifWalk for nested directory structure |
85
+ | contacts | JSON | Array format, not mmJSON |
86
+ | emdb | - | Schema only, no pipeline implementation |
87
+ | ihm | - | Schema only, no pipeline implementation |
88
+
89
+ ### Format Selection (Dual-Format Pipelines)
90
+
91
+ Pipelines pdbj, cc, ccmodel, prd support both CIF and mmJSON.
92
+ Format is selected via `format` field in `config.yml`:
93
+
94
+ ```yaml
95
+ pipelines:
96
+ pdbj:
97
+ format: cif # "cif" (default) or "mmjson"
98
+ data: /path/to/data/
99
+ ```
100
+
101
+ ### Backward Compatibility
102
+
103
+ Legacy pipeline names (`pdbj-cif`, `cc-cif`, `pdbj-json`, `cc-json`, etc.)
104
+ are still accepted but deprecated. They emit a warning and resolve to the
105
+ base pipeline name. Note: `-json` aliases resolve to the base name but do NOT
106
+ change the `format` config — users must set `format: mmjson` in config.yml.
107
+
108
+ ### Mtime-Based Skip Optimization
109
+
110
+ The `entry_metadata` table tracks file modification times. During incremental
111
+ updates, unchanged entries are automatically skipped. Use `--force` flag to
112
+ bypass mtime checks and reprocess all entries.
113
+
114
+ ## CLI Commands
115
+
116
+ ```bash
117
+ pixi run pmb sync <target> # Sync data via rsync
118
+ pixi run pmb update <pipeline> # Incremental update (--limit, --workers, --force)
119
+ pixi run pmb load <pipeline> # Bulk load via COPY protocol (initial load)
120
+ pixi run pmb all # Full sync + update cycle
121
+ pixi run pmb setup-rdkit # Setup RDKit extension
122
+ pixi run pmb test [pipeline...] # Run test pipelines against test DB
123
+ pixi run pmb reset <schema|all> # Drop and reset schemas
124
+ pixi run pmb stats # Show database statistics
125
+ pixi run pmb --version # Show version
126
+ ```
127
+
128
+ ## Key Patterns
129
+
130
+ ### Unified Parsing (gemmi)
131
+ Both CIF and mmJSON are parsed via gemmi, returning row-oriented dicts:
132
+ ```python
133
+ from pdbminebuilder.parsers.cif import parse_cif_file, parse_mmjson_file
134
+
135
+ # CIF files (supports .cif.gz)
136
+ data = parse_cif_file(filepath)
137
+
138
+ # mmJSON files (supports .json.gz)
139
+ data = parse_mmjson_file(filepath)
140
+
141
+ # Both return: {"category": [{"col": "val", ...}, ...], "_block_name": "..."}
142
+ ```
143
+
144
+ ### Column Name Normalization
145
+ mmJSON uses `column[1][2]` → schema uses `column12`
146
+ ```python
147
+ from pdbminebuilder.parsers.mmjson import normalize_column_name
148
+ ```
149
+
150
+ ### Schema Access (SQLAlchemy Core)
151
+ Schema definitions use SQLAlchemy Core Table objects with metadata:
152
+ ```python
153
+ from pdbminebuilder.models import get_metadata, ALL_METADATA
154
+ from pdbminebuilder.db.loader import get_table, get_all_tables, get_entry_pk
155
+
156
+ meta = get_metadata("cc") # MetaData with schema="cc"
157
+ table = get_table(meta, "brief_summary") # SA Table object
158
+ pk = get_entry_pk(meta) # "comp_id"
159
+ tables = get_all_tables(meta) # All tables in schema
160
+
161
+ # Schema/table config stored in .info dicts
162
+ meta.info["entry_pk"] # Schema-level primary key
163
+ table.info["keywords"] # Table-level keywords list
164
+ ```
165
+
166
+ ### Category Transformation
167
+ ```python
168
+ from pdbminebuilder.pipelines.base import transform_category
169
+
170
+ # mmJSON: needs normalization
171
+ rows = transform_category(rows, table, pk_value, pk_col, normalize_column_name)
172
+
173
+ # CIF: no normalization needed (pass None)
174
+ rows = transform_category(rows, table, pk_value, pk_col, None)
175
+ ```
176
+
177
+ ### Chemical SMILES Generation (cc pipeline)
178
+ Both CIF and mmJSON cc pipelines generate canonical SMILES using ccd2rdmol + RDKit.
179
+ This ensures consistent SMILES quality regardless of input format:
180
+ ```python
181
+ from ccd2rdmol import read_ccd_block
182
+
183
+ block = gemmi.cif.read(cif_path)[0] # CIF
184
+ block = gemmi.cif.read_mmjson(json_path)[0] # mmJSON
185
+
186
+ result = read_ccd_block(block, sanitize_mol=True, add_conformers=False)
187
+ smiles = Chem.MolToSmiles(result.mol, canonical=True)
188
+ ```
189
+
190
+ Note: The SMILES in `pdbx_chem_comp_descriptor` is NOT used. SMILES is always
191
+ generated from the molecular structure for consistency and quality.
192
+
193
+ ### RDKit PostgreSQL Cartridge
194
+ Chemical searches use RDKit extension (auto-configured on `cc` pipeline run):
195
+ ```sql
196
+ -- Substructure search
197
+ SELECT * FROM cc.brief_summary WHERE mol @> 'c1ccccc1'::mol;
198
+
199
+ -- Similarity search (Tanimoto)
200
+ SELECT *, tanimoto_sml(morganbv_fp(mol), morganbv_fp('CCO'::mol))
201
+ FROM cc.brief_summary WHERE morganbv_fp(mol) % morganbv_fp('CCO'::mol);
202
+ ```
203
+
204
+ ### Parallel Processing
205
+ - Workers create own DB connections (not pool)
206
+ - `ProcessPoolExecutor` with configurable worker count
207
+
208
+ ## Development
209
+
210
+ ```bash
211
+ pixi run lint # ruff check
212
+ pixi run format # ruff format
213
+ pixi run test # pytest (unit tests)
214
+ pixi run check # all checks (lint, format)
215
+ ```
216
+
217
+ ### Testing
218
+
219
+ **IMPORTANT**: テスト実行前に必ず Docker の test DB を起動すること。
220
+
221
+ ```bash
222
+ # 1. Test DB を起動 (PostgreSQL + RDKit)
223
+ pixi run test-db-up
224
+
225
+ # 2. Test DB の状態確認
226
+ pixi run test-db-status
227
+
228
+ # 3. テスト実行
229
+ pixi run test # All tests
230
+ pixi run test-unit # Unit tests only
231
+ pixi run test-integration # Integration tests (requires test DB)
232
+
233
+ # 4. Test DB を停止
234
+ pixi run test-db-down
235
+ ```
236
+
237
+ Test DB details:
238
+ - Image: `mcs07/postgres-rdkit:latest` (PostgreSQL 17 + RDKit)
239
+ - Container: `pmb-postgres-test`
240
+ - Port: `15433`
241
+ - Database: `pmb_test`
242
+ - Config: `config.test.yml`
243
+
244
+ ### Database Migrations (Alembic)
245
+
246
+ ```bash
247
+ pixi run db-migrate "description" # Generate migration
248
+ pixi run db-upgrade # Apply all pending migrations
249
+ pixi run db-downgrade # Rollback last migration
250
+ pixi run db-history # Show migration history
251
+ ```
252
+
253
+ Alembic is configured for multi-schema support (all 9 schemas).
254
+ Schema DDL is defined in `src/pdbminebuilder/models/` as SQLAlchemy Core Table objects.
255
+ Data operations still use psycopg3 direct connections (no SQLAlchemy Engine for data).
256
+
257
+ ## Database
258
+
259
+ ```bash
260
+ pixi run db-start # Start PostgreSQL
261
+ pixi run db-stop # Stop PostgreSQL
262
+ pixi run db-status # Show PostgreSQL status
263
+ ```
264
+
265
+ Connection: `config.yml` の `rdb.constring`
266
+
267
+ ### Bulk Load Mode
268
+
269
+ For initial data loading, use bulk load mode to significantly improve performance:
270
+
271
+ ```bash
272
+ # 1. Start PostgreSQL
273
+ pixi run db-start
274
+
275
+ # 2. Enable bulk load mode (disables fsync, autovacuum)
276
+ pixi run db-bulkload-mode
277
+
278
+ # 3. Run data loading
279
+ pixi run pmb load cc
280
+ pixi run pmb load pdbj
281
+ # ... other pipelines
282
+
283
+ # 4. Restore safe settings
284
+ pixi run db-safe-mode
285
+
286
+ # 5. Run VACUUM ANALYZE
287
+ psql -c "VACUUM ANALYZE;"
288
+ ```
289
+
290
+ **WARNING**: Bulk load mode disables crash safety. If PostgreSQL crashes during bulk load:
291
+ ```bash
292
+ pixi run db-stop
293
+ rm -rf $PGDATA
294
+ pixi run db-init
295
+ # Re-run data loading from scratch
296
+ ```
297
+
298
+ ## Configuration
299
+
300
+ - `config.yml` - Production config (customize locally)
301
+ - `config.test.yml` - Test config (uses fixture data with `${CWD}` expansion)
302
+ - `.env` - Environment variables (gitignored)
303
+ - `.env.example` - Template
304
+
305
+ ## Known Issues
306
+
307
+ - Global connection pool is for main process only; workers use direct connections
308
+ - Workers receive `schema_name: str` and import models inside worker function (avoids pickling SA objects)
309
+ - `emdb` and `ihm` have schema definitions but no pipeline implementations yet
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 N283T
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.