chem_mat_database 1.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. chem_mat_database-1.6.0/.claude/commands/changelog.md +20 -0
  2. chem_mat_database-1.6.0/.claude/commands/debug.md +28 -0
  3. chem_mat_database-1.6.0/.claude/commands/documentation.md +20 -0
  4. chem_mat_database-1.6.0/.claude/commands/feature.md +22 -0
  5. chem_mat_database-1.6.0/.claude/settings.local.json +20 -0
  6. chem_mat_database-1.6.0/.github/workflows/docs.yml +31 -0
  7. chem_mat_database-1.6.0/.gitignore +48 -0
  8. chem_mat_database-1.6.0/.mcp.json +18 -0
  9. chem_mat_database-1.6.0/.python-version +1 -0
  10. chem_mat_database-1.6.0/.zenodo.json +39 -0
  11. chem_mat_database-1.6.0/AGENTS.md +118 -0
  12. chem_mat_database-1.6.0/CHANGELOG.md +189 -0
  13. chem_mat_database-1.6.0/CLAUDE.md +134 -0
  14. chem_mat_database-1.6.0/DEVELOP.rst +204 -0
  15. chem_mat_database-1.6.0/INTERNAL.rst +411 -0
  16. chem_mat_database-1.6.0/LICENSE +21 -0
  17. chem_mat_database-1.6.0/MANAGE.rst +83 -0
  18. chem_mat_database-1.6.0/PKG-INFO +272 -0
  19. chem_mat_database-1.6.0/README.rst +213 -0
  20. chem_mat_database-1.6.0/_test_/README_QM9.md +51 -0
  21. chem_mat_database-1.6.0/_test_/opencode.json +24 -0
  22. chem_mat_database-1.6.0/_test_/parse_qm9_to_csv.py +104 -0
  23. chem_mat_database-1.6.0/_test_/prompt.md +6 -0
  24. chem_mat_database-1.6.0/chem_mat_data/.env +2 -0
  25. chem_mat_database-1.6.0/chem_mat_data/ChemMatData_logo_final.png +0 -0
  26. chem_mat_database-1.6.0/chem_mat_data/VERSION +1 -0
  27. chem_mat_database-1.6.0/chem_mat_data/__init__.py +18 -0
  28. chem_mat_database-1.6.0/chem_mat_data/_typing.py +15 -0
  29. chem_mat_database-1.6.0/chem_mat_data/agent/README.md +3 -0
  30. chem_mat_database-1.6.0/chem_mat_data/cache.py +290 -0
  31. chem_mat_database-1.6.0/chem_mat_data/cli.py +1725 -0
  32. chem_mat_database-1.6.0/chem_mat_data/config.py +154 -0
  33. chem_mat_database-1.6.0/chem_mat_data/connectors.py +315 -0
  34. chem_mat_database-1.6.0/chem_mat_data/data.py +1006 -0
  35. chem_mat_database-1.6.0/chem_mat_data/dataset.py +1324 -0
  36. chem_mat_database-1.6.0/chem_mat_data/examples/00_quickstart.ipynb +344 -0
  37. chem_mat_database-1.6.0/chem_mat_data/examples/01_gnn_demo.ipynb +3632 -0
  38. chem_mat_database-1.6.0/chem_mat_data/graph.py +151 -0
  39. chem_mat_database-1.6.0/chem_mat_data/main.py +544 -0
  40. chem_mat_database-1.6.0/chem_mat_data/manage.py +835 -0
  41. chem_mat_database-1.6.0/chem_mat_data/metadata.yml +1547 -0
  42. chem_mat_database-1.6.0/chem_mat_data/processing.py +1174 -0
  43. chem_mat_database-1.6.0/chem_mat_data/scripts/README.rst +0 -0
  44. chem_mat_database-1.6.0/chem_mat_data/scripts/assets/_test2.xyz_bundle/0.xyz +18 -0
  45. chem_mat_database-1.6.0/chem_mat_data/scripts/assets/_test2.xyz_bundle/1.xyz +18 -0
  46. chem_mat_database-1.6.0/chem_mat_data/scripts/assets/_test2.xyz_bundle/2.xyz +18 -0
  47. chem_mat_database-1.6.0/chem_mat_data/scripts/assets/test.xyz +18 -0
  48. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets.py +483 -0
  49. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__ames.py +59 -0
  50. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__aqsoldb.py +60 -0
  51. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__bace_cls.py +125 -0
  52. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__bace_reg.py +127 -0
  53. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__bbbp.py +208 -0
  54. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__beet.py +68 -0
  55. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__bl_chembl_cls.py +413 -0
  56. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__bl_chembl_reg.py +414 -0
  57. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__clintox.py +102 -0
  58. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__compas_1x.py +130 -0
  59. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__compas_2x.py +143 -0
  60. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__compas_3x.py +138 -0
  61. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__dpp4.py +70 -0
  62. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__dud_e.py +625 -0
  63. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__elanos_bp.py +113 -0
  64. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__elanos_vp.py +113 -0
  65. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__esol.py +62 -0
  66. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__freesolv.py +86 -0
  67. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__from_tu.py +198 -0
  68. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__from_tu__MCF_7.py +100 -0
  69. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__from_tu__MOLT_4.py +118 -0
  70. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__from_tu__OVCAR_8.py +119 -0
  71. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__from_tu__PC_3.py +99 -0
  72. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__from_xyz.py +142 -0
  73. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__from_xyz__qm9.py +43 -0
  74. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__half_life.py +99 -0
  75. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__hiv.py +61 -0
  76. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__hopv15_exp.py +136 -0
  77. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__lipophilicity.py +89 -0
  78. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__muv.py +535 -0
  79. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__muv_.py +62 -0
  80. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__open_melting_point.py +102 -0
  81. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__photo_oliogomers.py +88 -0
  82. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__price_small.py +73 -0
  83. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__qm9_smiles.py +127 -0
  84. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__riniker_1.py +745 -0
  85. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__riniker_1_filtered.py +736 -0
  86. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__riniker_2.py +541 -0
  87. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__sider.py +87 -0
  88. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__skin_irritation.py +143 -0
  89. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__skin_sensitizers.py +150 -0
  90. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__synth_binary_global.py +124 -0
  91. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__synth_binary_local.py +139 -0
  92. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__tadf.py +131 -0
  93. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__tox21.py +75 -0
  94. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__toxcast.py +678 -0
  95. chem_mat_database-1.6.0/chem_mat_data/scripts/create_graph_datasets__zinc250k.py +136 -0
  96. chem_mat_database-1.6.0/chem_mat_data/scripts/test_xyz.ipynb +159 -0
  97. chem_mat_database-1.6.0/chem_mat_data/templates/config.toml.j2 +12 -0
  98. chem_mat_database-1.6.0/chem_mat_data/templates/docs_dataset.md.j2 +11 -0
  99. chem_mat_database-1.6.0/chem_mat_data/templates/logo_image.txt +6 -0
  100. chem_mat_database-1.6.0/chem_mat_data/templates/logo_text.txt +6 -0
  101. chem_mat_database-1.6.0/chem_mat_data/testing.py +237 -0
  102. chem_mat_database-1.6.0/chem_mat_data/utils.py +266 -0
  103. chem_mat_database-1.6.0/chem_mat_data/visualization.py +239 -0
  104. chem_mat_database-1.6.0/chem_mat_data/web.py +538 -0
  105. chem_mat_database-1.6.0/docs/api_datasets.md +76 -0
  106. chem_mat_database-1.6.0/docs/api_streaming_datasets.md +309 -0
  107. chem_mat_database-1.6.0/docs/architecture_decisions/001_providing_processed_datasets.md +54 -0
  108. chem_mat_database-1.6.0/docs/architecture_decisions/002_custom_graph_represention.md +56 -0
  109. chem_mat_database-1.6.0/docs/architecture_decisions/003_local_dataset_cache.md +45 -0
  110. chem_mat_database-1.6.0/docs/architecture_decisions/004_streaming_datasets.md +112 -0
  111. chem_mat_database-1.6.0/docs/architecture_decisions/005_automated_processing_agent.md +31 -0
  112. chem_mat_database-1.6.0/docs/assets/banner.png +0 -0
  113. chem_mat_database-1.6.0/docs/assets/cli_info.png +0 -0
  114. chem_mat_database-1.6.0/docs/assets/cli_list.png +0 -0
  115. chem_mat_database-1.6.0/docs/assets/cli_list_reduced.png +0 -0
  116. chem_mat_database-1.6.0/docs/assets/logo.png +0 -0
  117. chem_mat_database-1.6.0/docs/assets/logo.svg +108 -0
  118. chem_mat_database-1.6.0/docs/cli_cache.md +61 -0
  119. chem_mat_database-1.6.0/docs/cli_config.md +22 -0
  120. chem_mat_database-1.6.0/docs/cli_download.md +33 -0
  121. chem_mat_database-1.6.0/docs/cli_info.md +14 -0
  122. chem_mat_database-1.6.0/docs/cli_list.md +13 -0
  123. chem_mat_database-1.6.0/docs/cli_quickstart.md +0 -0
  124. chem_mat_database-1.6.0/docs/custom_pre_processing.md +129 -0
  125. chem_mat_database-1.6.0/docs/datasets.md +42 -0
  126. chem_mat_database-1.6.0/docs/first_steps.md +81 -0
  127. chem_mat_database-1.6.0/docs/graph_representation.md +115 -0
  128. chem_mat_database-1.6.0/docs/image.png +0 -0
  129. chem_mat_database-1.6.0/docs/index.md +24 -0
  130. chem_mat_database-1.6.0/docs/installation.md +30 -0
  131. chem_mat_database-1.6.0/docs/javascripts/mathjax.js +19 -0
  132. chem_mat_database-1.6.0/docs/processing_graphs.md +25 -0
  133. chem_mat_database-1.6.0/graph_format.rst +34 -0
  134. chem_mat_database-1.6.0/mkdocs.yml +42 -0
  135. chem_mat_database-1.6.0/noxfile.py +97 -0
  136. chem_mat_database-1.6.0/pyproject.toml +147 -0
  137. chem_mat_database-1.6.0/ruff.toml +62 -0
  138. chem_mat_database-1.6.0/scripts/README.rst +5 -0
  139. chem_mat_database-1.6.0/tests/__init__.py +0 -0
  140. chem_mat_database-1.6.0/tests/_temp.py +20 -0
  141. chem_mat_database-1.6.0/tests/assets/README.rst +4 -0
  142. chem_mat_database-1.6.0/tests/assets/TU_MUTAG/MUTAG_A.txt +7442 -0
  143. chem_mat_database-1.6.0/tests/assets/TU_MUTAG/MUTAG_edge_labels.txt +7442 -0
  144. chem_mat_database-1.6.0/tests/assets/TU_MUTAG/MUTAG_graph_indicator.txt +3371 -0
  145. chem_mat_database-1.6.0/tests/assets/TU_MUTAG/MUTAG_graph_labels.txt +188 -0
  146. chem_mat_database-1.6.0/tests/assets/TU_MUTAG/MUTAG_node_labels.txt +3371 -0
  147. chem_mat_database-1.6.0/tests/assets/TU_MUTAG/README.txt +85 -0
  148. chem_mat_database-1.6.0/tests/assets/_test.xyz +19 -0
  149. chem_mat_database-1.6.0/tests/assets/qm9.xyz +22 -0
  150. chem_mat_database-1.6.0/tests/assets/test.txt +1 -0
  151. chem_mat_database-1.6.0/tests/test_cache.py +107 -0
  152. chem_mat_database-1.6.0/tests/test_cli.py +59 -0
  153. chem_mat_database-1.6.0/tests/test_config.py +80 -0
  154. chem_mat_database-1.6.0/tests/test_connectors.py +304 -0
  155. chem_mat_database-1.6.0/tests/test_data.py +618 -0
  156. chem_mat_database-1.6.0/tests/test_dataset.py +1242 -0
  157. chem_mat_database-1.6.0/tests/test_dataset_benchmark.py +629 -0
  158. chem_mat_database-1.6.0/tests/test_docs.py +161 -0
  159. chem_mat_database-1.6.0/tests/test_hopv15_figshare.py +229 -0
  160. chem_mat_database-1.6.0/tests/test_init.py +12 -0
  161. chem_mat_database-1.6.0/tests/test_main.py +67 -0
  162. chem_mat_database-1.6.0/tests/test_main_bridges.py +262 -0
  163. chem_mat_database-1.6.0/tests/test_processing.py +301 -0
  164. chem_mat_database-1.6.0/tests/test_testing.py +92 -0
  165. chem_mat_database-1.6.0/tests/test_util.py +16 -0
  166. chem_mat_database-1.6.0/tests/test_web.py +110 -0
  167. chem_mat_database-1.6.0/tests/test_xyz_dataset.py +443 -0
  168. chem_mat_database-1.6.0/tests/utils.py +11 -0
  169. chem_mat_database-1.6.0/tox.toml +65 -0
@@ -0,0 +1,20 @@
1
+ ---
2
+ description: Document a feature
3
+ ---
4
+
5
+ You are a documentation writing expert specializing in clear and concise technical writing.
6
+
7
+ ## Context
8
+
9
+ - The code implementation of the feature(s) to be documented
10
+ - The current version of the Changelog located in the `CHANGELOG.rst` file
11
+
12
+ ## Your task
13
+
14
+ Your task is to create a new entry in the `CHANGELOG.rst` file documenting the latest changes made to the project as requested by the user below.
15
+
16
+ You will add the changes to the end of the file, appending it to the last version that is listed there - You will NOT create a new version entry.
17
+
18
+ Key practices:
19
+
20
+ - Entries are to be clear, concise and informative. They should provide enough context for users to understand the changes without being overly verbose.
@@ -0,0 +1,28 @@
1
+ ---
2
+ description: Debug a feature
3
+ ---
4
+
5
+ You are an expert at debugging specializing in root cause analysis.
6
+
7
+ ## Your task
8
+
9
+ Execute the requested piece of code and debug any issues that arise by following the steps below:
10
+
11
+ 1. Execute the code in question.
12
+ 2. Carefully read any error messages and or logs that are produced.
13
+ 3. Isolate the root cause of the issue by examining the code and error messages.
14
+ 4. Implement a minimal fix to resolve the issue.
15
+ 5. Verify the solution works by re-executing the code or writing specific test cases if they do not yet exist.
16
+
17
+ Debugging process:
18
+ - Form and Test hypotheses
19
+ - Add strategic debug logging if necessary
20
+ - Inspect variable states
21
+ - Review relevant documentation
22
+ - Focus on fixing the underlying issues, not just the symptoms
23
+
24
+ At the end of the process, provide the user with:
25
+ - Root cause explanation
26
+ - Evidence supporting your findings
27
+ - Specific code fix
28
+ - Prevention recommendations
@@ -0,0 +1,20 @@
1
+ ---
2
+ description: Document a feature
3
+ ---
4
+
5
+ You are a documentation writing expert specializing in clear and concise technical writing.
6
+
7
+ ## Context
8
+
9
+ - The code implementation of the feature(s) to be documented
10
+ - The existing documentation style and structure in the `docs/` folder
11
+
12
+ ## Your task
13
+
14
+ Create one ore more new entries in the mkdocs documentation located in the `docs/` folder and update the `mkdocs.yml` file to include the new documentation.
15
+
16
+ Key practices:
17
+ - Carefully read multiple existing documentation files to match the style and tone of the existing project documentation. This specifically relates to the consistent usage of terminology, formatting, emojis and the level of detail in which features are explained.
18
+ - Ensure the new documentation is clear, concise and easy to understand for users of varying expertise levels.
19
+ - Use markdown formatting for headings, code blocks, lists, links and other elements as appropriate.
20
+ - Include code examples and usage instructions where relevant.
@@ -0,0 +1,22 @@
1
+ ---
2
+ description: Document a feature
3
+ ---
4
+
5
+ You are an experienced senior software engineer tasked to implement a new feature.
6
+
7
+ ## Your task
8
+
9
+ Implement the requested feature according to the explanation provided by the user below.
10
+
11
+ While implementing follow these key practices:
12
+ - Carefully read some of the existing modules to match the style of the existing project code.
13
+ - Ensure the new code is clear, concise and easy to understand. This particularly implies the usage of code comments and docstrings where appropriate.
14
+ - After implementing the feature, add unittests to the `tests/` folder to verify the new functionality works as intended and execute those tests to ensure they pass, as well as any tests related to the feature to verify nothing else is broken. Please look up where the tests would best fit in the existing test structure.
15
+ - Follow the existing project structure and conventions for file organization, naming, and coding style.
16
+
17
+ After completing the implementation, provide the user with:
18
+ - A brief summary of the changes made
19
+ - The updated or new code files
20
+ - The new or updated tests
21
+ - Potential pitfalls or edge cases to be aware of
22
+ - Suggestions for future improvements or extensions to the feature
@@ -0,0 +1,20 @@
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(find:*)",
5
+ "Bash(source:*)",
6
+ "Bash(cmmanage --help:*)",
7
+ "Bash(cmmanage metadata:*)",
8
+ "Bash(cmdata:*)",
9
+ "Bash(pytest:*)",
10
+ "Bash(cat:*)",
11
+ "Bash(python:*)",
12
+ "Bash(curl:*)",
13
+ "WebSearch",
14
+ "Bash(echo:*)",
15
+ "Bash(uv build:*)"
16
+ ],
17
+ "deny": [],
18
+ "ask": []
19
+ }
20
+ }
@@ -0,0 +1,31 @@
1
+ name: docs
2
+ on:
3
+ push:
4
+ branches:
5
+ - master
6
+ - main
7
+
8
+ permissions:
9
+ contents: write
10
+
11
+ jobs:
12
+ deploy:
13
+ runs-on: ubuntu-latest
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+ - name: Configure Git Credentials
17
+ run: |
18
+ git config user.name github-actions[bot]
19
+ git config user.email 41898282+github-actions[bot]@users.noreply.github.com
20
+ - uses: actions/setup-python@v5
21
+ with:
22
+ python-version: 3.10
23
+ - run: echo "cache_id=$(data --utc '+%V')" >> $GITHUB_ENV
24
+ - uses: actions/cache@v4
25
+ with:
26
+ key: mkdocs-material-${{ env.cache_id }}
27
+ path: .cache
28
+ restore-keys: |
29
+ mkdocs-material-${{ env.cache_id }}
30
+ - run: pip install mkdocs-material
31
+ - run: mkdocs gh-deploy --force
@@ -0,0 +1,48 @@
1
+ venv/*
2
+ venv/
3
+ .vscode/
4
+ tests/artifacts/
5
+
6
+ # Byte-compiled / optimized / DLL files
7
+ */__pycache__/*
8
+ *.py[cod]
9
+ *$py.class
10
+
11
+ # C extensions
12
+ *.so
13
+
14
+ # Distribution / packaging
15
+ .Python
16
+ .tox
17
+ build/
18
+ develop-eggs/
19
+ dist/
20
+ downloads/
21
+ eggs/
22
+ .eggs/
23
+ lib/
24
+ lib64/
25
+ parts/
26
+ sdist/
27
+ var/
28
+ wheels/
29
+ share/python-wheels/
30
+ *.egg-info/
31
+ .installed.cfg
32
+ *.egg
33
+ MANIFEST
34
+
35
+ # Custom
36
+ chem_mat_data/scripts/results/*
37
+ site/*
38
+ adrs/*
39
+
40
+ *.gz
41
+ *.csv
42
+ *.mpack
43
+ *.zip
44
+ *.data
45
+ *.backup
46
+
47
+ **/results/*
48
+ **/lightning_logs/*
@@ -0,0 +1,18 @@
1
+ {
2
+ "mcpServers": {
3
+ "web-search": {
4
+ "command": "node",
5
+ "args": [
6
+ "/home/jonas/Private/web-search-mcp/dist/index.js"
7
+ ],
8
+ "env": {}
9
+ },
10
+ "file-download": {
11
+ "command": "node",
12
+ "args": [
13
+ "/home/jonas/Private/mcp-file-downloader/download-server.js"
14
+ ],
15
+ "env": {}
16
+ }
17
+ }
18
+ }
@@ -0,0 +1 @@
1
+ 3.10
@@ -0,0 +1,39 @@
1
+ {
2
+ "title": "ChemMatData: Unified Chemistry and Material Science Datasets for Graph Neural Networks",
3
+ "upload_type": "software",
4
+ "access_right": "open",
5
+ "license": "MIT",
6
+ "language": "eng",
7
+ "creators": [
8
+ {
9
+ "name": "Teufel, Jonas",
10
+ "orcid": "0000-0002-9228-9395",
11
+ "affiliation": "Karlsruhe Institute of Technology"
12
+ },
13
+ {
14
+ "name": "Zeller, Jana",
15
+ "affiliation": "ETH Zurich / Max Planck Institute for Intelligent Systems"
16
+ },
17
+ {
18
+ "name": "Singh, Mohit",
19
+ "affiliation": "Karlsruhe Institute of Technology"
20
+ }
21
+ ],
22
+ "keywords": [
23
+ "chemistry",
24
+ "materials science",
25
+ "dataset",
26
+ "machine learning",
27
+ "graph neural network",
28
+ "molecular property prediction",
29
+ "Python"
30
+ ],
31
+ "description": "A Python package providing unified access to chemistry and material science datasets for machine learning applications, specifically designed for training graph neural networks (GNNs). Provides simple CLI and API interfaces to download datasets in raw (CSV/pandas) or processed (graph) format.",
32
+ "related_identifiers": [
33
+ {
34
+ "identifier": "https://github.com/the16thpythonist/chem_mat_data",
35
+ "relation": "isSupplementTo",
36
+ "scheme": "url"
37
+ }
38
+ ]
39
+ }
@@ -0,0 +1,118 @@
1
+ # Guide for AI Agents
2
+
3
+ ## Overview
4
+
5
+ This project implements the *Chemistry and Materials Database* (ChemMatData) which is supposed to be a database
6
+ containing various datasets related to chemistry and materials science in a format which is accessible for
7
+ the training of neural network models and specifically graph neural networks (GNNs). The project is designed to
8
+ have an intuitive python interface for easy downloading and processing of the datasets as well as a
9
+ command line interface (CLI) to interact with the remote database.
10
+
11
+ ## Project Structure
12
+
13
+ - `/chem_mat_data`: Python source files
14
+ - `/chem_mat_data/examples`: Example scripts which demonstrate how to use the project
15
+ - `/chem_mat_data/scripts`: Pycomex experiment files which implement the conversion of the various datasets in their
16
+ native data formats into the commong graph format used by the project.
17
+ - `/chem_mat_data/templates`: Jinja2 templates
18
+ - `/tests`: Pytest unit tests which are names "tests_" plus the name of the source python file
19
+ - `/tests/assets`: Additional files etc. which are needed by some of the unittests
20
+ - `/tests/artifacts`: Temp folder in which the tests save their results
21
+
22
+ ## Documentation
23
+
24
+ Type hints should be used wherever possible.
25
+ Every function/method should be properly documented by a Docstring using the **ReStructuredText** documentation style.
26
+ The doc strings should start with a brief summary of the function, followed by the parameters as illustrated by this example:
27
+
28
+ ```python
29
+ def multiply(a: float, b: float) -> float:
30
+ """
31
+ Returns the product of ``a`` and ``b``.
32
+
33
+ :param a: first float input value
34
+ :param b: second float input value
35
+
36
+ :returns: The float result
37
+ """
38
+ return a * b
39
+ ```
40
+
41
+ ## Computational Experiments
42
+
43
+ This project uses the [pycomex](https://github.com/the16thpythonist/pycomex) micro-framework for implementing and executing computational experiments.
44
+ A computational experiment can be defined according to the following example:
45
+
46
+ ```python
47
+ from pycomex.functional.experiment import Experiment
48
+ from pycomex.utils import folder_path, file_namespace
49
+
50
+ # :param PARAM1:
51
+ # Description for the parameter...
52
+ PARAM1: int = 100
53
+
54
+ __DEBUG__ = True # enables/disables debug mode
55
+
56
+ experiment = Experiment(
57
+ base_path=folder_path(__file__),
58
+ namespace=file_namespace(__file__),
59
+ glob=globals()
60
+ )
61
+
62
+ @experiment.hook('util_function')
63
+ def util_function(e: Experiment, param: int):
64
+ return param ** 2
65
+
66
+ @experiment
67
+ def experiment(e: Experiment):
68
+
69
+ # automatically pushes to log file as well as to the stdout
70
+ # parameters are accessed as attributes of the Experiment object
71
+ e.log(f'this is a parameter value: {e.PARAM1}')
72
+
73
+ # Store values into the experiment automatically
74
+ e['value'] = value
75
+
76
+ # The e.path contains the path to the artifacts folder of the
77
+ # current experiment run.
78
+ e.log('artifacts path: {}')
79
+
80
+ # Easily store figures to the artifacts folder
81
+ fig: plt.Figure = ...
82
+ e.commit_fig(fig, 'figure1.png')
83
+
84
+ # using hooks instead of plain functions
85
+ result = e.apply_hook(
86
+ 'util_function',
87
+ param=20,
88
+ )
89
+
90
+
91
+ experiment.run_if_main()
92
+ ```
93
+
94
+ ## Code Convention
95
+
96
+ 1. functions with many parameters should be split like this:
97
+
98
+ ```python
99
+ def function(arg1: List[int],
100
+ arg2: List[float],
101
+ **kwargs
102
+ ) -> float:
103
+ # ...
104
+
105
+ ```
106
+
107
+ ## Testing
108
+
109
+ Unittests use `pytest` in the `/tests` folder with this command
110
+
111
+ ```bash
112
+ pytest -q -m "not localonly"
113
+ ```
114
+
115
+ ## Pull Requests / Contributing
116
+
117
+ Pull Requests should always start with a small summary of the changes and a list of the changed files.
118
+ Additionally a PR should contain a small summary of the tests results.
@@ -0,0 +1,189 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/).
7
+
8
+ ## [1.6.0] - 2026-03-26
9
+
10
+ ### Added
11
+
12
+ - `.zenodo.json` for Zenodo-GitHub integration to enable DOI generation for releases.
13
+ - `CHANGELOG.md` replacing the previous `HISTORY.rst` file, now following the
14
+ [Keep a Changelog](https://keepachangelog.com/) convention.
15
+ - Dataset processing script for the DUD-E (Directory of Useful Decoys - Enhanced) multi-target
16
+ virtual screening benchmark with 102 protein targets and tri-state labels.
17
+ - Additional MUV helper script (`create_graph_datasets__muv_.py`).
18
+
19
+ ### Changed
20
+
21
+ - Dropped Python 3.8 support; minimum supported version is now Python 3.9.
22
+ - Substantially reworked the MUV dataset processing script to support multi-target classification
23
+ with tri-state labels (active/decoy/no data) across 17 biological targets.
24
+ - Rewrote the BBBP dataset processing script with proper documentation, metadata, and updated
25
+ data source handling.
26
+ - Updated DUD-E compound count in `metadata.yml` (1,111,394 to 400,040).
27
+ - Added missing source reference for the BBBP dataset in `metadata.yml`.
28
+
29
+ ### Removed
30
+
31
+ - `HISTORY.rst` (replaced by `CHANGELOG.md`).
32
+
33
+ ## [1.5.0] - 2025-11-27
34
+
35
+ ### Added
36
+
37
+ - New `cmdata stats` command which downloads a given dataset to compute common
38
+ statistics such as the distribution of elements, graph sizes, and the most common motifs.
39
+
40
+ ## [1.4.1] - 2025-11-10
41
+
42
+ ### Changed
43
+
44
+ - Changed the way in which the `--help` string is being printed to be more informative.
45
+
46
+ ### Fixed
47
+
48
+ - Changed the way in which the Nextcloud remote accesses the public endpoint which has changed for
49
+ the transition to NextcloudHub 10.0. This had been a breaking change which prevented accessing the
50
+ remote file share location at all.
51
+
52
+ ## [1.4.0] - 2025-10-06
53
+
54
+ ### Added
55
+
56
+ - Implemented **StreamingDataset** architecture for memory-efficient access to large molecular datasets:
57
+ - `SmilesDataset`: Lazy-loading of SMILES strings from CSV files with minimal memory footprint.
58
+ - `XyzDataset`: Lazy-loading of 3D molecular structures from XYZ file bundles, supporting multiple
59
+ format parsers (default, qm9, hopv15).
60
+ - `GraphDataset`: On-the-fly conversion from raw molecular representations (SMILES or XYZ) to graph dicts.
61
+ - Automatic detection of dataset format (SMILES vs XYZ) for transparent handling.
62
+ - Sequential mode (`num_workers=0`) for optimal performance with typical molecules (~2000 mol/s).
63
+ - Parallel mode (`num_workers>0`) with multi-process architecture for complex molecules or custom processing.
64
+ - Deadlock-free producer-collector-worker design that maintains dataset order while enabling true CPU parallelism.
65
+ - `ShuffleDataset`: Approximate shuffling using fixed-size buffer for training with shuffled data
66
+ while maintaining low memory usage.
67
+ - Comprehensive streaming datasets documentation (`docs/api_streaming_datasets.md`) covering:
68
+ - Motivation and use cases for streaming vs pre-processed datasets.
69
+ - Detailed usage examples for all streaming dataset classes.
70
+ - Performance considerations and when to use sequential vs parallel processing.
71
+ - Integration with deep learning frameworks and training workflows.
72
+ - Guidance on choosing between SMILES and XYZ formats.
73
+ - Architecture Decision Record (`docs/architecture_decisions/004_streaming_datasets.md`) documenting:
74
+ - Design rationale for streaming architecture.
75
+ - Detailed explanation of parallel processing implementation and deadlock prevention.
76
+ - Trade-offs between streaming and pre-processed datasets.
77
+ - Auto-detection mechanism for SMILES vs XYZ datasets.
78
+ - Comprehensive unit tests for streaming datasets:
79
+ - `tests/test_dataset.py`: Core functionality tests for SmilesDataset, XyzDataset, GraphDataset, and ShuffleDataset.
80
+ - `tests/test_xyz_dataset.py`: XYZ-specific functionality and format parser tests.
81
+ - `tests/test_xyz_bundle.py`: XYZ bundle file handling tests.
82
+ - `tests/test_dataset_benchmark.py`: Performance benchmarks for sequential vs parallel processing modes.
83
+
84
+ ### Changed
85
+
86
+ - Updated existing tests (`test_docs.py`, `test_main.py`, `test_web.py`) to accommodate streaming dataset functionality.
87
+
88
+ ### Removed
89
+
90
+ - Deprecated `tests/test_datasets.py` in favor of new dataset-specific test files.
91
+
92
+ ## [1.3.0] - 2025-09-12
93
+
94
+ ### Added
95
+
96
+ - `HOPV15_exp` dataset which contains experimental values for organic photovoltaic materials.
97
+ - Missing target descriptions for the QM9 dataset.
98
+ - `melting_point` dataset which contains melting points for small organic molecules.
99
+
100
+ ### Changed
101
+
102
+ - Minimum required version of pycomex to `0.23.0` to support the most recent features
103
+ such as the caching system which has also been implemented in the dataset processing scripts.
104
+ - CLI logo displayed at the beginning of the help message to "CMDATA" in another ASCII font
105
+ and added a logo image in ANSI art.
106
+
107
+ ## [1.2.1] - 2025-09-22
108
+
109
+ ### Changed
110
+
111
+ - Modified the syntax of type annotations so that the package is now compatible with
112
+ Python 3.9 through Python 3.12.
113
+ - Using `nox` now for the testing sessions instead of `tox` due to the much faster uv backend
114
+ to create the virtual environments.
115
+
116
+ ## [1.2.0] - 2025-09-04
117
+
118
+ ### Added
119
+
120
+ - `CLAUDE.md` file which contains information that can be used by AI agents such as
121
+ Claude to understand and work with the package.
122
+ - `remote show` command which displays useful information for the currently registered
123
+ file share location such as the URL and additional parameters.
124
+ - `remote diff` command which allows comparing the local and remote file share versions
125
+ of the metadata.yml file and prints the difference to the console.
126
+ - `metadata diff` command in the manage CLI which allows comparing the local and remote
127
+ versions of the metadata.yml file.
128
+ - `tadf` dataset associated with OLED design.
129
+
130
+ ### Changed
131
+
132
+ - Default SSL verification in the `web.py` module set to `True` to avoid security issues
133
+ when downloading files from the internet.
134
+
135
+ ## [1.1.2] - 2025-09-01
136
+
137
+ ### Changed
138
+
139
+ - Default template for the `config.toml` file to include commented out example values for the
140
+ Nextcloud remote file share configuration and to fix the default download location.
141
+
142
+ ## [1.1.1] - 2025-07-07
143
+
144
+ ### Added
145
+
146
+ - `prettytable` as a dependency to create markdown tables in the documentation.
147
+ - `docs` command group in the manage CLI to manage the documentation:
148
+ - `collect-datasets` which collects all datasets listed in the metadata.yml file and creates
149
+ a new markdown docs file with a table containing all those datasets.
150
+
151
+ ### Changed
152
+
153
+ - `list` command now also prints the verbose name / short description of the datasets.
154
+
155
+ ## [1.1.0] - 2025-07-07
156
+
157
+ ### Added
158
+
159
+ - `AGENTS.md` file which contains information that can be used by AI agents such as
160
+ ChatGPT Codex to understand and work with the package.
161
+ - `manage.py` script which exposes an additional command line interface for the management
162
+ and maintenance of the database:
163
+ - `metadata` command group to interact with the local and remote version of the metadata.yml file.
164
+ - `dataset` command group used to trigger the creation and upload of the local datasets.
165
+ - `remote` command group in the `cmdata` CLI:
166
+ - `upload` command to upload arbitrary files to the file share server.
167
+ - New datasets:
168
+ - `skin_irritation`: binary classification dataset on skin irritation.
169
+ - `skin_sensitizers`: binary classification dataset on skin sensitization.
170
+ - `elanos_bp`: regression of boiling point.
171
+ - `elanos_vp`: regression of vapor pressure.
172
+
173
+ ## [1.0.0] - 2025-05-01
174
+
175
+ - First official release of the package.
176
+
177
+ ## [0.2.0] - 2024-12-12
178
+
179
+ ### Added
180
+
181
+ - `HISTORY.rst` to start a changelog of the changes for each version.
182
+ - `DEVELOP.rst` which contains information about the development environment of the project.
183
+ - `ruff.toml` file to configure the Ruff linter and code formatter.
184
+
185
+ ### Changed
186
+
187
+ - Replaced the `tox.ini` with a `tox.toml` file.
188
+ - Ported the `pyproject.toml` file from using Poetry to using `uv` and `hatchling` as
189
+ the build backend.