ontology-loader 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ontology_loader-0.1.1/PKG-INFO +103 -0
- ontology_loader-0.1.1/README.md +73 -0
- ontology_loader-0.1.1/pyproject.toml +92 -0
- ontology_loader-0.1.1/src/ontology_loader/__init__.py +8 -0
- ontology_loader-0.1.1/src/ontology_loader/cli.py +53 -0
- ontology_loader-0.1.1/src/ontology_loader/mongo_db_config.py +21 -0
- ontology_loader-0.1.1/src/ontology_loader/mongodb_loader.py +117 -0
- ontology_loader-0.1.1/src/ontology_loader/ontology_load_controller.py +72 -0
- ontology_loader-0.1.1/src/ontology_loader/ontology_processor.py +109 -0
- ontology_loader-0.1.1/src/ontology_loader/reporter.py +42 -0
- ontology_loader-0.1.1/src/ontology_loader/utils.py +17 -0
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: ontology-loader
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: ontology_loader
|
|
5
|
+
License: MIT
|
|
6
|
+
Author: Sierra Moxon
|
|
7
|
+
Author-email: smoxon@lbl.gov
|
|
8
|
+
Requires-Python: >=3.11,<4.0
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Requires-Dist: chromadb (>=0.5.15,<0.6.0)
|
|
15
|
+
Requires-Dist: click (>=8.1.7,<9.0.0)
|
|
16
|
+
Requires-Dist: curies (>=0.8.0,<0.9.0)
|
|
17
|
+
Requires-Dist: linkml-store (>=0.2.6,<0.3.0)
|
|
18
|
+
Requires-Dist: mongomock (>=4.3.0,<5.0.0)
|
|
19
|
+
Requires-Dist: motor (>=3.6.0,<4.0.0)
|
|
20
|
+
Requires-Dist: neo4j (>=5.25.0,<6.0.0)
|
|
21
|
+
Requires-Dist: nmdc-schema
|
|
22
|
+
Requires-Dist: oaklib (>=0.6.16,<0.7.0)
|
|
23
|
+
Requires-Dist: pymongo (>=4.9.1,<5.0.0)
|
|
24
|
+
Requires-Dist: pystow (>=0.5.6,<0.6.0)
|
|
25
|
+
Requires-Dist: pytest (>=8.3.4,<9.0.0)
|
|
26
|
+
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
|
27
|
+
Requires-Dist: tiktoken (>=0.8.0,<0.9.0)
|
|
28
|
+
Requires-Dist: tox (>=4.24.1,<5.0.0)
|
|
29
|
+
Description-Content-Type: text/markdown
|
|
30
|
+
|
|
31
|
+
## ontology_loader
|
|
32
|
+
|
|
33
|
+
Suite of tools to configure and load an ontology from the OboFoundary into the data object for OntologyClass as
|
|
34
|
+
specified by NMDC schema.
|
|
35
|
+
|
|
36
|
+
## Development Environment
|
|
37
|
+
|
|
38
|
+
#### Pre-requisites
|
|
39
|
+
|
|
40
|
+
- >=Python 3.9
|
|
41
|
+
- Poetry
|
|
42
|
+
- Docker
|
|
43
|
+
- MongoDB
|
|
44
|
+
- NMDC materialized schema
|
|
45
|
+
- ENV variable for MONGO_PASSWORD (or pass it in via the cli/runner itself directly)
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
|
|
49
|
+
% docker pull mongo
|
|
50
|
+
% docker run -d --name mongodb-container -p 27017:27017 mongo
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
#### Basic mongosh commands
|
|
54
|
+
```bash
|
|
55
|
+
% docker ps
|
|
56
|
+
% docker exec -it [mongodb-container-id] bash
|
|
57
|
+
% mongosh
|
|
58
|
+
% show dbs
|
|
59
|
+
% use test
|
|
60
|
+
% db.ontology_class_set.find().pretty()
|
|
61
|
+
% db.ontology_relation_set.find().pretty()
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
#### Command line
|
|
65
|
+
```bash
|
|
66
|
+
% poetry install
|
|
67
|
+
% poetry run ontology_loader --help
|
|
68
|
+
% poetry run ontology_loader --source-ontology "envo"
|
|
69
|
+
% poetry run ontology_loader --source-ontology "go"
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
#### Running the tests
|
|
73
|
+
```bash
|
|
74
|
+
% make test
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
#### Running the linter
|
|
78
|
+
```bash
|
|
79
|
+
% make lint
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
#### python example usage
|
|
83
|
+
```bash
|
|
84
|
+
pip install nmdc-ontology-loader
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
from nmdc_ontology_loader.ontology_loader import OntologyLoader
|
|
89
|
+
import tempfile
|
|
90
|
+
|
|
91
|
+
def test_load_ontology():
|
|
92
|
+
"""Test the load_ontology method."""
|
|
93
|
+
ontology_loader = OntologyLoader(
|
|
94
|
+
source_ontology="envo",
|
|
95
|
+
output_directory=tempfile.gettempdir(),
|
|
96
|
+
generate_reports=True,
|
|
97
|
+
)
|
|
98
|
+
ontology_loader.load_ontology()
|
|
99
|
+
assert ontology_loader.ontology_class_set
|
|
100
|
+
assert ontology_loader.ontology_relation_set
|
|
101
|
+
assert ontology_loader.ontology_class_set.count() > 0
|
|
102
|
+
assert ontology_loader.ontology_relation_set.count() > 0
|
|
103
|
+
```
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
## ontology_loader
|
|
2
|
+
|
|
3
|
+
Suite of tools to configure and load an ontology from the OboFoundary into the data object for OntologyClass as
|
|
4
|
+
specified by NMDC schema.
|
|
5
|
+
|
|
6
|
+
## Development Environment
|
|
7
|
+
|
|
8
|
+
#### Pre-requisites
|
|
9
|
+
|
|
10
|
+
- >=Python 3.9
|
|
11
|
+
- Poetry
|
|
12
|
+
- Docker
|
|
13
|
+
- MongoDB
|
|
14
|
+
- NMDC materialized schema
|
|
15
|
+
- ENV variable for MONGO_PASSWORD (or pass it in via the cli/runner itself directly)
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
|
|
19
|
+
% docker pull mongo
|
|
20
|
+
% docker run -d --name mongodb-container -p 27017:27017 mongo
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
#### Basic mongosh commands
|
|
24
|
+
```bash
|
|
25
|
+
% docker ps
|
|
26
|
+
% docker exec -it [mongodb-container-id] bash
|
|
27
|
+
% mongosh
|
|
28
|
+
% show dbs
|
|
29
|
+
% use test
|
|
30
|
+
% db.ontology_class_set.find().pretty()
|
|
31
|
+
% db.ontology_relation_set.find().pretty()
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
#### Command line
|
|
35
|
+
```bash
|
|
36
|
+
% poetry install
|
|
37
|
+
% poetry run ontology_loader --help
|
|
38
|
+
% poetry run ontology_loader --source-ontology "envo"
|
|
39
|
+
% poetry run ontology_loader --source-ontology "go"
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
#### Running the tests
|
|
43
|
+
```bash
|
|
44
|
+
% make test
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
#### Running the linter
|
|
48
|
+
```bash
|
|
49
|
+
% make lint
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
#### python example usage
|
|
53
|
+
```bash
|
|
54
|
+
pip install nmdc-ontology-loader
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
from nmdc_ontology_loader.ontology_loader import OntologyLoader
|
|
59
|
+
import tempfile
|
|
60
|
+
|
|
61
|
+
def test_load_ontology():
|
|
62
|
+
"""Test the load_ontology method."""
|
|
63
|
+
ontology_loader = OntologyLoader(
|
|
64
|
+
source_ontology="envo",
|
|
65
|
+
output_directory=tempfile.gettempdir(),
|
|
66
|
+
generate_reports=True,
|
|
67
|
+
)
|
|
68
|
+
ontology_loader.load_ontology()
|
|
69
|
+
assert ontology_loader.ontology_class_set
|
|
70
|
+
assert ontology_loader.ontology_relation_set
|
|
71
|
+
assert ontology_loader.ontology_class_set.count() > 0
|
|
72
|
+
assert ontology_loader.ontology_relation_set.count() > 0
|
|
73
|
+
```
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
version = "0.1.1"
|
|
3
|
+
description = "ontology_loader"
|
|
4
|
+
authors = ["Sierra Moxon <smoxon@lbl.gov>"]
|
|
5
|
+
license = "MIT"
|
|
6
|
+
readme = "README.md"
|
|
7
|
+
name = "ontology-loader"
|
|
8
|
+
packages = [{ include = "ontology_loader", from = "src" }]
|
|
9
|
+
|
|
10
|
+
[tool.poetry.dependencies]
|
|
11
|
+
python = "^3.11"
|
|
12
|
+
pymongo = "^4.9.1"
|
|
13
|
+
nmdc-schema = "*"
|
|
14
|
+
oaklib = "^0.6.16"
|
|
15
|
+
click = "^8.1.7"
|
|
16
|
+
curies = "^0.8.0"
|
|
17
|
+
linkml-store = "^0.2.6"
|
|
18
|
+
tiktoken = "^0.8.0"
|
|
19
|
+
tabulate = "^0.9.0"
|
|
20
|
+
chromadb = "^0.5.15"
|
|
21
|
+
neo4j = "^5.25.0"
|
|
22
|
+
motor = "^3.6.0"
|
|
23
|
+
pystow = "^0.5.6"
|
|
24
|
+
tox = "^4.24.1"
|
|
25
|
+
pytest = "^8.3.4"
|
|
26
|
+
mongomock = "^4.3.0"
|
|
27
|
+
|
|
28
|
+
[tool.poetry.group.dev.dependencies]
|
|
29
|
+
pytest = {version = ">=7.1.2"}
|
|
30
|
+
tox = {version = ">=3.25.1"}
|
|
31
|
+
pre-commit = {version = ">=3.3.3"}
|
|
32
|
+
|
|
33
|
+
[tool.poetry.group.docs]
|
|
34
|
+
optional = true
|
|
35
|
+
|
|
36
|
+
[tool.poetry.group.docs.dependencies]
|
|
37
|
+
sphinx = {version = ">=6.1.3"}
|
|
38
|
+
sphinx-rtd-theme = {version = ">=1.0.0"}
|
|
39
|
+
sphinx-autodoc-typehints = {version = ">=1.2.0"}
|
|
40
|
+
sphinx-click = {version = ">=4.3.0"}
|
|
41
|
+
myst-parser = {version = ">=0.18.1"}
|
|
42
|
+
|
|
43
|
+
[tool.poetry.scripts]
|
|
44
|
+
ontology_loader = "ontology_loader.cli:main"
|
|
45
|
+
|
|
46
|
+
[tool.poetry-dynamic-versioning]
|
|
47
|
+
enable = false
|
|
48
|
+
vcs = "git"
|
|
49
|
+
style = "pep440"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
[tool.black]
|
|
53
|
+
line-length = 120
|
|
54
|
+
target-version = ["py38", "py39", "py310"]
|
|
55
|
+
|
|
56
|
+
[tool.ruff]
|
|
57
|
+
extend-ignore = [
|
|
58
|
+
"D211", # `no-blank-line-before-class`
|
|
59
|
+
"D212", # `multi-line-summary-first-line`
|
|
60
|
+
]
|
|
61
|
+
line-length = 120
|
|
62
|
+
|
|
63
|
+
# Allow autofix for all enabled rules (when `--fix`) is provided.
|
|
64
|
+
fixable = ["ALL"]
|
|
65
|
+
|
|
66
|
+
# Select or ignore from https://beta.ruff.rs/docs/rules/
|
|
67
|
+
select = [
|
|
68
|
+
"B", # bugbear
|
|
69
|
+
"D", # pydocstyle
|
|
70
|
+
"E", # pycodestyle errors
|
|
71
|
+
"F", # Pyflakes
|
|
72
|
+
"I", # isort
|
|
73
|
+
"S", # flake8-bandit
|
|
74
|
+
"W", # Warning
|
|
75
|
+
]
|
|
76
|
+
|
|
77
|
+
unfixable = []
|
|
78
|
+
target-version = "py311"
|
|
79
|
+
|
|
80
|
+
[tool.ruff.mccabe]
|
|
81
|
+
# Unlike Flake8, default to a complexity level of 10.
|
|
82
|
+
max-complexity = 10
|
|
83
|
+
|
|
84
|
+
[tool.codespell]
|
|
85
|
+
skip = "*.po,*.ts,.git,pyproject.toml"
|
|
86
|
+
count = ""
|
|
87
|
+
quiet-level = 3
|
|
88
|
+
# ignore-words-list = ""
|
|
89
|
+
|
|
90
|
+
[build-system]
|
|
91
|
+
requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning"]
|
|
92
|
+
build-backend = "poetry_dynamic_versioning.backend"
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Cli methods for ontology loading from the command line."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
import click
|
|
7
|
+
|
|
8
|
+
from src.ontology_loader.ontology_load_controller import OntologyLoaderController
|
|
9
|
+
|
|
10
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@click.command()
|
|
15
|
+
@click.option("--db-host", default=os.getenv("MONGO_HOST", "localhost"), help="MongoDB connection URL")
|
|
16
|
+
@click.option("--db-port", default=int(os.getenv("MONGO_PORT", 27018)), help="MongoDB connection port")
|
|
17
|
+
@click.option("--db-name", default=os.getenv("MONGO_DB", "nmdc"), help="Database name")
|
|
18
|
+
@click.option("--db-user", default=os.getenv("MONGO_USER", "admin"), help="Database user")
|
|
19
|
+
@click.option("--db-password", default=os.getenv("MONGO_PASSWORD", ""), help="Database password")
|
|
20
|
+
@click.option("--source-ontology", default="envo", help="Lowercase ontology prefix, e.g., envo, go, uberon, etc.")
|
|
21
|
+
@click.option("--output-directory", default=None, help="Output directory for reporting, default is /tmp")
|
|
22
|
+
@click.option("--generate-reports", default=True, help="Generate reports")
|
|
23
|
+
def cli(db_host, db_port, db_name, db_user, db_password, source_ontology, output_directory, generate_reports):
|
|
24
|
+
"""
|
|
25
|
+
CLI entry point for the ontology loader.
|
|
26
|
+
|
|
27
|
+
:param db_host: MongoDB connection URL, default is localhost
|
|
28
|
+
:param db_port: MongoDB connection port, default is 27018
|
|
29
|
+
:param db_name: Database name, default is nmdc
|
|
30
|
+
:param db_user: Database user, default is admin
|
|
31
|
+
:param db_password: Database password, default is blank
|
|
32
|
+
:param source_ontology: Lowercase ontology prefix, e.g., envo, go, uberon, etc.
|
|
33
|
+
:param output_directory: Output directory for reporting, default is /tmp
|
|
34
|
+
:param generate_reports: Generate reports or not, default is True
|
|
35
|
+
"""
|
|
36
|
+
logger.info(f"Processing ontology: {source_ontology}")
|
|
37
|
+
|
|
38
|
+
# Initialize the MongoDB Loader
|
|
39
|
+
loader = OntologyLoaderController(
|
|
40
|
+
db_host=db_host,
|
|
41
|
+
db_port=db_port,
|
|
42
|
+
db_name=db_name,
|
|
43
|
+
db_user=db_user,
|
|
44
|
+
db_password=db_password,
|
|
45
|
+
source_ontology=source_ontology,
|
|
46
|
+
output_directory=output_directory,
|
|
47
|
+
generate_reports=generate_reports,
|
|
48
|
+
)
|
|
49
|
+
loader.run_ontology_loader()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
if __name__ == "__main__":
|
|
53
|
+
cli()
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Singleton class to store default parameters accessed from client environment or sensible defaults."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class MongoDBConfig:
|
|
7
|
+
|
|
8
|
+
"""Singleton class to store default parameters accessed from client environment or sensible defaults."""
|
|
9
|
+
|
|
10
|
+
_instance = None
|
|
11
|
+
|
|
12
|
+
def __new__(cls):
|
|
13
|
+
"""Create a new instance of MongoDBConfig if it does not exist."""
|
|
14
|
+
if cls._instance is None:
|
|
15
|
+
cls._instance = super(MongoDBConfig, cls).__new__(cls)
|
|
16
|
+
cls._instance.db_name = os.getenv("MONGO_DB", "nmdc")
|
|
17
|
+
cls._instance.db_user = os.getenv("MONGO_USER", "admin")
|
|
18
|
+
cls._instance.db_password = os.getenv("MONGO_PASSWORD", "")
|
|
19
|
+
cls._instance.db_host = os.getenv("MONGO_HOST", "localhost")
|
|
20
|
+
cls._instance.db_port = int(os.getenv("MONGO_PORT", 27018))
|
|
21
|
+
return cls._instance
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""Load and process ontology terms and relations into MongoDB."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from dataclasses import asdict, fields
|
|
5
|
+
from typing import List, Optional
|
|
6
|
+
|
|
7
|
+
from linkml_runtime import SchemaView
|
|
8
|
+
from linkml_store import Client
|
|
9
|
+
from nmdc_schema.nmdc import OntologyClass
|
|
10
|
+
|
|
11
|
+
from src.ontology_loader.mongo_db_config import MongoDBConfig
|
|
12
|
+
from src.ontology_loader.reporter import Report
|
|
13
|
+
|
|
14
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class MongoDBLoader:
|
|
19
|
+
|
|
20
|
+
"""MongoDB Loader class to upsert OntologyClass objects and insert OntologyRelation objects into MongoDB."""
|
|
21
|
+
|
|
22
|
+
def __init__(self, schema_view: Optional[SchemaView] = None):
|
|
23
|
+
"""
|
|
24
|
+
Initialize MongoDB using LinkML-store's client.
|
|
25
|
+
|
|
26
|
+
:param schema_view: LinkML SchemaView for ontology
|
|
27
|
+
:param db_config: Singleton configuration for MongoDB connection
|
|
28
|
+
"""
|
|
29
|
+
db_config = MongoDBConfig()
|
|
30
|
+
self.schema_view = schema_view
|
|
31
|
+
self.db_host = db_config.db_host
|
|
32
|
+
self.db_port = db_config.db_port
|
|
33
|
+
self.db_name = db_config.db_name
|
|
34
|
+
self.db_user = db_config.db_user
|
|
35
|
+
self.db_password = db_config.db_password
|
|
36
|
+
|
|
37
|
+
# TODO: it might be that we are providing the connection string "incorrectly" (or differently) in linkml-store
|
|
38
|
+
# this exists so that the default env parameters in nmdc-runtime can be used as they are currently
|
|
39
|
+
# specified.
|
|
40
|
+
if self.db_host.startswith("mongodb://"):
|
|
41
|
+
# mongodb://mongo:27017
|
|
42
|
+
self.db_host = self.db_host.replace("mongodb://", "")
|
|
43
|
+
self.db_port = int(self.db_host.split(":")[1])
|
|
44
|
+
self.db_host = self.db_host.split(":")[0]
|
|
45
|
+
|
|
46
|
+
self.handle = (
|
|
47
|
+
f"mongodb://{self.db_user}:{self.db_password}@{self.db_host}:{self.db_port}/{self.db_name}?authSource=admin"
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
logger.info(self.handle)
|
|
51
|
+
self.client = Client(handle=self.handle)
|
|
52
|
+
|
|
53
|
+
# Explicitly set the correct database
|
|
54
|
+
self.db = self.client.attach_database(
|
|
55
|
+
handle=self.handle, # Ensure correct database is used
|
|
56
|
+
)
|
|
57
|
+
logger.info(f"Connected to MongoDB: {self.db}")
|
|
58
|
+
|
|
59
|
+
def upsert_ontology_classes(
|
|
60
|
+
self, ontology_classes: List[OntologyClass], collection_name: str = "ontology_class_set"
|
|
61
|
+
):
|
|
62
|
+
"""
|
|
63
|
+
Upsert each OntologyClass object into the 'ontology_class_set' collection and return reports.
|
|
64
|
+
|
|
65
|
+
:param ontology_classes: A list of OntologyClass objects to upsert
|
|
66
|
+
:param collection_name: The name of the MongoDB collection to upsert into.
|
|
67
|
+
:return: A tuple of two Report objects: one for updates and one for insertions.
|
|
68
|
+
"""
|
|
69
|
+
collection = self.db.create_collection(collection_name, recreate_if_exists=False)
|
|
70
|
+
collection.index("id", unique=False)
|
|
71
|
+
logging.info(collection_name)
|
|
72
|
+
|
|
73
|
+
if not ontology_classes:
|
|
74
|
+
logging.info("No OntologyClass objects to upsert.")
|
|
75
|
+
return Report("update", [], []), Report("insert", [], [])
|
|
76
|
+
|
|
77
|
+
updates_report = []
|
|
78
|
+
insertions_report = []
|
|
79
|
+
ontology_fields = [field.name for field in fields(OntologyClass)]
|
|
80
|
+
|
|
81
|
+
for obj in ontology_classes:
|
|
82
|
+
filter_criteria = {"id": obj.id}
|
|
83
|
+
query_result = collection.find(filter_criteria)
|
|
84
|
+
existing_doc = query_result.rows[0] if query_result.num_rows > 0 else None
|
|
85
|
+
|
|
86
|
+
if existing_doc:
|
|
87
|
+
updated_fields = {
|
|
88
|
+
key: getattr(obj, key) for key in ontology_fields if getattr(obj, key) != existing_doc.get(key)
|
|
89
|
+
}
|
|
90
|
+
if updated_fields:
|
|
91
|
+
collection.upsert([asdict(obj)], filter_fields=["id"], update_fields=list(updated_fields.keys()))
|
|
92
|
+
logging.debug(f"Updated existing OntologyClass (id={obj.id}): {updated_fields}")
|
|
93
|
+
updates_report.append([obj.id] + [getattr(obj, field, "") for field in ontology_fields])
|
|
94
|
+
else:
|
|
95
|
+
logging.debug(f"No changes detected for OntologyClass (id={obj.id}). Skipping update.")
|
|
96
|
+
else:
|
|
97
|
+
collection.upsert([asdict(obj)], filter_fields=["id"], update_fields=ontology_fields)
|
|
98
|
+
logging.debug(f"Inserted new OntologyClass (id={obj.id}).")
|
|
99
|
+
insertions_report.append([obj.id] + [getattr(obj, field, "") for field in ontology_fields])
|
|
100
|
+
|
|
101
|
+
logging.info(f"Finished upserting {len(ontology_classes)} OntologyClass objects into MongoDB.")
|
|
102
|
+
return Report("update", updates_report, ontology_fields), Report("insert", insertions_report, ontology_fields)
|
|
103
|
+
|
|
104
|
+
def insert_ontology_relations(self, ontology_relations, collection_name: str = "ontology_relation_set"):
|
|
105
|
+
"""
|
|
106
|
+
Insert each OntologyClass object into the 'ontology_class_set' collection.
|
|
107
|
+
|
|
108
|
+
:param ontology_relations: A list of OntologyRelation objects to insert
|
|
109
|
+
:param collection_name: The name of the MongoDB collection to insert into.
|
|
110
|
+
|
|
111
|
+
"""
|
|
112
|
+
collection = self.db.create_collection(collection_name, recreate_if_exists=False)
|
|
113
|
+
if ontology_relations:
|
|
114
|
+
for relation in ontology_relations:
|
|
115
|
+
collection.insert(relation)
|
|
116
|
+
else:
|
|
117
|
+
logger.info("No OntologyRelation objects to insert.")
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Cli methods for ontology loading from the command line."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import tempfile
|
|
5
|
+
|
|
6
|
+
from src.ontology_loader.mongodb_loader import MongoDBLoader
|
|
7
|
+
from src.ontology_loader.ontology_processor import OntologyProcessor
|
|
8
|
+
from src.ontology_loader.reporter import ReportWriter
|
|
9
|
+
from src.ontology_loader.utils import load_yaml_from_package
|
|
10
|
+
|
|
11
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class OntologyLoaderController:
|
|
16
|
+
|
|
17
|
+
"""
|
|
18
|
+
OntologyLoader runner class for MongoDBLoader.
|
|
19
|
+
|
|
20
|
+
This class is responsible for running the MongoDBLoader with the given parameters from code other than
|
|
21
|
+
the command line support offered through the cli.py click interface.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
source_ontology: str = "envo",
|
|
27
|
+
output_directory: str = tempfile.gettempdir(),
|
|
28
|
+
generate_reports: bool = True,
|
|
29
|
+
):
|
|
30
|
+
"""Set the parameters for the OntologyLoader."""
|
|
31
|
+
self.source_ontology = source_ontology
|
|
32
|
+
self.output_directory = output_directory
|
|
33
|
+
self.generate_reports = generate_reports
|
|
34
|
+
|
|
35
|
+
def run_ontology_loader(self):
|
|
36
|
+
"""Run the OntologyLoader and insert data into MongoDB."""
|
|
37
|
+
logging.info(f"Processing ontology: {self.source_ontology}")
|
|
38
|
+
|
|
39
|
+
# Load Schema View
|
|
40
|
+
nmdc_sv = load_yaml_from_package("nmdc_schema", "nmdc_materialized_patterns.yaml")
|
|
41
|
+
# Initialize the Ontology Processor
|
|
42
|
+
processor = OntologyProcessor(self.source_ontology)
|
|
43
|
+
|
|
44
|
+
# Process ontology terms and return a list of OntologyClass dicts produced by linkml json dumper as dict
|
|
45
|
+
ontology_classes = processor.get_terms_and_metadata()
|
|
46
|
+
|
|
47
|
+
logger.info(f"Extracted {len(ontology_classes)} ontology classes.")
|
|
48
|
+
|
|
49
|
+
# Process ontology relations and create OntologyRelation objects
|
|
50
|
+
ontology_relations = processor.get_relations_closure()
|
|
51
|
+
|
|
52
|
+
logger.info(f"Extracted {len(ontology_relations)} ontology relations.")
|
|
53
|
+
|
|
54
|
+
# Connect to MongoDB
|
|
55
|
+
db_manager = MongoDBLoader(schema_view=nmdc_sv)
|
|
56
|
+
# Insert data into MongoDB
|
|
57
|
+
updates_report, insertions_report = db_manager.upsert_ontology_classes(ontology_classes)
|
|
58
|
+
db_manager.insert_ontology_relations(ontology_relations)
|
|
59
|
+
db_manager.insert_ontology_relations(ontology_relations)
|
|
60
|
+
|
|
61
|
+
# Optionally write job reports
|
|
62
|
+
if self.generate_reports:
|
|
63
|
+
ReportWriter.write_reports(
|
|
64
|
+
reports=[updates_report, insertions_report], output_format="tsv", output_directory=self.output_directory
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
logger.info("Processing complete. Data inserted into MongoDB.")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
if __name__ == "__main__":
|
|
71
|
+
"""Run the OntologyLoader."""
|
|
72
|
+
OntologyLoaderController().run_ontology_loader()
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""Ontology Processor class to process ontology terms and relations."""
|
|
2
|
+
|
|
3
|
+
import gzip
|
|
4
|
+
import logging
|
|
5
|
+
import shutil
|
|
6
|
+
|
|
7
|
+
import pystow
|
|
8
|
+
from linkml_runtime.dumpers import json_dumper
|
|
9
|
+
from nmdc_schema.nmdc import OntologyClass, OntologyRelation
|
|
10
|
+
from oaklib import get_adapter
|
|
11
|
+
|
|
12
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class OntologyProcessor:
|
|
17
|
+
|
|
18
|
+
"""Ontology Processor class to process ontology terms and relations."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, ontology: str):
|
|
21
|
+
"""
|
|
22
|
+
Initialize the OntologyProcessor with a given SQLite ontology.
|
|
23
|
+
|
|
24
|
+
:param ontology: The ontology prefix (e.g., "envo", "go", "uberon", etc.)
|
|
25
|
+
|
|
26
|
+
"""
|
|
27
|
+
self.ontology = ontology
|
|
28
|
+
self.ontology_db_path = self.download_and_prepare_ontology()
|
|
29
|
+
self.adapter = get_adapter(f"sqlite:{self.ontology_db_path}")
|
|
30
|
+
self.adapter.precompute_lookups() # Optimize lookups
|
|
31
|
+
|
|
32
|
+
def download_and_prepare_ontology(self):
|
|
33
|
+
"""Download and prepare the ontology database for processing."""
|
|
34
|
+
logger.info(f"Preparing ontology: {self.ontology}")
|
|
35
|
+
|
|
36
|
+
# Get the ontology-specific pystow directory
|
|
37
|
+
source_ontology_module = pystow.module(self.ontology).base # Example: ~/.pystow/envo
|
|
38
|
+
|
|
39
|
+
# If the directory exists, remove it and all its contents
|
|
40
|
+
if source_ontology_module.exists():
|
|
41
|
+
logger.info(f"Removing existing pystow directory for {self.ontology}: {source_ontology_module}")
|
|
42
|
+
shutil.rmtree(source_ontology_module)
|
|
43
|
+
|
|
44
|
+
# Define ontology URL
|
|
45
|
+
ontology_db_url_prefix = "https://s3.amazonaws.com/bbop-sqlite/"
|
|
46
|
+
ontology_db_url_suffix = ".db.gz"
|
|
47
|
+
ontology_url = ontology_db_url_prefix + self.ontology + ontology_db_url_suffix
|
|
48
|
+
|
|
49
|
+
# Define paths (download to the module-specific directory)
|
|
50
|
+
compressed_path = pystow.ensure(self.ontology, f"{self.ontology}.db.gz", url=ontology_url)
|
|
51
|
+
decompressed_path = compressed_path.with_suffix("") # Remove .gz to get .db file
|
|
52
|
+
|
|
53
|
+
# Extract the file if not already extracted
|
|
54
|
+
if not decompressed_path.exists():
|
|
55
|
+
logger.info(f"Extracting {compressed_path} to {decompressed_path}...")
|
|
56
|
+
with gzip.open(compressed_path, "rb") as f_in:
|
|
57
|
+
with open(decompressed_path, "wb") as f_out:
|
|
58
|
+
shutil.copyfileobj(f_in, f_out)
|
|
59
|
+
|
|
60
|
+
logger.info(f"Ontology database is ready at: {decompressed_path}")
|
|
61
|
+
return decompressed_path
|
|
62
|
+
|
|
63
|
+
def get_terms_and_metadata(self):
|
|
64
|
+
"""Retrieve all terms that start with the ontology prefix and return a list of OntologyClass objects."""
|
|
65
|
+
ontology_classes = []
|
|
66
|
+
|
|
67
|
+
for entity in self.adapter.entities():
|
|
68
|
+
if entity.startswith(self.ontology.upper() + ":"):
|
|
69
|
+
ontology_class = OntologyClass(
|
|
70
|
+
id=entity,
|
|
71
|
+
type="nmdc:OntologyClass",
|
|
72
|
+
alternative_names=self.adapter.entity_aliases(entity) or [],
|
|
73
|
+
definition=self.adapter.definition(entity) or "",
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
ontology_classes.append(ontology_class)
|
|
77
|
+
|
|
78
|
+
return ontology_classes
|
|
79
|
+
|
|
80
|
+
def get_relations_closure(self, predicates=None):
|
|
81
|
+
"""
|
|
82
|
+
Retrieve all ontology relations closure for terms.
|
|
83
|
+
|
|
84
|
+
:param predicates: List of predicates to consider (default: ["rdfs:subClassOf", "BFO:0000050"])
|
|
85
|
+
|
|
86
|
+
"""
|
|
87
|
+
predicates = ["rdfs:subClassOf", "BFO:0000050"] if predicates is None else predicates
|
|
88
|
+
ontology_relations = []
|
|
89
|
+
|
|
90
|
+
for entity in self.adapter.entities():
|
|
91
|
+
if entity.startswith(self.ontology.upper() + ":"):
|
|
92
|
+
# Convert generator to list
|
|
93
|
+
ancestors_list = list(self.adapter.ancestors(entity, reflexive=True, predicates=predicates))
|
|
94
|
+
|
|
95
|
+
# Filter to keep only ENVO terms
|
|
96
|
+
filtered_ancestors = list(set(a for a in ancestors_list if a.startswith(self.ontology.upper() + ":")))
|
|
97
|
+
|
|
98
|
+
for ancestor in filtered_ancestors:
|
|
99
|
+
ontology_relation = OntologyRelation(
|
|
100
|
+
subject=entity,
|
|
101
|
+
predicate="is_a", # TODO: fix this to the real predicate that it came with
|
|
102
|
+
object=ancestor,
|
|
103
|
+
type="nmdc:OntologyRelation",
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
# Convert OntologyRelation instance to a dictionary
|
|
107
|
+
ontology_relations.append(json_dumper.to_dict(ontology_relation))
|
|
108
|
+
|
|
109
|
+
return ontology_relations
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Reporting class to handle TSV dumping."""
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import logging
|
|
5
|
+
import tempfile
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import List, Optional
|
|
9
|
+
|
|
10
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class Report:
|
|
16
|
+
|
|
17
|
+
"""Dataclass to hold report data."""
|
|
18
|
+
|
|
19
|
+
report_type: str # "update" or "insert"
|
|
20
|
+
records: List[List[str]]
|
|
21
|
+
headers: List[str]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ReportWriter:
|
|
25
|
+
|
|
26
|
+
"""ReportWriter class to write reports to TSV files."""
|
|
27
|
+
|
|
28
|
+
@staticmethod
|
|
29
|
+
def write_reports(reports: List[Report], output_format: str = "tsv", output_directory: Optional[str] = None):
|
|
30
|
+
"""Write reports to TSV files."""
|
|
31
|
+
if output_directory is None:
|
|
32
|
+
output_directory = Path(tempfile.gettempdir())
|
|
33
|
+
else:
|
|
34
|
+
output_directory = Path(output_directory)
|
|
35
|
+
|
|
36
|
+
for report in reports:
|
|
37
|
+
file_path = output_directory / f"ontology_{report.report_type}s.{output_format}"
|
|
38
|
+
with file_path.open(mode="w", newline="", encoding="utf-8") as f:
|
|
39
|
+
writer = csv.writer(f, delimiter="\t") if output_format == "tsv" else csv.writer(f)
|
|
40
|
+
writer.writerow(["id"] + report.headers)
|
|
41
|
+
writer.writerows(report.records)
|
|
42
|
+
logging.info(f"Report generated: {file_path}")
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Loads a YAML file from a given package."""
|
|
2
|
+
|
|
3
|
+
import importlib.resources
|
|
4
|
+
|
|
5
|
+
from linkml_runtime.utils.schemaview import SchemaView
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def load_yaml_from_package(package: str, filename: str) -> SchemaView:
|
|
9
|
+
"""
|
|
10
|
+
Load a YAML file from a given package.
|
|
11
|
+
|
|
12
|
+
:param package: The package where the YAML file is located (e.g., "nmdc_schema").
|
|
13
|
+
:param filename: The YAML file to load (e.g., "nmdc_materialized_patterns.yaml").
|
|
14
|
+
:return: Parsed YAML data as a Python dictionary.
|
|
15
|
+
"""
|
|
16
|
+
with importlib.resources.files(package).joinpath(filename).open("r") as f:
|
|
17
|
+
return SchemaView(f)
|