ontology-loader 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,103 @@
1
+ Metadata-Version: 2.3
2
+ Name: ontology-loader
3
+ Version: 0.1.1
4
+ Summary: ontology_loader
5
+ License: MIT
6
+ Author: Sierra Moxon
7
+ Author-email: smoxon@lbl.gov
8
+ Requires-Python: >=3.11,<4.0
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Requires-Dist: chromadb (>=0.5.15,<0.6.0)
15
+ Requires-Dist: click (>=8.1.7,<9.0.0)
16
+ Requires-Dist: curies (>=0.8.0,<0.9.0)
17
+ Requires-Dist: linkml-store (>=0.2.6,<0.3.0)
18
+ Requires-Dist: mongomock (>=4.3.0,<5.0.0)
19
+ Requires-Dist: motor (>=3.6.0,<4.0.0)
20
+ Requires-Dist: neo4j (>=5.25.0,<6.0.0)
21
+ Requires-Dist: nmdc-schema
22
+ Requires-Dist: oaklib (>=0.6.16,<0.7.0)
23
+ Requires-Dist: pymongo (>=4.9.1,<5.0.0)
24
+ Requires-Dist: pystow (>=0.5.6,<0.6.0)
25
+ Requires-Dist: pytest (>=8.3.4,<9.0.0)
26
+ Requires-Dist: tabulate (>=0.9.0,<0.10.0)
27
+ Requires-Dist: tiktoken (>=0.8.0,<0.9.0)
28
+ Requires-Dist: tox (>=4.24.1,<5.0.0)
29
+ Description-Content-Type: text/markdown
30
+
31
+ ## ontology_loader
32
+
33
+ Suite of tools to configure and load an ontology from the OboFoundary into the data object for OntologyClass as
34
+ specified by NMDC schema.
35
+
36
+ ## Development Environment
37
+
38
+ #### Pre-requisites
39
+
40
+ - >=Python 3.9
41
+ - Poetry
42
+ - Docker
43
+ - MongoDB
44
+ - NMDC materialized schema
45
+ - ENV variable for MONGO_PASSWORD (or pass it in via the cli/runner itself directly)
46
+
47
+ ```bash
48
+
49
+ % docker pull mongo
50
+ % docker run -d --name mongodb-container -p 27017:27017 mongo
51
+ ```
52
+
53
+ #### Basic mongosh commands
54
+ ```bash
55
+ % docker ps
56
+ % docker exec -it [mongodb-container-id] bash
57
+ % mongosh
58
+ % show dbs
59
+ % use test
60
+ % db.ontology_class_set.find().pretty()
61
+ % db.ontology_relation_set.find().pretty()
62
+ ```
63
+
64
+ #### Command line
65
+ ```bash
66
+ % poetry install
67
+ % poetry run ontology_loader --help
68
+ % poetry run ontology_loader --source-ontology "envo"
69
+ % poetry run ontology_loader --source-ontology "go"
70
+ ```
71
+
72
+ #### Running the tests
73
+ ```bash
74
+ % make test
75
+ ```
76
+
77
+ #### Running the linter
78
+ ```bash
79
+ % make lint
80
+ ```
81
+
82
+ #### python example usage
83
+ ```bash
84
+ pip install nmdc-ontology-loader
85
+ ```
86
+
87
+ ```python
88
+ from nmdc_ontology_loader.ontology_loader import OntologyLoader
89
+ import tempfile
90
+
91
+ def test_load_ontology():
92
+ """Test the load_ontology method."""
93
+ ontology_loader = OntologyLoader(
94
+ source_ontology="envo",
95
+ output_directory=tempfile.gettempdir(),
96
+ generate_reports=True,
97
+ )
98
+ ontology_loader.load_ontology()
99
+ assert ontology_loader.ontology_class_set
100
+ assert ontology_loader.ontology_relation_set
101
+ assert ontology_loader.ontology_class_set.count() > 0
102
+ assert ontology_loader.ontology_relation_set.count() > 0
103
+ ```
@@ -0,0 +1,73 @@
1
+ ## ontology_loader
2
+
3
+ Suite of tools to configure and load an ontology from the OboFoundary into the data object for OntologyClass as
4
+ specified by NMDC schema.
5
+
6
+ ## Development Environment
7
+
8
+ #### Pre-requisites
9
+
10
+ - >=Python 3.9
11
+ - Poetry
12
+ - Docker
13
+ - MongoDB
14
+ - NMDC materialized schema
15
+ - ENV variable for MONGO_PASSWORD (or pass it in via the cli/runner itself directly)
16
+
17
+ ```bash
18
+
19
+ % docker pull mongo
20
+ % docker run -d --name mongodb-container -p 27017:27017 mongo
21
+ ```
22
+
23
+ #### Basic mongosh commands
24
+ ```bash
25
+ % docker ps
26
+ % docker exec -it [mongodb-container-id] bash
27
+ % mongosh
28
+ % show dbs
29
+ % use test
30
+ % db.ontology_class_set.find().pretty()
31
+ % db.ontology_relation_set.find().pretty()
32
+ ```
33
+
34
+ #### Command line
35
+ ```bash
36
+ % poetry install
37
+ % poetry run ontology_loader --help
38
+ % poetry run ontology_loader --source-ontology "envo"
39
+ % poetry run ontology_loader --source-ontology "go"
40
+ ```
41
+
42
+ #### Running the tests
43
+ ```bash
44
+ % make test
45
+ ```
46
+
47
+ #### Running the linter
48
+ ```bash
49
+ % make lint
50
+ ```
51
+
52
+ #### python example usage
53
+ ```bash
54
+ pip install nmdc-ontology-loader
55
+ ```
56
+
57
+ ```python
58
+ from nmdc_ontology_loader.ontology_loader import OntologyLoader
59
+ import tempfile
60
+
61
+ def test_load_ontology():
62
+ """Test the load_ontology method."""
63
+ ontology_loader = OntologyLoader(
64
+ source_ontology="envo",
65
+ output_directory=tempfile.gettempdir(),
66
+ generate_reports=True,
67
+ )
68
+ ontology_loader.load_ontology()
69
+ assert ontology_loader.ontology_class_set
70
+ assert ontology_loader.ontology_relation_set
71
+ assert ontology_loader.ontology_class_set.count() > 0
72
+ assert ontology_loader.ontology_relation_set.count() > 0
73
+ ```
@@ -0,0 +1,92 @@
1
+ [tool.poetry]
2
+ version = "0.1.1"
3
+ description = "ontology_loader"
4
+ authors = ["Sierra Moxon <smoxon@lbl.gov>"]
5
+ license = "MIT"
6
+ readme = "README.md"
7
+ name = "ontology-loader"
8
+ packages = [{ include = "ontology_loader", from = "src" }]
9
+
10
+ [tool.poetry.dependencies]
11
+ python = "^3.11"
12
+ pymongo = "^4.9.1"
13
+ nmdc-schema = "*"
14
+ oaklib = "^0.6.16"
15
+ click = "^8.1.7"
16
+ curies = "^0.8.0"
17
+ linkml-store = "^0.2.6"
18
+ tiktoken = "^0.8.0"
19
+ tabulate = "^0.9.0"
20
+ chromadb = "^0.5.15"
21
+ neo4j = "^5.25.0"
22
+ motor = "^3.6.0"
23
+ pystow = "^0.5.6"
24
+ tox = "^4.24.1"
25
+ pytest = "^8.3.4"
26
+ mongomock = "^4.3.0"
27
+
28
+ [tool.poetry.group.dev.dependencies]
29
+ pytest = {version = ">=7.1.2"}
30
+ tox = {version = ">=3.25.1"}
31
+ pre-commit = {version = ">=3.3.3"}
32
+
33
+ [tool.poetry.group.docs]
34
+ optional = true
35
+
36
+ [tool.poetry.group.docs.dependencies]
37
+ sphinx = {version = ">=6.1.3"}
38
+ sphinx-rtd-theme = {version = ">=1.0.0"}
39
+ sphinx-autodoc-typehints = {version = ">=1.2.0"}
40
+ sphinx-click = {version = ">=4.3.0"}
41
+ myst-parser = {version = ">=0.18.1"}
42
+
43
+ [tool.poetry.scripts]
44
+ ontology_loader = "ontology_loader.cli:main"
45
+
46
+ [tool.poetry-dynamic-versioning]
47
+ enable = false
48
+ vcs = "git"
49
+ style = "pep440"
50
+
51
+
52
+ [tool.black]
53
+ line-length = 120
54
+ target-version = ["py38", "py39", "py310"]
55
+
56
+ [tool.ruff]
57
+ extend-ignore = [
58
+ "D211", # `no-blank-line-before-class`
59
+ "D212", # `multi-line-summary-first-line`
60
+ ]
61
+ line-length = 120
62
+
63
+ # Allow autofix for all enabled rules (when `--fix`) is provided.
64
+ fixable = ["ALL"]
65
+
66
+ # Select or ignore from https://beta.ruff.rs/docs/rules/
67
+ select = [
68
+ "B", # bugbear
69
+ "D", # pydocstyle
70
+ "E", # pycodestyle errors
71
+ "F", # Pyflakes
72
+ "I", # isort
73
+ "S", # flake8-bandit
74
+ "W", # Warning
75
+ ]
76
+
77
+ unfixable = []
78
+ target-version = "py311"
79
+
80
+ [tool.ruff.mccabe]
81
+ # Unlike Flake8, default to a complexity level of 10.
82
+ max-complexity = 10
83
+
84
+ [tool.codespell]
85
+ skip = "*.po,*.ts,.git,pyproject.toml"
86
+ count = ""
87
+ quiet-level = 3
88
+ # ignore-words-list = ""
89
+
90
+ [build-system]
91
+ requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning"]
92
+ build-backend = "poetry_dynamic_versioning.backend"
@@ -0,0 +1,8 @@
1
+ """Ontology Loader package."""
2
+
3
+ from importlib.metadata import PackageNotFoundError, version
4
+
5
+ try:
6
+ __version__ = version("ontology-loader")
7
+ except PackageNotFoundError:
8
+ __version__ = "unknown"
@@ -0,0 +1,53 @@
1
+ """Cli methods for ontology loading from the command line."""
2
+
3
+ import logging
4
+ import os
5
+
6
+ import click
7
+
8
+ from src.ontology_loader.ontology_load_controller import OntologyLoaderController
9
+
10
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ @click.command()
15
+ @click.option("--db-host", default=os.getenv("MONGO_HOST", "localhost"), help="MongoDB connection URL")
16
+ @click.option("--db-port", default=int(os.getenv("MONGO_PORT", 27018)), help="MongoDB connection port")
17
+ @click.option("--db-name", default=os.getenv("MONGO_DB", "nmdc"), help="Database name")
18
+ @click.option("--db-user", default=os.getenv("MONGO_USER", "admin"), help="Database user")
19
+ @click.option("--db-password", default=os.getenv("MONGO_PASSWORD", ""), help="Database password")
20
+ @click.option("--source-ontology", default="envo", help="Lowercase ontology prefix, e.g., envo, go, uberon, etc.")
21
+ @click.option("--output-directory", default=None, help="Output directory for reporting, default is /tmp")
22
+ @click.option("--generate-reports", default=True, help="Generate reports")
23
+ def cli(db_host, db_port, db_name, db_user, db_password, source_ontology, output_directory, generate_reports):
24
+ """
25
+ CLI entry point for the ontology loader.
26
+
27
+ :param db_host: MongoDB connection URL, default is localhost
28
+ :param db_port: MongoDB connection port, default is 27018
29
+ :param db_name: Database name, default is nmdc
30
+ :param db_user: Database user, default is admin
31
+ :param db_password: Database password, default is blank
32
+ :param source_ontology: Lowercase ontology prefix, e.g., envo, go, uberon, etc.
33
+ :param output_directory: Output directory for reporting, default is /tmp
34
+ :param generate_reports: Generate reports or not, default is True
35
+ """
36
+ logger.info(f"Processing ontology: {source_ontology}")
37
+
38
+ # Initialize the MongoDB Loader
39
+ loader = OntologyLoaderController(
40
+ db_host=db_host,
41
+ db_port=db_port,
42
+ db_name=db_name,
43
+ db_user=db_user,
44
+ db_password=db_password,
45
+ source_ontology=source_ontology,
46
+ output_directory=output_directory,
47
+ generate_reports=generate_reports,
48
+ )
49
+ loader.run_ontology_loader()
50
+
51
+
52
+ if __name__ == "__main__":
53
+ cli()
@@ -0,0 +1,21 @@
1
+ """Singleton class to store default parameters accessed from client environment or sensible defaults."""
2
+
3
+ import os
4
+
5
+
6
+ class MongoDBConfig:
7
+
8
+ """Singleton class to store default parameters accessed from client environment or sensible defaults."""
9
+
10
+ _instance = None
11
+
12
+ def __new__(cls):
13
+ """Create a new instance of MongoDBConfig if it does not exist."""
14
+ if cls._instance is None:
15
+ cls._instance = super(MongoDBConfig, cls).__new__(cls)
16
+ cls._instance.db_name = os.getenv("MONGO_DB", "nmdc")
17
+ cls._instance.db_user = os.getenv("MONGO_USER", "admin")
18
+ cls._instance.db_password = os.getenv("MONGO_PASSWORD", "")
19
+ cls._instance.db_host = os.getenv("MONGO_HOST", "localhost")
20
+ cls._instance.db_port = int(os.getenv("MONGO_PORT", 27018))
21
+ return cls._instance
@@ -0,0 +1,117 @@
1
+ """Load and process ontology terms and relations into MongoDB."""
2
+
3
+ import logging
4
+ from dataclasses import asdict, fields
5
+ from typing import List, Optional
6
+
7
+ from linkml_runtime import SchemaView
8
+ from linkml_store import Client
9
+ from nmdc_schema.nmdc import OntologyClass
10
+
11
+ from src.ontology_loader.mongo_db_config import MongoDBConfig
12
+ from src.ontology_loader.reporter import Report
13
+
14
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class MongoDBLoader:
19
+
20
+ """MongoDB Loader class to upsert OntologyClass objects and insert OntologyRelation objects into MongoDB."""
21
+
22
+ def __init__(self, schema_view: Optional[SchemaView] = None):
23
+ """
24
+ Initialize MongoDB using LinkML-store's client.
25
+
26
+ :param schema_view: LinkML SchemaView for ontology
27
+ :param db_config: Singleton configuration for MongoDB connection
28
+ """
29
+ db_config = MongoDBConfig()
30
+ self.schema_view = schema_view
31
+ self.db_host = db_config.db_host
32
+ self.db_port = db_config.db_port
33
+ self.db_name = db_config.db_name
34
+ self.db_user = db_config.db_user
35
+ self.db_password = db_config.db_password
36
+
37
+ # TODO: it might be that we are providing the connection string "incorrectly" (or differently) in linkml-store
38
+ # this exists so that the default env parameters in nmdc-runtime can be used as they are currently
39
+ # specified.
40
+ if self.db_host.startswith("mongodb://"):
41
+ # mongodb://mongo:27017
42
+ self.db_host = self.db_host.replace("mongodb://", "")
43
+ self.db_port = int(self.db_host.split(":")[1])
44
+ self.db_host = self.db_host.split(":")[0]
45
+
46
+ self.handle = (
47
+ f"mongodb://{self.db_user}:{self.db_password}@{self.db_host}:{self.db_port}/{self.db_name}?authSource=admin"
48
+ )
49
+
50
+ logger.info(self.handle)
51
+ self.client = Client(handle=self.handle)
52
+
53
+ # Explicitly set the correct database
54
+ self.db = self.client.attach_database(
55
+ handle=self.handle, # Ensure correct database is used
56
+ )
57
+ logger.info(f"Connected to MongoDB: {self.db}")
58
+
59
+ def upsert_ontology_classes(
60
+ self, ontology_classes: List[OntologyClass], collection_name: str = "ontology_class_set"
61
+ ):
62
+ """
63
+ Upsert each OntologyClass object into the 'ontology_class_set' collection and return reports.
64
+
65
+ :param ontology_classes: A list of OntologyClass objects to upsert
66
+ :param collection_name: The name of the MongoDB collection to upsert into.
67
+ :return: A tuple of two Report objects: one for updates and one for insertions.
68
+ """
69
+ collection = self.db.create_collection(collection_name, recreate_if_exists=False)
70
+ collection.index("id", unique=False)
71
+ logging.info(collection_name)
72
+
73
+ if not ontology_classes:
74
+ logging.info("No OntologyClass objects to upsert.")
75
+ return Report("update", [], []), Report("insert", [], [])
76
+
77
+ updates_report = []
78
+ insertions_report = []
79
+ ontology_fields = [field.name for field in fields(OntologyClass)]
80
+
81
+ for obj in ontology_classes:
82
+ filter_criteria = {"id": obj.id}
83
+ query_result = collection.find(filter_criteria)
84
+ existing_doc = query_result.rows[0] if query_result.num_rows > 0 else None
85
+
86
+ if existing_doc:
87
+ updated_fields = {
88
+ key: getattr(obj, key) for key in ontology_fields if getattr(obj, key) != existing_doc.get(key)
89
+ }
90
+ if updated_fields:
91
+ collection.upsert([asdict(obj)], filter_fields=["id"], update_fields=list(updated_fields.keys()))
92
+ logging.debug(f"Updated existing OntologyClass (id={obj.id}): {updated_fields}")
93
+ updates_report.append([obj.id] + [getattr(obj, field, "") for field in ontology_fields])
94
+ else:
95
+ logging.debug(f"No changes detected for OntologyClass (id={obj.id}). Skipping update.")
96
+ else:
97
+ collection.upsert([asdict(obj)], filter_fields=["id"], update_fields=ontology_fields)
98
+ logging.debug(f"Inserted new OntologyClass (id={obj.id}).")
99
+ insertions_report.append([obj.id] + [getattr(obj, field, "") for field in ontology_fields])
100
+
101
+ logging.info(f"Finished upserting {len(ontology_classes)} OntologyClass objects into MongoDB.")
102
+ return Report("update", updates_report, ontology_fields), Report("insert", insertions_report, ontology_fields)
103
+
104
+ def insert_ontology_relations(self, ontology_relations, collection_name: str = "ontology_relation_set"):
105
+ """
106
+ Insert each OntologyClass object into the 'ontology_class_set' collection.
107
+
108
+ :param ontology_relations: A list of OntologyRelation objects to insert
109
+ :param collection_name: The name of the MongoDB collection to insert into.
110
+
111
+ """
112
+ collection = self.db.create_collection(collection_name, recreate_if_exists=False)
113
+ if ontology_relations:
114
+ for relation in ontology_relations:
115
+ collection.insert(relation)
116
+ else:
117
+ logger.info("No OntologyRelation objects to insert.")
@@ -0,0 +1,72 @@
1
+ """Cli methods for ontology loading from the command line."""
2
+
3
+ import logging
4
+ import tempfile
5
+
6
+ from src.ontology_loader.mongodb_loader import MongoDBLoader
7
+ from src.ontology_loader.ontology_processor import OntologyProcessor
8
+ from src.ontology_loader.reporter import ReportWriter
9
+ from src.ontology_loader.utils import load_yaml_from_package
10
+
11
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class OntologyLoaderController:
16
+
17
+ """
18
+ OntologyLoader runner class for MongoDBLoader.
19
+
20
+ This class is responsible for running the MongoDBLoader with the given parameters from code other than
21
+ the command line support offered through the cli.py click interface.
22
+ """
23
+
24
+ def __init__(
25
+ self,
26
+ source_ontology: str = "envo",
27
+ output_directory: str = tempfile.gettempdir(),
28
+ generate_reports: bool = True,
29
+ ):
30
+ """Set the parameters for the OntologyLoader."""
31
+ self.source_ontology = source_ontology
32
+ self.output_directory = output_directory
33
+ self.generate_reports = generate_reports
34
+
35
+ def run_ontology_loader(self):
36
+ """Run the OntologyLoader and insert data into MongoDB."""
37
+ logging.info(f"Processing ontology: {self.source_ontology}")
38
+
39
+ # Load Schema View
40
+ nmdc_sv = load_yaml_from_package("nmdc_schema", "nmdc_materialized_patterns.yaml")
41
+ # Initialize the Ontology Processor
42
+ processor = OntologyProcessor(self.source_ontology)
43
+
44
+ # Process ontology terms and return a list of OntologyClass dicts produced by linkml json dumper as dict
45
+ ontology_classes = processor.get_terms_and_metadata()
46
+
47
+ logger.info(f"Extracted {len(ontology_classes)} ontology classes.")
48
+
49
+ # Process ontology relations and create OntologyRelation objects
50
+ ontology_relations = processor.get_relations_closure()
51
+
52
+ logger.info(f"Extracted {len(ontology_relations)} ontology relations.")
53
+
54
+ # Connect to MongoDB
55
+ db_manager = MongoDBLoader(schema_view=nmdc_sv)
56
+ # Insert data into MongoDB
57
+ updates_report, insertions_report = db_manager.upsert_ontology_classes(ontology_classes)
58
+ db_manager.insert_ontology_relations(ontology_relations)
59
+ db_manager.insert_ontology_relations(ontology_relations)
60
+
61
+ # Optionally write job reports
62
+ if self.generate_reports:
63
+ ReportWriter.write_reports(
64
+ reports=[updates_report, insertions_report], output_format="tsv", output_directory=self.output_directory
65
+ )
66
+
67
+ logger.info("Processing complete. Data inserted into MongoDB.")
68
+
69
+
70
+ if __name__ == "__main__":
71
+ """Run the OntologyLoader."""
72
+ OntologyLoaderController().run_ontology_loader()
@@ -0,0 +1,109 @@
1
+ """Ontology Processor class to process ontology terms and relations."""
2
+
3
+ import gzip
4
+ import logging
5
+ import shutil
6
+
7
+ import pystow
8
+ from linkml_runtime.dumpers import json_dumper
9
+ from nmdc_schema.nmdc import OntologyClass, OntologyRelation
10
+ from oaklib import get_adapter
11
+
12
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class OntologyProcessor:
17
+
18
+ """Ontology Processor class to process ontology terms and relations."""
19
+
20
+ def __init__(self, ontology: str):
21
+ """
22
+ Initialize the OntologyProcessor with a given SQLite ontology.
23
+
24
+ :param ontology: The ontology prefix (e.g., "envo", "go", "uberon", etc.)
25
+
26
+ """
27
+ self.ontology = ontology
28
+ self.ontology_db_path = self.download_and_prepare_ontology()
29
+ self.adapter = get_adapter(f"sqlite:{self.ontology_db_path}")
30
+ self.adapter.precompute_lookups() # Optimize lookups
31
+
32
+ def download_and_prepare_ontology(self):
33
+ """Download and prepare the ontology database for processing."""
34
+ logger.info(f"Preparing ontology: {self.ontology}")
35
+
36
+ # Get the ontology-specific pystow directory
37
+ source_ontology_module = pystow.module(self.ontology).base # Example: ~/.pystow/envo
38
+
39
+ # If the directory exists, remove it and all its contents
40
+ if source_ontology_module.exists():
41
+ logger.info(f"Removing existing pystow directory for {self.ontology}: {source_ontology_module}")
42
+ shutil.rmtree(source_ontology_module)
43
+
44
+ # Define ontology URL
45
+ ontology_db_url_prefix = "https://s3.amazonaws.com/bbop-sqlite/"
46
+ ontology_db_url_suffix = ".db.gz"
47
+ ontology_url = ontology_db_url_prefix + self.ontology + ontology_db_url_suffix
48
+
49
+ # Define paths (download to the module-specific directory)
50
+ compressed_path = pystow.ensure(self.ontology, f"{self.ontology}.db.gz", url=ontology_url)
51
+ decompressed_path = compressed_path.with_suffix("") # Remove .gz to get .db file
52
+
53
+ # Extract the file if not already extracted
54
+ if not decompressed_path.exists():
55
+ logger.info(f"Extracting {compressed_path} to {decompressed_path}...")
56
+ with gzip.open(compressed_path, "rb") as f_in:
57
+ with open(decompressed_path, "wb") as f_out:
58
+ shutil.copyfileobj(f_in, f_out)
59
+
60
+ logger.info(f"Ontology database is ready at: {decompressed_path}")
61
+ return decompressed_path
62
+
63
+ def get_terms_and_metadata(self):
64
+ """Retrieve all terms that start with the ontology prefix and return a list of OntologyClass objects."""
65
+ ontology_classes = []
66
+
67
+ for entity in self.adapter.entities():
68
+ if entity.startswith(self.ontology.upper() + ":"):
69
+ ontology_class = OntologyClass(
70
+ id=entity,
71
+ type="nmdc:OntologyClass",
72
+ alternative_names=self.adapter.entity_aliases(entity) or [],
73
+ definition=self.adapter.definition(entity) or "",
74
+ )
75
+
76
+ ontology_classes.append(ontology_class)
77
+
78
+ return ontology_classes
79
+
80
+ def get_relations_closure(self, predicates=None):
81
+ """
82
+ Retrieve all ontology relations closure for terms.
83
+
84
+ :param predicates: List of predicates to consider (default: ["rdfs:subClassOf", "BFO:0000050"])
85
+
86
+ """
87
+ predicates = ["rdfs:subClassOf", "BFO:0000050"] if predicates is None else predicates
88
+ ontology_relations = []
89
+
90
+ for entity in self.adapter.entities():
91
+ if entity.startswith(self.ontology.upper() + ":"):
92
+ # Convert generator to list
93
+ ancestors_list = list(self.adapter.ancestors(entity, reflexive=True, predicates=predicates))
94
+
95
+ # Filter to keep only ENVO terms
96
+ filtered_ancestors = list(set(a for a in ancestors_list if a.startswith(self.ontology.upper() + ":")))
97
+
98
+ for ancestor in filtered_ancestors:
99
+ ontology_relation = OntologyRelation(
100
+ subject=entity,
101
+ predicate="is_a", # TODO: fix this to the real predicate that it came with
102
+ object=ancestor,
103
+ type="nmdc:OntologyRelation",
104
+ )
105
+
106
+ # Convert OntologyRelation instance to a dictionary
107
+ ontology_relations.append(json_dumper.to_dict(ontology_relation))
108
+
109
+ return ontology_relations
@@ -0,0 +1,42 @@
1
+ """Reporting class to handle TSV dumping."""
2
+
3
+ import csv
4
+ import logging
5
+ import tempfile
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from typing import List, Optional
9
+
10
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ @dataclass
15
+ class Report:
16
+
17
+ """Dataclass to hold report data."""
18
+
19
+ report_type: str # "update" or "insert"
20
+ records: List[List[str]]
21
+ headers: List[str]
22
+
23
+
24
+ class ReportWriter:
25
+
26
+ """ReportWriter class to write reports to TSV files."""
27
+
28
+ @staticmethod
29
+ def write_reports(reports: List[Report], output_format: str = "tsv", output_directory: Optional[str] = None):
30
+ """Write reports to TSV files."""
31
+ if output_directory is None:
32
+ output_directory = Path(tempfile.gettempdir())
33
+ else:
34
+ output_directory = Path(output_directory)
35
+
36
+ for report in reports:
37
+ file_path = output_directory / f"ontology_{report.report_type}s.{output_format}"
38
+ with file_path.open(mode="w", newline="", encoding="utf-8") as f:
39
+ writer = csv.writer(f, delimiter="\t") if output_format == "tsv" else csv.writer(f)
40
+ writer.writerow(["id"] + report.headers)
41
+ writer.writerows(report.records)
42
+ logging.info(f"Report generated: {file_path}")
@@ -0,0 +1,17 @@
1
+ """Loads a YAML file from a given package."""
2
+
3
+ import importlib.resources
4
+
5
+ from linkml_runtime.utils.schemaview import SchemaView
6
+
7
+
8
+ def load_yaml_from_package(package: str, filename: str) -> SchemaView:
9
+ """
10
+ Load a YAML file from a given package.
11
+
12
+ :param package: The package where the YAML file is located (e.g., "nmdc_schema").
13
+ :param filename: The YAML file to load (e.g., "nmdc_materialized_patterns.yaml").
14
+ :return: Parsed YAML data as a Python dictionary.
15
+ """
16
+ with importlib.resources.files(package).joinpath(filename).open("r") as f:
17
+ return SchemaView(f)