PyPI - nerdd-link - Versions diffs - 0.2.11__tar.gz - Mend

nerdd-link 0.2.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

nerdd_link-0.2.11/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2023 Molecular Informatics Vienna
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

nerdd_link-0.2.11/PKG-INFO ADDED Viewed

@@ -0,0 +1,116 @@
+Metadata-Version: 2.2
+Name: nerdd-link
+Version: 0.2.11
+Summary: Run a NERDD module as a service
+Author-email: Steffen Hirte <steffen.hirte@univie.ac.at>
+Maintainer-email: Steffen Hirte <steffen.hirte@univie.ac.at>
+License: MIT License
+        Copyright (c) 2023 Molecular Informatics Vienna
+        Permission is hereby granted, free of charge, to any person obtaining a copy
+        of this software and associated documentation files (the "Software"), to deal
+        in the Software without restriction, including without limitation the rights
+        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+        copies of the Software, and to permit persons to whom the Software is
+        furnished to do so, subject to the following conditions:
+        The above copyright notice and this permission notice shall be included in all
+        copies or substantial portions of the Software.
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+        SOFTWARE.
+Project-URL: Repository, https://github.com/molinfo-vienna/nerdd-link
+Keywords: science,research,development,nerdd
+Classifier: Intended Audience :: Science/Research
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Programming Language :: Python
+Classifier: Topic :: Software Development
+Classifier: Topic :: Scientific/Engineering
+Classifier: Operating System :: Microsoft :: Windows
+Classifier: Operating System :: POSIX
+Classifier: Operating System :: Unix
+Classifier: Operating System :: MacOS
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: nerdd-module>=0.3.6
+Requires-Dist: pandas>=1.2.1
+Requires-Dist: pyyaml~=6.0
+Requires-Dist: filetype~=1.2.0
+Requires-Dist: rich-click>=1.7.1
+Requires-Dist: stringcase~=1.2.0
+Requires-Dist: numpy
+Requires-Dist: simplejson>=3
+Requires-Dist: pydantic>=2
+Requires-Dist: aiokafka>=0.12.0
+Requires-Dist: importlib-metadata>=4.6; python_version < "3.10"
+Provides-Extra: dev
+Requires-Dist: mypy; extra == "dev"
+Requires-Dist: ruff==0.8.0; extra == "dev"
+Requires-Dist: pre-commit>=2; extra == "dev"
+Provides-Extra: test
+Requires-Dist: pytest; extra == "test"
+Requires-Dist: pytest-sugar; extra == "test"
+Requires-Dist: pytest-cov; extra == "test"
+Requires-Dist: pytest-asyncio; extra == "test"
+Requires-Dist: pytest-bdd==7.3.0; extra == "test"
+Requires-Dist: pytest-mock; extra == "test"
+Requires-Dist: pytest-watcher; extra == "test"
+Requires-Dist: hypothesis; extra == "test"
+Requires-Dist: hypothesis-rdkit; extra == "test"
+Provides-Extra: docs
+Requires-Dist: mkdocs; extra == "docs"
+Requires-Dist: mkdocs-material; extra == "docs"
+Requires-Dist: mkdocstrings; extra == "docs"
+# NERDD-Link
+Run a [NERDD module](https://github.com/molinfo-vienna/nerdd-module) as a
+service that consumes input molecules and produces prediction tuples.
+## Installation
+```bash
+pip install -U nerdd-link
+```
+## Usage
+When a class inherits from ```nerdd_module.AbstractModel``` (see
+[NERDD Module Github page](https://github.com/molinfo-vienna/nerdd-module)), it can be
+used to create a Kafka service.
+```bash
+# run a Kafka service for NerddModel on localhost:9092
+run_nerdd_server package.path.to.NerddModel
+# modify broker url, input topic and batch size
+run_nerdd_server package.path.to.NerddModel \
+  --broker-url my-cluster-kafka-bootstrap.kafka:9092 \
+  --input-topic examples \
+  --batch-size 10
+# more information via --help
+run_nerdd_server --help
+```
+If the model class is called ```ExamplePredictionModel```, the server will read input
+tuples from the input topic ```example-prediction-inputs``` in batches of size 100
+and write results to the ```results``` topic. The batch size specifies the number
+of input tuples that are given to the model at once.
+## Communication

nerdd_link-0.2.11/README.md ADDED Viewed

@@ -0,0 +1,39 @@
+# NERDD-Link
+Run a [NERDD module](https://github.com/molinfo-vienna/nerdd-module) as a
+service that consumes input molecules and produces prediction tuples.
+## Installation
+```bash
+pip install -U nerdd-link
+```
+## Usage
+When a class inherits from ```nerdd_module.AbstractModel``` (see
+[NERDD Module Github page](https://github.com/molinfo-vienna/nerdd-module)), it can be
+used to create a Kafka service.
+```bash
+# run a Kafka service for NerddModel on localhost:9092
+run_nerdd_server package.path.to.NerddModel
+# modify broker url, input topic and batch size
+run_nerdd_server package.path.to.NerddModel \
+  --broker-url my-cluster-kafka-bootstrap.kafka:9092 \
+  --input-topic examples \
+  --batch-size 10
+# more information via --help
+run_nerdd_server --help
+```
+If the model class is called ```ExamplePredictionModel```, the server will read input
+tuples from the input topic ```example-prediction-inputs``` in batches of size 100
+and write results to the ```results``` topic. The batch size specifies the number
+of input tuples that are given to the model at once.
+## Communication

nerdd_link-0.2.11/nerdd_link/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+from .actions import *
+from .channels import *
+from .converters import *
+from .files import *
+from .input import *
+from .types import *

nerdd_link-0.2.11/nerdd_link/actions/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .action import *
+from .predict_checkpoints_action import *
+from .process_jobs_action import *
+from .register_module_action import *
+from .serialize_job_action import *

nerdd_link-0.2.11/nerdd_link/actions/action.py ADDED Viewed

@@ -0,0 +1,41 @@
+import logging
+from abc import ABC, abstractmethod
+from asyncio import CancelledError
+from typing import Generic, TypeVar
+from stringcase import spinalcase
+from ..channels import Channel, Topic
+from ..types import Message
+logger = logging.getLogger(__name__)
+T = TypeVar("T", bound=Message)
+class Action(ABC, Generic[T]):
+    def __init__(self, input_topic: Topic[T]):
+        self._input_topic = input_topic
+    async def run(self) -> None:
+        consumer_group = spinalcase(self._get_group_name())
+        async for message in self._input_topic.receive(consumer_group):
+            try:
+                await self._process_message(message)
+            except CancelledError:
+                # the consumer was cancelled, stop processing messages
+                break
+            except Exception:
+                # log the error and continue processing the next message
+                logger.error("Error processing message", exc_info=True)
+    @abstractmethod
+    async def _process_message(self, message: T) -> None:
+        pass
+    @property
+    def channel(self) -> Channel:
+        return self._input_topic.channel
+    def _get_group_name(self) -> str:
+        return self.__class__.__name__

nerdd_link-0.2.11/nerdd_link/actions/predict_checkpoints_action.py ADDED Viewed

@@ -0,0 +1,76 @@
+import logging
+from multiprocessing import Queue
+from nerdd_module import SimpleModel
+from ..channels import Channel
+from ..delegates import ReadCheckpointModel
+from ..files import FileSystem
+from ..types import CheckpointMessage, ResultCheckpointMessage, ResultMessage
+from .action import Action
+__all__ = ["PredictCheckpointsAction"]
+logger = logging.getLogger(__name__)
+class PredictCheckpointsAction(Action[CheckpointMessage]):
+    # Accept a batch of input molecules on the "<job-type>-checkpoints" topic
+    # (generated in the previous step) and process them. Results are written to
+    # the "results" topic.
+    def __init__(self, channel: Channel, model: SimpleModel, data_dir: str) -> None:
+        super().__init__(channel.checkpoints_topic(model))
+        self._model = model
+        self._data_dir = data_dir
+    async def _process_message(self, message: CheckpointMessage) -> None:
+        job_id = message.job_id
+        checkpoint_id = message.checkpoint_id
+        params = message.params
+        logger.info(f"Predict checkpoint {checkpoint_id} of job {job_id}")
+        # The Kafka consumers and producers run in the current asyncio event loop and (by
+        # observation) it seems that calling the produce method of a Kafka producer in a different
+        # event loop / thread / process doesn't seem to work (hangs indefinitely). Therefore, we
+        # create a queue in this event loop / thread and other tasks send messages to the queue
+        # instead of directly to the Kafka producer. This event loop will wait for new messages in
+        # this queue and forward them to the Kafka producer.
+        queue: Queue = Queue()
+        file_system = FileSystem(self._data_dir)
+        # create a wrapper model that
+        # * reads the checkpoint file instead of normal input
+        # * does preprocessing, prediction, and postprocessing like the encapsulated model
+        # * does not write to the specified results file, but to the checkpoints file instead
+        # * sends the results to the results topic
+        model = ReadCheckpointModel(
+            base_model=self._model,
+            job_id=job_id,
+            file_system=file_system,
+            checkpoint_id=checkpoint_id,
+            queue=queue,
+        )
+        # predict the checkpoint
+        # assign input=None, because the checkpoint file is already provided in ReadCheckpointModel
+        model.predict(
+            input=None,
+            **params,
+        )
+        # Wait for the prediction to finish and the results to be sent.
+        while True:
+            record = queue.get()
+            if record is not None:
+                await self.channel.results_topic().send(ResultMessage(job_id=job_id, **record))
+            else:
+                await self.channel.result_checkpoints_topic().send(
+                    ResultCheckpointMessage(job_id=job_id, checkpoint_id=checkpoint_id)
+                )
+                break
+    def _get_group_name(self) -> str:
+        model_id = self._model.get_config().id
+        return f"predict-checkpoints-{model_id}"

nerdd_link-0.2.11/nerdd_link/actions/process_jobs_action.py ADDED Viewed

@@ -0,0 +1,157 @@
+import logging
+from pickle import dump
+from typing import Any
+from nerdd_module.input import DepthFirstExplorer
+from nerdd_module.model import ReadInputStep
+from rdkit.Chem import Mol
+from rdkit.Chem.PropertyMol import PropertyMol
+from ..channels import Channel
+from ..files import FileSystem
+from ..types import CheckpointMessage, JobMessage, LogMessage
+from ..utils import batched
+from .action import Action
+__all__ = ["ProcessJobsAction"]
+logger = logging.getLogger(__name__)
+class ProcessJobsAction(Action[JobMessage]):
+    # Accept new jobs (on the "<job_type>-jobs" topic). For each job, the program
+    # iterates through all molecules in the input (files), writes them as batches
+    # into checkpoint files and sends checkpoint messages (for each batch) to the
+    # "<job_type>-checkpoints" topic. Also, the number of molecules read is
+    # reported to the topic "job-sizes".
+    def __init__(
+        self,
+        channel: Channel,
+        checkpoint_size: int,
+        max_num_molecules: int,
+        num_test_entries: int,
+        ratio_valid_entries: float,
+        maximum_depth: int,
+        max_num_lines_mol_block: int,
+        data_dir: str,
+    ) -> None:
+        super().__init__(channel.jobs_topic())
+        # relevant for chunking
+        self._checkpoint_size = checkpoint_size
+        self._max_num_molecules = max_num_molecules
+        # parameters of DepthFirstExplorer
+        self._num_test_entries = num_test_entries
+        self._ratio_valid_entries = ratio_valid_entries
+        self._maximum_depth = maximum_depth
+        # used as kwargs in DepthFirstExplorer
+        self._max_num_lines_mol_block = max_num_lines_mol_block
+        self._file_system = FileSystem(data_dir)
+    async def _process_message(self, message: JobMessage) -> None:
+        job_id = message.id
+        job_type = message.job_type
+        logger.info(f"Received a new job {job_id} of type {job_type}")
+        # the input file to the job is stored in a designated sources directory
+        # (the file is allowed to reference other files, but setting the data_dir
+        # to the sources directory ensures that we never read files outside of the
+        # sources directory)
+        data_dir = self._file_system.get_sources_dir()
+        # create a reader (explorer) for the input file
+        explorer = DepthFirstExplorer(
+            num_test_entries=self._num_test_entries,
+            threshold=self._ratio_valid_entries,
+            maximum_depth=self._maximum_depth,
+            # extra args
+            max_num_lines_mol_block=self._max_num_lines_mol_block,
+            data_dir=data_dir,
+        )
+        read_input_step = ReadInputStep(explorer, message.source_id)
+        # read the input file
+        entries = read_input_step()
+        # iterate through the entries
+        # create batches of size checkpoint_size
+        # limit the number of molecules to max_num_molecules
+        batches = batched(entries, self._checkpoint_size)
+        num_entries = 0
+        num_checkpoints = 0
+        for i, batch in enumerate(batches):
+            # max_num_molecules might be reached within the batch
+            num_store = min(len(batch), self._max_num_molecules - num_entries)
+            # store batch in data_dir
+            with self._file_system.get_checkpoint_file_handle(job_id, i, "wb") as f:
+                results = list(batch[:num_store])
+                # TODO: use a model for storing the batches
+                # check all items for mol values and use PropertyMol for those
+                # in order to keep molecular properties (thanks, RDKit! :/ )
+                def _check_value(value: Any) -> Any:
+                    if isinstance(value, Mol):
+                        return PropertyMol(value)
+                    return value
+                def _check_item(item: dict) -> dict:
+                    return {key: _check_value(value) for key, value in item.items()}
+                results = [_check_item(item) for item in results]
+                dump(results, f)
+            # send a tuple to topic cypstrate-checkpoints
+            await self.channel.checkpoints_topic(job_type).send(
+                CheckpointMessage(
+                    job_id=job_id,
+                    checkpoint_id=i,
+                    params=message.params,
+                )
+            )
+            num_entries += num_store
+            num_checkpoints += 1
+            if num_entries >= self._max_num_molecules:
+                break
+        logger.info(f"Wrote {i+1} checkpoints containing {num_entries} entries for job {job_id}")
+        # send a warning message if there were more molecules in the job than allowed
+        too_many_molecules = num_store < len(batch)
+        try:
+            # try to get another entry
+            next(entries)
+            # if we get here, there was another entry and we need to send a warning
+            too_many_molecules = True
+        except StopIteration:
+            pass
+        if too_many_molecules:
+            await self.channel.logs_topic().send(
+                LogMessage(
+                    job_id=job_id,
+                    message_type="warning",
+                    message=(
+                        f"The provided job contains more than "
+                        f"{self._max_num_molecules} input structures. Only the "
+                        f"first {self._max_num_molecules} will be processed."
+                    ),
+                )
+            )
+        # at the end, send a tuple to topic job-sizes with the overall size
+        # of the job
+        await self.channel.logs_topic().send(
+            LogMessage(
+                job_id=job_id,
+                message_type="report_job_size",
+                num_entries=num_entries,
+                num_checkpoints=num_checkpoints,
+            )
+        )

nerdd_link-0.2.11/nerdd_link/actions/register_module_action.py ADDED Viewed

@@ -0,0 +1,37 @@
+import json
+import logging
+from nerdd_module import Model
+from ..channels import Channel
+from ..files import FileSystem
+from ..types import ModuleMessage, SystemMessage
+from .action import Action
+__all__ = ["RegisterModuleAction"]
+logger = logging.getLogger(__name__)
+class RegisterModuleAction(Action[SystemMessage]):
+    def __init__(self, channel: Channel, model: Model, data_dir: str):
+        super().__init__(channel.system_topic())
+        # TODO: do this differently
+        assert hasattr(model, "get_config")
+        self._model = model
+        self._file_system = FileSystem(data_dir)
+    async def _process_message(self, message: SystemMessage) -> None:
+        config = self._model.get_config()
+        logger.info(f"Registering module with id {config.id}")
+        # save module as json to file
+        module_file = self._file_system.get_module_file_path(config.id)
+        json.dump(config.model_dump(), open(module_file, "w"))
+        # send the initialization message
+        await self.channel.modules_topic().send(ModuleMessage(id=config.id))
+    def _get_group_name(self) -> str:
+        model_id = self._model.get_config().id
+        return f"register-module-{model_id}"

nerdd_link-0.2.11/nerdd_link/actions/serialize_job_action.py ADDED Viewed

@@ -0,0 +1,64 @@
+import json
+import logging
+from nerdd_module import OutputStep
+from ..channels import Channel
+from ..delegates import ReadPickleStep, SerializeJobModel
+from ..files import FileSystem
+from ..types import SerializationRequestMessage, SerializationResultMessage
+from .action import Action
+__all__ = ["SerializeJobAction"]
+logger = logging.getLogger(__name__)
+class SerializeJobAction(Action[SerializationRequestMessage]):
+    def __init__(self, channel: Channel, data_dir: str) -> None:
+        super().__init__(channel.serialization_requests_topic())
+        self._file_system = FileSystem(data_dir)
+    async def _process_message(self, message: SerializationRequestMessage) -> None:
+        job_id = message.job_id
+        job_type = message.job_type
+        params = message.params
+        output_format = message.output_format
+        logger.info(f"Write output for job {job_id} in format {output_format}")
+        # remove specific parameter keys that could induce vulnerabilities
+        params.pop("output_file", None)
+        params.pop("output_format", None)
+        # obtain output file
+        output_file = self._file_system.get_output_file(job_id, output_format)
+        # get the configuration for the job_type
+        config_file = self._file_system.get_module_file_path(job_type)
+        config = json.load(open(config_file, "r"))
+        # create a fake model instance to get the postprocessing steps
+        model = SerializeJobModel(config)
+        steps = [
+            # read the result checkpoint files in the correct order
+            ReadPickleStep(self._file_system.iter_results_file_handles(job_id)),
+            # don't preprocess, don't do prediction, only post-process
+            *model._get_postprocessing_steps(output_format, output_file=output_file, **params),
+        ]
+        output_step = steps[-1]
+        assert isinstance(output_step, OutputStep), "The last step must be an OutputStep."
+        # build the pipeline from the list of steps
+        pipeline = None
+        for t in steps:
+            pipeline = t(pipeline)
+        # run the pipeline by calling the get_result method of the last step
+        output_step.get_result()
+        await self.channel.serialization_results_topic().send(
+            SerializationResultMessage(job_id=job_id, output_format=output_format)
+        )

nerdd_link-0.2.11/nerdd_link/channels/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .channel import *
+from .kafka_channel import *
+from .memory_channel import *