PyPI - nerdd-module - Versions diffs - 0.3.48__tar.gz → 0.3.50__tar.gz - Mend

nerdd-module 0.3.48tar.gz → 0.3.50tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (103) hide show

{nerdd_module-0.3.48 → nerdd_module-0.3.50}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nerdd-module
-Version: 0.3.48
+Version: 0.3.50
 Summary: Base package to create NERDD modules
 Author-email: Steffen Hirte <steffen.hirte@univie.ac.at>
 Maintainer-email: Steffen Hirte <steffen.hirte@univie.ac.at>

{nerdd_module-0.3.48 → nerdd_module-0.3.50}/nerdd_module/input/depth_first_explorer.py RENAMED Viewed

@@ -8,16 +8,17 @@ __all__ = ["DepthFirstExplorer"]
 class InvalidInputReader(Reader):
-    def __init__(self) -> None:
+    def __init__(self, message: str = "Invalid input") -> None:
         super().__init__()
+        self.message = message
     def read(self, input: Any, explore: ExploreCallable) -> Iterator[MoleculeEntry]:
         yield MoleculeEntry(
             raw_input=input,
             input_type="unknown",
-            source=("input",),
+            source=("raw_input",),
             mol=None,
-            errors=[Problem("invalid_input", "Invalid input")],
+            errors=[Problem("invalid_input", self.message)],
         )
     def __repr__(self) -> str:
@@ -120,6 +121,8 @@ class DepthFirstExplorer(Explorer):
             if best_mode == "builtin":
                 parent["first_guess"].append(best_reader)
+        # In order to get more fine-grained error messages, we do not handle exceptions here and
+        # rely on the readers to do so.
         yield from sample
         yield from generator

{nerdd_module-0.3.48 → nerdd_module-0.3.50}/nerdd_module/input/gzip_reader.py RENAMED Viewed

@@ -1,6 +1,7 @@
 import gzip
 from typing import Any, Iterator
+from ..problem import Problem
 from .reader import ExploreCallable, MoleculeEntry, Reader
 __all__ = ["GzipReader"]
@@ -22,7 +23,16 @@ class GzipReader(Reader):
             f.read(1)
             f.seek(0)
-            yield from explore(f)
+            try:
+                yield from explore(f)
+            except Exception as e:
+                yield MoleculeEntry(
+                    raw_input="<gzip>",
+                    input_type="gzip",
+                    source=("raw_input",),
+                    mol=None,
+                    errors=[Problem("invalid_input", f"Invalid gzip file: {e}")],
+                )
     def __repr__(self) -> str:
         return "GzipReader()"

{nerdd_module-0.3.48 → nerdd_module-0.3.50}/nerdd_module/input/list_reader.py RENAMED Viewed

@@ -1,6 +1,7 @@
 from io import IOBase
 from typing import Any, Iterable, Iterator
+from ..problem import Problem
 from .reader import ExploreCallable, MoleculeEntry, Reader
 __all__ = ["ListReader"]
@@ -16,7 +17,19 @@ class ListReader(Reader):
         ), f"input must be an iterable, but is {type(input_iterable)}"
         for entry in input_iterable:
-            yield from explore(entry)
+            try:
+                yield from explore(entry)
+            except Exception as e:
+                raw_input = str(entry)
+                if len(raw_input) > 100:
+                    raw_input = raw_input[:97] + "..."
+                yield MoleculeEntry(
+                    raw_input=raw_input,
+                    input_type="unknown",
+                    source=(),
+                    mol=None,
+                    errors=[Problem("invalid_list_entry", f"Could not read list entry: {e}")],
+                )
     def __repr__(self) -> str:
         return "ListReader()"

{nerdd_module-0.3.48 → nerdd_module-0.3.50}/nerdd_module/input/sdf_reader.py RENAMED Viewed

@@ -11,7 +11,7 @@ __all__ = ["SdfReader"]
 class SdfReader(StreamReader):
-    def __init__(self, max_num_lines_mol_block: int = 10_000) -> None:
+    def __init__(self, max_num_lines_mol_block: int = 100_000) -> None:
         super().__init__()
         self.max_num_lines_mol_block = max_num_lines_mol_block

{nerdd_module-0.3.48 → nerdd_module-0.3.50}/nerdd_module/input/stream_reader.py RENAMED Viewed

@@ -1,6 +1,6 @@
 from abc import abstractmethod
 from codecs import getreader
-from typing import Any, Iterator
+from typing import Any, Iterator, Optional
 import chardet
@@ -10,8 +10,9 @@ __all__ = ["StreamReader"]
 class StreamReader(Reader):
-    def __init__(self) -> None:
+    def __init__(self, encoding: Optional[str] = "utf-8-sig") -> None:
         super().__init__()
+        self.encoding = encoding
     def read(self, input_stream: Any, explore: ExploreCallable) -> Iterator[MoleculeEntry]:
         if not hasattr(input_stream, "read") or not hasattr(input_stream, "seek"):
@@ -20,24 +21,28 @@ class StreamReader(Reader):
         input_stream.seek(0)
         #
-        # detect file encoding
+        # detect file encoding (if not provided)
         #
         # read a portion of the file's content
-        sample = input_stream.read(1_000_000)
-        result = chardet.detect(sample)
-        if result["confidence"] > 0.5 and result["encoding"] is not None:
-            encoding = result["encoding"]
+        if self.encoding is None:
+            sample = input_stream.read(1_000_000)
+            result = chardet.detect(sample)
+            if result["confidence"] > 0.5 and result["encoding"] is not None:
+                encoding = result["encoding"]
+            else:
+                encoding = "utf-8-sig"
+            input_stream.seek(0)
         else:
-            encoding = "utf-8"
-        input_stream.seek(0)
+            encoding = self.encoding
         #
         # read file
         #
         StreamReader = getreader(encoding)
-        reader = StreamReader(input_stream)
+        # errors="replace": replace invalid characters instead of failing
+        reader = StreamReader(input_stream, "replace")
         return self._read_stream(reader, explore)
     @abstractmethod

nerdd_module-0.3.50/nerdd_module/input/tar_reader.py ADDED Viewed

@@ -0,0 +1,50 @@
+import tarfile
+from typing import Any, Iterator, Tuple
+from ..problem import Problem
+from .reader import ExploreCallable, MoleculeEntry, Reader
+__all__ = ["TarReader"]
+class TarReader(Reader):
+    def __init__(self) -> None:
+        super().__init__()
+    def read(self, input_stream: Any, explore: ExploreCallable) -> Iterator[MoleculeEntry]:
+        if not hasattr(input_stream, "read") or not hasattr(input_stream, "seek"):
+            raise TypeError("input must be a stream-like object")
+        input_stream.seek(0)
+        with tarfile.open(fileobj=input_stream, mode="r") as tar:
+            for member in tar.getmembers():
+                if not member.isfile():
+                    continue
+                try:
+                    for entry in explore(tar.extractfile(member)):
+                        # the underlying reader only sees the file content as a stream
+                        # -> it might believe that the source is "raw_input"
+                        # -> we need to correct that here
+                        if len(entry.source) == 1 and entry.source[0] == "raw_input":
+                            source: Tuple[str, ...] = tuple()
+                        else:
+                            source = entry.source
+                        yield entry._replace(source=(member.name, *source))
+                except Exception as e:
+                    yield MoleculeEntry(
+                        raw_input="<tar>",
+                        input_type="unknown",
+                        source=(member.name,),
+                        mol=None,
+                        errors=[
+                            Problem(
+                                "invalid_tar_member",
+                                f"Could not read tar member '{member.name}': {e}",
+                            )
+                        ],
+                    )
+    def __repr__(self) -> str:
+        return "TarReader()"

nerdd_module-0.3.50/nerdd_module/input/zip_reader.py ADDED Viewed

@@ -0,0 +1,52 @@
+import zipfile
+from typing import Any, Iterator, Tuple
+from ..problem import Problem
+from .reader import ExploreCallable, MoleculeEntry, Reader
+__all__ = ["ZipReader"]
+class ZipReader(Reader):
+    def __init__(self) -> None:
+        super().__init__()
+    def read(self, input_stream: Any, explore: ExploreCallable) -> Iterator[MoleculeEntry]:
+        if not hasattr(input_stream, "read") or not hasattr(input_stream, "seek"):
+            raise TypeError("input must be a stream-like object")
+        input_stream.seek(0)
+        with zipfile.ZipFile(input_stream, "r") as zipf:
+            for member in zipf.namelist():
+                # check if the member is a file
+                if member.endswith("/"):
+                    continue
+                try:
+                    with zipf.open(member, "r") as f:
+                        for entry in explore(f):
+                            # the underlying reader only sees the file content as a stream
+                            # -> it might believe that the source is "raw_input"
+                            # -> we need to correct that here
+                            if len(entry.source) == 1 and entry.source[0] == "raw_input":
+                                source: Tuple[str, ...] = tuple()
+                            else:
+                                source = entry.source
+                            yield entry._replace(source=(member, *source))
+                except Exception as e:
+                    yield MoleculeEntry(
+                        raw_input="<zip>",
+                        input_type="unknown",
+                        source=(member,),
+                        mol=None,
+                        errors=[
+                            Problem(
+                                "invalid_zip_member", f"Could not read zip member '{member}': {e}"
+                            )
+                        ],
+                    )
+    def __repr__(self) -> str:
+        return "ZipReader()"

{nerdd_module-0.3.48 → nerdd_module-0.3.50}/nerdd_module/preprocessing/sanitize.py RENAMED Viewed

@@ -1,7 +1,13 @@
 import logging
 from typing import List, Optional, Tuple
-from rdkit.Chem import AtomKekulizeException, KekulizeException, Mol, SanitizeMol
+from rdkit.Chem import (
+    AtomKekulizeException,
+    AtomValenceException,
+    KekulizeException,
+    Mol,
+    SanitizeMol,
+)
 from ..problem import Problem
 from .preprocessing_step import PreprocessingStep
@@ -26,6 +32,8 @@ class Sanitize(PreprocessingStep):
             return None, [
                 Problem("atom_kekulization_error", "Failed kekulizing an atom in the molecule.")
             ]
+        except AtomValenceException as e:
+            return None, [Problem("valence_error", str(e))]
         except Exception as e:
             logger.exception(e)
             return None, [Problem("sanitization_error", "Failed sanitizing the molecule.")]

{nerdd_module-0.3.48 → nerdd_module-0.3.50}/nerdd_module.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nerdd-module
-Version: 0.3.48
+Version: 0.3.50
 Summary: Base package to create NERDD modules
 Author-email: Steffen Hirte <steffen.hirte@univie.ac.at>
 Maintainer-email: Steffen Hirte <steffen.hirte@univie.ac.at>

{nerdd_module-0.3.48 → nerdd_module-0.3.50}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "nerdd-module"
-version = "0.3.48"
+version = "0.3.50"
 description = "Base package to create NERDD modules"
 readme = "README.md"
 license = "BSD-3-Clause"

nerdd_module-0.3.48/nerdd_module/input/tar_reader.py DELETED Viewed

@@ -1,27 +0,0 @@
-import tarfile
-from typing import Any, Iterator
-from .reader import ExploreCallable, MoleculeEntry, Reader
-__all__ = ["TarReader"]
-class TarReader(Reader):
-    def __init__(self) -> None:
-        super().__init__()
-    def read(self, input_stream: Any, explore: ExploreCallable) -> Iterator[MoleculeEntry]:
-        if not hasattr(input_stream, "read") or not hasattr(input_stream, "seek"):
-            raise TypeError("input must be a stream-like object")
-        input_stream.seek(0)
-        with tarfile.open(fileobj=input_stream, mode="r") as tar:
-            for member in tar.getmembers():
-                if not member.isfile():
-                    continue
-                for entry in explore(tar.extractfile(member)):
-                    yield entry._replace(source=(member.name, *entry.source))
-    def __repr__(self) -> str:
-        return "TarReader()"

nerdd_module-0.3.48/nerdd_module/input/zip_reader.py DELETED Viewed

@@ -1,29 +0,0 @@
-import zipfile
-from typing import Any, Iterator
-from .reader import ExploreCallable, MoleculeEntry, Reader
-__all__ = ["ZipReader"]
-class ZipReader(Reader):
-    def __init__(self) -> None:
-        super().__init__()
-    def read(self, input_stream: Any, explore: ExploreCallable) -> Iterator[MoleculeEntry]:
-        if not hasattr(input_stream, "read") or not hasattr(input_stream, "seek"):
-            raise TypeError("input must be a stream-like object")
-        input_stream.seek(0)
-        with zipfile.ZipFile(input_stream, "r") as zipf:
-            for member in zipf.namelist():
-                # check if the member is a file
-                if member.endswith("/"):
-                    continue
-                with zipf.open(member, "r") as f:
-                    for entry in explore(f):
-                        yield entry._replace(source=(member, *entry.source))
-    def __repr__(self) -> str:
-        return "ZipReader()"