PyPI - fhir-pyrate - Versions diffs - 0.2.0b9__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

fhir-pyrate 0.2.0b9py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

fhir_pyrate/__init__.py +1 -1
fhir_pyrate/ahoy.py +14 -7
fhir_pyrate/dicom_downloader.py +140 -68
fhir_pyrate/miner.py +17 -18
fhir_pyrate/pirate.py +106 -80
fhir_pyrate/util/__init__.py +2 -6
fhir_pyrate/util/bundle_processing_templates.py +7 -4
fhir_pyrate/util/fhirobj.py +2 -2
fhir_pyrate/util/imports.py +3 -3
fhir_pyrate/util/token_auth.py +27 -23
fhir_pyrate/util/util.py +9 -5
{fhir_pyrate-0.2.0b9.dist-info → fhir_pyrate-0.2.2.dist-info}/METADATA +79 -24
fhir_pyrate-0.2.2.dist-info/RECORD +15 -0
{fhir_pyrate-0.2.0b9.dist-info → fhir_pyrate-0.2.2.dist-info}/WHEEL +1 -1
fhir_pyrate-0.2.0b9.dist-info/RECORD +0 -15
{fhir_pyrate-0.2.0b9.dist-info → fhir_pyrate-0.2.2.dist-info}/LICENSE +0 -0

fhir_pyrate/__init__.py CHANGED Viewed

@@ -24,7 +24,7 @@ DicomDownloader, _ = fhir_pyrate.util.imports.optional_import(
 __all__ = [
     "Ahoy",
+    "DicomDownloader",
     "Miner",
     "Pirate",
-    "DicomDownloader",
 ]

fhir_pyrate/ahoy.py CHANGED Viewed

@@ -33,18 +33,22 @@ class Ahoy:
     :param token_refresh_delta: Either a timedelta object that tells us how often the token
     should be refreshed, or a number of minutes; this does not need to be specified for JWT tokens
     that contain the expiry date
+    :param session: The session that can be used for the authentication. This is particularly
+    useful if you have some particular requirements for your authentication (e.g. you need to
+    support for cusum self-signed certificates).
     """
     def __init__(
         self,
-        auth_url: str = None,
+        auth_url: Optional[str] = None,
         auth_type: Optional[str] = "token",
-        refresh_url: str = None,
-        username: str = None,
+        refresh_url: Optional[str] = None,
+        username: Optional[str] = None,
         auth_method: Optional[str] = "password",
-        token: str = None,
+        token: Optional[str] = None,
         max_login_attempts: int = 5,
-        token_refresh_delta: Union[int, timedelta] = None,
+        token_refresh_delta: Optional[Union[int, timedelta]] = None,
+        session: Optional[requests.Session] = None,
     ) -> None:
         self.auth_type = auth_type
         self.auth_method = auth_method
@@ -54,7 +58,10 @@ class Ahoy:
         self._user_env_name = "FHIR_USER"
         self._pass_env_name = "FHIR_PASSWORD"
         self.token = token
-        self.session = requests.Session()
+        if session is None:
+            self.session = requests.Session()
+        else:
+            self.session = session
         self.max_login_attempts = max_login_attempts
         self.token_refresh_delta = token_refresh_delta
         if self.auth_type is not None and self.auth_method is not None:
@@ -75,7 +82,7 @@ class Ahoy:
         self.close()
     def change_environment_variable_name(
-        self, user_env: str = None, pass_env: str = None
+        self, user_env: Optional[str] = None, pass_env: Optional[str] = None
     ) -> None:
         """
         Change the name of the variables used to retrieve username and password.

fhir_pyrate/dicom_downloader.py CHANGED Viewed

@@ -1,17 +1,31 @@
 import hashlib
 import io
 import logging
+import multiprocessing
 import os
 import pathlib
 import platform
 import shutil
+import signal
 import sys
 import tempfile
 import traceback
 import warnings
 from contextlib import contextmanager
+from functools import partial
 from types import TracebackType
-from typing import Dict, Generator, List, Optional, TextIO, Tuple, Type, Union
+from typing import (
+    ClassVar,
+    Dict,
+    FrozenSet,
+    Generator,
+    List,
+    Optional,
+    TextIO,
+    Tuple,
+    Type,
+    Union,
+)
 import pandas as pd
 import pydicom
@@ -64,7 +78,7 @@ def fileno(file_or_fd: TextIO) -> Optional[int]:
 @contextmanager
 def stdout_redirected(
     to: Union[str, TextIO] = os.devnull, stdout: Optional[TextIO] = None
-) -> Generator:
+) -> Generator[Optional[TextIO], None, None]:
     if platform.system() == "Windows":
         yield None
         return
@@ -130,25 +144,28 @@ class DicomDownloader:
     study will always end up in the same folder.
     :param retry: This flag will set the retry parameter of the DicomWebClient, which activates
     HTTP retrying.
+    :param num_processes: The number of processes to run for downloading
     """
-    ACCEPTED_FORMATS = {
-        ".dcm",
-        ".nia",
-        ".nii",
-        ".nii.gz",
-        ".hdr",
-        ".img",
-        ".img.gz",
-        ".tif",
-        ".TIF",
-        ".tiff",
-        ".TIFF",
-        ".mha",
-        ".mhd",
-        ".nrrd",
-        ".nhdr",
-    }
+    ACCEPTED_FORMATS: ClassVar[FrozenSet[str]] = frozenset(
+        {
+            ".dcm",
+            ".nia",
+            ".nii",
+            ".nii.gz",
+            ".hdr",
+            ".img",
+            ".img.gz",
+            ".tif",
+            ".TIF",
+            ".tiff",
+            ".TIFF",
+            ".mha",
+            ".mhd",
+            ".nrrd",
+            ".nhdr",
+        }
+    )
     def __init__(
         self,
@@ -160,6 +177,7 @@ class DicomDownloader:
         turn_off_checks: bool = False,
         always_download_in_study_folder: bool = False,
         retry: bool = False,
+        num_processes: int = 1,
     ):
         self.dicom_web_url = dicom_web_url
         self._close_session_on_exit = False
@@ -185,6 +203,7 @@ class DicomDownloader:
         self.turn_off_checks = turn_off_checks
         self.always_download_in_study_folder = always_download_in_study_folder
         self.hierarchical_storage = hierarchical_storage
+        self.num_processes = num_processes
     def set_output_format(self, new_output_format: str) -> None:
         """
@@ -233,7 +252,7 @@ class DicomDownloader:
     @staticmethod
     def get_download_id(
         study_uid: str,
-        series_uid: str = None,
+        series_uid: Optional[str] = None,
         always_download_in_study_folder: bool = False,
     ) -> str:
         """
@@ -253,7 +272,7 @@ class DicomDownloader:
     def get_download_path(self, download_id: str) -> pathlib.Path:
         """
-        Builds the folder hierarchy where the data will be stored. The hierarchy depends on the
+        Build the folder hierarchy where the data will be stored. The hierarchy depends on the
         `hierarchical_storage` parameter. Given a download ID
         263a1dad02916f5eca3c4eec51dc9d281735b47b8eb8bc2343c56e6ccd and `hierarchical_storage` = 2,
         the data will be stored in 26/3a/1dad02916f5eca3c4eec51dc9d281735b47b8eb8bc2343c56e6ccd.
@@ -271,13 +290,13 @@ class DicomDownloader:
     def download_data(
         self,
         study_uid: str,
-        series_uid: str = None,
+        series_uid: Optional[str] = None,
         output_dir: Union[str, pathlib.Path] = "out",
         save_metadata: bool = True,
         existing_ids: Optional[List[str]] = None,
     ) -> Tuple[List[Dict[str, str]], List[Dict[str, str]]]:
         """
-        Downloads the data related to the StudyInstanceUID and SeriesInstanceUID (if given,
+        Download the data related to the StudyInstanceUID and SeriesInstanceUID (if given,
         otherwise the entire study will be downloaded).
         :param study_uid: The StudyInstanceUID
@@ -327,7 +346,7 @@ class DicomDownloader:
             base_dict[self.series_instance_uid_field] = series_uid
         # Init the readers/writers
-        series_reader = sitk.ImageSeriesReader()  # type: ignore
+        series_reader = sitk.ImageSeriesReader()
         with tempfile.TemporaryDirectory() as tmp_dir:
             # Create the download dir
             current_tmp_dir = pathlib.Path(tmp_dir)
@@ -355,11 +374,11 @@ class DicomDownloader:
             progress_bar.close()
             # Get Series ID names from folder
-            series_uids = sitk.ImageSeriesReader.GetGDCMSeriesIDs(str(current_tmp_dir))  # type: ignore
+            series_uids = sitk.ImageSeriesReader.GetGDCMSeriesIDs(str(current_tmp_dir))
             logger.info(f"Study ID has {len(series_uids)} series.")
             for series in series_uids:
                 # Get the DICOMs corresponding to the series
-                files = series_reader.GetGDCMSeriesFileNames(  # type: ignore
+                files = series_reader.GetGDCMSeriesFileNames(
                     str(current_tmp_dir), series
                 )
                 current_dict = base_dict.copy()
@@ -368,11 +387,12 @@ class DicomDownloader:
                 )
                 try:
                     # Read the series
-                    with simpleitk_warning_file.open("w") as f, stdout_redirected(
-                        f, stdout=sys.stderr
+                    with (
+                        simpleitk_warning_file.open("w") as f,
+                        stdout_redirected(f, stdout=sys.stderr),
                     ):
-                        series_reader.SetFileNames(files)  # type: ignore
-                        image = series_reader.Execute()  # type: ignore
+                        series_reader.SetFileNames(files)
+                        image = series_reader.Execute()
                     with simpleitk_warning_file.open("r") as f:
                         content = f.read()
                     if "warning" in content.lower():
@@ -425,9 +445,9 @@ class DicomDownloader:
                         series_download_dir / f"{series}_meta.dcm",
                     )
                 dcm_info = pydicom.dcmread(str(files[0]), stop_before_pixels=True)
-                current_dict[
-                    self.deid_study_instance_uid_field
-                ] = dcm_info.StudyInstanceUID
+                current_dict[self.deid_study_instance_uid_field] = (
+                    dcm_info.StudyInstanceUID
+                )
                 current_dict[self.deid_series_instance_uid_field] = series
                 downloaded_series_info.append(current_dict)
@@ -436,7 +456,7 @@ class DicomDownloader:
     def fix_mapping_dataframe(
         self,
         df: pd.DataFrame,
-        mapping_df: pd.DataFrame = None,
+        mapping_df: Optional[pd.DataFrame] = None,
         output_dir: Union[str, pathlib.Path] = "out",
         study_uid_col: str = "study_instance_uid",
         series_uid_col: str = "series_instance_uid",
@@ -458,7 +478,8 @@ class DicomDownloader:
         output_dir = pathlib.Path(output_dir)
         if not output_dir.exists() or not len(list(output_dir.glob("*"))):
             warnings.warn(
-                "Cannot fix the mapping file if the output directory does not exist."
+                "Cannot fix the mapping file if the output directory does not exist.",
+                stacklevel=2,
             )
             return None
         if mapping_df is None:
@@ -503,13 +524,45 @@ class DicomDownloader:
         new_df = pd.concat([mapping_df, pd.DataFrame(csv_rows)])
         return new_df
+    def _download_helper(
+        self,
+        uids: Tuple[str, Optional[str]],
+        existing_ids: Optional[List[str]],
+        output_dir: pathlib.Path,
+        save_metadata: bool = True,
+    ) -> Tuple[Optional[List[Dict[str, str]]], Optional[List[Dict[str, str]]]]:
+        study_uid, series_uid = uids
+        with logging_redirect_tqdm():
+            try:
+                download_info, error_info = self.download_data(
+                    study_uid=study_uid,
+                    series_uid=series_uid,
+                    output_dir=output_dir,
+                    save_metadata=save_metadata,
+                    existing_ids=existing_ids,
+                )
+                return download_info, error_info
+            except Exception:
+                # If any error happens that is not caught, just go to the next one
+                logger.error(traceback.format_exc())
+                return None, [
+                    {
+                        self.study_instance_uid_field: study_uid,
+                        self.series_instance_uid_field: series_uid
+                        if series_uid
+                        else "",
+                        self.error_type_field: "Other Error",
+                        self.traceback_field: traceback.format_exc(),
+                    }
+                ]
     def download_data_from_dataframe(
         self,
         df: pd.DataFrame,
         output_dir: Union[str, pathlib.Path] = "out",
         study_uid_col: str = "study_instance_uid",
         series_uid_col: Optional[str] = "series_instance_uid",
-        mapping_df: pd.DataFrame = None,
+        mapping_df: Optional[pd.DataFrame] = None,
         download_full_study: bool = False,
         save_metadata: bool = True,
     ) -> Tuple[pd.DataFrame, pd.DataFrame]:
@@ -555,45 +608,64 @@ class DicomDownloader:
             warnings.warn(
                 "download_full_study = False will only download a specified series but "
                 "have not provided a valid Series UID column of the DataFrame, "
-                "as a result the full study will be downloaded."
+                "as a result the full study will be downloaded.",
+                stacklevel=2,
             )
         # Create list of rows
         csv_rows = []
         error_rows = []
-        for row in tqdm(
-            df.itertuples(index=False), total=len(df), desc="Downloading Rows"
-        ):
-            with logging_redirect_tqdm():
+        func = partial(
+            self._download_helper,
+            existing_ids=existing_ids,
+            output_dir=output_dir,
+            save_metadata=save_metadata,
+        )
+        rows = [
+            [
+                getattr(row, study_uid_col),
+                getattr(row, series_uid_col)
+                if not download_full_study and series_uid_col is not None
+                else None,
+            ]
+            for row in df.itertuples(index=False)
+        ]
+        if self.num_processes > 1:
+            with multiprocessing.Pool(
+                self.num_processes,
+                initializer=signal.signal,
+                initargs=(signal.SIGINT, signal.SIG_IGN),
+            ) as pool:
                 try:
-                    download_info, error_info = self.download_data(
-                        study_uid=getattr(row, study_uid_col),
-                        series_uid=getattr(row, series_uid_col)
-                        if not download_full_study and series_uid_col is not None
-                        else None,
-                        output_dir=output_dir,
-                        save_metadata=save_metadata,
-                        existing_ids=existing_ids,
-                    )
+                    for download_info, error_info in tqdm(
+                        pool.imap_unordered(func, rows),
+                        total=len(df),
+                        desc="Downloading Rows",
+                    ):
+                        if download_info is not None:
+                            csv_rows += download_info
+                        if error_info is not None:
+                            error_rows += error_info
                 except KeyboardInterrupt:
-                    break
-                except Exception:
-                    # If any error happens that is not caught, just go to the next one
-                    error_rows += [
-                        {
-                            self.study_instance_uid_field: getattr(row, study_uid_col),
-                            self.series_instance_uid_field: getattr(row, series_uid_col)
-                            if isinstance(series_uid_col, str)
-                            and getattr(row, series_uid_col) is not None
-                            else None,
-                            self.error_type_field: "Other Error",
-                            self.traceback_field: traceback.format_exc(),
-                        }
-                    ]
-                    logger.error(traceback.format_exc())
-                    continue
-                csv_rows += download_info
-                error_rows += error_info
+                    logger.info("Keyboard Interrupt, terminating the pool.")
+        else:
+            try:
+                for row in tqdm(
+                    rows,
+                    total=len(df),
+                    desc="Downloading Rows",
+                ):
+                    download_info, error_info = func(row)
+                    if download_info is not None:
+                        csv_rows += download_info
+                    if error_info is not None:
+                        error_rows += error_info
+            except KeyboardInterrupt:
+                logger.info("Keyboard Interrupt, terminating the pool.")
         new_mapping_df = pd.concat([mapping_df, pd.DataFrame(csv_rows)])
         error_df = pd.DataFrame(error_rows)
         return new_mapping_df, error_df

fhir_pyrate/miner.py CHANGED Viewed

@@ -29,9 +29,9 @@ class Miner:
     def __init__(
         self,
         target_regex: str,
-        negation_regex: str = None,
-        regex_flags: Union[int, re.RegexFlag] = None,
-        decode_text: Callable = None,
+        negation_regex: Optional[str] = None,
+        regex_flags: Optional[Union[int, re.RegexFlag]] = None,
+        decode_text: Optional[Callable[[str], str]] = None,
         nlp_lib: str = "de_core_news_sm",
         num_processes: int = 1,
     ) -> None:
@@ -49,6 +49,7 @@ class Miner:
                 "this will probably not work, because it needs access to your home "
                 "directory. Please run python -m spacy download {nlp_lib} in your "
                 "docker file.",
+                stacklevel=2,
             )
             subprocess.run(
                 f"python3 -m spacy download {nlp_lib}".split(" "),
@@ -66,7 +67,7 @@ class Miner:
     @staticmethod
     def _remove_header(sentences: List[Span], main_document_keyword: str) -> List[Span]:
         """
-        Removes all sentences that come before a sentence that contains the `main_document_keyword`.
+        Remove all sentences that come before a sentence that contains the `main_document_keyword`.
         This is useful when a document has a header, and we know what the first viable word of a
         document is, or we know that we are interested in some particular part of the
         document that comes after a certain keyword.
@@ -86,10 +87,10 @@ class Miner:
     def _check_diagnostic_report(
         self,
         report_text: str,
-        main_document_keyword: str = "",
+        main_document_keyword: Optional[str] = "",
     ) -> Optional[List[Span]]:
         """
-        Checks whether a report contains the relevant RegEx and does not contain the negation
+        Check whether a report contains the relevant RegEx and does not contain the negation
         RegEx (if specified).
         :param report_text: The text to be searched
@@ -103,7 +104,7 @@ class Miner:
         contains_target = re.search(self.target_regex, report_text, self.regex_flags)
         relevant_sentences = []
         if contains_target:
-            sentences = [i for i in self.nlp(report_text).sents]
+            sentences = list(self.nlp(report_text).sents)
             if main_document_keyword is not None:
                 sentences = self._remove_header(sentences, main_document_keyword)
@@ -129,10 +130,10 @@ class Miner:
         df: pd.DataFrame,
         text_column_name: str,
         new_column_name: str = "text_found",
-        main_document_keyword: str = None,
+        main_document_keyword: Optional[str] = None,
     ) -> pd.DataFrame:
         """
-        Searches the strings contained in `text_column_name` for the selected RegEx, and adds two
+        Search the strings contained in `text_column_name` for the selected RegEx, and adds two
         columns to the DataFrame with the output of the NLP search. The negation RegEx can be
         used to exclude sentences. Additionally, it is possible to define a `main_document_keyword`,
         which is a string that can be used to filter out the header of the document.
@@ -151,31 +152,29 @@ class Miner:
             self._check_diagnostic_report,
             main_document_keyword=main_document_keyword,
         )
-        texts = [row for row in df[text_column_name].values]
+        texts = list(df[text_column_name].values)
         tqdm_text = f"Searching for Sentences with {self.target_regex}"
         if self.negation_regex is not None:
             tqdm_text += f" and without {self.negation_regex}"
         if self.num_processes > 1:
             pool = multiprocessing.Pool(self.num_processes)
-            results = [
-                result
-                for result in tqdm(
+            results = list(
+                tqdm(
                     pool.imap(func, texts),
                     total=len(df),
                     desc=tqdm_text,
                 )
-            ]
+            )
             pool.close()
             pool.join()
         else:
-            results = [
-                result
-                for result in tqdm(
+            results = list(
+                tqdm(
                     [func(text) for text in texts],
                     total=len(df),
                     desc=tqdm_text,
                 )
-            ]
+            )
         df[new_column_name + "_sentences"] = results
         df[new_column_name] = ~df[new_column_name + "_sentences"].isna()

fhir-pyrate 0.2.0b9__py3-none-any.whl → 0.2.2__py3-none-any.whl

fhir-pyrate 0.2.0b9py3-none-any.whl → 0.2.2py3-none-any.whl