fhir-pyrate 0.2.0b9__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
fhir_pyrate/__init__.py CHANGED
@@ -24,7 +24,7 @@ DicomDownloader, _ = fhir_pyrate.util.imports.optional_import(
24
24
 
25
25
  __all__ = [
26
26
  "Ahoy",
27
+ "DicomDownloader",
27
28
  "Miner",
28
29
  "Pirate",
29
- "DicomDownloader",
30
30
  ]
fhir_pyrate/ahoy.py CHANGED
@@ -33,18 +33,22 @@ class Ahoy:
33
33
  :param token_refresh_delta: Either a timedelta object that tells us how often the token
34
34
  should be refreshed, or a number of minutes; this does not need to be specified for JWT tokens
35
35
  that contain the expiry date
36
+ :param session: The session that can be used for the authentication. This is particularly
37
+ useful if you have some particular requirements for your authentication (e.g. you need to
38
+ support for cusum self-signed certificates).
36
39
  """
37
40
 
38
41
  def __init__(
39
42
  self,
40
- auth_url: str = None,
43
+ auth_url: Optional[str] = None,
41
44
  auth_type: Optional[str] = "token",
42
- refresh_url: str = None,
43
- username: str = None,
45
+ refresh_url: Optional[str] = None,
46
+ username: Optional[str] = None,
44
47
  auth_method: Optional[str] = "password",
45
- token: str = None,
48
+ token: Optional[str] = None,
46
49
  max_login_attempts: int = 5,
47
- token_refresh_delta: Union[int, timedelta] = None,
50
+ token_refresh_delta: Optional[Union[int, timedelta]] = None,
51
+ session: Optional[requests.Session] = None,
48
52
  ) -> None:
49
53
  self.auth_type = auth_type
50
54
  self.auth_method = auth_method
@@ -54,7 +58,10 @@ class Ahoy:
54
58
  self._user_env_name = "FHIR_USER"
55
59
  self._pass_env_name = "FHIR_PASSWORD"
56
60
  self.token = token
57
- self.session = requests.Session()
61
+ if session is None:
62
+ self.session = requests.Session()
63
+ else:
64
+ self.session = session
58
65
  self.max_login_attempts = max_login_attempts
59
66
  self.token_refresh_delta = token_refresh_delta
60
67
  if self.auth_type is not None and self.auth_method is not None:
@@ -75,7 +82,7 @@ class Ahoy:
75
82
  self.close()
76
83
 
77
84
  def change_environment_variable_name(
78
- self, user_env: str = None, pass_env: str = None
85
+ self, user_env: Optional[str] = None, pass_env: Optional[str] = None
79
86
  ) -> None:
80
87
  """
81
88
  Change the name of the variables used to retrieve username and password.
@@ -1,17 +1,31 @@
1
1
  import hashlib
2
2
  import io
3
3
  import logging
4
+ import multiprocessing
4
5
  import os
5
6
  import pathlib
6
7
  import platform
7
8
  import shutil
9
+ import signal
8
10
  import sys
9
11
  import tempfile
10
12
  import traceback
11
13
  import warnings
12
14
  from contextlib import contextmanager
15
+ from functools import partial
13
16
  from types import TracebackType
14
- from typing import Dict, Generator, List, Optional, TextIO, Tuple, Type, Union
17
+ from typing import (
18
+ ClassVar,
19
+ Dict,
20
+ FrozenSet,
21
+ Generator,
22
+ List,
23
+ Optional,
24
+ TextIO,
25
+ Tuple,
26
+ Type,
27
+ Union,
28
+ )
15
29
 
16
30
  import pandas as pd
17
31
  import pydicom
@@ -64,7 +78,7 @@ def fileno(file_or_fd: TextIO) -> Optional[int]:
64
78
  @contextmanager
65
79
  def stdout_redirected(
66
80
  to: Union[str, TextIO] = os.devnull, stdout: Optional[TextIO] = None
67
- ) -> Generator:
81
+ ) -> Generator[Optional[TextIO], None, None]:
68
82
  if platform.system() == "Windows":
69
83
  yield None
70
84
  return
@@ -130,25 +144,28 @@ class DicomDownloader:
130
144
  study will always end up in the same folder.
131
145
  :param retry: This flag will set the retry parameter of the DicomWebClient, which activates
132
146
  HTTP retrying.
147
+ :param num_processes: The number of processes to run for downloading
133
148
  """
134
149
 
135
- ACCEPTED_FORMATS = {
136
- ".dcm",
137
- ".nia",
138
- ".nii",
139
- ".nii.gz",
140
- ".hdr",
141
- ".img",
142
- ".img.gz",
143
- ".tif",
144
- ".TIF",
145
- ".tiff",
146
- ".TIFF",
147
- ".mha",
148
- ".mhd",
149
- ".nrrd",
150
- ".nhdr",
151
- }
150
+ ACCEPTED_FORMATS: ClassVar[FrozenSet[str]] = frozenset(
151
+ {
152
+ ".dcm",
153
+ ".nia",
154
+ ".nii",
155
+ ".nii.gz",
156
+ ".hdr",
157
+ ".img",
158
+ ".img.gz",
159
+ ".tif",
160
+ ".TIF",
161
+ ".tiff",
162
+ ".TIFF",
163
+ ".mha",
164
+ ".mhd",
165
+ ".nrrd",
166
+ ".nhdr",
167
+ }
168
+ )
152
169
 
153
170
  def __init__(
154
171
  self,
@@ -160,6 +177,7 @@ class DicomDownloader:
160
177
  turn_off_checks: bool = False,
161
178
  always_download_in_study_folder: bool = False,
162
179
  retry: bool = False,
180
+ num_processes: int = 1,
163
181
  ):
164
182
  self.dicom_web_url = dicom_web_url
165
183
  self._close_session_on_exit = False
@@ -185,6 +203,7 @@ class DicomDownloader:
185
203
  self.turn_off_checks = turn_off_checks
186
204
  self.always_download_in_study_folder = always_download_in_study_folder
187
205
  self.hierarchical_storage = hierarchical_storage
206
+ self.num_processes = num_processes
188
207
 
189
208
  def set_output_format(self, new_output_format: str) -> None:
190
209
  """
@@ -233,7 +252,7 @@ class DicomDownloader:
233
252
  @staticmethod
234
253
  def get_download_id(
235
254
  study_uid: str,
236
- series_uid: str = None,
255
+ series_uid: Optional[str] = None,
237
256
  always_download_in_study_folder: bool = False,
238
257
  ) -> str:
239
258
  """
@@ -253,7 +272,7 @@ class DicomDownloader:
253
272
 
254
273
  def get_download_path(self, download_id: str) -> pathlib.Path:
255
274
  """
256
- Builds the folder hierarchy where the data will be stored. The hierarchy depends on the
275
+ Build the folder hierarchy where the data will be stored. The hierarchy depends on the
257
276
  `hierarchical_storage` parameter. Given a download ID
258
277
  263a1dad02916f5eca3c4eec51dc9d281735b47b8eb8bc2343c56e6ccd and `hierarchical_storage` = 2,
259
278
  the data will be stored in 26/3a/1dad02916f5eca3c4eec51dc9d281735b47b8eb8bc2343c56e6ccd.
@@ -271,13 +290,13 @@ class DicomDownloader:
271
290
  def download_data(
272
291
  self,
273
292
  study_uid: str,
274
- series_uid: str = None,
293
+ series_uid: Optional[str] = None,
275
294
  output_dir: Union[str, pathlib.Path] = "out",
276
295
  save_metadata: bool = True,
277
296
  existing_ids: Optional[List[str]] = None,
278
297
  ) -> Tuple[List[Dict[str, str]], List[Dict[str, str]]]:
279
298
  """
280
- Downloads the data related to the StudyInstanceUID and SeriesInstanceUID (if given,
299
+ Download the data related to the StudyInstanceUID and SeriesInstanceUID (if given,
281
300
  otherwise the entire study will be downloaded).
282
301
 
283
302
  :param study_uid: The StudyInstanceUID
@@ -327,7 +346,7 @@ class DicomDownloader:
327
346
  base_dict[self.series_instance_uid_field] = series_uid
328
347
 
329
348
  # Init the readers/writers
330
- series_reader = sitk.ImageSeriesReader() # type: ignore
349
+ series_reader = sitk.ImageSeriesReader()
331
350
  with tempfile.TemporaryDirectory() as tmp_dir:
332
351
  # Create the download dir
333
352
  current_tmp_dir = pathlib.Path(tmp_dir)
@@ -355,11 +374,11 @@ class DicomDownloader:
355
374
  progress_bar.close()
356
375
 
357
376
  # Get Series ID names from folder
358
- series_uids = sitk.ImageSeriesReader.GetGDCMSeriesIDs(str(current_tmp_dir)) # type: ignore
377
+ series_uids = sitk.ImageSeriesReader.GetGDCMSeriesIDs(str(current_tmp_dir))
359
378
  logger.info(f"Study ID has {len(series_uids)} series.")
360
379
  for series in series_uids:
361
380
  # Get the DICOMs corresponding to the series
362
- files = series_reader.GetGDCMSeriesFileNames( # type: ignore
381
+ files = series_reader.GetGDCMSeriesFileNames(
363
382
  str(current_tmp_dir), series
364
383
  )
365
384
  current_dict = base_dict.copy()
@@ -368,11 +387,12 @@ class DicomDownloader:
368
387
  )
369
388
  try:
370
389
  # Read the series
371
- with simpleitk_warning_file.open("w") as f, stdout_redirected(
372
- f, stdout=sys.stderr
390
+ with (
391
+ simpleitk_warning_file.open("w") as f,
392
+ stdout_redirected(f, stdout=sys.stderr),
373
393
  ):
374
- series_reader.SetFileNames(files) # type: ignore
375
- image = series_reader.Execute() # type: ignore
394
+ series_reader.SetFileNames(files)
395
+ image = series_reader.Execute()
376
396
  with simpleitk_warning_file.open("r") as f:
377
397
  content = f.read()
378
398
  if "warning" in content.lower():
@@ -425,9 +445,9 @@ class DicomDownloader:
425
445
  series_download_dir / f"{series}_meta.dcm",
426
446
  )
427
447
  dcm_info = pydicom.dcmread(str(files[0]), stop_before_pixels=True)
428
- current_dict[
429
- self.deid_study_instance_uid_field
430
- ] = dcm_info.StudyInstanceUID
448
+ current_dict[self.deid_study_instance_uid_field] = (
449
+ dcm_info.StudyInstanceUID
450
+ )
431
451
  current_dict[self.deid_series_instance_uid_field] = series
432
452
  downloaded_series_info.append(current_dict)
433
453
 
@@ -436,7 +456,7 @@ class DicomDownloader:
436
456
  def fix_mapping_dataframe(
437
457
  self,
438
458
  df: pd.DataFrame,
439
- mapping_df: pd.DataFrame = None,
459
+ mapping_df: Optional[pd.DataFrame] = None,
440
460
  output_dir: Union[str, pathlib.Path] = "out",
441
461
  study_uid_col: str = "study_instance_uid",
442
462
  series_uid_col: str = "series_instance_uid",
@@ -458,7 +478,8 @@ class DicomDownloader:
458
478
  output_dir = pathlib.Path(output_dir)
459
479
  if not output_dir.exists() or not len(list(output_dir.glob("*"))):
460
480
  warnings.warn(
461
- "Cannot fix the mapping file if the output directory does not exist."
481
+ "Cannot fix the mapping file if the output directory does not exist.",
482
+ stacklevel=2,
462
483
  )
463
484
  return None
464
485
  if mapping_df is None:
@@ -503,13 +524,45 @@ class DicomDownloader:
503
524
  new_df = pd.concat([mapping_df, pd.DataFrame(csv_rows)])
504
525
  return new_df
505
526
 
527
+ def _download_helper(
528
+ self,
529
+ uids: Tuple[str, Optional[str]],
530
+ existing_ids: Optional[List[str]],
531
+ output_dir: pathlib.Path,
532
+ save_metadata: bool = True,
533
+ ) -> Tuple[Optional[List[Dict[str, str]]], Optional[List[Dict[str, str]]]]:
534
+ study_uid, series_uid = uids
535
+ with logging_redirect_tqdm():
536
+ try:
537
+ download_info, error_info = self.download_data(
538
+ study_uid=study_uid,
539
+ series_uid=series_uid,
540
+ output_dir=output_dir,
541
+ save_metadata=save_metadata,
542
+ existing_ids=existing_ids,
543
+ )
544
+ return download_info, error_info
545
+ except Exception:
546
+ # If any error happens that is not caught, just go to the next one
547
+ logger.error(traceback.format_exc())
548
+ return None, [
549
+ {
550
+ self.study_instance_uid_field: study_uid,
551
+ self.series_instance_uid_field: series_uid
552
+ if series_uid
553
+ else "",
554
+ self.error_type_field: "Other Error",
555
+ self.traceback_field: traceback.format_exc(),
556
+ }
557
+ ]
558
+
506
559
  def download_data_from_dataframe(
507
560
  self,
508
561
  df: pd.DataFrame,
509
562
  output_dir: Union[str, pathlib.Path] = "out",
510
563
  study_uid_col: str = "study_instance_uid",
511
564
  series_uid_col: Optional[str] = "series_instance_uid",
512
- mapping_df: pd.DataFrame = None,
565
+ mapping_df: Optional[pd.DataFrame] = None,
513
566
  download_full_study: bool = False,
514
567
  save_metadata: bool = True,
515
568
  ) -> Tuple[pd.DataFrame, pd.DataFrame]:
@@ -555,45 +608,64 @@ class DicomDownloader:
555
608
  warnings.warn(
556
609
  "download_full_study = False will only download a specified series but "
557
610
  "have not provided a valid Series UID column of the DataFrame, "
558
- "as a result the full study will be downloaded."
611
+ "as a result the full study will be downloaded.",
612
+ stacklevel=2,
559
613
  )
560
614
 
561
615
  # Create list of rows
562
616
  csv_rows = []
563
617
  error_rows = []
564
- for row in tqdm(
565
- df.itertuples(index=False), total=len(df), desc="Downloading Rows"
566
- ):
567
- with logging_redirect_tqdm():
618
+
619
+ func = partial(
620
+ self._download_helper,
621
+ existing_ids=existing_ids,
622
+ output_dir=output_dir,
623
+ save_metadata=save_metadata,
624
+ )
625
+
626
+ rows = [
627
+ [
628
+ getattr(row, study_uid_col),
629
+ getattr(row, series_uid_col)
630
+ if not download_full_study and series_uid_col is not None
631
+ else None,
632
+ ]
633
+ for row in df.itertuples(index=False)
634
+ ]
635
+ if self.num_processes > 1:
636
+ with multiprocessing.Pool(
637
+ self.num_processes,
638
+ initializer=signal.signal,
639
+ initargs=(signal.SIGINT, signal.SIG_IGN),
640
+ ) as pool:
568
641
  try:
569
- download_info, error_info = self.download_data(
570
- study_uid=getattr(row, study_uid_col),
571
- series_uid=getattr(row, series_uid_col)
572
- if not download_full_study and series_uid_col is not None
573
- else None,
574
- output_dir=output_dir,
575
- save_metadata=save_metadata,
576
- existing_ids=existing_ids,
577
- )
642
+ for download_info, error_info in tqdm(
643
+ pool.imap_unordered(func, rows),
644
+ total=len(df),
645
+ desc="Downloading Rows",
646
+ ):
647
+ if download_info is not None:
648
+ csv_rows += download_info
649
+ if error_info is not None:
650
+ error_rows += error_info
578
651
  except KeyboardInterrupt:
579
- break
580
- except Exception:
581
- # If any error happens that is not caught, just go to the next one
582
- error_rows += [
583
- {
584
- self.study_instance_uid_field: getattr(row, study_uid_col),
585
- self.series_instance_uid_field: getattr(row, series_uid_col)
586
- if isinstance(series_uid_col, str)
587
- and getattr(row, series_uid_col) is not None
588
- else None,
589
- self.error_type_field: "Other Error",
590
- self.traceback_field: traceback.format_exc(),
591
- }
592
- ]
593
- logger.error(traceback.format_exc())
594
- continue
595
- csv_rows += download_info
596
- error_rows += error_info
652
+ logger.info("Keyboard Interrupt, terminating the pool.")
653
+ else:
654
+ try:
655
+ for row in tqdm(
656
+ rows,
657
+ total=len(df),
658
+ desc="Downloading Rows",
659
+ ):
660
+ download_info, error_info = func(row)
661
+ if download_info is not None:
662
+ csv_rows += download_info
663
+ if error_info is not None:
664
+ error_rows += error_info
665
+ except KeyboardInterrupt:
666
+ logger.info("Keyboard Interrupt, terminating the pool.")
667
+
597
668
  new_mapping_df = pd.concat([mapping_df, pd.DataFrame(csv_rows)])
598
669
  error_df = pd.DataFrame(error_rows)
670
+
599
671
  return new_mapping_df, error_df
fhir_pyrate/miner.py CHANGED
@@ -29,9 +29,9 @@ class Miner:
29
29
  def __init__(
30
30
  self,
31
31
  target_regex: str,
32
- negation_regex: str = None,
33
- regex_flags: Union[int, re.RegexFlag] = None,
34
- decode_text: Callable = None,
32
+ negation_regex: Optional[str] = None,
33
+ regex_flags: Optional[Union[int, re.RegexFlag]] = None,
34
+ decode_text: Optional[Callable[[str], str]] = None,
35
35
  nlp_lib: str = "de_core_news_sm",
36
36
  num_processes: int = 1,
37
37
  ) -> None:
@@ -49,6 +49,7 @@ class Miner:
49
49
  "this will probably not work, because it needs access to your home "
50
50
  "directory. Please run python -m spacy download {nlp_lib} in your "
51
51
  "docker file.",
52
+ stacklevel=2,
52
53
  )
53
54
  subprocess.run(
54
55
  f"python3 -m spacy download {nlp_lib}".split(" "),
@@ -66,7 +67,7 @@ class Miner:
66
67
  @staticmethod
67
68
  def _remove_header(sentences: List[Span], main_document_keyword: str) -> List[Span]:
68
69
  """
69
- Removes all sentences that come before a sentence that contains the `main_document_keyword`.
70
+ Remove all sentences that come before a sentence that contains the `main_document_keyword`.
70
71
  This is useful when a document has a header, and we know what the first viable word of a
71
72
  document is, or we know that we are interested in some particular part of the
72
73
  document that comes after a certain keyword.
@@ -86,10 +87,10 @@ class Miner:
86
87
  def _check_diagnostic_report(
87
88
  self,
88
89
  report_text: str,
89
- main_document_keyword: str = "",
90
+ main_document_keyword: Optional[str] = "",
90
91
  ) -> Optional[List[Span]]:
91
92
  """
92
- Checks whether a report contains the relevant RegEx and does not contain the negation
93
+ Check whether a report contains the relevant RegEx and does not contain the negation
93
94
  RegEx (if specified).
94
95
 
95
96
  :param report_text: The text to be searched
@@ -103,7 +104,7 @@ class Miner:
103
104
  contains_target = re.search(self.target_regex, report_text, self.regex_flags)
104
105
  relevant_sentences = []
105
106
  if contains_target:
106
- sentences = [i for i in self.nlp(report_text).sents]
107
+ sentences = list(self.nlp(report_text).sents)
107
108
  if main_document_keyword is not None:
108
109
  sentences = self._remove_header(sentences, main_document_keyword)
109
110
 
@@ -129,10 +130,10 @@ class Miner:
129
130
  df: pd.DataFrame,
130
131
  text_column_name: str,
131
132
  new_column_name: str = "text_found",
132
- main_document_keyword: str = None,
133
+ main_document_keyword: Optional[str] = None,
133
134
  ) -> pd.DataFrame:
134
135
  """
135
- Searches the strings contained in `text_column_name` for the selected RegEx, and adds two
136
+ Search the strings contained in `text_column_name` for the selected RegEx, and adds two
136
137
  columns to the DataFrame with the output of the NLP search. The negation RegEx can be
137
138
  used to exclude sentences. Additionally, it is possible to define a `main_document_keyword`,
138
139
  which is a string that can be used to filter out the header of the document.
@@ -151,31 +152,29 @@ class Miner:
151
152
  self._check_diagnostic_report,
152
153
  main_document_keyword=main_document_keyword,
153
154
  )
154
- texts = [row for row in df[text_column_name].values]
155
+ texts = list(df[text_column_name].values)
155
156
  tqdm_text = f"Searching for Sentences with {self.target_regex}"
156
157
  if self.negation_regex is not None:
157
158
  tqdm_text += f" and without {self.negation_regex}"
158
159
  if self.num_processes > 1:
159
160
  pool = multiprocessing.Pool(self.num_processes)
160
- results = [
161
- result
162
- for result in tqdm(
161
+ results = list(
162
+ tqdm(
163
163
  pool.imap(func, texts),
164
164
  total=len(df),
165
165
  desc=tqdm_text,
166
166
  )
167
- ]
167
+ )
168
168
  pool.close()
169
169
  pool.join()
170
170
  else:
171
- results = [
172
- result
173
- for result in tqdm(
171
+ results = list(
172
+ tqdm(
174
173
  [func(text) for text in texts],
175
174
  total=len(df),
176
175
  desc=tqdm_text,
177
176
  )
178
- ]
177
+ )
179
178
 
180
179
  df[new_column_name + "_sentences"] = results
181
180
  df[new_column_name] = ~df[new_column_name + "_sentences"].isna()