fhir-pyrate 0.2.0b9__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fhir_pyrate/__init__.py +1 -1
- fhir_pyrate/ahoy.py +14 -7
- fhir_pyrate/dicom_downloader.py +140 -68
- fhir_pyrate/miner.py +17 -18
- fhir_pyrate/pirate.py +106 -80
- fhir_pyrate/util/__init__.py +2 -6
- fhir_pyrate/util/bundle_processing_templates.py +7 -4
- fhir_pyrate/util/fhirobj.py +2 -2
- fhir_pyrate/util/imports.py +3 -3
- fhir_pyrate/util/token_auth.py +27 -23
- fhir_pyrate/util/util.py +9 -5
- {fhir_pyrate-0.2.0b9.dist-info → fhir_pyrate-0.2.2.dist-info}/METADATA +79 -24
- fhir_pyrate-0.2.2.dist-info/RECORD +15 -0
- {fhir_pyrate-0.2.0b9.dist-info → fhir_pyrate-0.2.2.dist-info}/WHEEL +1 -1
- fhir_pyrate-0.2.0b9.dist-info/RECORD +0 -15
- {fhir_pyrate-0.2.0b9.dist-info → fhir_pyrate-0.2.2.dist-info}/LICENSE +0 -0
fhir_pyrate/__init__.py
CHANGED
fhir_pyrate/ahoy.py
CHANGED
|
@@ -33,18 +33,22 @@ class Ahoy:
|
|
|
33
33
|
:param token_refresh_delta: Either a timedelta object that tells us how often the token
|
|
34
34
|
should be refreshed, or a number of minutes; this does not need to be specified for JWT tokens
|
|
35
35
|
that contain the expiry date
|
|
36
|
+
:param session: The session that can be used for the authentication. This is particularly
|
|
37
|
+
useful if you have some particular requirements for your authentication (e.g. you need to
|
|
38
|
+
support for cusum self-signed certificates).
|
|
36
39
|
"""
|
|
37
40
|
|
|
38
41
|
def __init__(
|
|
39
42
|
self,
|
|
40
|
-
auth_url: str = None,
|
|
43
|
+
auth_url: Optional[str] = None,
|
|
41
44
|
auth_type: Optional[str] = "token",
|
|
42
|
-
refresh_url: str = None,
|
|
43
|
-
username: str = None,
|
|
45
|
+
refresh_url: Optional[str] = None,
|
|
46
|
+
username: Optional[str] = None,
|
|
44
47
|
auth_method: Optional[str] = "password",
|
|
45
|
-
token: str = None,
|
|
48
|
+
token: Optional[str] = None,
|
|
46
49
|
max_login_attempts: int = 5,
|
|
47
|
-
token_refresh_delta: Union[int, timedelta] = None,
|
|
50
|
+
token_refresh_delta: Optional[Union[int, timedelta]] = None,
|
|
51
|
+
session: Optional[requests.Session] = None,
|
|
48
52
|
) -> None:
|
|
49
53
|
self.auth_type = auth_type
|
|
50
54
|
self.auth_method = auth_method
|
|
@@ -54,7 +58,10 @@ class Ahoy:
|
|
|
54
58
|
self._user_env_name = "FHIR_USER"
|
|
55
59
|
self._pass_env_name = "FHIR_PASSWORD"
|
|
56
60
|
self.token = token
|
|
57
|
-
|
|
61
|
+
if session is None:
|
|
62
|
+
self.session = requests.Session()
|
|
63
|
+
else:
|
|
64
|
+
self.session = session
|
|
58
65
|
self.max_login_attempts = max_login_attempts
|
|
59
66
|
self.token_refresh_delta = token_refresh_delta
|
|
60
67
|
if self.auth_type is not None and self.auth_method is not None:
|
|
@@ -75,7 +82,7 @@ class Ahoy:
|
|
|
75
82
|
self.close()
|
|
76
83
|
|
|
77
84
|
def change_environment_variable_name(
|
|
78
|
-
self, user_env: str = None, pass_env: str = None
|
|
85
|
+
self, user_env: Optional[str] = None, pass_env: Optional[str] = None
|
|
79
86
|
) -> None:
|
|
80
87
|
"""
|
|
81
88
|
Change the name of the variables used to retrieve username and password.
|
fhir_pyrate/dicom_downloader.py
CHANGED
|
@@ -1,17 +1,31 @@
|
|
|
1
1
|
import hashlib
|
|
2
2
|
import io
|
|
3
3
|
import logging
|
|
4
|
+
import multiprocessing
|
|
4
5
|
import os
|
|
5
6
|
import pathlib
|
|
6
7
|
import platform
|
|
7
8
|
import shutil
|
|
9
|
+
import signal
|
|
8
10
|
import sys
|
|
9
11
|
import tempfile
|
|
10
12
|
import traceback
|
|
11
13
|
import warnings
|
|
12
14
|
from contextlib import contextmanager
|
|
15
|
+
from functools import partial
|
|
13
16
|
from types import TracebackType
|
|
14
|
-
from typing import
|
|
17
|
+
from typing import (
|
|
18
|
+
ClassVar,
|
|
19
|
+
Dict,
|
|
20
|
+
FrozenSet,
|
|
21
|
+
Generator,
|
|
22
|
+
List,
|
|
23
|
+
Optional,
|
|
24
|
+
TextIO,
|
|
25
|
+
Tuple,
|
|
26
|
+
Type,
|
|
27
|
+
Union,
|
|
28
|
+
)
|
|
15
29
|
|
|
16
30
|
import pandas as pd
|
|
17
31
|
import pydicom
|
|
@@ -64,7 +78,7 @@ def fileno(file_or_fd: TextIO) -> Optional[int]:
|
|
|
64
78
|
@contextmanager
|
|
65
79
|
def stdout_redirected(
|
|
66
80
|
to: Union[str, TextIO] = os.devnull, stdout: Optional[TextIO] = None
|
|
67
|
-
) -> Generator:
|
|
81
|
+
) -> Generator[Optional[TextIO], None, None]:
|
|
68
82
|
if platform.system() == "Windows":
|
|
69
83
|
yield None
|
|
70
84
|
return
|
|
@@ -130,25 +144,28 @@ class DicomDownloader:
|
|
|
130
144
|
study will always end up in the same folder.
|
|
131
145
|
:param retry: This flag will set the retry parameter of the DicomWebClient, which activates
|
|
132
146
|
HTTP retrying.
|
|
147
|
+
:param num_processes: The number of processes to run for downloading
|
|
133
148
|
"""
|
|
134
149
|
|
|
135
|
-
ACCEPTED_FORMATS =
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
150
|
+
ACCEPTED_FORMATS: ClassVar[FrozenSet[str]] = frozenset(
|
|
151
|
+
{
|
|
152
|
+
".dcm",
|
|
153
|
+
".nia",
|
|
154
|
+
".nii",
|
|
155
|
+
".nii.gz",
|
|
156
|
+
".hdr",
|
|
157
|
+
".img",
|
|
158
|
+
".img.gz",
|
|
159
|
+
".tif",
|
|
160
|
+
".TIF",
|
|
161
|
+
".tiff",
|
|
162
|
+
".TIFF",
|
|
163
|
+
".mha",
|
|
164
|
+
".mhd",
|
|
165
|
+
".nrrd",
|
|
166
|
+
".nhdr",
|
|
167
|
+
}
|
|
168
|
+
)
|
|
152
169
|
|
|
153
170
|
def __init__(
|
|
154
171
|
self,
|
|
@@ -160,6 +177,7 @@ class DicomDownloader:
|
|
|
160
177
|
turn_off_checks: bool = False,
|
|
161
178
|
always_download_in_study_folder: bool = False,
|
|
162
179
|
retry: bool = False,
|
|
180
|
+
num_processes: int = 1,
|
|
163
181
|
):
|
|
164
182
|
self.dicom_web_url = dicom_web_url
|
|
165
183
|
self._close_session_on_exit = False
|
|
@@ -185,6 +203,7 @@ class DicomDownloader:
|
|
|
185
203
|
self.turn_off_checks = turn_off_checks
|
|
186
204
|
self.always_download_in_study_folder = always_download_in_study_folder
|
|
187
205
|
self.hierarchical_storage = hierarchical_storage
|
|
206
|
+
self.num_processes = num_processes
|
|
188
207
|
|
|
189
208
|
def set_output_format(self, new_output_format: str) -> None:
|
|
190
209
|
"""
|
|
@@ -233,7 +252,7 @@ class DicomDownloader:
|
|
|
233
252
|
@staticmethod
|
|
234
253
|
def get_download_id(
|
|
235
254
|
study_uid: str,
|
|
236
|
-
series_uid: str = None,
|
|
255
|
+
series_uid: Optional[str] = None,
|
|
237
256
|
always_download_in_study_folder: bool = False,
|
|
238
257
|
) -> str:
|
|
239
258
|
"""
|
|
@@ -253,7 +272,7 @@ class DicomDownloader:
|
|
|
253
272
|
|
|
254
273
|
def get_download_path(self, download_id: str) -> pathlib.Path:
|
|
255
274
|
"""
|
|
256
|
-
|
|
275
|
+
Build the folder hierarchy where the data will be stored. The hierarchy depends on the
|
|
257
276
|
`hierarchical_storage` parameter. Given a download ID
|
|
258
277
|
263a1dad02916f5eca3c4eec51dc9d281735b47b8eb8bc2343c56e6ccd and `hierarchical_storage` = 2,
|
|
259
278
|
the data will be stored in 26/3a/1dad02916f5eca3c4eec51dc9d281735b47b8eb8bc2343c56e6ccd.
|
|
@@ -271,13 +290,13 @@ class DicomDownloader:
|
|
|
271
290
|
def download_data(
|
|
272
291
|
self,
|
|
273
292
|
study_uid: str,
|
|
274
|
-
series_uid: str = None,
|
|
293
|
+
series_uid: Optional[str] = None,
|
|
275
294
|
output_dir: Union[str, pathlib.Path] = "out",
|
|
276
295
|
save_metadata: bool = True,
|
|
277
296
|
existing_ids: Optional[List[str]] = None,
|
|
278
297
|
) -> Tuple[List[Dict[str, str]], List[Dict[str, str]]]:
|
|
279
298
|
"""
|
|
280
|
-
|
|
299
|
+
Download the data related to the StudyInstanceUID and SeriesInstanceUID (if given,
|
|
281
300
|
otherwise the entire study will be downloaded).
|
|
282
301
|
|
|
283
302
|
:param study_uid: The StudyInstanceUID
|
|
@@ -327,7 +346,7 @@ class DicomDownloader:
|
|
|
327
346
|
base_dict[self.series_instance_uid_field] = series_uid
|
|
328
347
|
|
|
329
348
|
# Init the readers/writers
|
|
330
|
-
series_reader = sitk.ImageSeriesReader()
|
|
349
|
+
series_reader = sitk.ImageSeriesReader()
|
|
331
350
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
332
351
|
# Create the download dir
|
|
333
352
|
current_tmp_dir = pathlib.Path(tmp_dir)
|
|
@@ -355,11 +374,11 @@ class DicomDownloader:
|
|
|
355
374
|
progress_bar.close()
|
|
356
375
|
|
|
357
376
|
# Get Series ID names from folder
|
|
358
|
-
series_uids = sitk.ImageSeriesReader.GetGDCMSeriesIDs(str(current_tmp_dir))
|
|
377
|
+
series_uids = sitk.ImageSeriesReader.GetGDCMSeriesIDs(str(current_tmp_dir))
|
|
359
378
|
logger.info(f"Study ID has {len(series_uids)} series.")
|
|
360
379
|
for series in series_uids:
|
|
361
380
|
# Get the DICOMs corresponding to the series
|
|
362
|
-
files = series_reader.GetGDCMSeriesFileNames(
|
|
381
|
+
files = series_reader.GetGDCMSeriesFileNames(
|
|
363
382
|
str(current_tmp_dir), series
|
|
364
383
|
)
|
|
365
384
|
current_dict = base_dict.copy()
|
|
@@ -368,11 +387,12 @@ class DicomDownloader:
|
|
|
368
387
|
)
|
|
369
388
|
try:
|
|
370
389
|
# Read the series
|
|
371
|
-
with
|
|
372
|
-
f,
|
|
390
|
+
with (
|
|
391
|
+
simpleitk_warning_file.open("w") as f,
|
|
392
|
+
stdout_redirected(f, stdout=sys.stderr),
|
|
373
393
|
):
|
|
374
|
-
series_reader.SetFileNames(files)
|
|
375
|
-
image = series_reader.Execute()
|
|
394
|
+
series_reader.SetFileNames(files)
|
|
395
|
+
image = series_reader.Execute()
|
|
376
396
|
with simpleitk_warning_file.open("r") as f:
|
|
377
397
|
content = f.read()
|
|
378
398
|
if "warning" in content.lower():
|
|
@@ -425,9 +445,9 @@ class DicomDownloader:
|
|
|
425
445
|
series_download_dir / f"{series}_meta.dcm",
|
|
426
446
|
)
|
|
427
447
|
dcm_info = pydicom.dcmread(str(files[0]), stop_before_pixels=True)
|
|
428
|
-
current_dict[
|
|
429
|
-
|
|
430
|
-
|
|
448
|
+
current_dict[self.deid_study_instance_uid_field] = (
|
|
449
|
+
dcm_info.StudyInstanceUID
|
|
450
|
+
)
|
|
431
451
|
current_dict[self.deid_series_instance_uid_field] = series
|
|
432
452
|
downloaded_series_info.append(current_dict)
|
|
433
453
|
|
|
@@ -436,7 +456,7 @@ class DicomDownloader:
|
|
|
436
456
|
def fix_mapping_dataframe(
|
|
437
457
|
self,
|
|
438
458
|
df: pd.DataFrame,
|
|
439
|
-
mapping_df: pd.DataFrame = None,
|
|
459
|
+
mapping_df: Optional[pd.DataFrame] = None,
|
|
440
460
|
output_dir: Union[str, pathlib.Path] = "out",
|
|
441
461
|
study_uid_col: str = "study_instance_uid",
|
|
442
462
|
series_uid_col: str = "series_instance_uid",
|
|
@@ -458,7 +478,8 @@ class DicomDownloader:
|
|
|
458
478
|
output_dir = pathlib.Path(output_dir)
|
|
459
479
|
if not output_dir.exists() or not len(list(output_dir.glob("*"))):
|
|
460
480
|
warnings.warn(
|
|
461
|
-
"Cannot fix the mapping file if the output directory does not exist."
|
|
481
|
+
"Cannot fix the mapping file if the output directory does not exist.",
|
|
482
|
+
stacklevel=2,
|
|
462
483
|
)
|
|
463
484
|
return None
|
|
464
485
|
if mapping_df is None:
|
|
@@ -503,13 +524,45 @@ class DicomDownloader:
|
|
|
503
524
|
new_df = pd.concat([mapping_df, pd.DataFrame(csv_rows)])
|
|
504
525
|
return new_df
|
|
505
526
|
|
|
527
|
+
def _download_helper(
|
|
528
|
+
self,
|
|
529
|
+
uids: Tuple[str, Optional[str]],
|
|
530
|
+
existing_ids: Optional[List[str]],
|
|
531
|
+
output_dir: pathlib.Path,
|
|
532
|
+
save_metadata: bool = True,
|
|
533
|
+
) -> Tuple[Optional[List[Dict[str, str]]], Optional[List[Dict[str, str]]]]:
|
|
534
|
+
study_uid, series_uid = uids
|
|
535
|
+
with logging_redirect_tqdm():
|
|
536
|
+
try:
|
|
537
|
+
download_info, error_info = self.download_data(
|
|
538
|
+
study_uid=study_uid,
|
|
539
|
+
series_uid=series_uid,
|
|
540
|
+
output_dir=output_dir,
|
|
541
|
+
save_metadata=save_metadata,
|
|
542
|
+
existing_ids=existing_ids,
|
|
543
|
+
)
|
|
544
|
+
return download_info, error_info
|
|
545
|
+
except Exception:
|
|
546
|
+
# If any error happens that is not caught, just go to the next one
|
|
547
|
+
logger.error(traceback.format_exc())
|
|
548
|
+
return None, [
|
|
549
|
+
{
|
|
550
|
+
self.study_instance_uid_field: study_uid,
|
|
551
|
+
self.series_instance_uid_field: series_uid
|
|
552
|
+
if series_uid
|
|
553
|
+
else "",
|
|
554
|
+
self.error_type_field: "Other Error",
|
|
555
|
+
self.traceback_field: traceback.format_exc(),
|
|
556
|
+
}
|
|
557
|
+
]
|
|
558
|
+
|
|
506
559
|
def download_data_from_dataframe(
|
|
507
560
|
self,
|
|
508
561
|
df: pd.DataFrame,
|
|
509
562
|
output_dir: Union[str, pathlib.Path] = "out",
|
|
510
563
|
study_uid_col: str = "study_instance_uid",
|
|
511
564
|
series_uid_col: Optional[str] = "series_instance_uid",
|
|
512
|
-
mapping_df: pd.DataFrame = None,
|
|
565
|
+
mapping_df: Optional[pd.DataFrame] = None,
|
|
513
566
|
download_full_study: bool = False,
|
|
514
567
|
save_metadata: bool = True,
|
|
515
568
|
) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
@@ -555,45 +608,64 @@ class DicomDownloader:
|
|
|
555
608
|
warnings.warn(
|
|
556
609
|
"download_full_study = False will only download a specified series but "
|
|
557
610
|
"have not provided a valid Series UID column of the DataFrame, "
|
|
558
|
-
"as a result the full study will be downloaded."
|
|
611
|
+
"as a result the full study will be downloaded.",
|
|
612
|
+
stacklevel=2,
|
|
559
613
|
)
|
|
560
614
|
|
|
561
615
|
# Create list of rows
|
|
562
616
|
csv_rows = []
|
|
563
617
|
error_rows = []
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
618
|
+
|
|
619
|
+
func = partial(
|
|
620
|
+
self._download_helper,
|
|
621
|
+
existing_ids=existing_ids,
|
|
622
|
+
output_dir=output_dir,
|
|
623
|
+
save_metadata=save_metadata,
|
|
624
|
+
)
|
|
625
|
+
|
|
626
|
+
rows = [
|
|
627
|
+
[
|
|
628
|
+
getattr(row, study_uid_col),
|
|
629
|
+
getattr(row, series_uid_col)
|
|
630
|
+
if not download_full_study and series_uid_col is not None
|
|
631
|
+
else None,
|
|
632
|
+
]
|
|
633
|
+
for row in df.itertuples(index=False)
|
|
634
|
+
]
|
|
635
|
+
if self.num_processes > 1:
|
|
636
|
+
with multiprocessing.Pool(
|
|
637
|
+
self.num_processes,
|
|
638
|
+
initializer=signal.signal,
|
|
639
|
+
initargs=(signal.SIGINT, signal.SIG_IGN),
|
|
640
|
+
) as pool:
|
|
568
641
|
try:
|
|
569
|
-
download_info, error_info
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
642
|
+
for download_info, error_info in tqdm(
|
|
643
|
+
pool.imap_unordered(func, rows),
|
|
644
|
+
total=len(df),
|
|
645
|
+
desc="Downloading Rows",
|
|
646
|
+
):
|
|
647
|
+
if download_info is not None:
|
|
648
|
+
csv_rows += download_info
|
|
649
|
+
if error_info is not None:
|
|
650
|
+
error_rows += error_info
|
|
578
651
|
except KeyboardInterrupt:
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
csv_rows += download_info
|
|
596
|
-
error_rows += error_info
|
|
652
|
+
logger.info("Keyboard Interrupt, terminating the pool.")
|
|
653
|
+
else:
|
|
654
|
+
try:
|
|
655
|
+
for row in tqdm(
|
|
656
|
+
rows,
|
|
657
|
+
total=len(df),
|
|
658
|
+
desc="Downloading Rows",
|
|
659
|
+
):
|
|
660
|
+
download_info, error_info = func(row)
|
|
661
|
+
if download_info is not None:
|
|
662
|
+
csv_rows += download_info
|
|
663
|
+
if error_info is not None:
|
|
664
|
+
error_rows += error_info
|
|
665
|
+
except KeyboardInterrupt:
|
|
666
|
+
logger.info("Keyboard Interrupt, terminating the pool.")
|
|
667
|
+
|
|
597
668
|
new_mapping_df = pd.concat([mapping_df, pd.DataFrame(csv_rows)])
|
|
598
669
|
error_df = pd.DataFrame(error_rows)
|
|
670
|
+
|
|
599
671
|
return new_mapping_df, error_df
|
fhir_pyrate/miner.py
CHANGED
|
@@ -29,9 +29,9 @@ class Miner:
|
|
|
29
29
|
def __init__(
|
|
30
30
|
self,
|
|
31
31
|
target_regex: str,
|
|
32
|
-
negation_regex: str = None,
|
|
33
|
-
regex_flags: Union[int, re.RegexFlag] = None,
|
|
34
|
-
decode_text: Callable = None,
|
|
32
|
+
negation_regex: Optional[str] = None,
|
|
33
|
+
regex_flags: Optional[Union[int, re.RegexFlag]] = None,
|
|
34
|
+
decode_text: Optional[Callable[[str], str]] = None,
|
|
35
35
|
nlp_lib: str = "de_core_news_sm",
|
|
36
36
|
num_processes: int = 1,
|
|
37
37
|
) -> None:
|
|
@@ -49,6 +49,7 @@ class Miner:
|
|
|
49
49
|
"this will probably not work, because it needs access to your home "
|
|
50
50
|
"directory. Please run python -m spacy download {nlp_lib} in your "
|
|
51
51
|
"docker file.",
|
|
52
|
+
stacklevel=2,
|
|
52
53
|
)
|
|
53
54
|
subprocess.run(
|
|
54
55
|
f"python3 -m spacy download {nlp_lib}".split(" "),
|
|
@@ -66,7 +67,7 @@ class Miner:
|
|
|
66
67
|
@staticmethod
|
|
67
68
|
def _remove_header(sentences: List[Span], main_document_keyword: str) -> List[Span]:
|
|
68
69
|
"""
|
|
69
|
-
|
|
70
|
+
Remove all sentences that come before a sentence that contains the `main_document_keyword`.
|
|
70
71
|
This is useful when a document has a header, and we know what the first viable word of a
|
|
71
72
|
document is, or we know that we are interested in some particular part of the
|
|
72
73
|
document that comes after a certain keyword.
|
|
@@ -86,10 +87,10 @@ class Miner:
|
|
|
86
87
|
def _check_diagnostic_report(
|
|
87
88
|
self,
|
|
88
89
|
report_text: str,
|
|
89
|
-
main_document_keyword: str = "",
|
|
90
|
+
main_document_keyword: Optional[str] = "",
|
|
90
91
|
) -> Optional[List[Span]]:
|
|
91
92
|
"""
|
|
92
|
-
|
|
93
|
+
Check whether a report contains the relevant RegEx and does not contain the negation
|
|
93
94
|
RegEx (if specified).
|
|
94
95
|
|
|
95
96
|
:param report_text: The text to be searched
|
|
@@ -103,7 +104,7 @@ class Miner:
|
|
|
103
104
|
contains_target = re.search(self.target_regex, report_text, self.regex_flags)
|
|
104
105
|
relevant_sentences = []
|
|
105
106
|
if contains_target:
|
|
106
|
-
sentences =
|
|
107
|
+
sentences = list(self.nlp(report_text).sents)
|
|
107
108
|
if main_document_keyword is not None:
|
|
108
109
|
sentences = self._remove_header(sentences, main_document_keyword)
|
|
109
110
|
|
|
@@ -129,10 +130,10 @@ class Miner:
|
|
|
129
130
|
df: pd.DataFrame,
|
|
130
131
|
text_column_name: str,
|
|
131
132
|
new_column_name: str = "text_found",
|
|
132
|
-
main_document_keyword: str = None,
|
|
133
|
+
main_document_keyword: Optional[str] = None,
|
|
133
134
|
) -> pd.DataFrame:
|
|
134
135
|
"""
|
|
135
|
-
|
|
136
|
+
Search the strings contained in `text_column_name` for the selected RegEx, and adds two
|
|
136
137
|
columns to the DataFrame with the output of the NLP search. The negation RegEx can be
|
|
137
138
|
used to exclude sentences. Additionally, it is possible to define a `main_document_keyword`,
|
|
138
139
|
which is a string that can be used to filter out the header of the document.
|
|
@@ -151,31 +152,29 @@ class Miner:
|
|
|
151
152
|
self._check_diagnostic_report,
|
|
152
153
|
main_document_keyword=main_document_keyword,
|
|
153
154
|
)
|
|
154
|
-
texts =
|
|
155
|
+
texts = list(df[text_column_name].values)
|
|
155
156
|
tqdm_text = f"Searching for Sentences with {self.target_regex}"
|
|
156
157
|
if self.negation_regex is not None:
|
|
157
158
|
tqdm_text += f" and without {self.negation_regex}"
|
|
158
159
|
if self.num_processes > 1:
|
|
159
160
|
pool = multiprocessing.Pool(self.num_processes)
|
|
160
|
-
results =
|
|
161
|
-
|
|
162
|
-
for result in tqdm(
|
|
161
|
+
results = list(
|
|
162
|
+
tqdm(
|
|
163
163
|
pool.imap(func, texts),
|
|
164
164
|
total=len(df),
|
|
165
165
|
desc=tqdm_text,
|
|
166
166
|
)
|
|
167
|
-
|
|
167
|
+
)
|
|
168
168
|
pool.close()
|
|
169
169
|
pool.join()
|
|
170
170
|
else:
|
|
171
|
-
results =
|
|
172
|
-
|
|
173
|
-
for result in tqdm(
|
|
171
|
+
results = list(
|
|
172
|
+
tqdm(
|
|
174
173
|
[func(text) for text in texts],
|
|
175
174
|
total=len(df),
|
|
176
175
|
desc=tqdm_text,
|
|
177
176
|
)
|
|
178
|
-
|
|
177
|
+
)
|
|
179
178
|
|
|
180
179
|
df[new_column_name + "_sentences"] = results
|
|
181
180
|
df[new_column_name] = ~df[new_column_name + "_sentences"].isna()
|