pathling 8.0.0.dev1__tar.gz → 8.0.0.dev3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pathling-8.0.0.dev1/pathling.egg-info → pathling-8.0.0.dev3}/PKG-INFO +1 -1
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/examples/fhir_view.py +0 -1
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling/_version.py +1 -1
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling/bulk.py +106 -30
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling/context.py +1 -109
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling/datasink.py +12 -19
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling/datasource.py +33 -13
- pathling-8.0.0.dev3/pathling/spark.py +89 -0
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3/pathling.egg-info}/PKG-INFO +1 -1
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling.egg-info/SOURCES.txt +3 -3
- pathling-8.0.0.dev3/tests/test_bulk.py +62 -0
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/tests/test_datasource.py +68 -4
- pathling-8.0.0.dev3/tests/test_spark.py +39 -0
- pathling-8.0.0.dev1/examples/member_of_old.py +0 -42
- pathling-8.0.0.dev1/examples/subsumes_old.py +0 -49
- pathling-8.0.0.dev1/examples/translate_old.py +0 -36
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/LICENSE +0 -0
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/MANIFEST.in +0 -0
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/README.md +0 -0
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/examples/bulk.py +0 -0
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/examples/data/bundles/Bennett146_Swaniawski813_704c9750-f6e6-473b-ee83-fbd48e07fe3f.json +0 -0
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/examples/data/bundles/Dino214_Parisian75_40d82b80-b682-cd8b-da6d-396809878641.json +0 -0
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/examples/data/resources/Condition.ndjson +0 -0
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/examples/data/resources/Patient.ndjson +0 -0
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/examples/designation.py +0 -0
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/examples/display.py +0 -0
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/examples/encode_bundles.py +0 -0
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/examples/encode_resources.py +0 -0
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/examples/member_of.py +0 -0
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/examples/property_of.py +0 -0
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/examples/subsumes.py +0 -0
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/examples/translate.py +0 -0
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling/__init__.py +0 -0
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling/coding.py +0 -0
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling/core.py +0 -0
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling/fhir.py +0 -0
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling/functions.py +0 -0
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling/udfs.py +0 -0
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling.egg-info/dependency_links.txt +0 -0
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling.egg-info/requires.txt +0 -0
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling.egg-info/top_level.txt +0 -0
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/setup.cfg +0 -0
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/setup.py +0 -0
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/tests/test_encoders.py +0 -0
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/tests/test_functions.py +0 -0
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/tests/test_udfs.py +0 -0
- {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/tests/test_view.py +0 -0
|
@@ -13,16 +13,80 @@
|
|
|
13
13
|
# See the License for the specific language governing permissions and
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
|
|
16
|
-
from
|
|
17
|
-
from
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
from datetime import datetime, timezone
|
|
18
|
+
from typing import List, Optional, Tuple, Callable
|
|
18
19
|
|
|
19
|
-
from
|
|
20
|
+
from py4j.java_gateway import JavaObject, JVMView
|
|
21
|
+
from pyspark.sql import SparkSession
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class FileResult:
|
|
26
|
+
"""
|
|
27
|
+
Represents the result of a single file export operation.
|
|
28
|
+
"""
|
|
29
|
+
source: str
|
|
30
|
+
"""
|
|
31
|
+
The source URL of the exported file.
|
|
32
|
+
"""
|
|
33
|
+
destination: str
|
|
34
|
+
"""
|
|
35
|
+
The destination URL where the file was saved.
|
|
36
|
+
"""
|
|
37
|
+
size: int
|
|
38
|
+
"""
|
|
39
|
+
The size of the exported file in bytes.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class ExportResult:
|
|
45
|
+
"""
|
|
46
|
+
Represents the result of a bulk export operation.
|
|
47
|
+
"""
|
|
48
|
+
transaction_time: datetime
|
|
49
|
+
"""
|
|
50
|
+
The time at which the transaction was processed at the server.
|
|
51
|
+
Corresponds to `transactionTime` in the bulk export response.
|
|
52
|
+
"""
|
|
53
|
+
results: List[FileResult]
|
|
54
|
+
"""
|
|
55
|
+
A list of FileResult objects representing the exported files.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
@classmethod
|
|
59
|
+
def from_java(cls, java_result: JavaObject) -> 'ExportResult':
|
|
60
|
+
"""
|
|
61
|
+
Create an ExportResult from a Java export result object.
|
|
62
|
+
|
|
63
|
+
:param java_result: The Java export result object
|
|
64
|
+
:return: A Python ExportResult object
|
|
65
|
+
"""
|
|
66
|
+
# Convert transaction time from Java Instant to Python datetime
|
|
67
|
+
transaction_time = datetime.fromtimestamp(
|
|
68
|
+
java_result.getTransactionTime().toEpochMilli() / 1000.0, tz=timezone.utc)
|
|
69
|
+
|
|
70
|
+
# Convert file results
|
|
71
|
+
file_results = [
|
|
72
|
+
FileResult(
|
|
73
|
+
source=str(java_file_result.getSource()),
|
|
74
|
+
destination=str(java_file_result.getDestination()),
|
|
75
|
+
size=java_file_result.getSize())
|
|
76
|
+
for java_file_result in java_result.getResults()
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
return cls(
|
|
80
|
+
transaction_time=transaction_time,
|
|
81
|
+
results=file_results
|
|
82
|
+
)
|
|
20
83
|
|
|
21
84
|
|
|
22
85
|
class BulkExportClient:
|
|
23
86
|
"""
|
|
24
87
|
A client for exporting data from the FHIR Bulk Data Access API.
|
|
25
88
|
"""
|
|
89
|
+
|
|
26
90
|
def __init__(self, java_client):
|
|
27
91
|
"""
|
|
28
92
|
Create a new BulkExportClient that wraps a Java BulkExportClient.
|
|
@@ -31,26 +95,27 @@ class BulkExportClient:
|
|
|
31
95
|
"""
|
|
32
96
|
self._java_client = java_client
|
|
33
97
|
|
|
34
|
-
def export(self):
|
|
98
|
+
def export(self) -> ExportResult:
|
|
35
99
|
"""
|
|
36
100
|
Export data from the FHIR server.
|
|
37
101
|
|
|
38
|
-
:return: The result of the export operation
|
|
102
|
+
:return: The result of the export operation as a Python ExportResult object
|
|
39
103
|
"""
|
|
40
|
-
|
|
104
|
+
java_result = self._java_client.export()
|
|
105
|
+
return ExportResult.from_java(java_result)
|
|
41
106
|
|
|
42
107
|
@classmethod
|
|
43
108
|
def _configure_builder(cls, jvm, builder, fhir_endpoint_url: str, output_dir: str,
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
109
|
+
output_format: str = "application/fhir+ndjson",
|
|
110
|
+
since: Optional[datetime] = None,
|
|
111
|
+
types: Optional[List[str]] = None,
|
|
112
|
+
elements: Optional[List[str]] = None,
|
|
113
|
+
include_associated_data: Optional[List[str]] = None,
|
|
114
|
+
type_filters: Optional[List[str]] = None,
|
|
115
|
+
output_extension: str = "ndjson",
|
|
116
|
+
timeout: Optional[int] = None,
|
|
117
|
+
max_concurrent_downloads: int = 10,
|
|
118
|
+
auth_config: Optional[dict] = None):
|
|
54
119
|
"""
|
|
55
120
|
Configure common builder parameters.
|
|
56
121
|
|
|
@@ -119,7 +184,7 @@ class BulkExportClient:
|
|
|
119
184
|
|
|
120
185
|
if auth_config is not None:
|
|
121
186
|
auth_builder = jvm.au.csiro.fhir.auth.AuthConfig.builder()
|
|
122
|
-
|
|
187
|
+
|
|
123
188
|
# Set defaults to match Java class
|
|
124
189
|
auth_builder.enabled(False)
|
|
125
190
|
auth_builder.useSMART(True)
|
|
@@ -150,11 +215,11 @@ class BulkExportClient:
|
|
|
150
215
|
builder.withAuthConfig(auth_config_obj)
|
|
151
216
|
|
|
152
217
|
@classmethod
|
|
153
|
-
def for_system(cls,
|
|
218
|
+
def for_system(cls, spark, *args, **kwargs) -> 'BulkExportClient':
|
|
154
219
|
"""
|
|
155
220
|
Create a builder for a system-level export.
|
|
156
221
|
|
|
157
|
-
:param
|
|
222
|
+
:param spark: The SparkSession instance
|
|
158
223
|
:param fhir_endpoint_url: The URL of the FHIR server to export from
|
|
159
224
|
:param output_dir: The directory to write the output files to
|
|
160
225
|
:param output_format: The format of the output data
|
|
@@ -169,18 +234,17 @@ class BulkExportClient:
|
|
|
169
234
|
:param auth_config: Optional authentication configuration dictionary
|
|
170
235
|
:return: A BulkExportClient configured for system-level export
|
|
171
236
|
"""
|
|
172
|
-
|
|
173
|
-
builder = client_class.systemBuilder() # Returns a builder directly
|
|
237
|
+
builder, jvm = cls._create_builder(spark, lambda bc: bc.systemBuilder())
|
|
174
238
|
cls._configure_builder(jvm, builder, *args, **kwargs)
|
|
175
239
|
return cls(builder.build())
|
|
176
240
|
|
|
177
|
-
@classmethod
|
|
178
|
-
def for_group(cls,
|
|
241
|
+
@classmethod
|
|
242
|
+
def for_group(cls, spark, fhir_endpoint_url: str, output_dir: str,
|
|
179
243
|
group_id: str, *args, **kwargs) -> 'BulkExportClient':
|
|
180
244
|
"""
|
|
181
245
|
Create a builder for a group-level export.
|
|
182
246
|
|
|
183
|
-
:param
|
|
247
|
+
:param spark: The SparkSession instance
|
|
184
248
|
:param fhir_endpoint_url: The URL of the FHIR server to export from
|
|
185
249
|
:param output_dir: The directory to write the output files to
|
|
186
250
|
:param group_id: The ID of the group to export
|
|
@@ -196,19 +260,18 @@ class BulkExportClient:
|
|
|
196
260
|
:param auth_config: Optional authentication configuration dictionary
|
|
197
261
|
:return: A BulkExportClient configured for group-level export
|
|
198
262
|
"""
|
|
199
|
-
client_class = jvm.au.csiro.fhir.export.BulkExportClient
|
|
200
263
|
# Pass group_id directly to groupBuilder
|
|
201
|
-
builder =
|
|
264
|
+
builder, jvm = cls._create_builder(spark, lambda bc: bc.groupBuilder(group_id))
|
|
202
265
|
cls._configure_builder(jvm, builder, fhir_endpoint_url, output_dir, *args, **kwargs)
|
|
203
266
|
return cls(builder.build())
|
|
204
267
|
|
|
205
268
|
@classmethod
|
|
206
|
-
def for_patient(cls,
|
|
269
|
+
def for_patient(cls, spark, fhir_endpoint_url: str, output_dir: str,
|
|
207
270
|
patients: Optional[List[str]] = None, *args, **kwargs) -> 'BulkExportClient':
|
|
208
271
|
"""
|
|
209
272
|
Create a builder for a patient-level export.
|
|
210
273
|
|
|
211
|
-
:param
|
|
274
|
+
:param spark: The SparkSession instance
|
|
212
275
|
:param fhir_endpoint_url: The URL of the FHIR server to export from
|
|
213
276
|
:param output_dir: The directory to write the output files to
|
|
214
277
|
:param patients: List of patient references to include
|
|
@@ -224,11 +287,24 @@ class BulkExportClient:
|
|
|
224
287
|
:param auth_config: Optional authentication configuration dictionary
|
|
225
288
|
:return: A BulkExportClient configured for patient-level export
|
|
226
289
|
"""
|
|
227
|
-
|
|
228
|
-
builder = client_class.patientBuilder() # Returns a builder directly
|
|
290
|
+
builder, jvm = cls._create_builder(spark, lambda bc: bc.patientBuilder())
|
|
229
291
|
if patients is not None:
|
|
230
292
|
for patient in patients:
|
|
231
293
|
ref = jvm.au.csiro.fhir.model.Reference.of(patient)
|
|
232
294
|
builder.withPatient(ref)
|
|
233
295
|
cls._configure_builder(jvm, builder, fhir_endpoint_url, output_dir, *args, **kwargs)
|
|
234
296
|
return cls(builder.build())
|
|
297
|
+
|
|
298
|
+
@classmethod
|
|
299
|
+
def _create_builder(cls,
|
|
300
|
+
spark: SparkSession,
|
|
301
|
+
factory_f: Callable[[JavaObject], JavaObject]) -> Tuple[
|
|
302
|
+
JavaObject, JVMView]:
|
|
303
|
+
|
|
304
|
+
jvm: JVMView = spark._jvm
|
|
305
|
+
client_class = jvm.au.csiro.fhir.export.BulkExportClient
|
|
306
|
+
builder: JavaObject = factory_f(client_class)
|
|
307
|
+
builder = builder.withFileStoreFactory(
|
|
308
|
+
jvm.au.csiro.filestore.hdfs.HdfsFileStoreFactory(spark._jsc.sc().hadoopConfiguration())
|
|
309
|
+
)
|
|
310
|
+
return (builder, jvm)
|
|
@@ -15,9 +15,8 @@
|
|
|
15
15
|
|
|
16
16
|
# noinspection PyPackageRequirements
|
|
17
17
|
|
|
18
|
-
from deprecated import deprecated
|
|
19
18
|
from py4j.java_gateway import JavaObject
|
|
20
|
-
from pyspark.sql import DataFrame, SparkSession
|
|
19
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
21
20
|
from typing import Optional, Sequence, TYPE_CHECKING
|
|
22
21
|
|
|
23
22
|
from pathling._version import (
|
|
@@ -26,7 +25,6 @@ from pathling._version import (
|
|
|
26
25
|
__delta_version__,
|
|
27
26
|
__hadoop_version__,
|
|
28
27
|
)
|
|
29
|
-
from pathling.coding import Coding
|
|
30
28
|
from pathling.fhir import MimeType
|
|
31
29
|
|
|
32
30
|
if TYPE_CHECKING:
|
|
@@ -34,8 +32,6 @@ if TYPE_CHECKING:
|
|
|
34
32
|
|
|
35
33
|
__all__ = ["PathlingContext"]
|
|
36
34
|
|
|
37
|
-
EQ_EQUIVALENT = "equivalent"
|
|
38
|
-
|
|
39
35
|
|
|
40
36
|
class StorageType:
|
|
41
37
|
MEMORY: str = "memory"
|
|
@@ -356,110 +352,6 @@ class PathlingContext:
|
|
|
356
352
|
)
|
|
357
353
|
)
|
|
358
354
|
|
|
359
|
-
@deprecated(reason="You should use the 'udfs.member_of' UDF instead")
|
|
360
|
-
def member_of(
|
|
361
|
-
self,
|
|
362
|
-
df: DataFrame,
|
|
363
|
-
coding_column: Column,
|
|
364
|
-
value_set_uri: str,
|
|
365
|
-
output_column_name: str,
|
|
366
|
-
):
|
|
367
|
-
"""
|
|
368
|
-
Takes a dataframe with a Coding column as input. A new column is created which contains a
|
|
369
|
-
Boolean value, indicating whether the input Coding is a member of the specified FHIR
|
|
370
|
-
ValueSet.
|
|
371
|
-
|
|
372
|
-
:param df: a DataFrame containing the input data
|
|
373
|
-
:param coding_column: a Column containing a struct representation of a Coding
|
|
374
|
-
:param value_set_uri: an identifier for a FHIR ValueSet
|
|
375
|
-
:param output_column_name: the name of the result column
|
|
376
|
-
:return: A new dataframe with an additional column containing the result of the operation
|
|
377
|
-
"""
|
|
378
|
-
return self._wrap_df(
|
|
379
|
-
self._jpc.memberOf(
|
|
380
|
-
df._jdf, coding_column._jc, value_set_uri, output_column_name
|
|
381
|
-
)
|
|
382
|
-
)
|
|
383
|
-
|
|
384
|
-
@deprecated(reason="You should use the 'udfs.translate' UDF instead")
|
|
385
|
-
def translate(
|
|
386
|
-
self,
|
|
387
|
-
df: DataFrame,
|
|
388
|
-
coding_column: Column,
|
|
389
|
-
concept_map_uri: str,
|
|
390
|
-
reverse: Optional[bool] = False,
|
|
391
|
-
equivalence: Optional[str] = EQ_EQUIVALENT,
|
|
392
|
-
target: Optional[str] = None,
|
|
393
|
-
output_column_name: Optional[str] = "result",
|
|
394
|
-
):
|
|
395
|
-
"""
|
|
396
|
-
Takes a dataframe with a Coding column as input. A new column is created which contains
|
|
397
|
-
the array of Codings value with translation targets from the specified FHIR ConceptMap.
|
|
398
|
-
There may be more than one target concept for each input concept.
|
|
399
|
-
|
|
400
|
-
:param df: a DataFrame containing the input data
|
|
401
|
-
:param coding_column: a Column containing a struct representation of a Coding
|
|
402
|
-
:param concept_map_uri: an identifier for a FHIR ConceptMap
|
|
403
|
-
:param reverse: the direction to traverse the map - false results in "source to target"
|
|
404
|
-
mappings, while true results in "target to source"
|
|
405
|
-
:param equivalence: a comma-delimited set of values from the ConceptMapEquivalence ValueSet
|
|
406
|
-
:param target: identifies the value set in which a translation is sought. If there is no
|
|
407
|
-
target specified, the server should return all known translations.
|
|
408
|
-
:param output_column_name: the name of the result column
|
|
409
|
-
:return: A new dataframe with an additional column containing the result of the operation.
|
|
410
|
-
"""
|
|
411
|
-
return self._wrap_df(
|
|
412
|
-
self._jpc.translate(
|
|
413
|
-
df._jdf,
|
|
414
|
-
coding_column._jc,
|
|
415
|
-
concept_map_uri,
|
|
416
|
-
reverse,
|
|
417
|
-
equivalence,
|
|
418
|
-
target,
|
|
419
|
-
output_column_name,
|
|
420
|
-
)
|
|
421
|
-
)
|
|
422
|
-
|
|
423
|
-
@deprecated(reason="You should use the 'udfs.subsumes' UDF instead")
|
|
424
|
-
def subsumes(
|
|
425
|
-
self,
|
|
426
|
-
df: DataFrame,
|
|
427
|
-
output_column_name: str,
|
|
428
|
-
left_coding_column: Optional[Column] = None,
|
|
429
|
-
right_coding_column: Optional[Column] = None,
|
|
430
|
-
left_coding: Optional[Coding] = None,
|
|
431
|
-
right_coding: Optional[Coding] = None,
|
|
432
|
-
):
|
|
433
|
-
"""
|
|
434
|
-
Takes a dataframe with two Coding columns. A new column is created which contains a
|
|
435
|
-
Boolean value, indicating whether the left Coding subsumes the right Coding.
|
|
436
|
-
|
|
437
|
-
:param df: a DataFrame containing the input data
|
|
438
|
-
:param left_coding_column: a Column containing a struct representation of a Coding,
|
|
439
|
-
for the left-hand side of the subsumption test
|
|
440
|
-
:param right_coding_column: a Column containing a struct representation of a Coding,
|
|
441
|
-
for the right-hand side of the subsumption test
|
|
442
|
-
:param left_coding: a Coding object for the left-hand side of the subsumption test
|
|
443
|
-
:param right_coding: a Coding object for the right-hand side of the subsumption test
|
|
444
|
-
:param output_column_name: the name of the result column
|
|
445
|
-
:return: A new dataframe with an additional column containing the result of the operation.
|
|
446
|
-
"""
|
|
447
|
-
if (left_coding_column is None and left_coding is None) or (
|
|
448
|
-
right_coding_column is None and right_coding is None
|
|
449
|
-
):
|
|
450
|
-
raise ValueError(
|
|
451
|
-
"Must provide either left_coding_column or left_coding, and either "
|
|
452
|
-
"right_coding_column or right_coding"
|
|
453
|
-
)
|
|
454
|
-
left_column = left_coding.to_literal() if left_coding else left_coding_column
|
|
455
|
-
right_column = (
|
|
456
|
-
right_coding.to_literal() if right_coding else right_coding_column
|
|
457
|
-
)
|
|
458
|
-
return self._wrap_df(
|
|
459
|
-
self._jpc.subsumes(
|
|
460
|
-
df._jdf, left_column._jc, right_column._jc, output_column_name
|
|
461
|
-
)
|
|
462
|
-
)
|
|
463
355
|
|
|
464
356
|
@property
|
|
465
357
|
def read(self) -> "DataSources":
|
|
@@ -21,15 +21,6 @@ if TYPE_CHECKING:
|
|
|
21
21
|
from pathling.datasource import DataSource
|
|
22
22
|
|
|
23
23
|
|
|
24
|
-
class ImportMode:
|
|
25
|
-
"""
|
|
26
|
-
Constants that represent the different import modes.
|
|
27
|
-
"""
|
|
28
|
-
|
|
29
|
-
OVERWRITE: str = "overwrite"
|
|
30
|
-
MERGE: str = "merge"
|
|
31
|
-
|
|
32
|
-
|
|
33
24
|
class SaveMode:
|
|
34
25
|
"""
|
|
35
26
|
Constants that represent the different save modes.
|
|
@@ -38,12 +29,14 @@ class SaveMode:
|
|
|
38
29
|
APPEND: Append the new data to the existing data.
|
|
39
30
|
IGNORE: Only save the data if the file does not already exist.
|
|
40
31
|
ERROR: Raise an error if the file already exists.
|
|
32
|
+
MERGE: Merge the new data with the existing data based on resource ID.
|
|
41
33
|
"""
|
|
42
34
|
|
|
43
35
|
OVERWRITE: str = "overwrite"
|
|
44
36
|
APPEND: str = "append"
|
|
45
37
|
IGNORE: str = "ignore"
|
|
46
38
|
ERROR: str = "error"
|
|
39
|
+
MERGE: str = "merge"
|
|
47
40
|
|
|
48
41
|
|
|
49
42
|
class DataSinks(SparkConversionsMixin):
|
|
@@ -82,9 +75,9 @@ class DataSinks(SparkConversionsMixin):
|
|
|
82
75
|
wrapped_mapper = StringMapper(
|
|
83
76
|
self.spark._jvm._gateway_client, file_name_mapper
|
|
84
77
|
)
|
|
85
|
-
self._datasinks.ndjson(path,
|
|
78
|
+
self._datasinks.saveMode(save_mode).ndjson(path, wrapped_mapper)
|
|
86
79
|
else:
|
|
87
|
-
self._datasinks.ndjson(path
|
|
80
|
+
self._datasinks.saveMode(save_mode).ndjson(path)
|
|
88
81
|
|
|
89
82
|
def parquet(self, path: str, save_mode: Optional[str] = SaveMode.ERROR) -> None:
|
|
90
83
|
"""
|
|
@@ -97,35 +90,35 @@ class DataSinks(SparkConversionsMixin):
|
|
|
97
90
|
- "ignore" will only save the data if the file does not already exist.
|
|
98
91
|
- "error" will raise an error if the file already exists.
|
|
99
92
|
"""
|
|
100
|
-
self._datasinks.parquet(path
|
|
93
|
+
self._datasinks.saveMode(save_mode).parquet(path)
|
|
101
94
|
|
|
102
95
|
def delta(
|
|
103
|
-
self, path: str,
|
|
96
|
+
self, path: str, save_mode: Optional[str] = SaveMode.OVERWRITE
|
|
104
97
|
) -> None:
|
|
105
98
|
"""
|
|
106
99
|
Writes the data to a directory of Delta files.
|
|
107
100
|
|
|
108
101
|
:param path: The URI of the directory to write the files to.
|
|
109
|
-
:param
|
|
102
|
+
:param save_mode: The save mode to use when writing the data - "overwrite" will
|
|
110
103
|
overwrite any existing data, "merge" will merge the new data with the existing data based
|
|
111
104
|
on resource ID.
|
|
112
105
|
"""
|
|
113
|
-
self._datasinks.delta(path
|
|
106
|
+
self._datasinks.saveMode(save_mode).delta(path)
|
|
114
107
|
|
|
115
108
|
def tables(
|
|
116
109
|
self,
|
|
117
110
|
schema: Optional[str] = None,
|
|
118
|
-
|
|
111
|
+
save_mode: Optional[str] = SaveMode.OVERWRITE,
|
|
119
112
|
) -> None:
|
|
120
113
|
"""
|
|
121
114
|
Writes the data to a set of tables in the Spark catalog.
|
|
122
115
|
|
|
123
116
|
:param schema: The name of the schema to write the tables to.
|
|
124
|
-
:param
|
|
117
|
+
:param save_mode: The save mode to use when writing the data - "overwrite" will
|
|
125
118
|
overwrite any existing data, "merge" will merge the new data with the existing data based
|
|
126
119
|
on resource ID.
|
|
127
120
|
"""
|
|
128
121
|
if schema:
|
|
129
|
-
self._datasinks.tables(
|
|
122
|
+
self._datasinks.saveMode(save_mode).tables(schema)
|
|
130
123
|
else:
|
|
131
|
-
self._datasinks.tables(
|
|
124
|
+
self._datasinks.saveMode(save_mode).tables()
|
|
@@ -25,6 +25,7 @@ from pyspark.sql import DataFrame
|
|
|
25
25
|
from pathling import PathlingContext
|
|
26
26
|
from pathling.core import StringToStringSetMapper, SparkConversionsMixin
|
|
27
27
|
from pathling.fhir import MimeType
|
|
28
|
+
from pathling.spark import Dfs
|
|
28
29
|
|
|
29
30
|
if TYPE_CHECKING:
|
|
30
31
|
from pathling.datasink import DataSinks
|
|
@@ -109,6 +110,10 @@ class DataSources(SparkConversionsMixin):
|
|
|
109
110
|
A factory for creating data sources.
|
|
110
111
|
"""
|
|
111
112
|
|
|
113
|
+
# Default extension and MIME type for NDJSON files
|
|
114
|
+
NDJSON_EXTENSION = "ndjson"
|
|
115
|
+
NDJSON_MIMETYPE = "application/fhir+ndjson"
|
|
116
|
+
|
|
112
117
|
def __init__(self, pathling: PathlingContext):
|
|
113
118
|
SparkConversionsMixin.__init__(self, pathling.spark)
|
|
114
119
|
self._pc = pathling
|
|
@@ -120,7 +125,7 @@ class DataSources(SparkConversionsMixin):
|
|
|
120
125
|
def ndjson(
|
|
121
126
|
self,
|
|
122
127
|
path,
|
|
123
|
-
extension: Optional[str] =
|
|
128
|
+
extension: Optional[str] = None,
|
|
124
129
|
file_name_mapper: Callable[[str], Sequence[str]] = None,
|
|
125
130
|
) -> DataSource:
|
|
126
131
|
"""
|
|
@@ -134,6 +139,9 @@ class DataSources(SparkConversionsMixin):
|
|
|
134
139
|
types that it contains.
|
|
135
140
|
:return: A DataSource object that can be used to run queries against the data.
|
|
136
141
|
"""
|
|
142
|
+
|
|
143
|
+
extension = extension or DataSources.NDJSON_EXTENSION
|
|
144
|
+
|
|
137
145
|
if file_name_mapper:
|
|
138
146
|
wrapped_mapper = StringToStringSetMapper(
|
|
139
147
|
self.spark._jvm._gateway_client, file_name_mapper
|
|
@@ -221,34 +229,35 @@ class DataSources(SparkConversionsMixin):
|
|
|
221
229
|
def bulk(
|
|
222
230
|
self,
|
|
223
231
|
fhir_endpoint_url: str,
|
|
224
|
-
output_dir: str,
|
|
232
|
+
output_dir: Optional[str] = None,
|
|
233
|
+
overwrite: bool = True,
|
|
225
234
|
group_id: Optional[str] = None,
|
|
226
235
|
patients: Optional[List[str]] = None,
|
|
227
|
-
output_format: str = "application/fhir+ndjson",
|
|
228
236
|
since: Optional[datetime] = None,
|
|
229
237
|
types: Optional[List[str]] = None,
|
|
230
238
|
elements: Optional[List[str]] = None,
|
|
231
239
|
include_associated_data: Optional[List[str]] = None,
|
|
232
240
|
type_filters: Optional[List[str]] = None,
|
|
233
|
-
output_extension: str = "ndjson",
|
|
234
241
|
timeout: Optional[int] = None,
|
|
235
242
|
max_concurrent_downloads: int = 10,
|
|
236
243
|
auth_config: Optional[Dict] = None
|
|
237
244
|
) -> DataSource:
|
|
238
245
|
"""
|
|
239
|
-
Creates a data source from a FHIR Bulk Data Access API endpoint.
|
|
240
|
-
|
|
246
|
+
Creates a data source from a FHIR Bulk Data Access API endpoint.
|
|
247
|
+
Currently only supports bulk export in the ndjson format.
|
|
248
|
+
|
|
241
249
|
:param fhir_endpoint_url: The URL of the FHIR server to export from
|
|
242
|
-
:param output_dir: The directory to write the output files to
|
|
250
|
+
:param output_dir: The directory to write the output files to.
|
|
251
|
+
This should be a valid path in the Spark's filesystem.
|
|
252
|
+
If set to `None`, a temporary directory will be used instead.
|
|
253
|
+
:param overwrite: Whether to overwrite the output directory if it already exists. Defaults to True.
|
|
243
254
|
:param group_id: Optional group ID for group-level export
|
|
244
255
|
:param patients: Optional list of patient references for patient-level export
|
|
245
|
-
:param output_format: The format of the output data
|
|
246
256
|
:param since: Only include resources modified after this timestamp
|
|
247
257
|
:param types: List of FHIR resource types to include
|
|
248
258
|
:param elements: List of FHIR elements to include
|
|
249
259
|
:param include_associated_data: Pre-defined set of FHIR resources to include
|
|
250
260
|
:param type_filters: FHIR search queries to filter resources
|
|
251
|
-
:param output_extension: File extension for output files. Defaults to "ndjson"
|
|
252
261
|
:param timeout: Optional timeout duration in seconds
|
|
253
262
|
:param max_concurrent_downloads: Maximum number of concurrent downloads. Defaults to 10
|
|
254
263
|
:param auth_config: Optional authentication configuration dictionary with the following possible keys:
|
|
@@ -265,10 +274,21 @@ class DataSources(SparkConversionsMixin):
|
|
|
265
274
|
"""
|
|
266
275
|
from pathling.bulk import BulkExportClient
|
|
267
276
|
|
|
277
|
+
dfs = Dfs(self._pc.spark)
|
|
278
|
+
|
|
279
|
+
# If `output_dir` is not provided, create a temporary directory
|
|
280
|
+
output_dir = output_dir or dfs.get_temp_dir_path(prefix="tmp-bulk-export", qualified=True)
|
|
281
|
+
# If `overwrite`, then ensure the output directory does not exist
|
|
282
|
+
if overwrite and dfs.exists(output_dir):
|
|
283
|
+
dfs.delete(output_dir, recursive=True)
|
|
284
|
+
|
|
285
|
+
output_format = DataSources.NDJSON_MIMETYPE
|
|
286
|
+
output_extension = DataSources.NDJSON_EXTENSION
|
|
287
|
+
|
|
268
288
|
# Create appropriate client based on parameters
|
|
269
289
|
if group_id is not None:
|
|
270
290
|
client = BulkExportClient.for_group(
|
|
271
|
-
self.spark
|
|
291
|
+
self.spark,
|
|
272
292
|
fhir_endpoint_url=fhir_endpoint_url,
|
|
273
293
|
output_dir=output_dir,
|
|
274
294
|
group_id=group_id,
|
|
@@ -285,7 +305,7 @@ class DataSources(SparkConversionsMixin):
|
|
|
285
305
|
)
|
|
286
306
|
elif patients is not None:
|
|
287
307
|
client = BulkExportClient.for_patient(
|
|
288
|
-
self.spark
|
|
308
|
+
self.spark,
|
|
289
309
|
fhir_endpoint_url=fhir_endpoint_url,
|
|
290
310
|
output_dir=output_dir,
|
|
291
311
|
patients=patients,
|
|
@@ -302,7 +322,7 @@ class DataSources(SparkConversionsMixin):
|
|
|
302
322
|
)
|
|
303
323
|
else:
|
|
304
324
|
client = BulkExportClient.for_system(
|
|
305
|
-
self.spark
|
|
325
|
+
self.spark,
|
|
306
326
|
fhir_endpoint_url=fhir_endpoint_url,
|
|
307
327
|
output_dir=output_dir,
|
|
308
328
|
output_format=output_format,
|
|
@@ -318,7 +338,7 @@ class DataSources(SparkConversionsMixin):
|
|
|
318
338
|
)
|
|
319
339
|
|
|
320
340
|
# Perform the export
|
|
321
|
-
|
|
341
|
+
client.export()
|
|
322
342
|
|
|
323
343
|
# Return a DataSource that reads from the exported files
|
|
324
344
|
return self.ndjson(output_dir)
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# Copyright 2025 Commonwealth Scientific and Industrial Research
|
|
2
|
+
# Organisation (CSIRO) ABN 41 687 119 230.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
import uuid
|
|
18
|
+
|
|
19
|
+
from py4j.java_gateway import JavaObject, JVMView
|
|
20
|
+
from pyspark import SparkContext
|
|
21
|
+
from pyspark.sql import SparkSession
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Dfs:
|
|
25
|
+
"""A class for interacting with the Hadoop Distributed File System (HDFS) in Spark."""
|
|
26
|
+
|
|
27
|
+
def __init__(self, spark: SparkSession):
|
|
28
|
+
"""
|
|
29
|
+
Initialize the Dfs class with a SparkSession.
|
|
30
|
+
|
|
31
|
+
:param spark: SparkSession instance
|
|
32
|
+
"""
|
|
33
|
+
if not spark:
|
|
34
|
+
raise ValueError("SparkSession must be provided")
|
|
35
|
+
sc: SparkContext = spark.sparkContext
|
|
36
|
+
self._jvm: JVMView = sc._jvm
|
|
37
|
+
self._hadoop_conf: JavaObject = sc._jsc.hadoopConfiguration()
|
|
38
|
+
self._fs = self._jvm.org.apache.hadoop.fs.FileSystem.get(self._hadoop_conf)
|
|
39
|
+
|
|
40
|
+
def get_temp_dir_path(self, prefix: str = "tmp-app", qualified=True) -> str:
|
|
41
|
+
"""
|
|
42
|
+
Returns a unique path for a temporary directory in Spark's filesystem.
|
|
43
|
+
|
|
44
|
+
The path is constructed by appending a UUID to the base temporary directory,
|
|
45
|
+
ensuring uniqueness for each call.
|
|
46
|
+
The directory itself is not created, only the path is returned.
|
|
47
|
+
|
|
48
|
+
:param prefix: String to insert between the base directory and the UUID (default: "tmp-app").
|
|
49
|
+
:param qualified: If True, returns a fully qualified Hadoop path; if False, returns a raw path string.
|
|
50
|
+
:return: String representing the unique temporary directory path.
|
|
51
|
+
"""
|
|
52
|
+
base_tmp_dir = self._hadoop_conf.get("hadoop.tmp.dir")
|
|
53
|
+
if not base_tmp_dir:
|
|
54
|
+
raise ValueError("`hadoop.tmp.dir` must be set in Hadoop configuration.")
|
|
55
|
+
uuid_suffix = str(uuid.uuid4())
|
|
56
|
+
base_tmp_path = self._jvm.org.apache.hadoop.fs.Path(base_tmp_dir)
|
|
57
|
+
tmp_path = self._jvm.org.apache.hadoop.fs.Path(base_tmp_path, f"{prefix}-{uuid_suffix}")
|
|
58
|
+
return self._fs.makeQualified(tmp_path).toString() if qualified else tmp_path.toString()
|
|
59
|
+
|
|
60
|
+
def exists(self, path: str) -> bool:
|
|
61
|
+
"""
|
|
62
|
+
Check if a given path exists in the filesystem.
|
|
63
|
+
|
|
64
|
+
:param path: Path to check for existence.
|
|
65
|
+
:return: True if the path exists, False otherwise.
|
|
66
|
+
"""
|
|
67
|
+
hadoop_path = self._jvm.org.apache.hadoop.fs.Path(path)
|
|
68
|
+
return self._fs.exists(hadoop_path)
|
|
69
|
+
|
|
70
|
+
def delete(self, path: str, recursive: bool = False) -> bool:
|
|
71
|
+
"""
|
|
72
|
+
Delete a file or directory at the specified path.
|
|
73
|
+
|
|
74
|
+
:param path: Path to the file or directory to delete.
|
|
75
|
+
:param recursive: If True, delete directories and their contents recursively.
|
|
76
|
+
:return: True if deletion was successful, False otherwise.
|
|
77
|
+
"""
|
|
78
|
+
hadoop_path = self._jvm.org.apache.hadoop.fs.Path(path)
|
|
79
|
+
return self._fs.delete(hadoop_path, recursive)
|
|
80
|
+
|
|
81
|
+
def mkdirs(self, path: str) -> bool:
|
|
82
|
+
"""
|
|
83
|
+
Create a directory at the specified path.
|
|
84
|
+
|
|
85
|
+
:param path: Path to the directory to create.
|
|
86
|
+
:return: True if the directory was created successfully, False otherwise.
|
|
87
|
+
"""
|
|
88
|
+
hadoop_path = self._jvm.org.apache.hadoop.fs.Path(path)
|
|
89
|
+
return self._fs.mkdirs(hadoop_path)
|
|
@@ -10,12 +10,9 @@ examples/encode_bundles.py
|
|
|
10
10
|
examples/encode_resources.py
|
|
11
11
|
examples/fhir_view.py
|
|
12
12
|
examples/member_of.py
|
|
13
|
-
examples/member_of_old.py
|
|
14
13
|
examples/property_of.py
|
|
15
14
|
examples/subsumes.py
|
|
16
|
-
examples/subsumes_old.py
|
|
17
15
|
examples/translate.py
|
|
18
|
-
examples/translate_old.py
|
|
19
16
|
examples/data/bundles/Bennett146_Swaniawski813_704c9750-f6e6-473b-ee83-fbd48e07fe3f.json
|
|
20
17
|
examples/data/bundles/Dino214_Parisian75_40d82b80-b682-cd8b-da6d-396809878641.json
|
|
21
18
|
examples/data/resources/Condition.ndjson
|
|
@@ -30,14 +27,17 @@ pathling/datasink.py
|
|
|
30
27
|
pathling/datasource.py
|
|
31
28
|
pathling/fhir.py
|
|
32
29
|
pathling/functions.py
|
|
30
|
+
pathling/spark.py
|
|
33
31
|
pathling/udfs.py
|
|
34
32
|
pathling.egg-info/PKG-INFO
|
|
35
33
|
pathling.egg-info/SOURCES.txt
|
|
36
34
|
pathling.egg-info/dependency_links.txt
|
|
37
35
|
pathling.egg-info/requires.txt
|
|
38
36
|
pathling.egg-info/top_level.txt
|
|
37
|
+
tests/test_bulk.py
|
|
39
38
|
tests/test_datasource.py
|
|
40
39
|
tests/test_encoders.py
|
|
41
40
|
tests/test_functions.py
|
|
41
|
+
tests/test_spark.py
|
|
42
42
|
tests/test_udfs.py
|
|
43
43
|
tests/test_view.py
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# Copyright 2023 Commonwealth Scientific and Industrial Research
|
|
2
|
+
# Organisation (CSIRO) ABN 41 687 119 230.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
import os
|
|
17
|
+
|
|
18
|
+
from flask import Response
|
|
19
|
+
|
|
20
|
+
from pathling.bulk import BulkExportClient
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def test_bulk_client(pathling_ctx, mock_server, temp_dir):
|
|
24
|
+
@mock_server.route("/fhir/$export", methods=["GET"])
|
|
25
|
+
def export():
|
|
26
|
+
resp = Response(status=202)
|
|
27
|
+
resp.headers["content-location"] = mock_server.url("/pool")
|
|
28
|
+
return resp
|
|
29
|
+
|
|
30
|
+
@mock_server.route("/pool", methods=["GET"])
|
|
31
|
+
def pool():
|
|
32
|
+
return dict(
|
|
33
|
+
transactionTime="1970-01-01T01:02:03.004Z",
|
|
34
|
+
output=[
|
|
35
|
+
dict(type="Patient", url=mock_server.url("/download"), count=1),
|
|
36
|
+
],
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
@mock_server.route("/download", methods=["GET"])
|
|
40
|
+
def download():
|
|
41
|
+
return '{"id":"123"}'
|
|
42
|
+
|
|
43
|
+
output_dir = os.path.join(temp_dir, "export-output")
|
|
44
|
+
|
|
45
|
+
with mock_server.run():
|
|
46
|
+
result = BulkExportClient.for_system(
|
|
47
|
+
pathling_ctx.spark,
|
|
48
|
+
fhir_endpoint_url=mock_server.url("/fhir"),
|
|
49
|
+
output_dir=output_dir
|
|
50
|
+
).export()
|
|
51
|
+
|
|
52
|
+
assert os.path.isdir(output_dir)
|
|
53
|
+
assert os.path.exists(os.path.join(output_dir, "_SUCCESS"))
|
|
54
|
+
assert os.path.exists(os.path.join(output_dir, "Patient.0000.ndjson"))
|
|
55
|
+
with open(os.path.join(output_dir, "Patient.0000.ndjson")) as f:
|
|
56
|
+
assert f.read() == '{"id":"123"}'
|
|
57
|
+
assert result.transaction_time.isoformat() == "1970-01-01T01:02:03.004000+00:00"
|
|
58
|
+
assert 1 == len(result.results)
|
|
59
|
+
file_result = result.results[0]
|
|
60
|
+
assert 12 == file_result.size
|
|
61
|
+
assert os.path.join(output_dir, "Patient.0000.ndjson") == file_result.destination
|
|
62
|
+
assert mock_server.url("/download") == file_result.source
|
|
@@ -14,16 +14,27 @@
|
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
|
|
16
16
|
import os
|
|
17
|
-
from datetime import datetime, timezone
|
|
18
17
|
from tempfile import TemporaryDirectory
|
|
19
|
-
from unittest.mock import Mock, patch
|
|
20
18
|
|
|
19
|
+
from flask import Response
|
|
21
20
|
from pyspark.sql import Row, DataFrame
|
|
22
21
|
from pytest import fixture
|
|
23
22
|
|
|
24
23
|
from pathling.datasource import DataSource
|
|
25
24
|
|
|
26
25
|
|
|
26
|
+
@fixture(scope="function", autouse=True)
|
|
27
|
+
def func_temp_dir(temp_dir):
|
|
28
|
+
"""
|
|
29
|
+
Fixture to create a temporary directory for each test function.
|
|
30
|
+
:param temp_dir:
|
|
31
|
+
:return: existing temporary directory for each test function.
|
|
32
|
+
"""
|
|
33
|
+
temp_ndjson_dir = TemporaryDirectory(dir=temp_dir, prefix="function")
|
|
34
|
+
yield temp_ndjson_dir.name
|
|
35
|
+
temp_ndjson_dir.cleanup()
|
|
36
|
+
|
|
37
|
+
|
|
27
38
|
@fixture(scope="module")
|
|
28
39
|
def ndjson_test_data_dir(test_data_dir):
|
|
29
40
|
return os.path.join(test_data_dir, "ndjson")
|
|
@@ -65,6 +76,32 @@ def temp_delta_dir(temp_dir):
|
|
|
65
76
|
temp_delta_dir.cleanup()
|
|
66
77
|
|
|
67
78
|
|
|
79
|
+
@fixture(scope="function")
|
|
80
|
+
def bulk_server(mock_server, ndjson_test_data_dir):
|
|
81
|
+
@mock_server.route("/fhir/$export", methods=["GET"])
|
|
82
|
+
def export():
|
|
83
|
+
resp = Response(status=202)
|
|
84
|
+
resp.headers["content-location"] = mock_server.url("/pool")
|
|
85
|
+
return resp
|
|
86
|
+
|
|
87
|
+
@mock_server.route("/pool", methods=["GET"])
|
|
88
|
+
def pool():
|
|
89
|
+
return dict(
|
|
90
|
+
transactionTime="1970-01-01T00:00:00.000Z",
|
|
91
|
+
output=[
|
|
92
|
+
dict(type=resource, url=mock_server.url(f"/download/{resource}"), count=1) for
|
|
93
|
+
resource in ["Patient", "Condition"]
|
|
94
|
+
],
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
@mock_server.route("/download/<resource>", methods=["GET"])
|
|
98
|
+
def download(resource):
|
|
99
|
+
with open(os.path.join(ndjson_test_data_dir, f"{resource}.ndjson"), "r") as f:
|
|
100
|
+
return f.read()
|
|
101
|
+
|
|
102
|
+
return mock_server
|
|
103
|
+
|
|
104
|
+
|
|
68
105
|
ResultRow = Row("count")
|
|
69
106
|
|
|
70
107
|
|
|
@@ -160,7 +197,7 @@ def test_datasource_delta(delta_test_data_dir, temp_delta_dir, pathling_ctx):
|
|
|
160
197
|
|
|
161
198
|
def test_datasource_delta_merge(delta_test_data_dir, temp_delta_dir, pathling_ctx):
|
|
162
199
|
pathling_ctx.read.delta(delta_test_data_dir).write.delta(
|
|
163
|
-
temp_delta_dir,
|
|
200
|
+
temp_delta_dir, save_mode="merge"
|
|
164
201
|
)
|
|
165
202
|
data_source = pathling_ctx.read.delta(temp_delta_dir)
|
|
166
203
|
|
|
@@ -193,6 +230,34 @@ def test_datasource_tables_schema(ndjson_test_data_dir, pathling_ctx):
|
|
|
193
230
|
]
|
|
194
231
|
|
|
195
232
|
|
|
233
|
+
def test_datasource_bulk_with_temp_dir(pathling_ctx, bulk_server):
|
|
234
|
+
# !!! this directory cannot exist for the datasource to work
|
|
235
|
+
with bulk_server.run():
|
|
236
|
+
data_source = pathling_ctx.read.bulk(
|
|
237
|
+
fhir_endpoint_url=bulk_server.url("/fhir")
|
|
238
|
+
)
|
|
239
|
+
result = ndjson_query(data_source)
|
|
240
|
+
assert result.columns == list(ResultRow)
|
|
241
|
+
assert result.collect() == [
|
|
242
|
+
ResultRow(71),
|
|
243
|
+
]
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def test_datasource_bulk_with_existing_dir(pathling_ctx, bulk_server, func_temp_dir):
|
|
247
|
+
assert os.path.exists(func_temp_dir)
|
|
248
|
+
with bulk_server.run():
|
|
249
|
+
data_source = pathling_ctx.read.bulk(
|
|
250
|
+
fhir_endpoint_url=bulk_server.url("/fhir"),
|
|
251
|
+
output_dir=func_temp_dir,
|
|
252
|
+
overwrite=True # default anyway, but explicit for clarity
|
|
253
|
+
)
|
|
254
|
+
result = ndjson_query(data_source)
|
|
255
|
+
assert result.columns == list(ResultRow)
|
|
256
|
+
assert result.collect() == [
|
|
257
|
+
ResultRow(71),
|
|
258
|
+
]
|
|
259
|
+
|
|
260
|
+
|
|
196
261
|
def ndjson_query(data_source: DataSource) -> DataFrame:
|
|
197
262
|
return data_source.view(
|
|
198
263
|
resource='Condition',
|
|
@@ -219,7 +284,6 @@ def bundles_query(data_source: DataSource) -> DataFrame:
|
|
|
219
284
|
).groupby().count()
|
|
220
285
|
|
|
221
286
|
|
|
222
|
-
|
|
223
287
|
def parquet_query(data_source: DataSource) -> DataFrame:
|
|
224
288
|
return ndjson_query(data_source)
|
|
225
289
|
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# Copyright 2023 Commonwealth Scientific and Industrial Research
|
|
2
|
+
# Organisation (CSIRO) ABN 41 687 119 230.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
import re
|
|
17
|
+
from pathling.spark import Dfs
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_dfs_temp_dir(pathling_ctx):
|
|
21
|
+
dfs = Dfs(pathling_ctx.spark)
|
|
22
|
+
temp_path = dfs.get_temp_dir_path(prefix="test", qualified=True)
|
|
23
|
+
# In local setup the path should be something like:
|
|
24
|
+
# file:/tmp/hadoop-username/test-8e4756c1-46e4-44a5-b36d-d6afff1b168a
|
|
25
|
+
|
|
26
|
+
# Validate the format of the temp path using regex
|
|
27
|
+
regex_pattern = r'^file:/tmp/hadoop-[^/]+/test-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$'
|
|
28
|
+
assert re.match(regex_pattern, temp_path), f"Temp path {temp_path} does not match expected format"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_dfs_operations(pathling_ctx):
|
|
32
|
+
dfs = Dfs(pathling_ctx.spark)
|
|
33
|
+
temp_path = dfs.get_temp_dir_path(prefix="test", qualified=True)
|
|
34
|
+
# Check if the temporary directory exists (it should not exist yet)
|
|
35
|
+
assert not dfs.exists(temp_path), f"Temporary path {temp_path} should not exist before creation"
|
|
36
|
+
assert dfs.mkdirs(temp_path), f"Temporary path {temp_path} can be created"
|
|
37
|
+
assert dfs.exists(temp_path), f"Temporary path {temp_path} should exist after creation"
|
|
38
|
+
dfs.delete(temp_path, recursive=True)
|
|
39
|
+
assert not dfs.exists(temp_path), f"Temporary path {temp_path} should not exist after deletion"
|
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
# Copyright 2023 Commonwealth Scientific and Industrial Research
|
|
2
|
-
# Organisation (CSIRO) ABN 41 687 119 230.
|
|
3
|
-
#
|
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
# you may not use this file except in compliance with the License.
|
|
6
|
-
# You may obtain a copy of the License at
|
|
7
|
-
#
|
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
#
|
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
-
# See the License for the specific language governing permissions and
|
|
14
|
-
# limitations under the License.
|
|
15
|
-
|
|
16
|
-
import os
|
|
17
|
-
|
|
18
|
-
from pathling import PathlingContext
|
|
19
|
-
from pathling.functions import to_coding, to_ecl_value_set
|
|
20
|
-
|
|
21
|
-
HERE = os.path.abspath(os.path.dirname(__file__))
|
|
22
|
-
|
|
23
|
-
pc = PathlingContext.create()
|
|
24
|
-
|
|
25
|
-
csv = pc.spark.read.options(header=True).csv(
|
|
26
|
-
f'file://{os.path.join(HERE, "data/csv/conditions.csv")}'
|
|
27
|
-
)
|
|
28
|
-
|
|
29
|
-
result = pc.member_of(
|
|
30
|
-
csv,
|
|
31
|
-
to_coding(csv.CODE, "http://snomed.info/sct"),
|
|
32
|
-
to_ecl_value_set(
|
|
33
|
-
"""
|
|
34
|
-
<< 64572001|Disease| : (
|
|
35
|
-
<< 370135005|Pathological process| = << 441862004|Infectious process|,
|
|
36
|
-
<< 246075003|Causative agent| = << 49872002|Virus|
|
|
37
|
-
)
|
|
38
|
-
"""
|
|
39
|
-
),
|
|
40
|
-
"VIRAL_INFECTION",
|
|
41
|
-
)
|
|
42
|
-
result.select("CODE", "DESCRIPTION", "VIRAL_INFECTION").show()
|
|
@@ -1,49 +0,0 @@
|
|
|
1
|
-
# Copyright 2023 Commonwealth Scientific and Industrial Research
|
|
2
|
-
# Organisation (CSIRO) ABN 41 687 119 230.
|
|
3
|
-
#
|
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
# you may not use this file except in compliance with the License.
|
|
6
|
-
# You may obtain a copy of the License at
|
|
7
|
-
#
|
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
#
|
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
-
# See the License for the specific language governing permissions and
|
|
14
|
-
# limitations under the License.
|
|
15
|
-
|
|
16
|
-
import os
|
|
17
|
-
|
|
18
|
-
from pathling import PathlingContext
|
|
19
|
-
from pathling.coding import Coding
|
|
20
|
-
from pathling.functions import to_coding
|
|
21
|
-
|
|
22
|
-
HERE = os.path.abspath(os.path.dirname(__file__))
|
|
23
|
-
|
|
24
|
-
pc = PathlingContext.create()
|
|
25
|
-
|
|
26
|
-
csv = pc.spark.read.options(header=True).csv(
|
|
27
|
-
f'file://{os.path.join(HERE, "data/csv/conditions.csv")}'
|
|
28
|
-
)
|
|
29
|
-
first_3 = csv.limit(3)
|
|
30
|
-
cross_join = first_3.selectExpr(
|
|
31
|
-
"CODE as LEFT", "DESCRIPTION as LEFT_DESCRIPTION"
|
|
32
|
-
).crossJoin(first_3.selectExpr("CODE as RIGHT", "DESCRIPTION as RIGHT_DESCRIPTION"))
|
|
33
|
-
|
|
34
|
-
result_1 = pc.subsumes(
|
|
35
|
-
cross_join,
|
|
36
|
-
"SUBSUMES",
|
|
37
|
-
left_coding_column=to_coding(cross_join.LEFT, "http://snomed.info/sct"),
|
|
38
|
-
right_coding_column=to_coding(cross_join.RIGHT, "http://snomed.info/sct"),
|
|
39
|
-
)
|
|
40
|
-
result_2 = pc.subsumes(
|
|
41
|
-
result_1,
|
|
42
|
-
"LEFT_IS_ENT",
|
|
43
|
-
# 232208008 |Ear, nose and throat disorder|
|
|
44
|
-
left_coding=Coding("http://snomed.info/sct", "232208008"),
|
|
45
|
-
right_coding_column=to_coding(cross_join.LEFT, "http://snomed.info/sct"),
|
|
46
|
-
)
|
|
47
|
-
result_2.select(
|
|
48
|
-
"LEFT", "RIGHT", "LEFT_DESCRIPTION", "RIGHT_DESCRIPTION", "SUBSUMES", "LEFT_IS_ENT"
|
|
49
|
-
).show()
|
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
# Copyright 2023 Commonwealth Scientific and Industrial Research
|
|
2
|
-
# Organisation (CSIRO) ABN 41 687 119 230.
|
|
3
|
-
#
|
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
# you may not use this file except in compliance with the License.
|
|
6
|
-
# You may obtain a copy of the License at
|
|
7
|
-
#
|
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
#
|
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
-
# See the License for the specific language governing permissions and
|
|
14
|
-
# limitations under the License.
|
|
15
|
-
|
|
16
|
-
import os
|
|
17
|
-
|
|
18
|
-
from pathling import PathlingContext, to_coding
|
|
19
|
-
|
|
20
|
-
HERE = os.path.abspath(os.path.dirname(__file__))
|
|
21
|
-
|
|
22
|
-
pc = PathlingContext.create()
|
|
23
|
-
|
|
24
|
-
csv = pc.spark.read.options(header=True).csv(
|
|
25
|
-
f'file://{os.path.join(HERE, "data/csv/conditions.csv")}'
|
|
26
|
-
)
|
|
27
|
-
|
|
28
|
-
# Translate codings to Read CTV3 using the map that ships with SNOMED CT.
|
|
29
|
-
result = pc.translate(
|
|
30
|
-
csv,
|
|
31
|
-
to_coding(csv.CODE, "http://snomed.info/sct"),
|
|
32
|
-
"http://snomed.info/sct/900000000000207008?fhir_cm=900000000000497000",
|
|
33
|
-
output_column_name="READ_CODE",
|
|
34
|
-
)
|
|
35
|
-
result = result.withColumn("READ_CODE", result.READ_CODE.code)
|
|
36
|
-
result.select("CODE", "DESCRIPTION", "READ_CODE").show()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|