pathling 8.0.0.dev1__tar.gz → 8.0.0.dev3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. {pathling-8.0.0.dev1/pathling.egg-info → pathling-8.0.0.dev3}/PKG-INFO +1 -1
  2. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/examples/fhir_view.py +0 -1
  3. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling/_version.py +1 -1
  4. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling/bulk.py +106 -30
  5. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling/context.py +1 -109
  6. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling/datasink.py +12 -19
  7. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling/datasource.py +33 -13
  8. pathling-8.0.0.dev3/pathling/spark.py +89 -0
  9. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3/pathling.egg-info}/PKG-INFO +1 -1
  10. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling.egg-info/SOURCES.txt +3 -3
  11. pathling-8.0.0.dev3/tests/test_bulk.py +62 -0
  12. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/tests/test_datasource.py +68 -4
  13. pathling-8.0.0.dev3/tests/test_spark.py +39 -0
  14. pathling-8.0.0.dev1/examples/member_of_old.py +0 -42
  15. pathling-8.0.0.dev1/examples/subsumes_old.py +0 -49
  16. pathling-8.0.0.dev1/examples/translate_old.py +0 -36
  17. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/LICENSE +0 -0
  18. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/MANIFEST.in +0 -0
  19. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/README.md +0 -0
  20. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/examples/bulk.py +0 -0
  21. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/examples/data/bundles/Bennett146_Swaniawski813_704c9750-f6e6-473b-ee83-fbd48e07fe3f.json +0 -0
  22. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/examples/data/bundles/Dino214_Parisian75_40d82b80-b682-cd8b-da6d-396809878641.json +0 -0
  23. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/examples/data/resources/Condition.ndjson +0 -0
  24. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/examples/data/resources/Patient.ndjson +0 -0
  25. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/examples/designation.py +0 -0
  26. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/examples/display.py +0 -0
  27. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/examples/encode_bundles.py +0 -0
  28. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/examples/encode_resources.py +0 -0
  29. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/examples/member_of.py +0 -0
  30. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/examples/property_of.py +0 -0
  31. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/examples/subsumes.py +0 -0
  32. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/examples/translate.py +0 -0
  33. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling/__init__.py +0 -0
  34. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling/coding.py +0 -0
  35. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling/core.py +0 -0
  36. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling/fhir.py +0 -0
  37. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling/functions.py +0 -0
  38. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling/udfs.py +0 -0
  39. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling.egg-info/dependency_links.txt +0 -0
  40. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling.egg-info/requires.txt +0 -0
  41. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling.egg-info/top_level.txt +0 -0
  42. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/setup.cfg +0 -0
  43. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/setup.py +0 -0
  44. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/tests/test_encoders.py +0 -0
  45. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/tests/test_functions.py +0 -0
  46. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/tests/test_udfs.py +0 -0
  47. {pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/tests/test_view.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pathling
3
- Version: 8.0.0.dev1
3
+ Version: 8.0.0.dev3
4
4
  Summary: Python API for Pathling
5
5
  Home-page: https://github.com/aehrc/pathling
6
6
  Author: Australian e-Health Research Centre, CSIRO
@@ -56,4 +56,3 @@ view_ds = datasource.view(
56
56
  )
57
57
 
58
58
  view_ds.show()
59
- view_ds.explain(True)
@@ -2,7 +2,7 @@
2
2
  # Auto generated from POM project version.
3
3
  # Please do not modify.
4
4
  #
5
- __version__="8.0.0.dev1"
5
+ __version__="8.0.0.dev3"
6
6
  __java_version__="8.0.0-SNAPSHOT"
7
7
  __scala_version__="2.12"
8
8
  __delta_version__="3.3.2"
@@ -13,16 +13,80 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
 
16
- from datetime import datetime
17
- from typing import List, Optional
16
+ from dataclasses import dataclass
17
+ from datetime import datetime, timezone
18
+ from typing import List, Optional, Tuple, Callable
18
19
 
19
- from pathling import PathlingContext
20
+ from py4j.java_gateway import JavaObject, JVMView
21
+ from pyspark.sql import SparkSession
22
+
23
+
24
+ @dataclass
25
+ class FileResult:
26
+ """
27
+ Represents the result of a single file export operation.
28
+ """
29
+ source: str
30
+ """
31
+ The source URL of the exported file.
32
+ """
33
+ destination: str
34
+ """
35
+ The destination URL where the file was saved.
36
+ """
37
+ size: int
38
+ """
39
+ The size of the exported file in bytes.
40
+ """
41
+
42
+
43
+ @dataclass
44
+ class ExportResult:
45
+ """
46
+ Represents the result of a bulk export operation.
47
+ """
48
+ transaction_time: datetime
49
+ """
50
+ The time at which the transaction was processed at the server.
51
+ Corresponds to `transactionTime` in the bulk export response.
52
+ """
53
+ results: List[FileResult]
54
+ """
55
+ A list of FileResult objects representing the exported files.
56
+ """
57
+
58
+ @classmethod
59
+ def from_java(cls, java_result: JavaObject) -> 'ExportResult':
60
+ """
61
+ Create an ExportResult from a Java export result object.
62
+
63
+ :param java_result: The Java export result object
64
+ :return: A Python ExportResult object
65
+ """
66
+ # Convert transaction time from Java Instant to Python datetime
67
+ transaction_time = datetime.fromtimestamp(
68
+ java_result.getTransactionTime().toEpochMilli() / 1000.0, tz=timezone.utc)
69
+
70
+ # Convert file results
71
+ file_results = [
72
+ FileResult(
73
+ source=str(java_file_result.getSource()),
74
+ destination=str(java_file_result.getDestination()),
75
+ size=java_file_result.getSize())
76
+ for java_file_result in java_result.getResults()
77
+ ]
78
+
79
+ return cls(
80
+ transaction_time=transaction_time,
81
+ results=file_results
82
+ )
20
83
 
21
84
 
22
85
  class BulkExportClient:
23
86
  """
24
87
  A client for exporting data from the FHIR Bulk Data Access API.
25
88
  """
89
+
26
90
  def __init__(self, java_client):
27
91
  """
28
92
  Create a new BulkExportClient that wraps a Java BulkExportClient.
@@ -31,26 +95,27 @@ class BulkExportClient:
31
95
  """
32
96
  self._java_client = java_client
33
97
 
34
- def export(self):
98
+ def export(self) -> ExportResult:
35
99
  """
36
100
  Export data from the FHIR server.
37
101
 
38
- :return: The result of the export operation
102
+ :return: The result of the export operation as a Python ExportResult object
39
103
  """
40
- return self._java_client.export()
104
+ java_result = self._java_client.export()
105
+ return ExportResult.from_java(java_result)
41
106
 
42
107
  @classmethod
43
108
  def _configure_builder(cls, jvm, builder, fhir_endpoint_url: str, output_dir: str,
44
- output_format: str = "application/fhir+ndjson",
45
- since: Optional[datetime] = None,
46
- types: Optional[List[str]] = None,
47
- elements: Optional[List[str]] = None,
48
- include_associated_data: Optional[List[str]] = None,
49
- type_filters: Optional[List[str]] = None,
50
- output_extension: str = "ndjson",
51
- timeout: Optional[int] = None,
52
- max_concurrent_downloads: int = 10,
53
- auth_config: Optional[dict] = None):
109
+ output_format: str = "application/fhir+ndjson",
110
+ since: Optional[datetime] = None,
111
+ types: Optional[List[str]] = None,
112
+ elements: Optional[List[str]] = None,
113
+ include_associated_data: Optional[List[str]] = None,
114
+ type_filters: Optional[List[str]] = None,
115
+ output_extension: str = "ndjson",
116
+ timeout: Optional[int] = None,
117
+ max_concurrent_downloads: int = 10,
118
+ auth_config: Optional[dict] = None):
54
119
  """
55
120
  Configure common builder parameters.
56
121
 
@@ -119,7 +184,7 @@ class BulkExportClient:
119
184
 
120
185
  if auth_config is not None:
121
186
  auth_builder = jvm.au.csiro.fhir.auth.AuthConfig.builder()
122
-
187
+
123
188
  # Set defaults to match Java class
124
189
  auth_builder.enabled(False)
125
190
  auth_builder.useSMART(True)
@@ -150,11 +215,11 @@ class BulkExportClient:
150
215
  builder.withAuthConfig(auth_config_obj)
151
216
 
152
217
  @classmethod
153
- def for_system(cls, jvm, *args, **kwargs) -> 'BulkExportClient':
218
+ def for_system(cls, spark, *args, **kwargs) -> 'BulkExportClient':
154
219
  """
155
220
  Create a builder for a system-level export.
156
221
 
157
- :param jvm: The JVM instance
222
+ :param spark: The SparkSession instance
158
223
  :param fhir_endpoint_url: The URL of the FHIR server to export from
159
224
  :param output_dir: The directory to write the output files to
160
225
  :param output_format: The format of the output data
@@ -169,18 +234,17 @@ class BulkExportClient:
169
234
  :param auth_config: Optional authentication configuration dictionary
170
235
  :return: A BulkExportClient configured for system-level export
171
236
  """
172
- client_class = jvm.au.csiro.fhir.export.BulkExportClient
173
- builder = client_class.systemBuilder() # Returns a builder directly
237
+ builder, jvm = cls._create_builder(spark, lambda bc: bc.systemBuilder())
174
238
  cls._configure_builder(jvm, builder, *args, **kwargs)
175
239
  return cls(builder.build())
176
240
 
177
- @classmethod
178
- def for_group(cls, jvm, fhir_endpoint_url: str, output_dir: str,
241
+ @classmethod
242
+ def for_group(cls, spark, fhir_endpoint_url: str, output_dir: str,
179
243
  group_id: str, *args, **kwargs) -> 'BulkExportClient':
180
244
  """
181
245
  Create a builder for a group-level export.
182
246
 
183
- :param jvm: The JVM instance
247
+ :param spark: The SparkSession instance
184
248
  :param fhir_endpoint_url: The URL of the FHIR server to export from
185
249
  :param output_dir: The directory to write the output files to
186
250
  :param group_id: The ID of the group to export
@@ -196,19 +260,18 @@ class BulkExportClient:
196
260
  :param auth_config: Optional authentication configuration dictionary
197
261
  :return: A BulkExportClient configured for group-level export
198
262
  """
199
- client_class = jvm.au.csiro.fhir.export.BulkExportClient
200
263
  # Pass group_id directly to groupBuilder
201
- builder = client_class.groupBuilder(group_id)
264
+ builder, jvm = cls._create_builder(spark, lambda bc: bc.groupBuilder(group_id))
202
265
  cls._configure_builder(jvm, builder, fhir_endpoint_url, output_dir, *args, **kwargs)
203
266
  return cls(builder.build())
204
267
 
205
268
  @classmethod
206
- def for_patient(cls, jvm, fhir_endpoint_url: str, output_dir: str,
269
+ def for_patient(cls, spark, fhir_endpoint_url: str, output_dir: str,
207
270
  patients: Optional[List[str]] = None, *args, **kwargs) -> 'BulkExportClient':
208
271
  """
209
272
  Create a builder for a patient-level export.
210
273
 
211
- :param jvm: The JVM instance
274
+ :param spark: The SparkSession instance
212
275
  :param fhir_endpoint_url: The URL of the FHIR server to export from
213
276
  :param output_dir: The directory to write the output files to
214
277
  :param patients: List of patient references to include
@@ -224,11 +287,24 @@ class BulkExportClient:
224
287
  :param auth_config: Optional authentication configuration dictionary
225
288
  :return: A BulkExportClient configured for patient-level export
226
289
  """
227
- client_class = jvm.au.csiro.fhir.export.BulkExportClient
228
- builder = client_class.patientBuilder() # Returns a builder directly
290
+ builder, jvm = cls._create_builder(spark, lambda bc: bc.patientBuilder())
229
291
  if patients is not None:
230
292
  for patient in patients:
231
293
  ref = jvm.au.csiro.fhir.model.Reference.of(patient)
232
294
  builder.withPatient(ref)
233
295
  cls._configure_builder(jvm, builder, fhir_endpoint_url, output_dir, *args, **kwargs)
234
296
  return cls(builder.build())
297
+
298
+ @classmethod
299
+ def _create_builder(cls,
300
+ spark: SparkSession,
301
+ factory_f: Callable[[JavaObject], JavaObject]) -> Tuple[
302
+ JavaObject, JVMView]:
303
+
304
+ jvm: JVMView = spark._jvm
305
+ client_class = jvm.au.csiro.fhir.export.BulkExportClient
306
+ builder: JavaObject = factory_f(client_class)
307
+ builder = builder.withFileStoreFactory(
308
+ jvm.au.csiro.filestore.hdfs.HdfsFileStoreFactory(spark._jsc.sc().hadoopConfiguration())
309
+ )
310
+ return (builder, jvm)
@@ -15,9 +15,8 @@
15
15
 
16
16
  # noinspection PyPackageRequirements
17
17
 
18
- from deprecated import deprecated
19
18
  from py4j.java_gateway import JavaObject
20
- from pyspark.sql import DataFrame, SparkSession, Column
19
+ from pyspark.sql import DataFrame, SparkSession
21
20
  from typing import Optional, Sequence, TYPE_CHECKING
22
21
 
23
22
  from pathling._version import (
@@ -26,7 +25,6 @@ from pathling._version import (
26
25
  __delta_version__,
27
26
  __hadoop_version__,
28
27
  )
29
- from pathling.coding import Coding
30
28
  from pathling.fhir import MimeType
31
29
 
32
30
  if TYPE_CHECKING:
@@ -34,8 +32,6 @@ if TYPE_CHECKING:
34
32
 
35
33
  __all__ = ["PathlingContext"]
36
34
 
37
- EQ_EQUIVALENT = "equivalent"
38
-
39
35
 
40
36
  class StorageType:
41
37
  MEMORY: str = "memory"
@@ -356,110 +352,6 @@ class PathlingContext:
356
352
  )
357
353
  )
358
354
 
359
- @deprecated(reason="You should use the 'udfs.member_of' UDF instead")
360
- def member_of(
361
- self,
362
- df: DataFrame,
363
- coding_column: Column,
364
- value_set_uri: str,
365
- output_column_name: str,
366
- ):
367
- """
368
- Takes a dataframe with a Coding column as input. A new column is created which contains a
369
- Boolean value, indicating whether the input Coding is a member of the specified FHIR
370
- ValueSet.
371
-
372
- :param df: a DataFrame containing the input data
373
- :param coding_column: a Column containing a struct representation of a Coding
374
- :param value_set_uri: an identifier for a FHIR ValueSet
375
- :param output_column_name: the name of the result column
376
- :return: A new dataframe with an additional column containing the result of the operation
377
- """
378
- return self._wrap_df(
379
- self._jpc.memberOf(
380
- df._jdf, coding_column._jc, value_set_uri, output_column_name
381
- )
382
- )
383
-
384
- @deprecated(reason="You should use the 'udfs.translate' UDF instead")
385
- def translate(
386
- self,
387
- df: DataFrame,
388
- coding_column: Column,
389
- concept_map_uri: str,
390
- reverse: Optional[bool] = False,
391
- equivalence: Optional[str] = EQ_EQUIVALENT,
392
- target: Optional[str] = None,
393
- output_column_name: Optional[str] = "result",
394
- ):
395
- """
396
- Takes a dataframe with a Coding column as input. A new column is created which contains
397
- the array of Codings value with translation targets from the specified FHIR ConceptMap.
398
- There may be more than one target concept for each input concept.
399
-
400
- :param df: a DataFrame containing the input data
401
- :param coding_column: a Column containing a struct representation of a Coding
402
- :param concept_map_uri: an identifier for a FHIR ConceptMap
403
- :param reverse: the direction to traverse the map - false results in "source to target"
404
- mappings, while true results in "target to source"
405
- :param equivalence: a comma-delimited set of values from the ConceptMapEquivalence ValueSet
406
- :param target: identifies the value set in which a translation is sought. If there is no
407
- target specified, the server should return all known translations.
408
- :param output_column_name: the name of the result column
409
- :return: A new dataframe with an additional column containing the result of the operation.
410
- """
411
- return self._wrap_df(
412
- self._jpc.translate(
413
- df._jdf,
414
- coding_column._jc,
415
- concept_map_uri,
416
- reverse,
417
- equivalence,
418
- target,
419
- output_column_name,
420
- )
421
- )
422
-
423
- @deprecated(reason="You should use the 'udfs.subsumes' UDF instead")
424
- def subsumes(
425
- self,
426
- df: DataFrame,
427
- output_column_name: str,
428
- left_coding_column: Optional[Column] = None,
429
- right_coding_column: Optional[Column] = None,
430
- left_coding: Optional[Coding] = None,
431
- right_coding: Optional[Coding] = None,
432
- ):
433
- """
434
- Takes a dataframe with two Coding columns. A new column is created which contains a
435
- Boolean value, indicating whether the left Coding subsumes the right Coding.
436
-
437
- :param df: a DataFrame containing the input data
438
- :param left_coding_column: a Column containing a struct representation of a Coding,
439
- for the left-hand side of the subsumption test
440
- :param right_coding_column: a Column containing a struct representation of a Coding,
441
- for the right-hand side of the subsumption test
442
- :param left_coding: a Coding object for the left-hand side of the subsumption test
443
- :param right_coding: a Coding object for the right-hand side of the subsumption test
444
- :param output_column_name: the name of the result column
445
- :return: A new dataframe with an additional column containing the result of the operation.
446
- """
447
- if (left_coding_column is None and left_coding is None) or (
448
- right_coding_column is None and right_coding is None
449
- ):
450
- raise ValueError(
451
- "Must provide either left_coding_column or left_coding, and either "
452
- "right_coding_column or right_coding"
453
- )
454
- left_column = left_coding.to_literal() if left_coding else left_coding_column
455
- right_column = (
456
- right_coding.to_literal() if right_coding else right_coding_column
457
- )
458
- return self._wrap_df(
459
- self._jpc.subsumes(
460
- df._jdf, left_column._jc, right_column._jc, output_column_name
461
- )
462
- )
463
355
 
464
356
  @property
465
357
  def read(self) -> "DataSources":
@@ -21,15 +21,6 @@ if TYPE_CHECKING:
21
21
  from pathling.datasource import DataSource
22
22
 
23
23
 
24
- class ImportMode:
25
- """
26
- Constants that represent the different import modes.
27
- """
28
-
29
- OVERWRITE: str = "overwrite"
30
- MERGE: str = "merge"
31
-
32
-
33
24
  class SaveMode:
34
25
  """
35
26
  Constants that represent the different save modes.
@@ -38,12 +29,14 @@ class SaveMode:
38
29
  APPEND: Append the new data to the existing data.
39
30
  IGNORE: Only save the data if the file does not already exist.
40
31
  ERROR: Raise an error if the file already exists.
32
+ MERGE: Merge the new data with the existing data based on resource ID.
41
33
  """
42
34
 
43
35
  OVERWRITE: str = "overwrite"
44
36
  APPEND: str = "append"
45
37
  IGNORE: str = "ignore"
46
38
  ERROR: str = "error"
39
+ MERGE: str = "merge"
47
40
 
48
41
 
49
42
  class DataSinks(SparkConversionsMixin):
@@ -82,9 +75,9 @@ class DataSinks(SparkConversionsMixin):
82
75
  wrapped_mapper = StringMapper(
83
76
  self.spark._jvm._gateway_client, file_name_mapper
84
77
  )
85
- self._datasinks.ndjson(path, save_mode, wrapped_mapper)
78
+ self._datasinks.saveMode(save_mode).ndjson(path, wrapped_mapper)
86
79
  else:
87
- self._datasinks.ndjson(path, save_mode)
80
+ self._datasinks.saveMode(save_mode).ndjson(path)
88
81
 
89
82
  def parquet(self, path: str, save_mode: Optional[str] = SaveMode.ERROR) -> None:
90
83
  """
@@ -97,35 +90,35 @@ class DataSinks(SparkConversionsMixin):
97
90
  - "ignore" will only save the data if the file does not already exist.
98
91
  - "error" will raise an error if the file already exists.
99
92
  """
100
- self._datasinks.parquet(path, save_mode)
93
+ self._datasinks.saveMode(save_mode).parquet(path)
101
94
 
102
95
  def delta(
103
- self, path: str, import_mode: Optional[str] = ImportMode.OVERWRITE
96
+ self, path: str, save_mode: Optional[str] = SaveMode.OVERWRITE
104
97
  ) -> None:
105
98
  """
106
99
  Writes the data to a directory of Delta files.
107
100
 
108
101
  :param path: The URI of the directory to write the files to.
109
- :param import_mode: The import mode to use when writing the data - "overwrite" will
102
+ :param save_mode: The save mode to use when writing the data - "overwrite" will
110
103
  overwrite any existing data, "merge" will merge the new data with the existing data based
111
104
  on resource ID.
112
105
  """
113
- self._datasinks.delta(path, import_mode)
106
+ self._datasinks.saveMode(save_mode).delta(path)
114
107
 
115
108
  def tables(
116
109
  self,
117
110
  schema: Optional[str] = None,
118
- import_mode: Optional[str] = ImportMode.OVERWRITE,
111
+ save_mode: Optional[str] = SaveMode.OVERWRITE,
119
112
  ) -> None:
120
113
  """
121
114
  Writes the data to a set of tables in the Spark catalog.
122
115
 
123
116
  :param schema: The name of the schema to write the tables to.
124
- :param import_mode: The import mode to use when writing the data - "overwrite" will
117
+ :param save_mode: The save mode to use when writing the data - "overwrite" will
125
118
  overwrite any existing data, "merge" will merge the new data with the existing data based
126
119
  on resource ID.
127
120
  """
128
121
  if schema:
129
- self._datasinks.tables(import_mode, schema)
122
+ self._datasinks.saveMode(save_mode).tables(schema)
130
123
  else:
131
- self._datasinks.tables(import_mode)
124
+ self._datasinks.saveMode(save_mode).tables()
@@ -25,6 +25,7 @@ from pyspark.sql import DataFrame
25
25
  from pathling import PathlingContext
26
26
  from pathling.core import StringToStringSetMapper, SparkConversionsMixin
27
27
  from pathling.fhir import MimeType
28
+ from pathling.spark import Dfs
28
29
 
29
30
  if TYPE_CHECKING:
30
31
  from pathling.datasink import DataSinks
@@ -109,6 +110,10 @@ class DataSources(SparkConversionsMixin):
109
110
  A factory for creating data sources.
110
111
  """
111
112
 
113
+ # Default extension and MIME type for NDJSON files
114
+ NDJSON_EXTENSION = "ndjson"
115
+ NDJSON_MIMETYPE = "application/fhir+ndjson"
116
+
112
117
  def __init__(self, pathling: PathlingContext):
113
118
  SparkConversionsMixin.__init__(self, pathling.spark)
114
119
  self._pc = pathling
@@ -120,7 +125,7 @@ class DataSources(SparkConversionsMixin):
120
125
  def ndjson(
121
126
  self,
122
127
  path,
123
- extension: Optional[str] = "ndjson",
128
+ extension: Optional[str] = None,
124
129
  file_name_mapper: Callable[[str], Sequence[str]] = None,
125
130
  ) -> DataSource:
126
131
  """
@@ -134,6 +139,9 @@ class DataSources(SparkConversionsMixin):
134
139
  types that it contains.
135
140
  :return: A DataSource object that can be used to run queries against the data.
136
141
  """
142
+
143
+ extension = extension or DataSources.NDJSON_EXTENSION
144
+
137
145
  if file_name_mapper:
138
146
  wrapped_mapper = StringToStringSetMapper(
139
147
  self.spark._jvm._gateway_client, file_name_mapper
@@ -221,34 +229,35 @@ class DataSources(SparkConversionsMixin):
221
229
  def bulk(
222
230
  self,
223
231
  fhir_endpoint_url: str,
224
- output_dir: str,
232
+ output_dir: Optional[str] = None,
233
+ overwrite: bool = True,
225
234
  group_id: Optional[str] = None,
226
235
  patients: Optional[List[str]] = None,
227
- output_format: str = "application/fhir+ndjson",
228
236
  since: Optional[datetime] = None,
229
237
  types: Optional[List[str]] = None,
230
238
  elements: Optional[List[str]] = None,
231
239
  include_associated_data: Optional[List[str]] = None,
232
240
  type_filters: Optional[List[str]] = None,
233
- output_extension: str = "ndjson",
234
241
  timeout: Optional[int] = None,
235
242
  max_concurrent_downloads: int = 10,
236
243
  auth_config: Optional[Dict] = None
237
244
  ) -> DataSource:
238
245
  """
239
- Creates a data source from a FHIR Bulk Data Access API endpoint.
240
-
246
+ Creates a data source from a FHIR Bulk Data Access API endpoint.
247
+ Currently only supports bulk export in the ndjson format.
248
+
241
249
  :param fhir_endpoint_url: The URL of the FHIR server to export from
242
- :param output_dir: The directory to write the output files to
250
+ :param output_dir: The directory to write the output files to.
251
+ This should be a valid path in the Spark's filesystem.
252
+ If set to `None`, a temporary directory will be used instead.
253
+ :param overwrite: Whether to overwrite the output directory if it already exists. Defaults to True.
243
254
  :param group_id: Optional group ID for group-level export
244
255
  :param patients: Optional list of patient references for patient-level export
245
- :param output_format: The format of the output data
246
256
  :param since: Only include resources modified after this timestamp
247
257
  :param types: List of FHIR resource types to include
248
258
  :param elements: List of FHIR elements to include
249
259
  :param include_associated_data: Pre-defined set of FHIR resources to include
250
260
  :param type_filters: FHIR search queries to filter resources
251
- :param output_extension: File extension for output files. Defaults to "ndjson"
252
261
  :param timeout: Optional timeout duration in seconds
253
262
  :param max_concurrent_downloads: Maximum number of concurrent downloads. Defaults to 10
254
263
  :param auth_config: Optional authentication configuration dictionary with the following possible keys:
@@ -265,10 +274,21 @@ class DataSources(SparkConversionsMixin):
265
274
  """
266
275
  from pathling.bulk import BulkExportClient
267
276
 
277
+ dfs = Dfs(self._pc.spark)
278
+
279
+ # If `output_dir` is not provided, create a temporary directory
280
+ output_dir = output_dir or dfs.get_temp_dir_path(prefix="tmp-bulk-export", qualified=True)
281
+ # If `overwrite`, then ensure the output directory does not exist
282
+ if overwrite and dfs.exists(output_dir):
283
+ dfs.delete(output_dir, recursive=True)
284
+
285
+ output_format = DataSources.NDJSON_MIMETYPE
286
+ output_extension = DataSources.NDJSON_EXTENSION
287
+
268
288
  # Create appropriate client based on parameters
269
289
  if group_id is not None:
270
290
  client = BulkExportClient.for_group(
271
- self.spark._jvm,
291
+ self.spark,
272
292
  fhir_endpoint_url=fhir_endpoint_url,
273
293
  output_dir=output_dir,
274
294
  group_id=group_id,
@@ -285,7 +305,7 @@ class DataSources(SparkConversionsMixin):
285
305
  )
286
306
  elif patients is not None:
287
307
  client = BulkExportClient.for_patient(
288
- self.spark._jvm,
308
+ self.spark,
289
309
  fhir_endpoint_url=fhir_endpoint_url,
290
310
  output_dir=output_dir,
291
311
  patients=patients,
@@ -302,7 +322,7 @@ class DataSources(SparkConversionsMixin):
302
322
  )
303
323
  else:
304
324
  client = BulkExportClient.for_system(
305
- self.spark._jvm,
325
+ self.spark,
306
326
  fhir_endpoint_url=fhir_endpoint_url,
307
327
  output_dir=output_dir,
308
328
  output_format=output_format,
@@ -318,7 +338,7 @@ class DataSources(SparkConversionsMixin):
318
338
  )
319
339
 
320
340
  # Perform the export
321
- result = client.export()
341
+ client.export()
322
342
 
323
343
  # Return a DataSource that reads from the exported files
324
344
  return self.ndjson(output_dir)
@@ -0,0 +1,89 @@
1
+ # Copyright 2025 Commonwealth Scientific and Industrial Research
2
+ # Organisation (CSIRO) ABN 41 687 119 230.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+
17
+ import uuid
18
+
19
+ from py4j.java_gateway import JavaObject, JVMView
20
+ from pyspark import SparkContext
21
+ from pyspark.sql import SparkSession
22
+
23
+
24
+ class Dfs:
25
+ """A class for interacting with the Hadoop Distributed File System (HDFS) in Spark."""
26
+
27
+ def __init__(self, spark: SparkSession):
28
+ """
29
+ Initialize the Dfs class with a SparkSession.
30
+
31
+ :param spark: SparkSession instance
32
+ """
33
+ if not spark:
34
+ raise ValueError("SparkSession must be provided")
35
+ sc: SparkContext = spark.sparkContext
36
+ self._jvm: JVMView = sc._jvm
37
+ self._hadoop_conf: JavaObject = sc._jsc.hadoopConfiguration()
38
+ self._fs = self._jvm.org.apache.hadoop.fs.FileSystem.get(self._hadoop_conf)
39
+
40
+ def get_temp_dir_path(self, prefix: str = "tmp-app", qualified=True) -> str:
41
+ """
42
+ Returns a unique path for a temporary directory in Spark's filesystem.
43
+
44
+ The path is constructed by appending a UUID to the base temporary directory,
45
+ ensuring uniqueness for each call.
46
+ The directory itself is not created, only the path is returned.
47
+
48
+ :param prefix: String to insert between the base directory and the UUID (default: "tmp-app").
49
+ :param qualified: If True, returns a fully qualified Hadoop path; if False, returns a raw path string.
50
+ :return: String representing the unique temporary directory path.
51
+ """
52
+ base_tmp_dir = self._hadoop_conf.get("hadoop.tmp.dir")
53
+ if not base_tmp_dir:
54
+ raise ValueError("`hadoop.tmp.dir` must be set in Hadoop configuration.")
55
+ uuid_suffix = str(uuid.uuid4())
56
+ base_tmp_path = self._jvm.org.apache.hadoop.fs.Path(base_tmp_dir)
57
+ tmp_path = self._jvm.org.apache.hadoop.fs.Path(base_tmp_path, f"{prefix}-{uuid_suffix}")
58
+ return self._fs.makeQualified(tmp_path).toString() if qualified else tmp_path.toString()
59
+
60
+ def exists(self, path: str) -> bool:
61
+ """
62
+ Check if a given path exists in the filesystem.
63
+
64
+ :param path: Path to check for existence.
65
+ :return: True if the path exists, False otherwise.
66
+ """
67
+ hadoop_path = self._jvm.org.apache.hadoop.fs.Path(path)
68
+ return self._fs.exists(hadoop_path)
69
+
70
+ def delete(self, path: str, recursive: bool = False) -> bool:
71
+ """
72
+ Delete a file or directory at the specified path.
73
+
74
+ :param path: Path to the file or directory to delete.
75
+ :param recursive: If True, delete directories and their contents recursively.
76
+ :return: True if deletion was successful, False otherwise.
77
+ """
78
+ hadoop_path = self._jvm.org.apache.hadoop.fs.Path(path)
79
+ return self._fs.delete(hadoop_path, recursive)
80
+
81
+ def mkdirs(self, path: str) -> bool:
82
+ """
83
+ Create a directory at the specified path.
84
+
85
+ :param path: Path to the directory to create.
86
+ :return: True if the directory was created successfully, False otherwise.
87
+ """
88
+ hadoop_path = self._jvm.org.apache.hadoop.fs.Path(path)
89
+ return self._fs.mkdirs(hadoop_path)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pathling
3
- Version: 8.0.0.dev1
3
+ Version: 8.0.0.dev3
4
4
  Summary: Python API for Pathling
5
5
  Home-page: https://github.com/aehrc/pathling
6
6
  Author: Australian e-Health Research Centre, CSIRO
@@ -10,12 +10,9 @@ examples/encode_bundles.py
10
10
  examples/encode_resources.py
11
11
  examples/fhir_view.py
12
12
  examples/member_of.py
13
- examples/member_of_old.py
14
13
  examples/property_of.py
15
14
  examples/subsumes.py
16
- examples/subsumes_old.py
17
15
  examples/translate.py
18
- examples/translate_old.py
19
16
  examples/data/bundles/Bennett146_Swaniawski813_704c9750-f6e6-473b-ee83-fbd48e07fe3f.json
20
17
  examples/data/bundles/Dino214_Parisian75_40d82b80-b682-cd8b-da6d-396809878641.json
21
18
  examples/data/resources/Condition.ndjson
@@ -30,14 +27,17 @@ pathling/datasink.py
30
27
  pathling/datasource.py
31
28
  pathling/fhir.py
32
29
  pathling/functions.py
30
+ pathling/spark.py
33
31
  pathling/udfs.py
34
32
  pathling.egg-info/PKG-INFO
35
33
  pathling.egg-info/SOURCES.txt
36
34
  pathling.egg-info/dependency_links.txt
37
35
  pathling.egg-info/requires.txt
38
36
  pathling.egg-info/top_level.txt
37
+ tests/test_bulk.py
39
38
  tests/test_datasource.py
40
39
  tests/test_encoders.py
41
40
  tests/test_functions.py
41
+ tests/test_spark.py
42
42
  tests/test_udfs.py
43
43
  tests/test_view.py
@@ -0,0 +1,62 @@
1
+ # Copyright 2023 Commonwealth Scientific and Industrial Research
2
+ # Organisation (CSIRO) ABN 41 687 119 230.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ import os
17
+
18
+ from flask import Response
19
+
20
+ from pathling.bulk import BulkExportClient
21
+
22
+
23
+ def test_bulk_client(pathling_ctx, mock_server, temp_dir):
24
+ @mock_server.route("/fhir/$export", methods=["GET"])
25
+ def export():
26
+ resp = Response(status=202)
27
+ resp.headers["content-location"] = mock_server.url("/pool")
28
+ return resp
29
+
30
+ @mock_server.route("/pool", methods=["GET"])
31
+ def pool():
32
+ return dict(
33
+ transactionTime="1970-01-01T01:02:03.004Z",
34
+ output=[
35
+ dict(type="Patient", url=mock_server.url("/download"), count=1),
36
+ ],
37
+ )
38
+
39
+ @mock_server.route("/download", methods=["GET"])
40
+ def download():
41
+ return '{"id":"123"}'
42
+
43
+ output_dir = os.path.join(temp_dir, "export-output")
44
+
45
+ with mock_server.run():
46
+ result = BulkExportClient.for_system(
47
+ pathling_ctx.spark,
48
+ fhir_endpoint_url=mock_server.url("/fhir"),
49
+ output_dir=output_dir
50
+ ).export()
51
+
52
+ assert os.path.isdir(output_dir)
53
+ assert os.path.exists(os.path.join(output_dir, "_SUCCESS"))
54
+ assert os.path.exists(os.path.join(output_dir, "Patient.0000.ndjson"))
55
+ with open(os.path.join(output_dir, "Patient.0000.ndjson")) as f:
56
+ assert f.read() == '{"id":"123"}'
57
+ assert result.transaction_time.isoformat() == "1970-01-01T01:02:03.004000+00:00"
58
+ assert 1 == len(result.results)
59
+ file_result = result.results[0]
60
+ assert 12 == file_result.size
61
+ assert os.path.join(output_dir, "Patient.0000.ndjson") == file_result.destination
62
+ assert mock_server.url("/download") == file_result.source
@@ -14,16 +14,27 @@
14
14
  # limitations under the License.
15
15
 
16
16
  import os
17
- from datetime import datetime, timezone
18
17
  from tempfile import TemporaryDirectory
19
- from unittest.mock import Mock, patch
20
18
 
19
+ from flask import Response
21
20
  from pyspark.sql import Row, DataFrame
22
21
  from pytest import fixture
23
22
 
24
23
  from pathling.datasource import DataSource
25
24
 
26
25
 
26
+ @fixture(scope="function", autouse=True)
27
+ def func_temp_dir(temp_dir):
28
+ """
29
+ Fixture to create a temporary directory for each test function.
30
+ :param temp_dir:
31
+ :return: existing temporary directory for each test function.
32
+ """
33
+ temp_ndjson_dir = TemporaryDirectory(dir=temp_dir, prefix="function")
34
+ yield temp_ndjson_dir.name
35
+ temp_ndjson_dir.cleanup()
36
+
37
+
27
38
  @fixture(scope="module")
28
39
  def ndjson_test_data_dir(test_data_dir):
29
40
  return os.path.join(test_data_dir, "ndjson")
@@ -65,6 +76,32 @@ def temp_delta_dir(temp_dir):
65
76
  temp_delta_dir.cleanup()
66
77
 
67
78
 
79
+ @fixture(scope="function")
80
+ def bulk_server(mock_server, ndjson_test_data_dir):
81
+ @mock_server.route("/fhir/$export", methods=["GET"])
82
+ def export():
83
+ resp = Response(status=202)
84
+ resp.headers["content-location"] = mock_server.url("/pool")
85
+ return resp
86
+
87
+ @mock_server.route("/pool", methods=["GET"])
88
+ def pool():
89
+ return dict(
90
+ transactionTime="1970-01-01T00:00:00.000Z",
91
+ output=[
92
+ dict(type=resource, url=mock_server.url(f"/download/{resource}"), count=1) for
93
+ resource in ["Patient", "Condition"]
94
+ ],
95
+ )
96
+
97
+ @mock_server.route("/download/<resource>", methods=["GET"])
98
+ def download(resource):
99
+ with open(os.path.join(ndjson_test_data_dir, f"{resource}.ndjson"), "r") as f:
100
+ return f.read()
101
+
102
+ return mock_server
103
+
104
+
68
105
  ResultRow = Row("count")
69
106
 
70
107
 
@@ -160,7 +197,7 @@ def test_datasource_delta(delta_test_data_dir, temp_delta_dir, pathling_ctx):
160
197
 
161
198
  def test_datasource_delta_merge(delta_test_data_dir, temp_delta_dir, pathling_ctx):
162
199
  pathling_ctx.read.delta(delta_test_data_dir).write.delta(
163
- temp_delta_dir, import_mode="merge"
200
+ temp_delta_dir, save_mode="merge"
164
201
  )
165
202
  data_source = pathling_ctx.read.delta(temp_delta_dir)
166
203
 
@@ -193,6 +230,34 @@ def test_datasource_tables_schema(ndjson_test_data_dir, pathling_ctx):
193
230
  ]
194
231
 
195
232
 
233
+ def test_datasource_bulk_with_temp_dir(pathling_ctx, bulk_server):
234
+ # !!! this directory cannot exist for the datasource to work
235
+ with bulk_server.run():
236
+ data_source = pathling_ctx.read.bulk(
237
+ fhir_endpoint_url=bulk_server.url("/fhir")
238
+ )
239
+ result = ndjson_query(data_source)
240
+ assert result.columns == list(ResultRow)
241
+ assert result.collect() == [
242
+ ResultRow(71),
243
+ ]
244
+
245
+
246
+ def test_datasource_bulk_with_existing_dir(pathling_ctx, bulk_server, func_temp_dir):
247
+ assert os.path.exists(func_temp_dir)
248
+ with bulk_server.run():
249
+ data_source = pathling_ctx.read.bulk(
250
+ fhir_endpoint_url=bulk_server.url("/fhir"),
251
+ output_dir=func_temp_dir,
252
+ overwrite=True # default anyway, but explicit for clarity
253
+ )
254
+ result = ndjson_query(data_source)
255
+ assert result.columns == list(ResultRow)
256
+ assert result.collect() == [
257
+ ResultRow(71),
258
+ ]
259
+
260
+
196
261
  def ndjson_query(data_source: DataSource) -> DataFrame:
197
262
  return data_source.view(
198
263
  resource='Condition',
@@ -219,7 +284,6 @@ def bundles_query(data_source: DataSource) -> DataFrame:
219
284
  ).groupby().count()
220
285
 
221
286
 
222
-
223
287
  def parquet_query(data_source: DataSource) -> DataFrame:
224
288
  return ndjson_query(data_source)
225
289
 
@@ -0,0 +1,39 @@
1
+ # Copyright 2023 Commonwealth Scientific and Industrial Research
2
+ # Organisation (CSIRO) ABN 41 687 119 230.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ import re
17
+ from pathling.spark import Dfs
18
+
19
+
20
+ def test_dfs_temp_dir(pathling_ctx):
21
+ dfs = Dfs(pathling_ctx.spark)
22
+ temp_path = dfs.get_temp_dir_path(prefix="test", qualified=True)
23
+ # In local setup the path should be something like:
24
+ # file:/tmp/hadoop-username/test-8e4756c1-46e4-44a5-b36d-d6afff1b168a
25
+
26
+ # Validate the format of the temp path using regex
27
+ regex_pattern = r'^file:/tmp/hadoop-[^/]+/test-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$'
28
+ assert re.match(regex_pattern, temp_path), f"Temp path {temp_path} does not match expected format"
29
+
30
+
31
+ def test_dfs_operations(pathling_ctx):
32
+ dfs = Dfs(pathling_ctx.spark)
33
+ temp_path = dfs.get_temp_dir_path(prefix="test", qualified=True)
34
+ # Check if the temporary directory exists (it should not exist yet)
35
+ assert not dfs.exists(temp_path), f"Temporary path {temp_path} should not exist before creation"
36
+ assert dfs.mkdirs(temp_path), f"Temporary path {temp_path} can be created"
37
+ assert dfs.exists(temp_path), f"Temporary path {temp_path} should exist after creation"
38
+ dfs.delete(temp_path, recursive=True)
39
+ assert not dfs.exists(temp_path), f"Temporary path {temp_path} should not exist after deletion"
@@ -1,42 +0,0 @@
1
- # Copyright 2023 Commonwealth Scientific and Industrial Research
2
- # Organisation (CSIRO) ABN 41 687 119 230.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import os
17
-
18
- from pathling import PathlingContext
19
- from pathling.functions import to_coding, to_ecl_value_set
20
-
21
- HERE = os.path.abspath(os.path.dirname(__file__))
22
-
23
- pc = PathlingContext.create()
24
-
25
- csv = pc.spark.read.options(header=True).csv(
26
- f'file://{os.path.join(HERE, "data/csv/conditions.csv")}'
27
- )
28
-
29
- result = pc.member_of(
30
- csv,
31
- to_coding(csv.CODE, "http://snomed.info/sct"),
32
- to_ecl_value_set(
33
- """
34
- << 64572001|Disease| : (
35
- << 370135005|Pathological process| = << 441862004|Infectious process|,
36
- << 246075003|Causative agent| = << 49872002|Virus|
37
- )
38
- """
39
- ),
40
- "VIRAL_INFECTION",
41
- )
42
- result.select("CODE", "DESCRIPTION", "VIRAL_INFECTION").show()
@@ -1,49 +0,0 @@
1
- # Copyright 2023 Commonwealth Scientific and Industrial Research
2
- # Organisation (CSIRO) ABN 41 687 119 230.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import os
17
-
18
- from pathling import PathlingContext
19
- from pathling.coding import Coding
20
- from pathling.functions import to_coding
21
-
22
- HERE = os.path.abspath(os.path.dirname(__file__))
23
-
24
- pc = PathlingContext.create()
25
-
26
- csv = pc.spark.read.options(header=True).csv(
27
- f'file://{os.path.join(HERE, "data/csv/conditions.csv")}'
28
- )
29
- first_3 = csv.limit(3)
30
- cross_join = first_3.selectExpr(
31
- "CODE as LEFT", "DESCRIPTION as LEFT_DESCRIPTION"
32
- ).crossJoin(first_3.selectExpr("CODE as RIGHT", "DESCRIPTION as RIGHT_DESCRIPTION"))
33
-
34
- result_1 = pc.subsumes(
35
- cross_join,
36
- "SUBSUMES",
37
- left_coding_column=to_coding(cross_join.LEFT, "http://snomed.info/sct"),
38
- right_coding_column=to_coding(cross_join.RIGHT, "http://snomed.info/sct"),
39
- )
40
- result_2 = pc.subsumes(
41
- result_1,
42
- "LEFT_IS_ENT",
43
- # 232208008 |Ear, nose and throat disorder|
44
- left_coding=Coding("http://snomed.info/sct", "232208008"),
45
- right_coding_column=to_coding(cross_join.LEFT, "http://snomed.info/sct"),
46
- )
47
- result_2.select(
48
- "LEFT", "RIGHT", "LEFT_DESCRIPTION", "RIGHT_DESCRIPTION", "SUBSUMES", "LEFT_IS_ENT"
49
- ).show()
@@ -1,36 +0,0 @@
1
- # Copyright 2023 Commonwealth Scientific and Industrial Research
2
- # Organisation (CSIRO) ABN 41 687 119 230.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import os
17
-
18
- from pathling import PathlingContext, to_coding
19
-
20
- HERE = os.path.abspath(os.path.dirname(__file__))
21
-
22
- pc = PathlingContext.create()
23
-
24
- csv = pc.spark.read.options(header=True).csv(
25
- f'file://{os.path.join(HERE, "data/csv/conditions.csv")}'
26
- )
27
-
28
- # Translate codings to Read CTV3 using the map that ships with SNOMED CT.
29
- result = pc.translate(
30
- csv,
31
- to_coding(csv.CODE, "http://snomed.info/sct"),
32
- "http://snomed.info/sct/900000000000207008?fhir_cm=900000000000497000",
33
- output_column_name="READ_CODE",
34
- )
35
- result = result.withColumn("READ_CODE", result.READ_CODE.code)
36
- result.select("CODE", "DESCRIPTION", "READ_CODE").show()
File without changes
File without changes
File without changes
File without changes
File without changes