PyPI - pathling - Versions diffs - 8.0.0.dev1__tar.gz → 8.0.0.dev3__tar.gz - Mend

pathling 8.0.0.dev1tar.gz → 8.0.0.dev3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

{pathling-8.0.0.dev1/pathling.egg-info → pathling-8.0.0.dev3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pathling
-Version: 8.0.0.dev1
+Version: 8.0.0.dev3
 Summary: Python API for Pathling
 Home-page: https://github.com/aehrc/pathling
 Author: Australian e-Health Research Centre, CSIRO

{pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/examples/fhir_view.py RENAMED Viewed

@@ -56,4 +56,3 @@ view_ds = datasource.view(
 )
 view_ds.show()
-view_ds.explain(True)

{pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling/_version.py RENAMED Viewed

@@ -2,7 +2,7 @@
 # Auto generated from POM project version.
 # Please do not modify.
 #
-__version__="8.0.0.dev1"
+__version__="8.0.0.dev3"
 __java_version__="8.0.0-SNAPSHOT"
 __scala_version__="2.12"
 __delta_version__="3.3.2"

{pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling/bulk.py RENAMED Viewed

@@ -13,16 +13,80 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-from datetime import datetime
-from typing import List, Optional
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from typing import List, Optional, Tuple, Callable
-from pathling import PathlingContext
+from py4j.java_gateway import JavaObject, JVMView
+from pyspark.sql import SparkSession
+@dataclass
+class FileResult:
+    """
+    Represents the result of a single file export operation.
+    """
+    source: str
+    """
+    The source URL of the exported file.
+    """
+    destination: str
+    """
+    The destination URL where the file was saved.
+    """
+    size: int
+    """
+    The size of the exported file in bytes.
+    """
+@dataclass
+class ExportResult:
+    """
+    Represents the result of a bulk export operation.
+    """
+    transaction_time: datetime
+    """
+    The time at which the transaction was processed at the server.
+    Corresponds to `transactionTime` in the bulk export response.
+    """
+    results: List[FileResult]
+    """
+    A list of FileResult objects representing the exported files.
+    """
+    @classmethod
+    def from_java(cls, java_result: JavaObject) -> 'ExportResult':
+        """
+        Create an ExportResult from a Java export result object.
+        :param java_result: The Java export result object
+        :return: A Python ExportResult object
+        """
+        # Convert transaction time from Java Instant to Python datetime
+        transaction_time = datetime.fromtimestamp(
+            java_result.getTransactionTime().toEpochMilli() / 1000.0, tz=timezone.utc)
+        # Convert file results
+        file_results = [
+            FileResult(
+                source=str(java_file_result.getSource()),
+                destination=str(java_file_result.getDestination()),
+                size=java_file_result.getSize())
+            for java_file_result in java_result.getResults()
+        ]
+        return cls(
+            transaction_time=transaction_time,
+            results=file_results
+        )
 class BulkExportClient:
     """
     A client for exporting data from the FHIR Bulk Data Access API.
     """
     def __init__(self, java_client):
         """
         Create a new BulkExportClient that wraps a Java BulkExportClient.
@@ -31,26 +95,27 @@ class BulkExportClient:
         """
         self._java_client = java_client
-    def export(self):
+    def export(self) -> ExportResult:
         """
         Export data from the FHIR server.
-        :return: The result of the export operation
+        :return: The result of the export operation as a Python ExportResult object
         """
-        return self._java_client.export()
+        java_result = self._java_client.export()
+        return ExportResult.from_java(java_result)
     @classmethod
     def _configure_builder(cls, jvm, builder, fhir_endpoint_url: str, output_dir: str,
-                      output_format: str = "application/fhir+ndjson",
-                      since: Optional[datetime] = None,
-                      types: Optional[List[str]] = None,
-                      elements: Optional[List[str]] = None,
-                      include_associated_data: Optional[List[str]] = None,
-                      type_filters: Optional[List[str]] = None,
-                      output_extension: str = "ndjson",
-                      timeout: Optional[int] = None,
-                      max_concurrent_downloads: int = 10,
-                      auth_config: Optional[dict] = None):
+                           output_format: str = "application/fhir+ndjson",
+                           since: Optional[datetime] = None,
+                           types: Optional[List[str]] = None,
+                           elements: Optional[List[str]] = None,
+                           include_associated_data: Optional[List[str]] = None,
+                           type_filters: Optional[List[str]] = None,
+                           output_extension: str = "ndjson",
+                           timeout: Optional[int] = None,
+                           max_concurrent_downloads: int = 10,
+                           auth_config: Optional[dict] = None):
         """
         Configure common builder parameters.
@@ -119,7 +184,7 @@ class BulkExportClient:
         if auth_config is not None:
             auth_builder = jvm.au.csiro.fhir.auth.AuthConfig.builder()
             # Set defaults to match Java class
             auth_builder.enabled(False)
             auth_builder.useSMART(True)
@@ -150,11 +215,11 @@ class BulkExportClient:
             builder.withAuthConfig(auth_config_obj)
     @classmethod
-    def for_system(cls, jvm, *args, **kwargs) -> 'BulkExportClient':
+    def for_system(cls, spark, *args, **kwargs) -> 'BulkExportClient':
         """
         Create a builder for a system-level export.
-        :param jvm: The JVM instance
+        :param spark: The SparkSession instance
         :param fhir_endpoint_url: The URL of the FHIR server to export from
         :param output_dir: The directory to write the output files to
         :param output_format: The format of the output data
@@ -169,18 +234,17 @@ class BulkExportClient:
         :param auth_config: Optional authentication configuration dictionary
         :return: A BulkExportClient configured for system-level export
         """
-        client_class = jvm.au.csiro.fhir.export.BulkExportClient
-        builder = client_class.systemBuilder()  # Returns a builder directly
+        builder, jvm = cls._create_builder(spark, lambda bc: bc.systemBuilder())
         cls._configure_builder(jvm, builder, *args, **kwargs)
         return cls(builder.build())
-    @classmethod
-    def for_group(cls, jvm, fhir_endpoint_url: str, output_dir: str,
+    @classmethod
+    def for_group(cls, spark, fhir_endpoint_url: str, output_dir: str,
                   group_id: str, *args, **kwargs) -> 'BulkExportClient':
         """
         Create a builder for a group-level export.
-        :param jvm: The JVM instance
+        :param spark: The SparkSession instance
         :param fhir_endpoint_url: The URL of the FHIR server to export from
         :param output_dir: The directory to write the output files to
         :param group_id: The ID of the group to export
@@ -196,19 +260,18 @@ class BulkExportClient:
         :param auth_config: Optional authentication configuration dictionary
         :return: A BulkExportClient configured for group-level export
         """
-        client_class = jvm.au.csiro.fhir.export.BulkExportClient
         # Pass group_id directly to groupBuilder
-        builder = client_class.groupBuilder(group_id)
+        builder, jvm = cls._create_builder(spark, lambda bc: bc.groupBuilder(group_id))
         cls._configure_builder(jvm, builder, fhir_endpoint_url, output_dir, *args, **kwargs)
         return cls(builder.build())
     @classmethod
-    def for_patient(cls, jvm, fhir_endpoint_url: str, output_dir: str,
+    def for_patient(cls, spark, fhir_endpoint_url: str, output_dir: str,
                     patients: Optional[List[str]] = None, *args, **kwargs) -> 'BulkExportClient':
         """
         Create a builder for a patient-level export.
-        :param jvm: The JVM instance
+        :param spark: The SparkSession instance
         :param fhir_endpoint_url: The URL of the FHIR server to export from
         :param output_dir: The directory to write the output files to
         :param patients: List of patient references to include
@@ -224,11 +287,24 @@ class BulkExportClient:
         :param auth_config: Optional authentication configuration dictionary
         :return: A BulkExportClient configured for patient-level export
         """
-        client_class = jvm.au.csiro.fhir.export.BulkExportClient
-        builder = client_class.patientBuilder()  # Returns a builder directly
+        builder, jvm = cls._create_builder(spark, lambda bc: bc.patientBuilder())
         if patients is not None:
             for patient in patients:
                 ref = jvm.au.csiro.fhir.model.Reference.of(patient)
                 builder.withPatient(ref)
         cls._configure_builder(jvm, builder, fhir_endpoint_url, output_dir, *args, **kwargs)
         return cls(builder.build())
+    @classmethod
+    def _create_builder(cls,
+                        spark: SparkSession,
+                        factory_f: Callable[[JavaObject], JavaObject]) -> Tuple[
+        JavaObject, JVMView]:
+        jvm: JVMView = spark._jvm
+        client_class = jvm.au.csiro.fhir.export.BulkExportClient
+        builder: JavaObject = factory_f(client_class)
+        builder = builder.withFileStoreFactory(
+            jvm.au.csiro.filestore.hdfs.HdfsFileStoreFactory(spark._jsc.sc().hadoopConfiguration())
+        )
+        return (builder, jvm)

{pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling/context.py RENAMED Viewed

@@ -15,9 +15,8 @@
 # noinspection PyPackageRequirements
-from deprecated import deprecated
 from py4j.java_gateway import JavaObject
-from pyspark.sql import DataFrame, SparkSession, Column
+from pyspark.sql import DataFrame, SparkSession
 from typing import Optional, Sequence, TYPE_CHECKING
 from pathling._version import (
@@ -26,7 +25,6 @@ from pathling._version import (
     __delta_version__,
     __hadoop_version__,
 )
-from pathling.coding import Coding
 from pathling.fhir import MimeType
 if TYPE_CHECKING:
@@ -34,8 +32,6 @@ if TYPE_CHECKING:
 __all__ = ["PathlingContext"]
-EQ_EQUIVALENT = "equivalent"
 class StorageType:
     MEMORY: str = "memory"
@@ -356,110 +352,6 @@ class PathlingContext:
             )
         )
-    @deprecated(reason="You should use the 'udfs.member_of' UDF instead")
-    def member_of(
-        self,
-        df: DataFrame,
-        coding_column: Column,
-        value_set_uri: str,
-        output_column_name: str,
-    ):
-        """
-        Takes a dataframe with a Coding column as input. A new column is created which contains a
-        Boolean value, indicating whether the input Coding is a member of the specified FHIR
-        ValueSet.
-        :param df: a DataFrame containing the input data
-        :param coding_column: a Column containing a struct representation of a Coding
-        :param value_set_uri: an identifier for a FHIR ValueSet
-        :param output_column_name: the name of the result column
-        :return: A new dataframe with an additional column containing the result of the operation
-        """
-        return self._wrap_df(
-            self._jpc.memberOf(
-                df._jdf, coding_column._jc, value_set_uri, output_column_name
-            )
-        )
-    @deprecated(reason="You should use the 'udfs.translate' UDF instead")
-    def translate(
-        self,
-        df: DataFrame,
-        coding_column: Column,
-        concept_map_uri: str,
-        reverse: Optional[bool] = False,
-        equivalence: Optional[str] = EQ_EQUIVALENT,
-        target: Optional[str] = None,
-        output_column_name: Optional[str] = "result",
-    ):
-        """
-        Takes a dataframe with a Coding column as input. A new column is created which contains
-        the array of Codings value with translation targets from the specified FHIR ConceptMap.
-        There may be more than one target concept for each input concept.
-        :param df: a DataFrame containing the input data
-        :param coding_column: a Column containing a struct representation of a Coding
-        :param concept_map_uri: an identifier for a FHIR ConceptMap
-        :param reverse: the direction to traverse the map - false results in "source to target"
-               mappings, while true results in "target to source"
-        :param equivalence: a comma-delimited set of values from the ConceptMapEquivalence ValueSet
-        :param target: identifies the value set in which a translation is sought.  If there is no
-               target specified, the server should return all known translations.
-        :param output_column_name: the name of the result column
-        :return: A new dataframe with an additional column containing the result of the operation.
-        """
-        return self._wrap_df(
-            self._jpc.translate(
-                df._jdf,
-                coding_column._jc,
-                concept_map_uri,
-                reverse,
-                equivalence,
-                target,
-                output_column_name,
-            )
-        )
-    @deprecated(reason="You should use the 'udfs.subsumes' UDF instead")
-    def subsumes(
-        self,
-        df: DataFrame,
-        output_column_name: str,
-        left_coding_column: Optional[Column] = None,
-        right_coding_column: Optional[Column] = None,
-        left_coding: Optional[Coding] = None,
-        right_coding: Optional[Coding] = None,
-    ):
-        """
-        Takes a dataframe with two Coding columns. A new column is created which contains a
-        Boolean value, indicating whether the left Coding subsumes the right Coding.
-        :param df: a DataFrame containing the input data
-        :param left_coding_column: a Column containing a struct representation of a Coding,
-               for the left-hand side of the subsumption test
-        :param right_coding_column: a Column containing a struct representation of a Coding,
-               for the right-hand side of the subsumption test
-        :param left_coding: a Coding object for the left-hand side of the subsumption test
-        :param right_coding: a Coding object for the right-hand side of the subsumption test
-        :param output_column_name: the name of the result column
-        :return: A new dataframe with an additional column containing the result of the operation.
-        """
-        if (left_coding_column is None and left_coding is None) or (
-            right_coding_column is None and right_coding is None
-        ):
-            raise ValueError(
-                "Must provide either left_coding_column or left_coding, and either "
-                "right_coding_column or right_coding"
-            )
-        left_column = left_coding.to_literal() if left_coding else left_coding_column
-        right_column = (
-            right_coding.to_literal() if right_coding else right_coding_column
-        )
-        return self._wrap_df(
-            self._jpc.subsumes(
-                df._jdf, left_column._jc, right_column._jc, output_column_name
-            )
-        )
     @property
     def read(self) -> "DataSources":

{pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling/datasink.py RENAMED Viewed

@@ -21,15 +21,6 @@ if TYPE_CHECKING:
     from pathling.datasource import DataSource
-class ImportMode:
-    """
-    Constants that represent the different import modes.
-    """
-    OVERWRITE: str = "overwrite"
-    MERGE: str = "merge"
 class SaveMode:
     """
     Constants that represent the different save modes.
@@ -38,12 +29,14 @@ class SaveMode:
     APPEND: Append the new data to the existing data.
     IGNORE: Only save the data if the file does not already exist.
     ERROR: Raise an error if the file already exists.
+    MERGE: Merge the new data with the existing data based on resource ID.
     """
     OVERWRITE: str = "overwrite"
     APPEND: str = "append"
     IGNORE: str = "ignore"
     ERROR: str = "error"
+    MERGE: str = "merge"
 class DataSinks(SparkConversionsMixin):
@@ -82,9 +75,9 @@ class DataSinks(SparkConversionsMixin):
             wrapped_mapper = StringMapper(
                 self.spark._jvm._gateway_client, file_name_mapper
             )
-            self._datasinks.ndjson(path, save_mode, wrapped_mapper)
+            self._datasinks.saveMode(save_mode).ndjson(path, wrapped_mapper)
         else:
-            self._datasinks.ndjson(path, save_mode)
+            self._datasinks.saveMode(save_mode).ndjson(path)
     def parquet(self, path: str, save_mode: Optional[str] = SaveMode.ERROR) -> None:
         """
@@ -97,35 +90,35 @@ class DataSinks(SparkConversionsMixin):
             - "ignore" will only save the data if the file does not already exist.
             - "error" will raise an error if the file already exists.
         """
-        self._datasinks.parquet(path, save_mode)
+        self._datasinks.saveMode(save_mode).parquet(path)
     def delta(
-        self, path: str, import_mode: Optional[str] = ImportMode.OVERWRITE
+        self, path: str, save_mode: Optional[str] = SaveMode.OVERWRITE
     ) -> None:
         """
         Writes the data to a directory of Delta files.
         :param path: The URI of the directory to write the files to.
-        :param import_mode: The import mode to use when writing the data - "overwrite" will
+        :param save_mode: The save mode to use when writing the data - "overwrite" will
         overwrite any existing data, "merge" will merge the new data with the existing data based
         on resource ID.
         """
-        self._datasinks.delta(path, import_mode)
+        self._datasinks.saveMode(save_mode).delta(path)
     def tables(
         self,
         schema: Optional[str] = None,
-        import_mode: Optional[str] = ImportMode.OVERWRITE,
+        save_mode: Optional[str] = SaveMode.OVERWRITE,
     ) -> None:
         """
         Writes the data to a set of tables in the Spark catalog.
         :param schema: The name of the schema to write the tables to.
-        :param import_mode: The import mode to use when writing the data - "overwrite" will
+        :param save_mode: The save mode to use when writing the data - "overwrite" will
         overwrite any existing data, "merge" will merge the new data with the existing data based
         on resource ID.
         """
         if schema:
-            self._datasinks.tables(import_mode, schema)
+            self._datasinks.saveMode(save_mode).tables(schema)
         else:
-            self._datasinks.tables(import_mode)
+            self._datasinks.saveMode(save_mode).tables()

{pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling/datasource.py RENAMED Viewed

@@ -25,6 +25,7 @@ from pyspark.sql import DataFrame
 from pathling import PathlingContext
 from pathling.core import StringToStringSetMapper, SparkConversionsMixin
 from pathling.fhir import MimeType
+from pathling.spark import Dfs
 if TYPE_CHECKING:
     from pathling.datasink import DataSinks
@@ -109,6 +110,10 @@ class DataSources(SparkConversionsMixin):
     A factory for creating data sources.
     """
+    # Default extension and MIME type for NDJSON files
+    NDJSON_EXTENSION = "ndjson"
+    NDJSON_MIMETYPE = "application/fhir+ndjson"
     def __init__(self, pathling: PathlingContext):
         SparkConversionsMixin.__init__(self, pathling.spark)
         self._pc = pathling
@@ -120,7 +125,7 @@ class DataSources(SparkConversionsMixin):
     def ndjson(
         self,
         path,
-        extension: Optional[str] = "ndjson",
+        extension: Optional[str] = None,
         file_name_mapper: Callable[[str], Sequence[str]] = None,
     ) -> DataSource:
         """
@@ -134,6 +139,9 @@ class DataSources(SparkConversionsMixin):
                types that it contains.
         :return: A DataSource object that can be used to run queries against the data.
         """
+        extension = extension or DataSources.NDJSON_EXTENSION
         if file_name_mapper:
             wrapped_mapper = StringToStringSetMapper(
                 self.spark._jvm._gateway_client, file_name_mapper
@@ -221,34 +229,35 @@ class DataSources(SparkConversionsMixin):
     def bulk(
         self,
         fhir_endpoint_url: str,
-        output_dir: str,
+        output_dir: Optional[str] = None,
+        overwrite: bool = True,
         group_id: Optional[str] = None,
         patients: Optional[List[str]] = None,
-        output_format: str = "application/fhir+ndjson",
         since: Optional[datetime] = None,
         types: Optional[List[str]] = None,
         elements: Optional[List[str]] = None,
         include_associated_data: Optional[List[str]] = None,
         type_filters: Optional[List[str]] = None,
-        output_extension: str = "ndjson",
         timeout: Optional[int] = None,
         max_concurrent_downloads: int = 10,
         auth_config: Optional[Dict] = None
     ) -> DataSource:
         """
-        Creates a data source from a FHIR Bulk Data Access API endpoint.
+        Creates a data source from a FHIR Bulk Data Access API endpoint.
+        Currently only supports bulk export in the ndjson format.
         :param fhir_endpoint_url: The URL of the FHIR server to export from
-        :param output_dir: The directory to write the output files to
+        :param output_dir: The directory to write the output files to.
+                This should be a valid path in the Spark's filesystem.
+                If set to `None`, a temporary directory will be used instead.
+        :param overwrite: Whether to overwrite the output directory if it already exists. Defaults to True.
         :param group_id: Optional group ID for group-level export
         :param patients: Optional list of patient references for patient-level export
-        :param output_format: The format of the output data
         :param since: Only include resources modified after this timestamp
         :param types: List of FHIR resource types to include
         :param elements: List of FHIR elements to include
         :param include_associated_data: Pre-defined set of FHIR resources to include
         :param type_filters: FHIR search queries to filter resources
-        :param output_extension: File extension for output files. Defaults to "ndjson"
         :param timeout: Optional timeout duration in seconds
         :param max_concurrent_downloads: Maximum number of concurrent downloads. Defaults to 10
         :param auth_config: Optional authentication configuration dictionary with the following possible keys:
@@ -265,10 +274,21 @@ class DataSources(SparkConversionsMixin):
         """
         from pathling.bulk import BulkExportClient
+        dfs = Dfs(self._pc.spark)
+        # If `output_dir` is not provided, create a temporary directory
+        output_dir = output_dir or dfs.get_temp_dir_path(prefix="tmp-bulk-export", qualified=True)
+        # If `overwrite`, then ensure the output directory does not exist
+        if overwrite and dfs.exists(output_dir):
+            dfs.delete(output_dir, recursive=True)
+        output_format = DataSources.NDJSON_MIMETYPE
+        output_extension = DataSources.NDJSON_EXTENSION
         # Create appropriate client based on parameters
         if group_id is not None:
             client = BulkExportClient.for_group(
-                self.spark._jvm,
+                self.spark,
                 fhir_endpoint_url=fhir_endpoint_url,
                 output_dir=output_dir,
                 group_id=group_id,
@@ -285,7 +305,7 @@ class DataSources(SparkConversionsMixin):
             )
         elif patients is not None:
             client = BulkExportClient.for_patient(
-                self.spark._jvm,
+                self.spark,
                 fhir_endpoint_url=fhir_endpoint_url,
                 output_dir=output_dir,
                 patients=patients,
@@ -302,7 +322,7 @@ class DataSources(SparkConversionsMixin):
             )
         else:
             client = BulkExportClient.for_system(
-                self.spark._jvm,
+                self.spark,
                 fhir_endpoint_url=fhir_endpoint_url,
                 output_dir=output_dir,
                 output_format=output_format,
@@ -318,7 +338,7 @@ class DataSources(SparkConversionsMixin):
             )
         # Perform the export
-        result = client.export()
+        client.export()
         # Return a DataSource that reads from the exported files
         return self.ndjson(output_dir)

pathling-8.0.0.dev3/pathling/spark.py ADDED Viewed

@@ -0,0 +1,89 @@
+#  Copyright 2025 Commonwealth Scientific and Industrial Research
+#  Organisation (CSIRO) ABN 41 687 119 230.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import uuid
+from py4j.java_gateway import JavaObject, JVMView
+from pyspark import SparkContext
+from pyspark.sql import SparkSession
+class Dfs:
+    """A class for interacting with the Hadoop Distributed File System (HDFS) in Spark."""
+    def __init__(self, spark: SparkSession):
+        """
+        Initialize the Dfs class with a SparkSession.
+        :param spark: SparkSession instance
+        """
+        if not spark:
+            raise ValueError("SparkSession must be provided")
+        sc: SparkContext = spark.sparkContext
+        self._jvm: JVMView = sc._jvm
+        self._hadoop_conf: JavaObject = sc._jsc.hadoopConfiguration()
+        self._fs = self._jvm.org.apache.hadoop.fs.FileSystem.get(self._hadoop_conf)
+    def get_temp_dir_path(self, prefix: str = "tmp-app", qualified=True) -> str:
+        """
+        Returns a unique path for a temporary directory in Spark's filesystem.
+        The path is constructed by appending a UUID to the base temporary directory,
+        ensuring uniqueness for each call.
+        The directory itself is not created, only the path is returned.
+        :param prefix: String to insert between the base directory and the UUID (default: "tmp-app").
+        :param qualified: If True, returns a fully qualified Hadoop path; if False, returns a raw path string.
+        :return: String representing the unique temporary directory path.
+        """
+        base_tmp_dir = self._hadoop_conf.get("hadoop.tmp.dir")
+        if not base_tmp_dir:
+            raise ValueError("`hadoop.tmp.dir` must be set in Hadoop configuration.")
+        uuid_suffix = str(uuid.uuid4())
+        base_tmp_path = self._jvm.org.apache.hadoop.fs.Path(base_tmp_dir)
+        tmp_path = self._jvm.org.apache.hadoop.fs.Path(base_tmp_path, f"{prefix}-{uuid_suffix}")
+        return self._fs.makeQualified(tmp_path).toString() if qualified else tmp_path.toString()
+    def exists(self, path: str) -> bool:
+        """
+        Check if a given path exists in the filesystem.
+        :param path: Path to check for existence.
+        :return: True if the path exists, False otherwise.
+        """
+        hadoop_path = self._jvm.org.apache.hadoop.fs.Path(path)
+        return self._fs.exists(hadoop_path)
+    def delete(self, path: str, recursive: bool = False) -> bool:
+        """
+        Delete a file or directory at the specified path.
+        :param path: Path to the file or directory to delete.
+        :param recursive: If True, delete directories and their contents recursively.
+        :return: True if deletion was successful, False otherwise.
+        """
+        hadoop_path = self._jvm.org.apache.hadoop.fs.Path(path)
+        return self._fs.delete(hadoop_path, recursive)
+    def mkdirs(self, path: str) -> bool:
+        """
+        Create a directory at the specified path.
+        :param path: Path to the directory to create.
+        :return: True if the directory was created successfully, False otherwise.
+        """
+        hadoop_path = self._jvm.org.apache.hadoop.fs.Path(path)
+        return self._fs.mkdirs(hadoop_path)

{pathling-8.0.0.dev1 → pathling-8.0.0.dev3/pathling.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pathling
-Version: 8.0.0.dev1
+Version: 8.0.0.dev3
 Summary: Python API for Pathling
 Home-page: https://github.com/aehrc/pathling
 Author: Australian e-Health Research Centre, CSIRO

{pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/pathling.egg-info/SOURCES.txt RENAMED Viewed

@@ -10,12 +10,9 @@ examples/encode_bundles.py
 examples/encode_resources.py
 examples/fhir_view.py
 examples/member_of.py
-examples/member_of_old.py
 examples/property_of.py
 examples/subsumes.py
-examples/subsumes_old.py
 examples/translate.py
-examples/translate_old.py
 examples/data/bundles/Bennett146_Swaniawski813_704c9750-f6e6-473b-ee83-fbd48e07fe3f.json
 examples/data/bundles/Dino214_Parisian75_40d82b80-b682-cd8b-da6d-396809878641.json
 examples/data/resources/Condition.ndjson
@@ -30,14 +27,17 @@ pathling/datasink.py
 pathling/datasource.py
 pathling/fhir.py
 pathling/functions.py
+pathling/spark.py
 pathling/udfs.py
 pathling.egg-info/PKG-INFO
 pathling.egg-info/SOURCES.txt
 pathling.egg-info/dependency_links.txt
 pathling.egg-info/requires.txt
 pathling.egg-info/top_level.txt
+tests/test_bulk.py
 tests/test_datasource.py
 tests/test_encoders.py
 tests/test_functions.py
+tests/test_spark.py
 tests/test_udfs.py
 tests/test_view.py

pathling-8.0.0.dev3/tests/test_bulk.py ADDED Viewed

@@ -0,0 +1,62 @@
+#  Copyright 2023 Commonwealth Scientific and Industrial Research
+#  Organisation (CSIRO) ABN 41 687 119 230.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import os
+from flask import Response
+from pathling.bulk import BulkExportClient
+def test_bulk_client(pathling_ctx, mock_server, temp_dir):
+    @mock_server.route("/fhir/$export", methods=["GET"])
+    def export():
+        resp = Response(status=202)
+        resp.headers["content-location"] = mock_server.url("/pool")
+        return resp
+    @mock_server.route("/pool", methods=["GET"])
+    def pool():
+        return dict(
+            transactionTime="1970-01-01T01:02:03.004Z",
+            output=[
+                dict(type="Patient", url=mock_server.url("/download"), count=1),
+            ],
+        )
+    @mock_server.route("/download", methods=["GET"])
+    def download():
+        return '{"id":"123"}'
+    output_dir = os.path.join(temp_dir, "export-output")
+    with mock_server.run():
+        result = BulkExportClient.for_system(
+            pathling_ctx.spark,
+            fhir_endpoint_url=mock_server.url("/fhir"),
+            output_dir=output_dir
+        ).export()
+        assert os.path.isdir(output_dir)
+        assert os.path.exists(os.path.join(output_dir, "_SUCCESS"))
+        assert os.path.exists(os.path.join(output_dir, "Patient.0000.ndjson"))
+        with open(os.path.join(output_dir, "Patient.0000.ndjson")) as f:
+            assert f.read() == '{"id":"123"}'
+        assert result.transaction_time.isoformat() == "1970-01-01T01:02:03.004000+00:00"
+        assert 1 == len(result.results)
+        file_result = result.results[0]
+        assert 12 == file_result.size
+        assert os.path.join(output_dir, "Patient.0000.ndjson") == file_result.destination
+        assert mock_server.url("/download") == file_result.source

{pathling-8.0.0.dev1 → pathling-8.0.0.dev3}/tests/test_datasource.py RENAMED Viewed

@@ -14,16 +14,27 @@
 #  limitations under the License.
 import os
-from datetime import datetime, timezone
 from tempfile import TemporaryDirectory
-from unittest.mock import Mock, patch
+from flask import Response
 from pyspark.sql import Row, DataFrame
 from pytest import fixture
 from pathling.datasource import DataSource
+@fixture(scope="function", autouse=True)
+def func_temp_dir(temp_dir):
+    """
+    Fixture to create a temporary directory for each test function.
+    :param temp_dir:
+    :return: existing temporary directory for each test function.
+    """
+    temp_ndjson_dir = TemporaryDirectory(dir=temp_dir, prefix="function")
+    yield temp_ndjson_dir.name
+    temp_ndjson_dir.cleanup()
 @fixture(scope="module")
 def ndjson_test_data_dir(test_data_dir):
     return os.path.join(test_data_dir, "ndjson")
@@ -65,6 +76,32 @@ def temp_delta_dir(temp_dir):
     temp_delta_dir.cleanup()
+@fixture(scope="function")
+def bulk_server(mock_server, ndjson_test_data_dir):
+    @mock_server.route("/fhir/$export", methods=["GET"])
+    def export():
+        resp = Response(status=202)
+        resp.headers["content-location"] = mock_server.url("/pool")
+        return resp
+    @mock_server.route("/pool", methods=["GET"])
+    def pool():
+        return dict(
+            transactionTime="1970-01-01T00:00:00.000Z",
+            output=[
+                dict(type=resource, url=mock_server.url(f"/download/{resource}"), count=1) for
+                resource in ["Patient", "Condition"]
+            ],
+        )
+    @mock_server.route("/download/<resource>", methods=["GET"])
+    def download(resource):
+        with open(os.path.join(ndjson_test_data_dir, f"{resource}.ndjson"), "r") as f:
+            return f.read()
+    return mock_server
 ResultRow = Row("count")
@@ -160,7 +197,7 @@ def test_datasource_delta(delta_test_data_dir, temp_delta_dir, pathling_ctx):
 def test_datasource_delta_merge(delta_test_data_dir, temp_delta_dir, pathling_ctx):
     pathling_ctx.read.delta(delta_test_data_dir).write.delta(
-        temp_delta_dir, import_mode="merge"
+        temp_delta_dir, save_mode="merge"
     )
     data_source = pathling_ctx.read.delta(temp_delta_dir)
@@ -193,6 +230,34 @@ def test_datasource_tables_schema(ndjson_test_data_dir, pathling_ctx):
     ]
+def test_datasource_bulk_with_temp_dir(pathling_ctx, bulk_server):
+    # !!! this directory cannot exist for the datasource to work
+    with bulk_server.run():
+        data_source = pathling_ctx.read.bulk(
+            fhir_endpoint_url=bulk_server.url("/fhir")
+        )
+        result = ndjson_query(data_source)
+        assert result.columns == list(ResultRow)
+        assert result.collect() == [
+            ResultRow(71),
+        ]
+def test_datasource_bulk_with_existing_dir(pathling_ctx, bulk_server, func_temp_dir):
+    assert os.path.exists(func_temp_dir)
+    with bulk_server.run():
+        data_source = pathling_ctx.read.bulk(
+            fhir_endpoint_url=bulk_server.url("/fhir"),
+            output_dir=func_temp_dir,
+            overwrite=True  # default anyway, but explicit for clarity
+        )
+        result = ndjson_query(data_source)
+        assert result.columns == list(ResultRow)
+        assert result.collect() == [
+            ResultRow(71),
+        ]
 def ndjson_query(data_source: DataSource) -> DataFrame:
     return data_source.view(
         resource='Condition',
@@ -219,7 +284,6 @@ def bundles_query(data_source: DataSource) -> DataFrame:
     ).groupby().count()
 def parquet_query(data_source: DataSource) -> DataFrame:
     return ndjson_query(data_source)

pathling-8.0.0.dev3/tests/test_spark.py ADDED Viewed

@@ -0,0 +1,39 @@
+#  Copyright 2023 Commonwealth Scientific and Industrial Research
+#  Organisation (CSIRO) ABN 41 687 119 230.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import re
+from pathling.spark import Dfs
+def test_dfs_temp_dir(pathling_ctx):
+    dfs = Dfs(pathling_ctx.spark)
+    temp_path = dfs.get_temp_dir_path(prefix="test", qualified=True)
+    # In local setup the path should be something like:
+    # file:/tmp/hadoop-username/test-8e4756c1-46e4-44a5-b36d-d6afff1b168a
+    # Validate the format of the temp path using regex
+    regex_pattern = r'^file:/tmp/hadoop-[^/]+/test-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$'
+    assert re.match(regex_pattern, temp_path), f"Temp path {temp_path} does not match expected format"
+def test_dfs_operations(pathling_ctx):
+    dfs = Dfs(pathling_ctx.spark)
+    temp_path = dfs.get_temp_dir_path(prefix="test", qualified=True)
+    # Check if the temporary directory exists (it should not exist yet)
+    assert not dfs.exists(temp_path), f"Temporary path {temp_path} should not exist before creation"
+    assert dfs.mkdirs(temp_path), f"Temporary path {temp_path} can be created"
+    assert dfs.exists(temp_path), f"Temporary path {temp_path} should exist after creation"
+    dfs.delete(temp_path, recursive=True)
+    assert not dfs.exists(temp_path), f"Temporary path {temp_path} should not exist after deletion"

pathling-8.0.0.dev1/examples/member_of_old.py DELETED Viewed

@@ -1,42 +0,0 @@
-#  Copyright 2023 Commonwealth Scientific and Industrial Research
-#  Organisation (CSIRO) ABN 41 687 119 230.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-import os
-from pathling import PathlingContext
-from pathling.functions import to_coding, to_ecl_value_set
-HERE = os.path.abspath(os.path.dirname(__file__))
-pc = PathlingContext.create()
-csv = pc.spark.read.options(header=True).csv(
-    f'file://{os.path.join(HERE, "data/csv/conditions.csv")}'
-)
-result = pc.member_of(
-    csv,
-    to_coding(csv.CODE, "http://snomed.info/sct"),
-    to_ecl_value_set(
-        """
-<< 64572001|Disease| : (
-  << 370135005|Pathological process| = << 441862004|Infectious process|,
-  << 246075003|Causative agent| = << 49872002|Virus|
-)
-                      """
-    ),
-    "VIRAL_INFECTION",
-)
-result.select("CODE", "DESCRIPTION", "VIRAL_INFECTION").show()

pathling-8.0.0.dev1/examples/subsumes_old.py DELETED Viewed

@@ -1,49 +0,0 @@
-#  Copyright 2023 Commonwealth Scientific and Industrial Research
-#  Organisation (CSIRO) ABN 41 687 119 230.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-import os
-from pathling import PathlingContext
-from pathling.coding import Coding
-from pathling.functions import to_coding
-HERE = os.path.abspath(os.path.dirname(__file__))
-pc = PathlingContext.create()
-csv = pc.spark.read.options(header=True).csv(
-    f'file://{os.path.join(HERE, "data/csv/conditions.csv")}'
-)
-first_3 = csv.limit(3)
-cross_join = first_3.selectExpr(
-    "CODE as LEFT", "DESCRIPTION as LEFT_DESCRIPTION"
-).crossJoin(first_3.selectExpr("CODE as RIGHT", "DESCRIPTION as RIGHT_DESCRIPTION"))
-result_1 = pc.subsumes(
-    cross_join,
-    "SUBSUMES",
-    left_coding_column=to_coding(cross_join.LEFT, "http://snomed.info/sct"),
-    right_coding_column=to_coding(cross_join.RIGHT, "http://snomed.info/sct"),
-)
-result_2 = pc.subsumes(
-    result_1,
-    "LEFT_IS_ENT",
-    # 232208008 |Ear, nose and throat disorder|
-    left_coding=Coding("http://snomed.info/sct", "232208008"),
-    right_coding_column=to_coding(cross_join.LEFT, "http://snomed.info/sct"),
-)
-result_2.select(
-    "LEFT", "RIGHT", "LEFT_DESCRIPTION", "RIGHT_DESCRIPTION", "SUBSUMES", "LEFT_IS_ENT"
-).show()

pathling-8.0.0.dev1/examples/translate_old.py DELETED Viewed

@@ -1,36 +0,0 @@
-#  Copyright 2023 Commonwealth Scientific and Industrial Research
-#  Organisation (CSIRO) ABN 41 687 119 230.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-import os
-from pathling import PathlingContext, to_coding
-HERE = os.path.abspath(os.path.dirname(__file__))
-pc = PathlingContext.create()
-csv = pc.spark.read.options(header=True).csv(
-    f'file://{os.path.join(HERE, "data/csv/conditions.csv")}'
-)
-# Translate codings to Read CTV3 using the map that ships with SNOMED CT.
-result = pc.translate(
-    csv,
-    to_coding(csv.CODE, "http://snomed.info/sct"),
-    "http://snomed.info/sct/900000000000207008?fhir_cm=900000000000497000",
-    output_column_name="READ_CODE",
-)
-result = result.withColumn("READ_CODE", result.READ_CODE.code)
-result.select("CODE", "DESCRIPTION", "READ_CODE").show()