PyPI - neo4j-etl-lib - Versions diffs - 0.0.2__py3-none-any.whl - Mend

neo4j-etl-lib 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

etl_lib/__init__.py +4 -0
etl_lib/cli/__init__.py +0 -0
etl_lib/cli/run_tools.py +189 -0
etl_lib/core/BatchProcessor.py +88 -0
etl_lib/core/ClosedLoopBatchProcessor.py +30 -0
etl_lib/core/ETLContext.py +136 -0
etl_lib/core/ProgressReporter.py +210 -0
etl_lib/core/Task.py +267 -0
etl_lib/core/ValidationBatchProcessor.py +74 -0
etl_lib/core/__init__.py +0 -0
etl_lib/core/utils.py +7 -0
etl_lib/data_sink/CypherBatchProcessor.py +35 -0
etl_lib/data_sink/__init__.py +0 -0
etl_lib/data_source/CSVBatchProcessor.py +90 -0
etl_lib/data_source/__init__.py +0 -0
etl_lib/task/ExecuteCypherTask.py +29 -0
etl_lib/task/GDSTask.py +44 -0
etl_lib/task/__init__.py +0 -0
etl_lib/task/data_loading/CSVLoad2Neo4jTask.py +41 -0
etl_lib/task/data_loading/__init__.py +0 -0
neo4j_etl_lib-0.0.2.dist-info/LICENSE +201 -0
neo4j_etl_lib-0.0.2.dist-info/METADATA +126 -0
neo4j_etl_lib-0.0.2.dist-info/RECORD +24 -0
neo4j_etl_lib-0.0.2.dist-info/WHEEL +4 -0

etl_lib/core/Task.py ADDED Viewed

@@ -0,0 +1,267 @@
+import abc
+import logging
+import uuid
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from datetime import datetime
+class TaskReturn:
+    """
+    Return object for the :py:func:`~Task.execute` function, transporting result information.
+    """
+    success: bool
+    """Success or failure of the task."""
+    summery: dict
+    """dict holding statistics about the task performed, such as rows inserted, updated."""
+    error: str
+    """Error message."""
+    def __init__(self, success: bool = True, summery: dict = None, error: str = None):
+        self.success = success
+        self.summery = summery if summery else {}
+        self.error = error
+    def __repr__(self):
+        return f"TaskReturn({self.success=}, {self.summery=}, {self.error=})"
+    def __add__(self, other):
+        """
+        Adding 2 instances of TaskReturn.
+        Args:
+            other: Instance to add.
+        Returns:
+              New TaskReturn instance. `success` is the logical AND of the instances.
+              `summery` is the merged dict. For the values of the same key the values are added.
+        """
+        if not isinstance(other, TaskReturn):
+            return NotImplemented
+        # Merge the summery dictionaries by summing their values
+        merged_summery = self.summery.copy()
+        for key, value in other.summery.items():
+            merged_summery[key] = merged_summery.get(key, 0) + value
+        # Combine success values and errors
+        combined_success = self.success and other.success
+        combined_error = f"{self.error or ''} | {other.error or ''}".strip(" |")
+        return TaskReturn(
+            success=combined_success, summery=merged_summery, error=combined_error
+        )
+class Task:
+    """
+    ETL job that can be executed.
+    Provides reporting, time tracking and error handling.
+    Implementations must provide the :py:func:`~run_internal` function.
+    """
+    def __init__(self, context):
+        """
+        Construct a Task object.
+        Args:
+            context: :py:class:`etl_lib.core.ETLContext.ETLContext` instance. Will be available to subclasses.
+        """
+        self.context = context
+        self.logger = logging.getLogger(self.__class__.__name__)
+        self.uuid = str(uuid.uuid4())
+        """Uniquely identifies a Task."""
+        self.start_time: datetime
+        """Time when the :py:func:`~execute` was called., `None` before."""
+        self.end_time: datetime
+        """Time when the :py:func:`~execute` has finished., `None` before."""
+        self.success: bool
+        """True if the task has finished successful. False otherwise, `None` before the task has finished."""
+        self.summery: dict  # TODO: still in use?
+        """Summery statistics about the task performed, such as rows inserted, updated."""
+        self.error: str  # TODO: still in use?
+        self.depth: int = 0
+        """Level or depth of the task in the hierarchy. The root task is depth 0. Updated by the Reporter"""
+    def execute(self, **kwargs) -> TaskReturn:
+        """
+        Executes the task.
+        Implementations of this Interface should not overwrite this method, but provide the
+        Task functionality inside :py:func:`~run_internal` which will be called from here.
+        Will use the :py:class:`ProgressReporter` from the :py:attr:`~context` to report status updates.
+        Args:
+            kwargs: will be passed to `run_internal`
+        """
+        self.context.reporter.started_task(self)
+        try:
+            result = self.run_internal(**kwargs)
+        except Exception as e:
+            result = TaskReturn(success=False, summery={}, error=str(e))
+        self.context.reporter.finished_task(
+            task=self,
+            success=result.success,
+            summery=result.summery,
+            error=result.error,
+        )
+        return result
+    @abc.abstractmethod
+    def run_internal(self, **kwargs) -> TaskReturn:
+        """
+        Place to provide the logic to be performed.
+        This base class provides all the housekeeping and reporting, so that implementation must/should not need to care
+        about them.
+        Exceptions should not be captured by implementations. They are handled by this base class.
+        Args:
+            kwargs: will be passed to `run_internal`
+        Returns:
+            An instance of :py:class:`~etl_lib.core.Task.TaskReturn`.
+        """
+        pass
+    def abort_on_fail(self) -> bool:
+        """
+        Should the pipeline abort when this job fails.
+        Returns:
+            `True` indicates that no other Tasks should be executed if :py:func:`~run_internal` fails.
+        """
+        return True
+    def task_name(self) -> str:
+        """
+        Option to overwrite the name of this Task.
+        Name is used in reporting only.
+        Returns:
+            Sting describing the task. Defaults to the class name..
+        """
+        return self.__class__.__name__
+    def __repr__(self):
+        return f"Task({self.task_name()})"
+class TaskGroup(Task):
+    """
+    Base class to allow wrapping of Task or TaskGroups to form a hierarchy of jobs.
+    Implementations only need to provide the Tasks to execute as an array.
+    The summery statistic object returned from the group execute method will be a merged/aggregated one.
+    """
+    def __init__(self, context, tasks: list[Task], name: str):
+        """
+        Construct a TaskGroup object.
+        Args:
+            context: :py:class:`etl_lib.core.ETLContext.ETLContext` instance.
+            tasks: a list of `:py:class:`etl_lib.core.Task.Rask` instances.
+                These will be executed in the order provided when :py:func:`~run_internal` is called.
+            name: short name of the TaskGroup for reporting.
+        """
+        super().__init__(context)
+        self.tasks = tasks
+        self.name = name
+    def sub_tasks(self) -> [Task]:
+        return self.tasks
+    def run_internal(self, **kwargs) -> TaskReturn:
+        ret = TaskReturn()
+        for task in self.tasks:
+            task_ret = task.execute(**kwargs)
+            if task_ret == False and task.abort_on_fail():
+                self.logger.warning(
+                    f"Task {self.task_name()} failed. Aborting execution."
+                )
+                return task_ret
+            ret = ret + task_ret
+        return ret
+    def abort_on_fail(self):
+        for task in self.tasks:
+            if task.abort_on_fail():
+                return True
+    def task_name(self) -> str:
+        return self.name
+    def __repr__(self):
+        return f"TaskGroup({self.task_name()})"
+class ParallelTaskGroup(TaskGroup):
+    """
+    Task group for parallel execution of jobs.
+    This class uses a ThreadPoolExecutor to run the provided tasks :py:func:`~run_internal` functions in parallel.
+    Care should be taken that the Tasks can operate without blocking.locking each other.
+    """
+    def __init__(self, context, tasks: list[Task], name: str):
+        """
+        Construct a TaskGroup object.
+        Args:
+            context: :py:class:`etl_lib.core.ETLContext.ETLContext` instance.
+            tasks: an array of `Task` instances.
+                These will be executed in parallel when :py:func:`~run_internal`  is called.
+                The Tasks in the array could itself be other TaskGroups.
+            name: short name of the TaskGroup.
+        """
+        super().__init__(context, tasks, name)
+    def run_internal(self, **kwargs) -> TaskReturn:
+        combined_result = TaskReturn()
+        with ThreadPoolExecutor() as executor:
+            future_to_task = {
+                executor.submit(task.execute, **kwargs): task for task in self.tasks
+            }
+            for future in as_completed(future_to_task):
+                task = future_to_task[future]
+                try:
+                    result = future.result()
+                    combined_result += result
+                    # If a task fails and it has abort_on_fail set, stop further execution
+                    if not result.success and task.abort_on_fail():
+                        self.logger.warning(
+                            f"Task {task.task_name()} failed. Aborting execution of TaskGroup {self.task_name()}."
+                        )
+                        # Cancel any pending tasks
+                        for f in future_to_task:
+                            if not f.done():
+                                f.cancel()
+                        return combined_result
+                except Exception as e:
+                    self.logger.error(
+                        f"Task {task.task_name()} encountered an error: {str(e)}"
+                    )
+                    error_result = TaskReturn(success=False, summery={}, error=str(e))
+                    combined_result += error_result
+                    # Handle abort logic for unexpected exceptions
+                    if task.abort_on_fail():
+                        self.logger.warning(
+                            f"Unexpected failure in {task.task_name()}. Aborting execution of TaskGroup {self.task_name()}."
+                        )
+                        # Cancel any pending tasks
+                        for f in future_to_task:
+                            if not f.done():
+                                f.cancel()
+                        return combined_result
+        return combined_result

etl_lib/core/ValidationBatchProcessor.py ADDED Viewed

@@ -0,0 +1,74 @@
+import json
+from pathlib import Path
+from typing import Type, Generator
+from pydantic import BaseModel, ValidationError
+from etl_lib.core.BatchProcessor import BatchProcessor, BatchResults
+from etl_lib.core.ETLContext import ETLContext
+from etl_lib.core.Task import Task
+from etl_lib.core.utils import merge_summery
+class ValidationBatchProcessor(BatchProcessor):
+    """
+    Batch processor for validation, using Pydantic.
+    """
+    def __init__(self, context: ETLContext, task: Task, predecessor, model: Type[BaseModel], error_file: Path):
+        """
+        Constructs a new ValidationBatchProcessor.
+        The :py:class:`etl_lib.core.BatchProcessor.BatchResults` returned from the :py:func:`~get_batch` of this
+        implementation will contain the following additional entries:
+        - `valid_rows`: Number of valid rows.
+        - `invalid_rows`: Number of invalid rows.
+        Args:
+            context: :py:class:`etl_lib.core.ETLContext.ETLContext` instance.
+            task: :py:class:`etl_lib.core.Task.Task` instance owning this batchProcessor.
+            predecessor: BatchProcessor which :py:func:`~get_batch` function will be called to receive batches to process.
+            model: Pydantic model class used to validate each row in the batch.
+            error_file: Path to the file that will receive each row that did not pass validation.
+                Each row in this file will contain the original data together with all validation errors for this row.
+        """
+        super().__init__(context, task, predecessor)
+        self.error_file = error_file
+        self.model = model
+    def get_batch(self, max_batch__size: int) -> Generator[BatchResults, None, None]:
+        assert self.predecessor is not None
+        for batch in self.predecessor.get_batch(max_batch__size):
+            valid_rows = []
+            invalid_rows = []
+            for row in batch.chunk:
+                try:
+                    # Validate and transform the row
+                    validated_row = self.model(**row).model_dump()
+                    valid_rows.append(validated_row)
+                except ValidationError as e:
+                    # Collect invalid rows with errors
+                    invalid_rows.append({"row": row, "errors": e.errors()})
+            # Write invalid rows to the error file
+            if invalid_rows:
+                with open(self.error_file, "a") as f:
+                    for invalid in invalid_rows:
+                        # the following is needed as ValueError (contained in 'ctx') is not json serializable
+                        serializable = {"row": invalid["row"],
+                                        "errors": [{k: v for k, v in e.items() if k != "ctx"} for e in
+                                                   invalid["errors"]]}
+                        f.write(f"{json.dumps(serializable)}\n")
+            # Yield BatchResults with statistics
+            yield BatchResults(
+                chunk=valid_rows,
+                statistics=merge_summery(batch.statistics, {
+                    "valid_rows": len(valid_rows),
+                    "invalid_rows": len(invalid_rows)
+                }),
+                batch_size=len(batch.chunk)
+            )

etl_lib/core/__init__.py ADDED Viewed

File without changes

etl_lib/core/utils.py ADDED Viewed

@@ -0,0 +1,7 @@
+def merge_summery(summery_1: dict, summery_2: dict) -> dict:
+    """
+    Helper function to merge dicts. Assuming that values are numbers.
+    If a key exists in both dicts, then the result will contain a key with the added values.
+    """
+    return {i: summery_1.get(i, 0) + summery_2.get(i, 0)
+            for i in set(summery_1).union(summery_2)}

etl_lib/data_sink/CypherBatchProcessor.py ADDED Viewed

@@ -0,0 +1,35 @@
+from typing import Generator
+from etl_lib.core.ETLContext import ETLContext
+from etl_lib.core.BatchProcessor import BatchProcessor, BatchResults, append_result
+from etl_lib.core.Task import Task
+class CypherBatchProcessor(BatchProcessor):
+    """
+    BatchProcessor to write batches of data to a Neo4j database.
+    """
+    def __init__(self, context: ETLContext, task: Task, predecessor: BatchProcessor, query: str):
+        """
+        Constructs a new CypherBatchProcessor.
+        Args:
+            context: :py:class:`etl_lib.core.ETLContext.ETLContext` instance.
+            task: :py:class:`etl_lib.core.Task.Task` instance owning this batchProcessor.
+            predecessor: BatchProcessor which :py:func:`~get_batch` function will be called to receive batches to process.
+            query: Cypher to write the query to Neo4j.
+                Data will be passed as `batch` parameter.
+                Therefor, the query should start with a `UNWIND $batch AS row`.
+        """
+        super().__init__(context, task, predecessor)
+        self.query = query
+        self.neo4j = context.neo4j
+    def get_batch(self, batch_size: int) -> Generator[BatchResults, None, None]:
+        assert self.predecessor is not None
+        with self.neo4j.session() as session:
+            for batch_result in self.predecessor.get_batch(batch_size):
+                result = self.neo4j.query_database(session=session, query=self.query, batch=batch_result.chunk)
+                yield append_result(batch_result, result.summery)

etl_lib/data_sink/__init__.py ADDED Viewed

File without changes

etl_lib/data_source/CSVBatchProcessor.py ADDED Viewed

@@ -0,0 +1,90 @@
+import csv
+import gzip
+from pathlib import Path
+from typing import Generator
+from etl_lib.core.BatchProcessor import BatchProcessor, BatchResults
+from etl_lib.core.ETLContext import ETLContext
+from etl_lib.core.Task import Task
+class CSVBatchProcessor(BatchProcessor):
+    """
+    BatchProcessor that reads a CSV file using the `csv` package.
+    File can optionally be gzipped.
+    The returned batch of rows will have an additional `_row` column, containing the source row of the data,
+    starting with 0.
+    """
+    def __init__(self, csv_file: Path, context: ETLContext, task: Task, **kwargs):
+        """
+        Constructs a new CSVBatchProcessor.
+        Args:
+            csv_file: Path to the CSV file.
+            context: :py:class:`etl_lib.core.ETLContext.ETLContext` instance.
+            kwargs: Will be passed on to the `csv.DictReader` providing a way to customise the reading to different
+                csv formats.
+        """
+        super().__init__(context, task)
+        self.csv_file = csv_file
+        self.kwargs = kwargs
+    def get_batch(self, max_batch__size: int) -> Generator[BatchResults]:
+        for batch_size, chunks_ in self.read_csv(self.csv_file, batch_size=max_batch__size, **self.kwargs):
+            yield BatchResults(chunk=chunks_, statistics={"csv_lines_read": batch_size}, batch_size=batch_size)
+    def read_csv(self, file: Path, batch_size: int, **kwargs):
+        if file.suffix == ".gz":
+            with gzip.open(file, "rt", encoding='utf-8-sig') as f:
+                yield from self.__parse_csv(batch_size, file=f, **kwargs)
+        else:
+            with open(file, "rt", encoding='utf-8-sig') as f:
+                yield from self.__parse_csv(batch_size, file=f, **kwargs)
+    def __parse_csv(self, batch_size, file, **kwargs):
+        csv_file = csv.DictReader(file, **kwargs)
+        yield from self.__split_to_batches(csv_file, batch_size)
+    def __split_to_batches(self, source: [dict], batch_size):
+        """
+        Splits the provided source into batches.
+        Args:
+            source: Anything that can be loop over, ideally, this should also be a generator
+            batch_size: desired batch size
+        Returns:
+            generator object to loop over the batches. Each batch is an Array.
+        """
+        cnt = 0
+        batch_ = []
+        for i in source:
+            i["_row"] = cnt
+            cnt += 1
+            batch_.append(self.__clean_dict(i))
+            if len(batch_) == batch_size:
+                yield len(batch_), batch_
+                batch_ = []
+        if len(batch_) > 0:
+            yield len(batch_), batch_
+    def __clean_dict(self, input_dict):
+        """
+        Needed in Python versions < 3.13
+        Removes entries from the dictionary where:
+        - The value is an empty string
+        - The key is NoneType
+        Args:
+            input_dict (dict): The dictionary to clean.
+        Returns:
+            dict: A cleaned dictionary.
+        """
+        return {
+            k: (None if isinstance(v, str) and v.strip() == "" else v)
+            for k, v in input_dict.items()
+            if k is not None
+        }

etl_lib/data_source/__init__.py ADDED Viewed

File without changes

etl_lib/task/ExecuteCypherTask.py ADDED Viewed

@@ -0,0 +1,29 @@
+import abc
+from etl_lib.core.ETLContext import ETLContext
+from etl_lib.core.Task import Task, TaskReturn
+from etl_lib.core.utils import merge_summery
+class ExecuteCypherTask(Task):
+    def __init__(self, context: ETLContext):
+        super().__init__(context)
+        self.context = context
+    def run_internal(self, **kwargs) -> TaskReturn:
+        with self.context.neo4j.session() as session:
+            if isinstance(self._query(), list):
+                stats = {}
+                for query in self._query():
+                    result = self.context.neo4j.query_database(session=session, query=query, **kwargs)
+                    stats = merge_summery(stats, result.summery)
+                return TaskReturn(True, stats)
+            else:
+                result = self.context.neo4j.query_database(session=session, query=self._query(), **kwargs)
+                return TaskReturn(True, result.summery)
+    @abc.abstractmethod
+    def _query(self) -> str | list[str]:
+        pass

etl_lib/task/GDSTask.py ADDED Viewed

@@ -0,0 +1,44 @@
+from etl_lib.core.Task import Task, TaskReturn
+def transform_dict(input_dict):
+    """
+    Recursively transforms the input dictionary by converting any dictionary or list values to string representations.
+    Helpful to transform a gds call return into a storable representation
+    param: input_dict (dict): The input dictionary with values that can be of any type.
+    Returns:
+        dict: A new dictionary with transformed values.
+    """
+    def transform_value(value):
+        if isinstance(value, dict):
+            return {k: transform_value(v) for k, v in value.items()}
+        elif isinstance(value, list):
+            return str(value)
+        else:
+            return value
+    return {key: transform_value(value) for key, value in input_dict.items()}
+class GDSTask(Task):
+    def __init__(self, context, func):
+        """
+        Function that uses the gds client to perform tasks. See the following example:
+        def gds_fun(etl_context):
+            with etl_context.neo4j.gds() as gds:
+                gds.graph.drop("neo4j-offices", failIfMissing=False)
+                g_office, project_result = gds.graph.project("neo4j-offices", "City", "FLY_TO")
+                mutate_result = gds.pageRank.mutate(g_office, tolerance=0.5, mutateProperty="rank")
+                return TaskReturn(success=True, summery=transform_dict(mutate_result.to_dict()))
+        :param context: The ETLContext to use. Provides the gds client to the func via `etl_context.neo4j.gds()`
+        :param func: a function that expects a param `etl_context` and returns a `TaskReturn` object.
+        """
+        super().__init__(context)
+        self.func = func
+    def run_internal(self, **kwargs) -> TaskReturn:
+        return self.func(etl_context= self.context, **kwargs)

etl_lib/task/__init__.py ADDED Viewed

File without changes

etl_lib/task/data_loading/CSVLoad2Neo4jTask.py ADDED Viewed

@@ -0,0 +1,41 @@
+import abc
+import logging
+from pathlib import Path
+from typing import Type
+from pydantic import BaseModel
+from etl_lib.core.ETLContext import ETLContext
+from etl_lib.core.ClosedLoopBatchProcessor import ClosedLoopBatchProcessor
+from etl_lib.core.Task import Task, TaskReturn
+from etl_lib.core.ValidationBatchProcessor import ValidationBatchProcessor
+from etl_lib.data_sink.CypherBatchProcessor import CypherBatchProcessor
+from etl_lib.data_source.CSVBatchProcessor import CSVBatchProcessor
+class CSVLoad2Neo4jTasks(Task):
+    def __init__(self, context: ETLContext, model: Type[BaseModel], file: Path, batch_size: int = 5000):
+        super().__init__(context)
+        self.batch_size = batch_size
+        self.model = model
+        self.logger = logging.getLogger(self.__class__.__name__)
+        self.file = file
+    def run_internal(self, **kwargs) -> TaskReturn:
+        error_file = self.file.with_suffix(".error.json")
+        csv = CSVBatchProcessor(self.file, self.context, self)
+        validator = ValidationBatchProcessor(self.context, self, csv, self.model, error_file)
+        cypher = CypherBatchProcessor(self.context, self, validator, self._query())
+        end = ClosedLoopBatchProcessor(self.context, self, cypher)
+        result = next(end.get_batch(self.batch_size))
+        return TaskReturn(True, result.statistics)
+    def __repr__(self):
+        return f"{self.__class__.__name__}({self.file})"
+    @abc.abstractmethod
+    def _query(self):
+        pass

etl_lib/task/data_loading/__init__.py ADDED Viewed

File without changes