PyPI - neo4j-etl-lib - Versions diffs - 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

neo4j-etl-lib 0.2.0py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

etl_lib/__init__.py +1 -1
etl_lib/core/BatchProcessor.py +7 -7
etl_lib/core/ETLContext.py +64 -30
etl_lib/core/ParallelBatchProcessor.py +180 -0
etl_lib/core/ProgressReporter.py +22 -3
etl_lib/core/SplittingBatchProcessor.py +268 -0
etl_lib/core/Task.py +10 -8
etl_lib/core/ValidationBatchProcessor.py +2 -0
etl_lib/core/utils.py +52 -11
etl_lib/data_source/CSVBatchSource.py +1 -1
etl_lib/data_source/SQLBatchSource.py +79 -25
etl_lib/task/GDSTask.py +8 -5
etl_lib/task/data_loading/CSVLoad2Neo4jTask.py +1 -1
etl_lib/task/data_loading/ParallelCSVLoad2Neo4jTask.py +98 -0
etl_lib/task/data_loading/ParallelSQLLoad2Neo4jTask.py +122 -0
etl_lib/task/data_loading/SQLLoad2Neo4jTask.py +1 -1
etl_lib/test_utils/utils.py +9 -5
{neo4j_etl_lib-0.2.0.dist-info → neo4j_etl_lib-0.3.1.dist-info}/METADATA +8 -6
neo4j_etl_lib-0.3.1.dist-info/RECORD +36 -0
neo4j_etl_lib-0.2.0.dist-info/RECORD +0 -32
{neo4j_etl_lib-0.2.0.dist-info → neo4j_etl_lib-0.3.1.dist-info}/WHEEL +0 -0
{neo4j_etl_lib-0.2.0.dist-info → neo4j_etl_lib-0.3.1.dist-info}/licenses/LICENSE +0 -0

etl_lib/core/SplittingBatchProcessor.py ADDED Viewed

@@ -0,0 +1,268 @@
+import logging
+from typing import Any, Callable, Dict, Generator, List, Tuple
+from etl_lib.core.BatchProcessor import BatchProcessor, BatchResults
+from etl_lib.core.utils import merge_summery
+from tabulate import tabulate
+def tuple_id_extractor(table_size: int = 10) -> Callable[[Tuple[str | int, str | int]], Tuple[int, int]]:
+    """
+    Create an ID extractor function for tuple items, using the last decimal digit of each element.
+    The output is a `(row, col)` tuple within a `table_size x table_size` grid (default 10x10).
+    Args:
+        table_size: The dimension of the grid (number of rows/cols). Defaults to 10.
+    Returns:
+        A callable that maps a tuple `(a, b)` to a tuple `(row, col)` using the last digit of `a` and `b`.
+    """
+    def extractor(item: Tuple[Any, Any]) -> Tuple[int, int]:
+        a, b = item
+        try:
+            row = int(str(a)[-1])
+            col = int(str(b)[-1])
+        except Exception as e:
+            raise ValueError(f"Failed to extract ID from item {item}: {e}")
+        return row, col
+    extractor.table_size = table_size
+    return extractor
+def dict_id_extractor(
+        table_size: int = 10,
+        start_key: str = "start",
+        end_key: str = "end",
+) -> Callable[[Dict[str, Any]], Tuple[int, int]]:
+    """
+    Build an ID extractor for dict rows. The extractor reads two fields (configurable via
+    `start_key` and `end_key`) and returns (row, col) based on the last decimal digit of each.
+    Range validation remains the responsibility of the SplittingBatchProcessor.
+    Args:
+        table_size: Informational hint carried on the extractor; used by callers to sanity-check.
+        start_key: Field name for the start node identifier.
+        end_key: Field name for the end node identifier.
+    Returns:
+        Callable[[Mapping[str, Any]], tuple[int, int]]: Maps {start_key, end_key} → (row, col).
+    """
+    def extractor(item: Dict[str, Any]) -> Tuple[int, int]:
+        missing = [k for k in (start_key, end_key) if k not in item]
+        if missing:
+            raise KeyError(f"Item missing required keys: {', '.join(missing)}")
+        try:
+            row = int(str(item[start_key])[-1])
+            col = int(str(item[end_key])[-1])
+        except Exception as e:
+            raise ValueError(f"Failed to extract ID from item {item}: {e}")
+        return row, col
+    extractor.table_size = table_size
+    return extractor
+class SplittingBatchProcessor(BatchProcessor):
+    """
+    BatchProcessor that splits incoming BatchResults into non-overlapping partitions based
+    on row/col indices extracted by the id_extractor, and emits full or remaining batches
+    using the mix-and-batch algorithm from https://neo4j.com/blog/developer/mix-and-batch-relationship-load/
+    Each emitted batch is a list of per-cell lists (array of arrays), so callers
+    can process each partition (other name for a cell) in parallel.
+    A batch for a schedule group is  emitted when all cells in that group have at least `batch_size` items.
+    In addition, when a cell/partition reaches 3x the configured max_batch_size, the group is emitted to avoid
+    overflowing the buffer when the distribution per cell is uneven.
+    Leftovers are flushed after source exhaustion.
+    Emitted batches never exceed the configured max_batch_size.
+    """
+    def __init__(self, context, table_size: int, id_extractor: Callable[[Any], Tuple[int, int]],
+                 task=None, predecessor=None):
+        super().__init__(context, task, predecessor)
+        # If the extractor carries an expected table size, use or validate it
+        if hasattr(id_extractor, "table_size"):
+            expected_size = id_extractor.table_size
+            if table_size is None:
+                table_size = expected_size  # determine table size from extractor if not provided
+            elif table_size != expected_size:
+                raise ValueError(f"Mismatch between provided table_size ({table_size}) and "
+                                 f"id_extractor table_size ({expected_size}).")
+        elif table_size is None:
+            raise ValueError("table_size must be specified if id_extractor has no defined table_size")
+        self.table_size = table_size
+        self._id_extractor = id_extractor
+        # Initialize 2D buffer for partitions
+        self.buffer: Dict[int, Dict[int, List[Any]]] = {
+            i: {j: [] for j in range(self.table_size)} for i in range(self.table_size)
+        }
+        self.logger = logging.getLogger(f"{self.__class__.__module__}.{self.__class__.__name__}")
+    def _generate_batch_schedule(self) -> List[List[Tuple[int, int]]]:
+        """
+        Create diagonal stripes (row, col) partitions to ensure no overlapping IDs
+        across emitted batches.
+        Example grid:
+                 ||  0  |  1  |  2
+            =====++=====+=====+=====
+              0  ||  0  |  1  |  2
+            -----++-----+-----+----
+              1  ||  2  |  0  |  1
+            -----++-----+-----+-----
+              2  ||  1  |  2  |  0
+        would return [[(0, 0), (1, 1), (2, 2)], [(0, 1), (1, 2), (2, 0)], [(0, 2), (1, 0), (2, 1)]]
+        """
+        schedule: List[List[Tuple[int, int]]] = []
+        for shift in range(self.table_size):
+            partition: List[Tuple[int, int]] = [
+                (i, (i + shift) % self.table_size)
+                for i in range(self.table_size)
+            ]
+            schedule.append(partition)
+        return schedule
+    def _flush_group(
+            self,
+            partitions: List[Tuple[int, int]],
+            batch_size: int,
+            statistics: Dict[str, Any] | None = None,
+    ) -> Generator[BatchResults, None, None]:
+        """
+        Extract up to `batch_size` items from each cell in `partitions`, remove them from the buffer,
+        and yield a BatchResults whose chunks is a list of per-cell lists from these partitions.
+        Args:
+            partitions: Cell coordinates forming a diagonal group from the schedule.
+            batch_size: Max number of items to take from each cell.
+            statistics: Stats dict to attach to this emission (use {} for interim waves).
+                        The "final" emission will pass the accumulated stats here.
+        Notes:
+            - Debug-only: prints a 2D matrix of cell sizes when logger is in DEBUG.
+            - batch_size in the returned BatchResults equals the number of emitted items.
+        """
+        self._log_buffer_matrix(partition=partitions)
+        cell_chunks: List[List[Any]] = []
+        for row, col in partitions:
+            q = self.buffer[row][col]
+            take = min(batch_size, len(q))
+            part = q[:take]
+            cell_chunks.append(part)
+            # remove flushed items
+            self.buffer[row][col] = q[take:]
+        emitted = sum(len(c) for c in cell_chunks)
+        result = BatchResults(
+            chunk=cell_chunks,
+            statistics=statistics or {},
+            batch_size=emitted,
+        )
+        yield result
+    def get_batch(self, max_batch__size: int) -> Generator[BatchResults, None, None]:
+        """
+        Consume upstream batches, split data across cells, and emit diagonal partitions:
+          - During streaming: emit a full partition when all its cells have >= max_batch__size.
+          - Also during streaming: if any cell in a partition builds up beyond a 'burst' threshold
+            (3 * of max_batch__size), emit that partition early, taking up to max_batch__size
+            from each cell.
+          - After source exhaustion: flush leftovers in waves capped at max_batch__size per cell.
+        Statistics policy:
+          * Every emission except the last carries {}.
+          * The last emission carries the accumulated upstream stats (unfiltered).
+        """
+        schedule = self._generate_batch_schedule()
+        accum_stats: Dict[str, Any] = {}
+        pending: BatchResults | None = None  # hold back the most recent emission so we know what's final
+        burst_threshold = 3 * max_batch__size
+        for upstream in self.predecessor.get_batch(max_batch__size):
+            # accumulate upstream stats (unfiltered)
+            if upstream.statistics:
+                accum_stats = merge_summery(accum_stats, upstream.statistics)
+            # add data to cells
+            for item in upstream.chunk:
+                r, c = self._id_extractor(item)
+                if not (0 <= r < self.table_size and 0 <= c < self.table_size):
+                    raise ValueError(f"partition id out of range: {(r, c)} for table_size={self.table_size}")
+                self.buffer[r][c].append(item)
+            # process partitions
+            for partition in schedule:
+                # normal full flush when all cells are ready
+                if all(len(self.buffer[r][c]) >= max_batch__size for r, c in partition):
+                    br = next(self._flush_group(partition, max_batch__size, statistics={}))
+                    if pending is not None:
+                        yield pending
+                    pending = br
+                    continue
+                # burst flush if any cell backlog explodes
+                hot_cells = [(r, c, len(self.buffer[r][c])) for r, c in partition if
+                             len(self.buffer[r][c]) >= burst_threshold]
+                if hot_cells:
+                    top_r, top_c, top_len = max(hot_cells, key=lambda x: x[2])
+                    self.logger.debug(
+                        "burst flush: partition=%s threshold=%d top_cell=(%d,%d len=%d)",
+                        partition, burst_threshold, top_r, top_c, top_len
+                    )
+                    br = next(self._flush_group(partition, max_batch__size, statistics={}))
+                    if pending is not None:
+                        yield pending
+                    pending = br
+        # source exhausted: drain leftovers in capped waves (respecting batch size)
+        self.logger.debug("start flushing leftovers")
+        for partition in (p for p in schedule if any(self.buffer[r][c] for r, c in p)):
+            while any(self.buffer[r][c] for r, c in partition):
+                br = next(self._flush_group(partition, max_batch__size, statistics={}))
+                if pending is not None:
+                    yield pending
+                pending = br
+        # final emission carries accumulated stats once
+        if pending is not None:
+            yield BatchResults(chunk=pending.chunk, statistics=accum_stats, batch_size=pending.batch_size)
+    def _log_buffer_matrix(self, *, partition: List[Tuple[int, int]]) -> None:
+        """
+        Dumps a compact 2D matrix of per-cell sizes (len of each buffer) when DEBUG is enabled.
+        """
+        if not self.logger.isEnabledFor(logging.DEBUG):
+            return
+        counts = [
+            [len(self.buffer[r][c]) for c in range(self.table_size)]
+            for r in range(self.table_size)
+        ]
+        marks = set(partition)
+        pad = max(2, len(str(self.table_size - 1)))
+        col_headers = [f"c{c:0{pad}d}" for c in range(self.table_size)]
+        rows = []
+        for r in range(self.table_size):
+            row_label = f"r{r:0{pad}d}"
+            row_vals = [f"[{v}]" if (r, c) in marks else f"{v}" for c, v in enumerate(counts[r])]
+            rows.append([row_label, *row_vals])
+        table = tabulate(
+            rows,
+            headers=["", *col_headers],
+            tablefmt="psql",
+            stralign="right",
+            disable_numparse=True,
+        )
+        self.logger.debug("buffer matrix:\n%s", table)

etl_lib/core/Task.py CHANGED Viewed

@@ -7,7 +7,7 @@ from datetime import datetime
 class TaskReturn:
     """
-    Return object for the :py:func:`~Task.execute` function, transporting result information.
+    Return object for the :func:`~Task.execute` function, transporting result information.
     """
     success: bool
@@ -59,7 +59,7 @@ class Task:
     ETL job that can be executed.
     Provides reporting, time tracking and error handling.
-    Implementations must provide the :py:func:`~run_internal` function.
+    Implementations must provide the :func:`~run_internal` function.
     """
     def __init__(self, context):
@@ -67,16 +67,17 @@ class Task:
         Construct a Task object.
         Args:
-            context: :py:class:`etl_lib.core.ETLContext.ETLContext` instance. Will be available to subclasses.
+            context: :class:`~etl_lib.core.ETLContext.ETLContext` instance. Will be available to subclasses.
         """
         self.context = context
-        self.logger = logging.getLogger(self.__class__.__name__)
+        """:class:`~etl_lib.core.ETLContext.ETLContext` giving access to various resources."""
+        self.logger = logging.getLogger(f"{self.__class__.__module__}.{self.__class__.__name__}")
         self.uuid = str(uuid.uuid4())
         """Uniquely identifies a Task."""
         self.start_time: datetime
-        """Time when the :py:func:`~execute` was called., `None` before."""
+        """Time when the :func:`~execute` was called., `None` before."""
         self.end_time: datetime
-        """Time when the :py:func:`~execute` has finished., `None` before."""
+        """Time when the :func:`~execute` has finished., `None` before."""
         self.success: bool
         """True if the task has finished successful. False otherwise, `None` before the task has finished."""
         self.depth: int = 0
@@ -87,8 +88,9 @@ class Task:
         Executes the task.
         Implementations of this Interface should not overwrite this method, but provide the
-        Task functionality inside :py:func:`~run_internal` which will be called from here.
-        Will use the :py:class:`ProgressReporter` from the :py:attr:`~context` to report status updates.
+        Task functionality inside :func:`~run_internal` which will be called from here.
+        Will use the :class:`~etl_lib.core.ProgressReporter.ProgressReporter` from
+        :attr:`~etl_lib.core.Task.Task.context` to report status updates.
         Args:
             kwargs: will be passed to `run_internal`

etl_lib/core/ValidationBatchProcessor.py CHANGED Viewed

@@ -34,6 +34,8 @@ class ValidationBatchProcessor(BatchProcessor):
                 Each row in this file will contain the original data together with all validation errors for this row.
         """
         super().__init__(context, task, predecessor)
+        if model is not None and error_file is None:
+            raise ValueError('you must provide error file if the model is specified')
         self.error_file = error_file
         self.model = model

etl_lib/core/utils.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import logging
+import os
+import signal
 def merge_summery(summery_1: dict, summery_2: dict) -> dict:
@@ -12,17 +14,56 @@ def merge_summery(summery_1: dict, summery_2: dict) -> dict:
 def setup_logging(log_file=None):
     """
-    Set up logging to console and optionally to a log file.
-    :param log_file: Path to the log file
-    :type log_file: str, optional
+    Set up the logging. INFO is used for the root logger.
+    Via ETL_LIB_LOG_LEVEL environment variable, the log level of the library itself can be set to another level.
+    It also defaults to INFO.
     """
-    handlers = [logging.StreamHandler()]
+    fmt = '%(asctime)s - %(levelname)s - %(name)s - [%(threadName)s] - %(message)s'
+    formatter = logging.Formatter(fmt)
+    root_handlers = [logging.StreamHandler()]
+    if log_file:
+        root_handlers.append(logging.FileHandler(log_file))
+    for h in root_handlers:
+        h.setLevel(logging.INFO)
+        h.setFormatter(formatter)
+    logging.basicConfig(level=logging.INFO, handlers=root_handlers, force=True)
+    raw = os.getenv("ETL_LIB_LOG_LEVEL", "INFO")
+    try:
+        etl_level = int(raw) if str(raw).isdigit() else getattr(logging, str(raw).upper())
+    except Exception:
+        etl_level = logging.DEBUG
+    etl_logger = logging.getLogger('etl_lib')
+    etl_logger.setLevel(etl_level)
+    etl_logger.propagate = False
+    etl_logger.handlers.clear()
+    dbg_console = logging.StreamHandler()
+    dbg_console.setLevel(logging.NOTSET)
+    dbg_console.setFormatter(formatter)
+    etl_logger.addHandler(dbg_console)
     if log_file:
-        handlers.append(logging.FileHandler(log_file))
+        dbg_file = logging.FileHandler(log_file)
+        dbg_file.setLevel(logging.NOTSET)
+        dbg_file.setFormatter(formatter)
+        etl_logger.addHandler(dbg_file)
+def add_sigint_handler(handler_to_add):
+    """
+    Register handler_to_add(signum, frame) to run on Ctrl-C,
+    chaining any previously registered handler afterward.
+    """
+    old_handler = signal.getsignal(signal.SIGINT)
+    def chained_handler(signum, frame):
+        # first, run the new handler
+        handler_to_add(signum, frame)
+        # then, if there was an old handler, call it
+        if callable(old_handler):
+            old_handler(signum, frame)
-    logging.basicConfig(
-        level=logging.INFO,
-        format='%(asctime)s - %(levelname)s - %(message)s',
-        handlers=handlers
-    )
+    signal.signal(signal.SIGINT, chained_handler)

etl_lib/data_source/CSVBatchSource.py CHANGED Viewed

@@ -30,7 +30,7 @@ class CSVBatchSource(BatchProcessor):
         self.csv_file = csv_file
         self.kwargs = kwargs
-    def get_batch(self, max_batch__size: int) -> Generator[BatchResults]:
+    def get_batch(self, max_batch__size: int) -> Generator[BatchResults, None, None]:
         for batch_size, chunks_ in self.__read_csv(self.csv_file, batch_size=max_batch__size, **self.kwargs):
             yield BatchResults(chunk=chunks_, statistics={"csv_lines_read": batch_size}, batch_size=batch_size)

etl_lib/data_source/SQLBatchSource.py CHANGED Viewed

@@ -1,5 +1,10 @@
-from typing import Generator, Callable, Optional
+import time
+from typing import Generator, Callable, Optional, List, Dict
+from psycopg2 import OperationalError as PsycopgOperationalError
 from sqlalchemy import text
+from sqlalchemy.exc import OperationalError as SAOperationalError, DBAPIError
 from etl_lib.core.BatchProcessor import BatchResults, BatchProcessor
 from etl_lib.core.ETLContext import ETLContext
 from etl_lib.core.Task import Task
@@ -25,36 +30,85 @@ class SQLBatchSource(BatchProcessor):
             kwargs: Arguments passed as parameters with the query.
         """
         super().__init__(context, task)
-        self.query = query
+        self.query = query.strip().rstrip(";")
         self.record_transformer = record_transformer
-        self.kwargs = kwargs  # Query parameters
+        self.kwargs = kwargs
+    def _fetch_page(self, limit: int, offset: int) -> Optional[List[Dict]]:
+        """
+        Fetch a single batch of rows using LIMIT/OFFSET, with retry/backoff.
+        Each page is executed in its own transaction. On transient
+        disconnects or DB errors, it retries up to 3 times with exponential backoff.
-    def __read_records(self, conn, batch_size: int):
-        batch_ = []
-        result = conn.execute(text(self.query), self.kwargs)  # Safe execution with bound parameters
+        Args:
+            limit: maximum number of rows to return.
+            offset: number of rows to skip before starting this page.
+        Returns:
+            A list of row dicts (after applying record_transformer, if any),
+            or None if no rows are returned.
-        for row in result.mappings():  # Returns row as dict (like Neo4j's `record.data()`)
-            data = dict(row)  # Convert to dictionary
-            if self.record_transformer:
-                data = self.record_transformer(data)
-            batch_.append(data)
+        Raises:
+            Exception: re-raises the last caught error on final failure.
+        """
+        paged_sql = f"{self.query} LIMIT :limit OFFSET :offset"
+        params = {**self.kwargs, "limit": limit, "offset": offset}
+        max_retries = 5
+        backoff = 2.0
-            if len(batch_) == batch_size:
-                yield batch_
-                batch_ = []  # Reset batch
+        for attempt in range(1, max_retries + 1):
+            try:
+                with self.context.sql.engine.connect() as conn:
+                    with conn.begin():
+                        rows = conn.execute(text(paged_sql), params).mappings().all()
+                result = [
+                    self.record_transformer(dict(r)) if self.record_transformer else dict(r)
+                    for r in rows
+                ]
+                return result if result else None
-        if batch_:
-            yield batch_
+            except (PsycopgOperationalError, SAOperationalError, DBAPIError) as err:
+                if attempt == max_retries:
+                    self.logger.error(
+                        f"Page fetch failed after {max_retries} attempts "
+                        f"(limit={limit}, offset={offset}): {err}"
+                    )
+                    raise
+                self.logger.warning(
+                    f"Transient DB error on page fetch {attempt}/{max_retries}: {err!r}, "
+                    f"retrying in {backoff:.1f}s"
+                )
+                time.sleep(backoff)
+                backoff *= 2
+        return None
     def get_batch(self, max_batch_size: int) -> Generator[BatchResults, None, None]:
         """
-        Fetches data in batches using an open transaction, similar to Neo4j's approach.
+        Yield successive batches until the query is exhausted.
+        Calls _fetch_page() repeatedly, advancing the offset by the
+        number of rows returned. Stops when no more rows are returned.
+        Args:
+            max_batch_size: upper limit on rows per batch.
+        Yields:
+            BatchResults for each non-empty page.
         """
-        with self.context.sql.engine.connect() as conn:  # Keep transaction open
-            with conn.begin():  # Ensures rollback on failure
-                for chunk in self.__read_records(conn, max_batch_size):
-                    yield BatchResults(
-                        chunk=chunk,
-                        statistics={"sql_rows_read": len(chunk)},
-                        batch_size=len(chunk)
-                    )
+        offset = 0
+        while True:
+            chunk = self._fetch_page(max_batch_size, offset)
+            if not chunk:
+                break
+            yield BatchResults(
+                chunk=chunk,
+                statistics={"sql_rows_read": len(chunk)},
+                batch_size=len(chunk),
+            )
+            offset += len(chunk)

etl_lib/task/GDSTask.py CHANGED Viewed

@@ -28,11 +28,14 @@ class GDSTask(Task):
         Function that uses the gds client to perform tasks. See the following example:
         def gds_fun(etl_context):
-            with etl_context.neo4j.gds() as gds:
-                gds.graph.drop("neo4j-offices", failIfMissing=False)
-                g_office, project_result = gds.graph.project("neo4j-offices", "City", "FLY_TO")
-                mutate_result = gds.pageRank.mutate(g_office, tolerance=0.5, mutateProperty="rank")
-                return TaskReturn(success=True, summery=transform_dict(mutate_result.to_dict()))
+            gds =  etl_context.neo4j.gds
+            gds.graph.drop("neo4j-offices", failIfMissing=False)
+            g_office, project_result = gds.graph.project("neo4j-offices", "City", "FLY_TO")
+            mutate_result = gds.pageRank.write(g_office, tolerance=0.5, writeProperty="rank")
+            return TaskReturn(success=True, summery=transform_dict(mutate_result.to_dict()))
+        Notes: Do *NOT* use `etl_context.neo4j.gds` with a context manager. The GDS client closes the underlying
+            connection when exiting the context.
         :param context: The ETLContext to use. Provides the gds client to the func via `etl_context.neo4j.gds()`
         :param func: a function that expects a param `etl_context` and returns a `TaskReturn` object.

etl_lib/task/data_loading/CSVLoad2Neo4jTask.py CHANGED Viewed

@@ -67,7 +67,7 @@ class CSVLoad2Neo4jTask(Task):
         super().__init__(context)
         self.batch_size = batch_size
         self.model = model
-        self.logger = logging.getLogger(self.__class__.__name__)
+        self.logger = logging.getLogger(f"{self.__class__.__module__}.{self.__class__.__name__}")
         self.file = file
     def run_internal(self, **kwargs) -> TaskReturn:

etl_lib/task/data_loading/ParallelCSVLoad2Neo4jTask.py ADDED Viewed

@@ -0,0 +1,98 @@
+import abc
+from pathlib import Path
+from typing import Type
+from etl_lib.core.ClosedLoopBatchProcessor import ClosedLoopBatchProcessor
+from etl_lib.core.ETLContext import ETLContext
+from etl_lib.core.ParallelBatchProcessor import ParallelBatchProcessor
+from etl_lib.core.SplittingBatchProcessor import SplittingBatchProcessor, dict_id_extractor
+from etl_lib.core.Task import Task, TaskReturn
+from etl_lib.core.ValidationBatchProcessor import ValidationBatchProcessor
+from etl_lib.data_sink.CypherBatchSink import CypherBatchSink
+from etl_lib.data_source.CSVBatchSource import CSVBatchSource
+from pydantic import BaseModel
+class ParallelCSVLoad2Neo4jTask(Task):
+    """
+    Parallel CSV → Neo4j load using the mix-and-batch strategy.
+    Wires a CSV reader, optional Pydantic validation, a diagonal splitter
+    (to avoid overlapping node locks), and a Cypher sink. Rows are
+    distributed into (row, col) partitions and processed in non-overlapping groups.
+    Args:
+        context: Shared ETL context.
+        file: CSV file to load.
+        model: Optional Pydantic model for row validation; invalid rows go to `error_file`.
+        error_file: Output for invalid rows. Required when `model` is set.
+        table_size: Bucketing grid size for the splitter.
+        batch_size: Per-cell target batch size from the splitter.
+        max_workers: Worker threads per wave.
+        prefetch: Number of waves to prefetch from the splitter.
+        **csv_reader_kwargs: Forwarded to :py:class:`etl_lib.data_source.CSVBatchSource.CSVBatchSource`.
+    Returns:
+        :py:class:`~etl_lib.core.Task.TaskReturn` with merged validation and Neo4j counters.
+    Notes:
+        - `_query()` must return Cypher that starts with ``UNWIND $batch AS row``.
+        - Override `_id_extractor()` if your CSV schema doesn’t expose ``start``/``end``; the default uses
+          :py:func:`etl_lib.core.SplittingBatchProcessor.dict_id_extractor`.
+        - See the nyc-taxi example for a working subclass.
+    """
+    def __init__(self,
+                 context: ETLContext,
+                 file: Path,
+                 model: Type[BaseModel] | None = None,
+                 error_file: Path | None = None,
+                 table_size: int = 10,
+                 batch_size: int = 5000,
+                 max_workers: int | None = None,
+                 prefetch: int = 4,
+                 **csv_reader_kwargs):
+        super().__init__(context)
+        self.file = file
+        self.model = model
+        if model is not None and error_file is None:
+            raise ValueError('you must provide error file if the model is specified')
+        self.error_file = error_file
+        self.table_size = table_size
+        self.batch_size = batch_size
+        self.max_workers = max_workers or table_size
+        self.prefetch = prefetch
+        self.csv_reader_kwargs = csv_reader_kwargs
+    def run_internal(self) -> TaskReturn:
+        csv = CSVBatchSource(self.file, self.context, self, **self.csv_reader_kwargs)
+        predecessor = csv
+        if self.model is not None:
+            predecessor = ValidationBatchProcessor(self.context, self, csv, self.model, self.error_file)
+        splitter = SplittingBatchProcessor(
+            context=self.context,
+            task=self,
+            predecessor=predecessor,
+            table_size=self.table_size,
+            id_extractor=self._id_extractor()
+        )
+        parallel = ParallelBatchProcessor(
+            context=self.context,
+            task=self,
+            predecessor=splitter,
+            worker_factory=lambda: CypherBatchSink(self.context, self, None, self._query()),
+            max_workers=self.max_workers,
+            prefetch=self.prefetch
+        )
+        closing = ClosedLoopBatchProcessor(self.context, self, parallel)
+        result = next(closing.get_batch(self.batch_size))
+        return TaskReturn(True, result.statistics)
+    def _id_extractor(self):
+        return dict_id_extractor()
+    @abc.abstractmethod
+    def _query(self):
+        pass

neo4j-etl-lib 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

neo4j-etl-lib 0.2.0py3-none-any.whl → 0.3.1py3-none-any.whl