neo4j-etl-lib 0.1.1__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {neo4j_etl_lib-0.1.1 → neo4j_etl_lib-0.3.0}/PKG-INFO +14 -3
  2. {neo4j_etl_lib-0.1.1 → neo4j_etl_lib-0.3.0}/README.md +7 -1
  3. {neo4j_etl_lib-0.1.1 → neo4j_etl_lib-0.3.0}/pyproject.toml +5 -3
  4. {neo4j_etl_lib-0.1.1 → neo4j_etl_lib-0.3.0}/src/etl_lib/__init__.py +1 -1
  5. {neo4j_etl_lib-0.1.1 → neo4j_etl_lib-0.3.0}/src/etl_lib/cli/run_tools.py +1 -1
  6. {neo4j_etl_lib-0.1.1 → neo4j_etl_lib-0.3.0}/src/etl_lib/core/BatchProcessor.py +7 -7
  7. {neo4j_etl_lib-0.1.1 → neo4j_etl_lib-0.3.0}/src/etl_lib/core/ClosedLoopBatchProcessor.py +8 -2
  8. neo4j_etl_lib-0.3.0/src/etl_lib/core/ETLContext.py +226 -0
  9. neo4j_etl_lib-0.3.0/src/etl_lib/core/ParallelBatchProcessor.py +180 -0
  10. {neo4j_etl_lib-0.1.1 → neo4j_etl_lib-0.3.0}/src/etl_lib/core/ProgressReporter.py +23 -4
  11. neo4j_etl_lib-0.3.0/src/etl_lib/core/SplittingBatchProcessor.py +268 -0
  12. {neo4j_etl_lib-0.1.1 → neo4j_etl_lib-0.3.0}/src/etl_lib/core/Task.py +10 -8
  13. {neo4j_etl_lib-0.1.1 → neo4j_etl_lib-0.3.0}/src/etl_lib/core/ValidationBatchProcessor.py +2 -0
  14. neo4j_etl_lib-0.3.0/src/etl_lib/core/utils.py +69 -0
  15. {neo4j_etl_lib-0.1.1 → neo4j_etl_lib-0.3.0}/src/etl_lib/data_sink/CypherBatchSink.py +4 -3
  16. neo4j_etl_lib-0.3.0/src/etl_lib/data_sink/SQLBatchSink.py +36 -0
  17. neo4j_etl_lib-0.3.0/src/etl_lib/data_source/SQLBatchSource.py +114 -0
  18. {neo4j_etl_lib-0.1.1 → neo4j_etl_lib-0.3.0}/src/etl_lib/task/CreateReportingConstraintsTask.py +2 -2
  19. neo4j_etl_lib-0.3.0/src/etl_lib/task/data_loading/CSVLoad2Neo4jTask.py +93 -0
  20. neo4j_etl_lib-0.3.0/src/etl_lib/task/data_loading/ParallelCSVLoad2Neo4jTask.py +98 -0
  21. neo4j_etl_lib-0.3.0/src/etl_lib/task/data_loading/ParallelSQLLoad2Neo4jTask.py +122 -0
  22. neo4j_etl_lib-0.3.0/src/etl_lib/task/data_loading/SQLLoad2Neo4jTask.py +90 -0
  23. {neo4j_etl_lib-0.1.1 → neo4j_etl_lib-0.3.0}/src/etl_lib/test_utils/utils.py +19 -1
  24. neo4j_etl_lib-0.1.1/src/etl_lib/core/ETLContext.py +0 -160
  25. neo4j_etl_lib-0.1.1/src/etl_lib/core/utils.py +0 -28
  26. neo4j_etl_lib-0.1.1/src/etl_lib/task/data_loading/CSVLoad2Neo4jTask.py +0 -57
  27. {neo4j_etl_lib-0.1.1 → neo4j_etl_lib-0.3.0}/LICENSE +0 -0
  28. {neo4j_etl_lib-0.1.1 → neo4j_etl_lib-0.3.0}/src/etl_lib/cli/__init__.py +0 -0
  29. {neo4j_etl_lib-0.1.1 → neo4j_etl_lib-0.3.0}/src/etl_lib/core/__init__.py +0 -0
  30. {neo4j_etl_lib-0.1.1 → neo4j_etl_lib-0.3.0}/src/etl_lib/data_sink/CSVBatchSink.py +0 -0
  31. {neo4j_etl_lib-0.1.1 → neo4j_etl_lib-0.3.0}/src/etl_lib/data_sink/__init__.py +0 -0
  32. {neo4j_etl_lib-0.1.1 → neo4j_etl_lib-0.3.0}/src/etl_lib/data_source/CSVBatchSource.py +0 -0
  33. {neo4j_etl_lib-0.1.1 → neo4j_etl_lib-0.3.0}/src/etl_lib/data_source/CypherBatchSource.py +0 -0
  34. {neo4j_etl_lib-0.1.1 → neo4j_etl_lib-0.3.0}/src/etl_lib/data_source/__init__.py +0 -0
  35. {neo4j_etl_lib-0.1.1 → neo4j_etl_lib-0.3.0}/src/etl_lib/task/ExecuteCypherTask.py +0 -0
  36. {neo4j_etl_lib-0.1.1 → neo4j_etl_lib-0.3.0}/src/etl_lib/task/GDSTask.py +0 -0
  37. {neo4j_etl_lib-0.1.1 → neo4j_etl_lib-0.3.0}/src/etl_lib/task/__init__.py +0 -0
  38. {neo4j_etl_lib-0.1.1 → neo4j_etl_lib-0.3.0}/src/etl_lib/task/data_loading/__init__.py +0 -0
  39. {neo4j_etl_lib-0.1.1 → neo4j_etl_lib-0.3.0}/src/etl_lib/test_utils/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: neo4j-etl-lib
3
- Version: 0.1.1
3
+ Version: 0.3.0
4
4
  Summary: Building blocks for ETL pipelines.
5
5
  Keywords: etl,graph,database
6
6
  Author-email: Bert Radke <bert.radke@pm.me>
@@ -15,10 +15,11 @@ Classifier: Topic :: Database
15
15
  Classifier: Development Status :: 4 - Beta
16
16
  License-File: LICENSE
17
17
  Requires-Dist: pydantic>=2.10.5; python_version >= '3.8'
18
- Requires-Dist: neo4j>=5.27.0; python_version >= '3.7'
18
+ Requires-Dist: neo4j-rust-ext>=5.27.0; python_version >= '3.7'
19
19
  Requires-Dist: python-dotenv>=1.0.1; python_version >= '3.8'
20
20
  Requires-Dist: tabulate>=0.9.0; python_version >= '3.7'
21
21
  Requires-Dist: click>=8.1.8; python_version >= '3.7'
22
+ Requires-Dist: pydantic[email-validator]
22
23
  Requires-Dist: pytest>=8.3.0 ; extra == "dev" and ( python_version >= '3.8')
23
24
  Requires-Dist: testcontainers[neo4j]==4.9.0 ; extra == "dev" and ( python_version >= '3.9' and python_version < '4.0')
24
25
  Requires-Dist: pytest-cov ; extra == "dev"
@@ -31,11 +32,15 @@ Requires-Dist: pydata-sphinx-theme ; extra == "dev"
31
32
  Requires-Dist: sphinx-autodoc-typehints ; extra == "dev"
32
33
  Requires-Dist: sphinxcontrib-napoleon ; extra == "dev"
33
34
  Requires-Dist: sphinx-autoapi ; extra == "dev"
35
+ Requires-Dist: sqlalchemy ; extra == "dev"
36
+ Requires-Dist: psycopg2-binary ; extra == "dev"
34
37
  Requires-Dist: graphdatascience>=1.13 ; extra == "gds" and ( python_version >= '3.9')
38
+ Requires-Dist: sqlalchemy ; extra == "sql"
35
39
  Project-URL: Documentation, https://neo-technology-field.github.io/python-etl-lib/index.html
36
40
  Project-URL: Home, https://github.com/neo-technology-field/python-etl-lib
37
41
  Provides-Extra: dev
38
42
  Provides-Extra: gds
43
+ Provides-Extra: sql
39
44
 
40
45
  # Neo4j ETL Toolbox
41
46
 
@@ -43,7 +48,13 @@ A Python library of building blocks to assemble etl pipelines.
43
48
 
44
49
  Complete documentation can be found on https://neo-technology-field.github.io/python-etl-lib/index.html
45
50
 
46
- See https://github.com/neo-technology-field/python-etl-lib/tree/main/examples/gtfs for an example project.
51
+ See https://github.com/neo-technology-field/python-etl-lib/tree/main/examples/gtfs
52
+
53
+ or
54
+
55
+ https://github.com/neo-technology-field/python-etl-lib/tree/main/examples/musicbrainz
56
+
57
+ for example projects.
47
58
 
48
59
 
49
60
  The library can be installed via
@@ -4,7 +4,13 @@ A Python library of building blocks to assemble etl pipelines.
4
4
 
5
5
  Complete documentation can be found on https://neo-technology-field.github.io/python-etl-lib/index.html
6
6
 
7
- See https://github.com/neo-technology-field/python-etl-lib/tree/main/examples/gtfs for an example project.
7
+ See https://github.com/neo-technology-field/python-etl-lib/tree/main/examples/gtfs
8
+
9
+ or
10
+
11
+ https://github.com/neo-technology-field/python-etl-lib/tree/main/examples/musicbrainz
12
+
13
+ for example projects.
8
14
 
9
15
 
10
16
  The library can be installed via
@@ -23,10 +23,11 @@ keywords = ["etl", "graph", "database"]
23
23
 
24
24
  dependencies = [
25
25
  "pydantic>=2.10.5; python_version >= '3.8'",
26
- "neo4j>=5.27.0; python_version >= '3.7'",
26
+ "neo4j-rust-ext>=5.27.0; python_version >= '3.7'",
27
27
  "python-dotenv>=1.0.1; python_version >= '3.8'",
28
28
  "tabulate>=0.9.0; python_version >= '3.7'",
29
- "click>=8.1.8; python_version >= '3.7'"
29
+ "click>=8.1.8; python_version >= '3.7'",
30
+ "pydantic[email_validator]"
30
31
  ]
31
32
 
32
33
  [project.optional-dependencies]
@@ -35,9 +36,10 @@ dev = [
35
36
  "testcontainers[neo4j]==4.9.0; python_version >= '3.9' and python_version < '4.0'",
36
37
  "pytest-cov", "bumpver", "isort", "pip-tools",
37
38
  "sphinx", "sphinx-rtd-theme", "pydata-sphinx-theme", "sphinx-autodoc-typehints",
38
- "sphinxcontrib-napoleon", "sphinx-autoapi"
39
+ "sphinxcontrib-napoleon", "sphinx-autoapi", "sqlalchemy", "psycopg2-binary"
39
40
  ]
40
41
  gds = ["graphdatascience>=1.13; python_version >= '3.9'"]
42
+ sql = ["sqlalchemy"]
41
43
 
42
44
  [project.urls]
43
45
  Home = "https://github.com/neo-technology-field/python-etl-lib"
@@ -1,4 +1,4 @@
1
1
  """
2
2
  Building blocks for ETL pipelines.
3
3
  """
4
- __version__ = "0.1.1"
4
+ __version__ = "0.3.0"
@@ -67,7 +67,7 @@ def __driver(ctx):
67
67
  @click.pass_context
68
68
  def cli(ctx, neo4j_uri, neo4j_user, neo4j_password, log_file, database_name):
69
69
  """
70
- Command-line tool to process files in INPUT_DIRECTORY.
70
+ Command-line tool for ETL pipelines.
71
71
 
72
72
  Environment variables can be configured via a .env file or overridden via CLI options:
73
73
 
@@ -2,7 +2,7 @@ import abc
2
2
  import logging
3
3
  import sys
4
4
  from dataclasses import dataclass, field
5
- from typing import Generator
5
+ from typing import Generator, List, Any
6
6
 
7
7
  from etl_lib.core.Task import Task
8
8
  from etl_lib.core.utils import merge_summery
@@ -13,7 +13,7 @@ class BatchResults:
13
13
  """
14
14
  Return object of the :py:func:`~BatchProcessor.get_batch` method, wrapping a batched data together with meta information.
15
15
  """
16
- chunk: []
16
+ chunk: List[Any]
17
17
  """The batch of data."""
18
18
  statistics: dict = field(default_factory=dict)
19
19
  """`dict` of statistic information, such as row processed, nodes writen, .."""
@@ -38,11 +38,11 @@ def append_result(org: BatchResults, stats: dict) -> BatchResults:
38
38
  batch_size=org.batch_size)
39
39
 
40
40
 
41
- class BatchProcessor:
41
+ class BatchProcessor(abc.ABC):
42
42
  """
43
43
  Allows assembly of :py:class:`etl_lib.core.Task.Task` out of smaller building blocks.
44
44
 
45
- This way, functionally such as reading from a CSV file, writing to a database or validation
45
+ This way, functionally, such as reading from a CSV file, writing to a database or validation
46
46
  can be implemented and tested independently and re-used.
47
47
 
48
48
  BatchProcessors form, a linked list, where each processor only knows about its predecessor.
@@ -57,17 +57,17 @@ class BatchProcessor:
57
57
  Constructs a new :py:class:`etl_lib.core.BatchProcessor` instance.
58
58
 
59
59
  Args:
60
- context: :py:class:`etl_lib.core.ETLContext.ETLContext` instance. Will be available to subclasses.
60
+ context: :py:class:`etl_lib.core.ETLContext.ETLContext` instance. It Will be available to subclasses.
61
61
  task: :py:class:`etl_lib.core.Task.Task` this processor is part of.
62
62
  Needed for status reporting only.
63
63
  predecessor: Source of batches for this processor.
64
- Can be `None` of no predecessor is needed (such as when this processor is the start of the queue.
64
+ Can be `None` if no predecessor is needed (such as when this processor is the start of the queue).
65
65
  """
66
66
  self.context = context
67
67
  """:py:class:`etl_lib.core.ETLContext.ETLContext` instance. Providing access to general facilities."""
68
68
  self.predecessor = predecessor
69
69
  """Predecessor, used as a source of batches."""
70
- self.logger = logging.getLogger(self.__class__.__name__)
70
+ self.logger = logging.getLogger(f"{self.__class__.__module__}.{self.__class__.__name__}")
71
71
  self.task = task
72
72
  """The :py:class:`etl_lib.core.Task.Task` owning instance."""
73
73
 
@@ -1,7 +1,7 @@
1
1
  from typing import Generator
2
2
 
3
- from etl_lib.core.ETLContext import ETLContext
4
3
  from etl_lib.core.BatchProcessor import BatchProcessor, BatchResults, append_result
4
+ from etl_lib.core.ETLContext import ETLContext
5
5
  from etl_lib.core.Task import Task
6
6
 
7
7
 
@@ -24,7 +24,13 @@ class ClosedLoopBatchProcessor(BatchProcessor):
24
24
  for batch in self.predecessor.get_batch(max_batch__size):
25
25
  result = append_result(result, batch.statistics)
26
26
  batch_cnt += 1
27
- self.context.reporter.report_progress(self.task, batch_cnt, self.expected_rows, result.statistics)
27
+ self.context.reporter.report_progress(self.task, batch_cnt, self._safe_calculate_count(max_batch__size),
28
+ result.statistics)
28
29
 
29
30
  self.logger.debug(result.statistics)
30
31
  yield result
32
+
33
+ def _safe_calculate_count(self, batch_size: int | None) -> int:
34
+ if not self.expected_rows or not batch_size:
35
+ return 0
36
+ return (self.expected_rows + batch_size - 1) // batch_size # ceiling division
@@ -0,0 +1,226 @@
1
+ import logging
2
+ from typing import Any, Dict, List, NamedTuple
3
+
4
+ from neo4j.exceptions import Neo4jError
5
+
6
+ try:
7
+ from graphdatascience import GraphDataScience
8
+
9
+ gds_available = False
10
+ except ImportError:
11
+ gds_available = False
12
+ logging.info("Graph Data Science not installed, skipping")
13
+ GraphDataScience = None
14
+
15
+ from neo4j import GraphDatabase, Session, WRITE_ACCESS, SummaryCounters
16
+
17
+ try:
18
+ from sqlalchemy import create_engine
19
+ from sqlalchemy.engine import Engine
20
+
21
+ sqlalchemy_available = True
22
+ except ImportError:
23
+ sqlalchemy_available = False
24
+ logging.info("SQL Alchemy not installed, skipping")
25
+ create_engine = None # this and next line needed to prevent PyCharm warning
26
+ Engine = None
27
+
28
+ from etl_lib.core.ProgressReporter import get_reporter
29
+
30
+
31
+ class QueryResult(NamedTuple):
32
+ """Result of a query against the neo4j database."""
33
+ data: List[Any]
34
+ """Data as returned from the query."""
35
+ summery: Dict[str, int]
36
+ """Counters as reported by neo4j. Contains entries such as `nodes_created`, `nodes_deleted`, etc."""
37
+
38
+
39
+ def append_results(r1: QueryResult, r2: QueryResult) -> QueryResult:
40
+ """
41
+ Appends two QueryResult objects, summing the values for duplicate keys in the summary.
42
+
43
+ Args:
44
+ r1: The first QueryResult object.
45
+ r2: The second QueryResult object to append.
46
+
47
+ Returns:
48
+ A new QueryResult object with combined data and summed summary counts.
49
+ """
50
+ combined_summery = r1.summery.copy()
51
+
52
+ for key, value in r2.summery.items():
53
+ combined_summery[key] = combined_summery.get(key, 0) + value
54
+
55
+ return QueryResult(r1.data + r2.data, combined_summery)
56
+
57
+
58
+ class Neo4jContext:
59
+ """
60
+ Holds the connection to the neo4j database and provides facilities to execute queries.
61
+ """
62
+
63
+ def __init__(self, env_vars: dict):
64
+ """
65
+ Create a new Neo4j context.
66
+
67
+ Reads the following env_vars keys:
68
+ - `NEO4J_URI`,
69
+ - `NEO4J_USERNAME`,
70
+ - `NEO4J_PASSWORD`.
71
+ - `NEO4J_DATABASE`,
72
+ """
73
+ self.logger = logging.getLogger(f"{self.__class__.__module__}.{self.__class__.__name__}")
74
+ self.uri = env_vars["NEO4J_URI"]
75
+ self.auth = (env_vars["NEO4J_USERNAME"],
76
+ env_vars["NEO4J_PASSWORD"])
77
+ self.database = env_vars["NEO4J_DATABASE"]
78
+ self.__neo4j_connect()
79
+
80
+ def query_database(self, session: Session, query, **kwargs) -> QueryResult:
81
+ """
82
+ Executes Cypher and returns (records, counters) with retryable write semantics.
83
+ Accepts either a single query string or a list of queries.
84
+ Does not work with CALL {} IN TRANSACTION queries.
85
+ """
86
+ if isinstance(query, list):
87
+ results = None
88
+ for single in query:
89
+ part = self.query_database(session, single, **kwargs)
90
+ results = append_results(results, part) if results is not None else part
91
+ return results
92
+
93
+ def _tx(tx, q, params):
94
+ res = tx.run(q, **params)
95
+ records = list(res)
96
+ counters = res.consume().counters
97
+ return records, counters
98
+
99
+ try:
100
+ records, counters = session.execute_write(_tx, query, kwargs)
101
+ return QueryResult(records, self.__counters_2_dict(counters))
102
+ except Neo4jError as e:
103
+ self.logger.error(e)
104
+ raise
105
+
106
+ @staticmethod
107
+ def __counters_2_dict(counters: SummaryCounters):
108
+ return {
109
+ "constraints_added": counters.constraints_added,
110
+ "constraints_removed": counters.constraints_removed,
111
+ "indexes_added": counters.indexes_added,
112
+ "indexes_removed": counters.indexes_removed,
113
+ "labels_added": counters.labels_added,
114
+ "labels_removed": counters.labels_removed,
115
+ "nodes_created": counters.nodes_created,
116
+ "nodes_deleted": counters.nodes_deleted,
117
+ "properties_set": counters.properties_set,
118
+ "relationships_created": counters.relationships_created,
119
+ "relationships_deleted": counters.relationships_deleted,
120
+ }
121
+
122
+ def session(self, database=None):
123
+ """
124
+ Create a new Neo4j session in write mode, caller is responsible to close the session.
125
+
126
+ Args:
127
+ database: name of the database to use for this session. If not provided, the database name provided during
128
+ construction will be used.
129
+
130
+ Returns:
131
+ newly created Neo4j session.
132
+
133
+ """
134
+ if database is None:
135
+ return self.driver.session(database=self.database, default_access_mode=WRITE_ACCESS)
136
+ else:
137
+ return self.driver.session(database=database, default_access_mode=WRITE_ACCESS)
138
+
139
+ def __neo4j_connect(self):
140
+ self.driver = GraphDatabase.driver(uri=self.uri, auth=self.auth,
141
+ notifications_min_severity="OFF")
142
+ self.driver.verify_connectivity()
143
+ self.logger.info(
144
+ f"driver connected to instance at {self.uri} with username {self.auth[0]} and database {self.database}")
145
+
146
+
147
+ def gds(neo4j_context) -> GraphDataScience:
148
+ """
149
+ Creates a new GraphDataScience client.
150
+
151
+ Args:
152
+ neo4j_context: Neo4j context containing driver and database name.
153
+
154
+ Returns:
155
+ gds client.
156
+ """
157
+ return GraphDataScience.from_neo4j_driver(driver=neo4j_context.driver, database=neo4j_context.database)
158
+
159
+
160
+ if sqlalchemy_available:
161
+ class SQLContext:
162
+ def __init__(self, database_url: str, pool_size: int = 10, max_overflow: int = 20):
163
+ """
164
+ Initializes the SQL context with an SQLAlchemy engine.
165
+
166
+ Args:
167
+ database_url (str): SQLAlchemy connection URL.
168
+ pool_size (int): Number of connections to maintain in the pool.
169
+ max_overflow (int): Additional connections allowed beyond pool_size.
170
+ """
171
+ self.engine: Engine = create_engine(
172
+ database_url,
173
+ pool_pre_ping=True,
174
+ pool_size=pool_size,
175
+ max_overflow=max_overflow,
176
+ pool_recycle=1800, # recycle connections older than 30m
177
+ connect_args={
178
+ # turn on TCP keepalives on the client socket:
179
+ "keepalives": 1,
180
+ "keepalives_idle": 60, # after 60s of idle
181
+ "keepalives_interval": 10, # probe every 10s
182
+ "keepalives_count": 5, # give up after 5 failed probes
183
+ })
184
+
185
+
186
+ class ETLContext:
187
+ """
188
+ General context information.
189
+
190
+ Will be passed to all :class:`~etl_lib.core.Task.Task` to provide access to environment variables and functionally
191
+ deemed general enough that all parts of the ETL pipeline would need it.
192
+ """
193
+
194
+ def __init__(self, env_vars: dict):
195
+ """
196
+ Create a new ETLContext.
197
+
198
+ Args:
199
+ env_vars: Environment variables. Stored internally and can be accessed via :func:`~env` .
200
+
201
+ The context created will contain an :class:`~Neo4jContext` and a :class:`~etl_lib.core.ProgressReporter.ProgressReporter`.
202
+ See there for keys used from the provided `env_vars` dict.
203
+ """
204
+ self.logger = logging.getLogger(f"{self.__class__.__module__}.{self.__class__.__name__}")
205
+ self.neo4j = Neo4jContext(env_vars)
206
+ self.__env_vars = env_vars
207
+ self.reporter = get_reporter(self)
208
+ sql_uri = self.env("SQLALCHEMY_URI")
209
+ if sql_uri is not None and sqlalchemy_available:
210
+ self.sql = SQLContext(sql_uri)
211
+ if gds_available:
212
+ self.gds = gds(self.neo4j)
213
+
214
+ def env(self, key: str) -> Any:
215
+ """
216
+ Returns the value of an entry in the `env_vars` dict.
217
+
218
+ Args:
219
+ key: name of the entry to read.
220
+
221
+ Returns:
222
+ value of the entry, or None if the key is not in the dict.
223
+ """
224
+ if key in self.__env_vars:
225
+ return self.__env_vars[key]
226
+ return None
@@ -0,0 +1,180 @@
1
+ import queue
2
+ import threading
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
+ from typing import Any, Callable, Generator, List
5
+
6
+ from etl_lib.core.BatchProcessor import BatchProcessor, BatchResults
7
+ from etl_lib.core.utils import merge_summery
8
+
9
+
10
+ class ParallelBatchResult(BatchResults):
11
+ """
12
+ Represents a batch split into parallelizable partitions.
13
+
14
+ `chunk` is a list of lists, each sub-list is a partition.
15
+ """
16
+ pass
17
+
18
+
19
+ class ParallelBatchProcessor(BatchProcessor):
20
+ """
21
+ BatchProcessor that runs worker threads over partitions of batches.
22
+
23
+ Receives a special BatchResult (:py:class:`ParallelBatchResult`) from the predecessor.
24
+ All chunks in a ParallelBatchResult it receives can be processed in parallel.
25
+ See :py:class:`etl_lib.core.SplittingBatchProcessor` on how to produce them.
26
+ Prefetches the next ParallelBatchResults from its predecessor.
27
+ The actual processing of the batches is deferred to the configured worker.
28
+
29
+ Note:
30
+ - The predecessor must emit `ParallelBatchResult` instances.
31
+
32
+ Args:
33
+ context: ETL context.
34
+ worker_factory: A zero-arg callable that returns a new BatchProcessor
35
+ each time it's called. This processor is responsible for the processing pf the batches.
36
+ task: optional Task for reporting.
37
+ predecessor: upstream BatchProcessor that must emit ParallelBatchResult.
38
+ max_workers: number of parallel threads for partitions.
39
+ prefetch: number of ParallelBatchResults to prefetch from the predecessor.
40
+
41
+ Behavior:
42
+ - For every ParallelBatchResult, spins up `max_workers` threads.
43
+ - Each thread calls its own worker from `worker_factory()`, with its
44
+ partition wrapped by `SingleBatchWrapper`.
45
+ - Collects and merges their BatchResults in a fail-fast manner: on first
46
+ exception, logs the error, cancels remaining threads, and raises an exception.
47
+ """
48
+
49
+ def __init__(
50
+ self,
51
+ context,
52
+ worker_factory: Callable[[], BatchProcessor],
53
+ task=None,
54
+ predecessor=None,
55
+ max_workers: int = 4,
56
+ prefetch: int = 4,
57
+ ):
58
+ super().__init__(context, task, predecessor)
59
+ self.worker_factory = worker_factory
60
+ self.max_workers = max_workers
61
+ self.prefetch = prefetch
62
+ self._batches_done = 0
63
+
64
+ def _process_parallel(self, pbr: ParallelBatchResult) -> BatchResults:
65
+ """
66
+ Run one worker per partition in `pbr.chunk`, merge their outputs, and include upstream
67
+ statistics from `pbr.statistics` so counters (e.g., valid/invalid rows from validation)
68
+ are preserved through the parallel stage.
69
+
70
+ Progress reporting:
71
+ - After each partition completes, report batch count only
72
+ """
73
+ merged_stats = dict(pbr.statistics or {})
74
+ merged_chunk = []
75
+ total = 0
76
+
77
+ parts_total = len(pbr.chunk)
78
+ partitions_done = 0
79
+
80
+ self.logger.debug(f"Processing pbr of len {parts_total}")
81
+ with ThreadPoolExecutor(max_workers=self.max_workers, thread_name_prefix='PBP_worker_') as pool:
82
+ futures = [pool.submit(self._process_partition, part) for part in pbr.chunk]
83
+ try:
84
+ for f in as_completed(futures):
85
+ out = f.result()
86
+
87
+ # Merge into this PBR's cumulative result (returned downstream)
88
+ merged_stats = merge_summery(merged_stats, out.statistics or {})
89
+ total += out.batch_size
90
+ merged_chunk.extend(out.chunk if isinstance(out.chunk, list) else [out.chunk])
91
+
92
+ partitions_done += 1
93
+ self.context.reporter.report_progress(
94
+ task=self.task,
95
+ batches=self._batches_done,
96
+ expected_batches=None,
97
+ stats={},
98
+ )
99
+
100
+ except Exception as e:
101
+ for g in futures:
102
+ g.cancel()
103
+ pool.shutdown(cancel_futures=True)
104
+ raise RuntimeError("partition processing failed") from e
105
+
106
+ self.logger.debug(f"Finished processing pbr with {merged_stats}")
107
+ return BatchResults(chunk=merged_chunk, statistics=merged_stats, batch_size=total)
108
+
109
+ def get_batch(self, max_batch_size: int) -> Generator[BatchResults, None, None]:
110
+ """
111
+ Pulls ParallelBatchResult batches from the predecessor, prefetching
112
+ up to `prefetch` ahead, processes each batch's partitions in
113
+ parallel threads, and yields a flattened BatchResults. The predecessor
114
+ can run ahead while the current batch is processed.
115
+ """
116
+ pbr_queue: queue.Queue[ParallelBatchResult | object] = queue.Queue(self.prefetch)
117
+ SENTINEL = object()
118
+ exc: BaseException | None = None
119
+
120
+ def producer():
121
+ nonlocal exc
122
+ try:
123
+ for pbr in self.predecessor.get_batch(max_batch_size):
124
+ self.logger.debug(
125
+ f"adding pgr {pbr.statistics} / {len(pbr.chunk)} to queue of size {pbr_queue.qsize()}"
126
+ )
127
+ pbr_queue.put(pbr)
128
+ except BaseException as e:
129
+ exc = e
130
+ finally:
131
+ pbr_queue.put(SENTINEL)
132
+
133
+ threading.Thread(target=producer, daemon=True, name='prefetcher').start()
134
+
135
+ while True:
136
+ pbr = pbr_queue.get()
137
+ if pbr is SENTINEL:
138
+ if exc is not None:
139
+ self.logger.error("Upstream producer failed", exc_info=True)
140
+ raise exc
141
+ break
142
+ result = self._process_parallel(pbr)
143
+ yield result
144
+
145
+ class SingleBatchWrapper(BatchProcessor):
146
+ """
147
+ Simple BatchProcessor that returns the batch it receives via init.
148
+ Will be used as predecessor for the worker
149
+ """
150
+
151
+ def __init__(self, context, batch: List[Any]):
152
+ super().__init__(context=context, predecessor=None)
153
+ self._batch = batch
154
+
155
+ def get_batch(self, max_batch__size: int) -> Generator[BatchResults, None, None]:
156
+ # Ignores max_size; yields exactly one BatchResults containing the whole batch
157
+ yield BatchResults(
158
+ chunk=self._batch,
159
+ statistics={},
160
+ batch_size=len(self._batch)
161
+ )
162
+
163
+ def _process_partition(self, partition):
164
+ """
165
+ Processes one partition of items by:
166
+ 1. Wrapping it in SingleBatchWrapper
167
+ 2. Instantiating a fresh worker via worker_factory()
168
+ 3. Setting the worker's predecessor to the wrapper
169
+ 4. Running exactly one batch and returning its BatchResults
170
+
171
+ Raises whatever exception the worker raises, allowing _process_parallel
172
+ to handle fail-fast behavior.
173
+ """
174
+ self.logger.debug("Processing partition")
175
+ wrapper = self.SingleBatchWrapper(self.context, partition)
176
+ worker = self.worker_factory()
177
+ worker.predecessor = wrapper
178
+ result = next(worker.get_batch(len(partition)))
179
+ self.logger.debug(f"finished processing partition with {result.statistics}")
180
+ return result
@@ -4,6 +4,7 @@ from datetime import datetime
4
4
  from tabulate import tabulate
5
5
 
6
6
  from etl_lib.core.Task import Task, TaskGroup, TaskReturn
7
+ from etl_lib.core.utils import add_sigint_handler
7
8
 
8
9
 
9
10
  class ProgressReporter:
@@ -18,7 +19,7 @@ class ProgressReporter:
18
19
 
19
20
  def __init__(self, context):
20
21
  self.context = context
21
- self.logger = logging.getLogger(self.__class__.__name__)
22
+ self.logger = logging.getLogger(f"{self.__class__.__module__}.{self.__class__.__name__}")
22
23
 
23
24
  def register_tasks(self, main: Task):
24
25
  """
@@ -64,7 +65,7 @@ class ProgressReporter:
64
65
  task.success = result.success
65
66
  task.summery = result.summery
66
67
 
67
- report = f"{'\t' * task.depth} finished {task.task_name()} in {task.end_time - task.start_time} with success: {result.success}"
68
+ report = f"finished {task.task_name()} in {task.end_time - task.start_time} with status: {'success' if result.success else 'failed'}"
68
69
  if result.error is not None:
69
70
  report += f", error: \n{result.error}"
70
71
  else:
@@ -85,7 +86,7 @@ class ProgressReporter:
85
86
  task: Task reporting updates.
86
87
  batches: Number of batches processed so far.
87
88
  expected_batches: Number of expected batches. Can be `None` if the overall number of
88
- batches is not know before execution.
89
+ batches is not known before execution.
89
90
  stats: dict of statistics so far (such as `nodes_created`).
90
91
  """
91
92
  pass
@@ -119,13 +120,16 @@ class Neo4jProgressReporter(ProgressReporter):
119
120
  database: Name of the database to write the status updates to.
120
121
  """
121
122
  super().__init__(context)
123
+ self.run_uuid = None
122
124
  self.database = database
123
125
  self.logger.info(f"progress reporting to database: {self.database}")
124
126
  self.__create_constraints()
127
+ self._register_shutdown_handler()
125
128
 
126
129
  def register_tasks(self, root: Task, **kwargs):
127
130
  super().register_tasks(root)
128
131
 
132
+ self.run_uuid = root.uuid
129
133
  with self.context.neo4j.session(self.database) as session:
130
134
  order = 0
131
135
  session.run(
@@ -166,7 +170,7 @@ class Neo4jProgressReporter(ProgressReporter):
166
170
  start_time=task.start_time)
167
171
  return task
168
172
 
169
- def finished_task(self, task: Task, result: TaskReturn) -> Task:
173
+ def finished_task(self, task: Task, result: TaskReturn) -> Task:
170
174
  super().finished_task(task=task, result=result)
171
175
  if result.success:
172
176
  status = "success"
@@ -190,6 +194,21 @@ class Neo4jProgressReporter(ProgressReporter):
190
194
  session.run("MATCH (t:ETLTask {uuid:$id}) SET t.batches =$batches, t.expected_batches =$expected_batches",
191
195
  id=task.uuid, batches=batches, expected_batches=expected_batches)
192
196
 
197
+ def _register_shutdown_handler(self):
198
+ def shutdown_handler(signum, frame):
199
+ self.logger.warning("SIGINT received, waiting for running tasks to abort.")
200
+ with self.context.neo4j.session(self.database) as session:
201
+ cnt = session.run("""
202
+ MATCH path=(r:ETLRun {uuid: $runId})-[*]->()
203
+ WITH [task in nodes(path) WHERE task:ETLTask AND task.status IN ['open', 'running'] | task] AS tasks
204
+ UNWIND tasks AS task
205
+ SET task.status = 'aborted'
206
+ RETURN count(task) AS cnt
207
+ """, runId=self.run_uuid
208
+ ).single()['cnt']
209
+ self.logger.info(f"marked {cnt} tasks as aborted.")
210
+ add_sigint_handler(shutdown_handler)
211
+
193
212
 
194
213
  def get_reporter(context) -> ProgressReporter:
195
214
  """