neo4j-etl-lib 0.2.0__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {neo4j_etl_lib-0.2.0 → neo4j_etl_lib-0.3.1}/PKG-INFO +8 -6
- {neo4j_etl_lib-0.2.0 → neo4j_etl_lib-0.3.1}/pyproject.toml +10 -5
- {neo4j_etl_lib-0.2.0 → neo4j_etl_lib-0.3.1}/src/etl_lib/__init__.py +1 -1
- {neo4j_etl_lib-0.2.0 → neo4j_etl_lib-0.3.1}/src/etl_lib/core/BatchProcessor.py +7 -7
- {neo4j_etl_lib-0.2.0 → neo4j_etl_lib-0.3.1}/src/etl_lib/core/ETLContext.py +64 -30
- neo4j_etl_lib-0.3.1/src/etl_lib/core/ParallelBatchProcessor.py +180 -0
- {neo4j_etl_lib-0.2.0 → neo4j_etl_lib-0.3.1}/src/etl_lib/core/ProgressReporter.py +22 -3
- neo4j_etl_lib-0.3.1/src/etl_lib/core/SplittingBatchProcessor.py +268 -0
- {neo4j_etl_lib-0.2.0 → neo4j_etl_lib-0.3.1}/src/etl_lib/core/Task.py +10 -8
- {neo4j_etl_lib-0.2.0 → neo4j_etl_lib-0.3.1}/src/etl_lib/core/ValidationBatchProcessor.py +2 -0
- neo4j_etl_lib-0.3.1/src/etl_lib/core/utils.py +69 -0
- {neo4j_etl_lib-0.2.0 → neo4j_etl_lib-0.3.1}/src/etl_lib/data_source/CSVBatchSource.py +1 -1
- neo4j_etl_lib-0.3.1/src/etl_lib/data_source/SQLBatchSource.py +114 -0
- {neo4j_etl_lib-0.2.0 → neo4j_etl_lib-0.3.1}/src/etl_lib/task/GDSTask.py +8 -5
- {neo4j_etl_lib-0.2.0 → neo4j_etl_lib-0.3.1}/src/etl_lib/task/data_loading/CSVLoad2Neo4jTask.py +1 -1
- neo4j_etl_lib-0.3.1/src/etl_lib/task/data_loading/ParallelCSVLoad2Neo4jTask.py +98 -0
- neo4j_etl_lib-0.3.1/src/etl_lib/task/data_loading/ParallelSQLLoad2Neo4jTask.py +122 -0
- {neo4j_etl_lib-0.2.0 → neo4j_etl_lib-0.3.1}/src/etl_lib/task/data_loading/SQLLoad2Neo4jTask.py +1 -1
- {neo4j_etl_lib-0.2.0 → neo4j_etl_lib-0.3.1}/src/etl_lib/test_utils/utils.py +9 -5
- neo4j_etl_lib-0.2.0/src/etl_lib/core/utils.py +0 -28
- neo4j_etl_lib-0.2.0/src/etl_lib/data_source/SQLBatchSource.py +0 -60
- {neo4j_etl_lib-0.2.0 → neo4j_etl_lib-0.3.1}/LICENSE +0 -0
- {neo4j_etl_lib-0.2.0 → neo4j_etl_lib-0.3.1}/README.md +0 -0
- {neo4j_etl_lib-0.2.0 → neo4j_etl_lib-0.3.1}/src/etl_lib/cli/__init__.py +0 -0
- {neo4j_etl_lib-0.2.0 → neo4j_etl_lib-0.3.1}/src/etl_lib/cli/run_tools.py +0 -0
- {neo4j_etl_lib-0.2.0 → neo4j_etl_lib-0.3.1}/src/etl_lib/core/ClosedLoopBatchProcessor.py +0 -0
- {neo4j_etl_lib-0.2.0 → neo4j_etl_lib-0.3.1}/src/etl_lib/core/__init__.py +0 -0
- {neo4j_etl_lib-0.2.0 → neo4j_etl_lib-0.3.1}/src/etl_lib/data_sink/CSVBatchSink.py +0 -0
- {neo4j_etl_lib-0.2.0 → neo4j_etl_lib-0.3.1}/src/etl_lib/data_sink/CypherBatchSink.py +0 -0
- {neo4j_etl_lib-0.2.0 → neo4j_etl_lib-0.3.1}/src/etl_lib/data_sink/SQLBatchSink.py +0 -0
- {neo4j_etl_lib-0.2.0 → neo4j_etl_lib-0.3.1}/src/etl_lib/data_sink/__init__.py +0 -0
- {neo4j_etl_lib-0.2.0 → neo4j_etl_lib-0.3.1}/src/etl_lib/data_source/CypherBatchSource.py +0 -0
- {neo4j_etl_lib-0.2.0 → neo4j_etl_lib-0.3.1}/src/etl_lib/data_source/__init__.py +0 -0
- {neo4j_etl_lib-0.2.0 → neo4j_etl_lib-0.3.1}/src/etl_lib/task/CreateReportingConstraintsTask.py +0 -0
- {neo4j_etl_lib-0.2.0 → neo4j_etl_lib-0.3.1}/src/etl_lib/task/ExecuteCypherTask.py +0 -0
- {neo4j_etl_lib-0.2.0 → neo4j_etl_lib-0.3.1}/src/etl_lib/task/__init__.py +0 -0
- {neo4j_etl_lib-0.2.0 → neo4j_etl_lib-0.3.1}/src/etl_lib/task/data_loading/__init__.py +0 -0
- {neo4j_etl_lib-0.2.0 → neo4j_etl_lib-0.3.1}/src/etl_lib/test_utils/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: neo4j-etl-lib
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.1
|
|
4
4
|
Summary: Building blocks for ETL pipelines.
|
|
5
5
|
Keywords: etl,graph,database
|
|
6
6
|
Author-email: Bert Radke <bert.radke@pm.me>
|
|
@@ -14,11 +14,11 @@ Classifier: Programming Language :: Python :: 3
|
|
|
14
14
|
Classifier: Topic :: Database
|
|
15
15
|
Classifier: Development Status :: 4 - Beta
|
|
16
16
|
License-File: LICENSE
|
|
17
|
-
Requires-Dist: pydantic>=2.10.5; python_version >= '3.
|
|
18
|
-
Requires-Dist: neo4j>=5.27.0; python_version >= '3.
|
|
19
|
-
Requires-Dist: python-dotenv>=1.0.1; python_version >= '3.
|
|
20
|
-
Requires-Dist: tabulate>=0.9.0; python_version >= '3.
|
|
21
|
-
Requires-Dist: click>=8.1.8; python_version >= '3.
|
|
17
|
+
Requires-Dist: pydantic>=2.10.5; python_version >= '3.10'
|
|
18
|
+
Requires-Dist: neo4j-rust-ext>=5.27.0,<6; python_version >= '3.10'
|
|
19
|
+
Requires-Dist: python-dotenv>=1.0.1; python_version >= '3.10'
|
|
20
|
+
Requires-Dist: tabulate>=0.9.0; python_version >= '3.10'
|
|
21
|
+
Requires-Dist: click>=8.1.8; python_version >= '3.10'
|
|
22
22
|
Requires-Dist: pydantic[email-validator]
|
|
23
23
|
Requires-Dist: pytest>=8.3.0 ; extra == "dev" and ( python_version >= '3.8')
|
|
24
24
|
Requires-Dist: testcontainers[neo4j]==4.9.0 ; extra == "dev" and ( python_version >= '3.9' and python_version < '4.0')
|
|
@@ -35,11 +35,13 @@ Requires-Dist: sphinx-autoapi ; extra == "dev"
|
|
|
35
35
|
Requires-Dist: sqlalchemy ; extra == "dev"
|
|
36
36
|
Requires-Dist: psycopg2-binary ; extra == "dev"
|
|
37
37
|
Requires-Dist: graphdatascience>=1.13 ; extra == "gds" and ( python_version >= '3.9')
|
|
38
|
+
Requires-Dist: nox>=2024.0.0 ; extra == "nox"
|
|
38
39
|
Requires-Dist: sqlalchemy ; extra == "sql"
|
|
39
40
|
Project-URL: Documentation, https://neo-technology-field.github.io/python-etl-lib/index.html
|
|
40
41
|
Project-URL: Home, https://github.com/neo-technology-field/python-etl-lib
|
|
41
42
|
Provides-Extra: dev
|
|
42
43
|
Provides-Extra: gds
|
|
44
|
+
Provides-Extra: nox
|
|
43
45
|
Provides-Extra: sql
|
|
44
46
|
|
|
45
47
|
# Neo4j ETL Toolbox
|
|
@@ -22,11 +22,11 @@ dynamic = ["version", "description"]
|
|
|
22
22
|
keywords = ["etl", "graph", "database"]
|
|
23
23
|
|
|
24
24
|
dependencies = [
|
|
25
|
-
"pydantic>=2.10.5; python_version >= '3.
|
|
26
|
-
"neo4j>=5.27.0; python_version >= '3.
|
|
27
|
-
"python-dotenv>=1.0.1; python_version >= '3.
|
|
28
|
-
"tabulate>=0.9.0; python_version >= '3.
|
|
29
|
-
"click>=8.1.8; python_version >= '3.
|
|
25
|
+
"pydantic>=2.10.5; python_version >= '3.10'",
|
|
26
|
+
"neo4j-rust-ext>=5.27.0,<6; python_version >= '3.10'",
|
|
27
|
+
"python-dotenv>=1.0.1; python_version >= '3.10'",
|
|
28
|
+
"tabulate>=0.9.0; python_version >= '3.10'",
|
|
29
|
+
"click>=8.1.8; python_version >= '3.10'",
|
|
30
30
|
"pydantic[email_validator]"
|
|
31
31
|
]
|
|
32
32
|
|
|
@@ -41,6 +41,11 @@ dev = [
|
|
|
41
41
|
gds = ["graphdatascience>=1.13; python_version >= '3.9'"]
|
|
42
42
|
sql = ["sqlalchemy"]
|
|
43
43
|
|
|
44
|
+
# Local-only multy-version testing, install via `pip install ".[dev,nox]"`
|
|
45
|
+
nox = [
|
|
46
|
+
"nox>=2024.0.0"
|
|
47
|
+
]
|
|
48
|
+
|
|
44
49
|
[project.urls]
|
|
45
50
|
Home = "https://github.com/neo-technology-field/python-etl-lib"
|
|
46
51
|
Documentation = "https://neo-technology-field.github.io/python-etl-lib/index.html"
|
|
@@ -2,7 +2,7 @@ import abc
|
|
|
2
2
|
import logging
|
|
3
3
|
import sys
|
|
4
4
|
from dataclasses import dataclass, field
|
|
5
|
-
from typing import Generator
|
|
5
|
+
from typing import Generator, List, Any
|
|
6
6
|
|
|
7
7
|
from etl_lib.core.Task import Task
|
|
8
8
|
from etl_lib.core.utils import merge_summery
|
|
@@ -13,7 +13,7 @@ class BatchResults:
|
|
|
13
13
|
"""
|
|
14
14
|
Return object of the :py:func:`~BatchProcessor.get_batch` method, wrapping a batched data together with meta information.
|
|
15
15
|
"""
|
|
16
|
-
chunk: []
|
|
16
|
+
chunk: List[Any]
|
|
17
17
|
"""The batch of data."""
|
|
18
18
|
statistics: dict = field(default_factory=dict)
|
|
19
19
|
"""`dict` of statistic information, such as row processed, nodes writen, .."""
|
|
@@ -38,11 +38,11 @@ def append_result(org: BatchResults, stats: dict) -> BatchResults:
|
|
|
38
38
|
batch_size=org.batch_size)
|
|
39
39
|
|
|
40
40
|
|
|
41
|
-
class BatchProcessor:
|
|
41
|
+
class BatchProcessor(abc.ABC):
|
|
42
42
|
"""
|
|
43
43
|
Allows assembly of :py:class:`etl_lib.core.Task.Task` out of smaller building blocks.
|
|
44
44
|
|
|
45
|
-
This way, functionally such as reading from a CSV file, writing to a database or validation
|
|
45
|
+
This way, functionally, such as reading from a CSV file, writing to a database or validation
|
|
46
46
|
can be implemented and tested independently and re-used.
|
|
47
47
|
|
|
48
48
|
BatchProcessors form, a linked list, where each processor only knows about its predecessor.
|
|
@@ -57,17 +57,17 @@ class BatchProcessor:
|
|
|
57
57
|
Constructs a new :py:class:`etl_lib.core.BatchProcessor` instance.
|
|
58
58
|
|
|
59
59
|
Args:
|
|
60
|
-
context: :py:class:`etl_lib.core.ETLContext.ETLContext` instance. Will be available to subclasses.
|
|
60
|
+
context: :py:class:`etl_lib.core.ETLContext.ETLContext` instance. It Will be available to subclasses.
|
|
61
61
|
task: :py:class:`etl_lib.core.Task.Task` this processor is part of.
|
|
62
62
|
Needed for status reporting only.
|
|
63
63
|
predecessor: Source of batches for this processor.
|
|
64
|
-
Can be `None`
|
|
64
|
+
Can be `None` if no predecessor is needed (such as when this processor is the start of the queue).
|
|
65
65
|
"""
|
|
66
66
|
self.context = context
|
|
67
67
|
""":py:class:`etl_lib.core.ETLContext.ETLContext` instance. Providing access to general facilities."""
|
|
68
68
|
self.predecessor = predecessor
|
|
69
69
|
"""Predecessor, used as a source of batches."""
|
|
70
|
-
self.logger = logging.getLogger(self.__class__.__name__)
|
|
70
|
+
self.logger = logging.getLogger(f"{self.__class__.__module__}.{self.__class__.__name__}")
|
|
71
71
|
self.task = task
|
|
72
72
|
"""The :py:class:`etl_lib.core.Task.Task` owning instance."""
|
|
73
73
|
|
|
@@ -1,19 +1,23 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import Any, Dict, List, NamedTuple
|
|
3
|
+
|
|
4
|
+
from neo4j.exceptions import Neo4jError
|
|
3
5
|
|
|
4
6
|
try:
|
|
5
7
|
from graphdatascience import GraphDataScience
|
|
8
|
+
|
|
6
9
|
gds_available = False
|
|
7
10
|
except ImportError:
|
|
8
11
|
gds_available = False
|
|
9
12
|
logging.info("Graph Data Science not installed, skipping")
|
|
10
13
|
GraphDataScience = None
|
|
11
14
|
|
|
12
|
-
from neo4j import GraphDatabase, WRITE_ACCESS, SummaryCounters
|
|
15
|
+
from neo4j import GraphDatabase, Session, WRITE_ACCESS, SummaryCounters
|
|
13
16
|
|
|
14
17
|
try:
|
|
15
18
|
from sqlalchemy import create_engine
|
|
16
19
|
from sqlalchemy.engine import Engine
|
|
20
|
+
|
|
17
21
|
sqlalchemy_available = True
|
|
18
22
|
except ImportError:
|
|
19
23
|
sqlalchemy_available = False
|
|
@@ -26,14 +30,29 @@ from etl_lib.core.ProgressReporter import get_reporter
|
|
|
26
30
|
|
|
27
31
|
class QueryResult(NamedTuple):
|
|
28
32
|
"""Result of a query against the neo4j database."""
|
|
29
|
-
data: []
|
|
33
|
+
data: List[Any]
|
|
30
34
|
"""Data as returned from the query."""
|
|
31
|
-
summery:
|
|
35
|
+
summery: Dict[str, int]
|
|
32
36
|
"""Counters as reported by neo4j. Contains entries such as `nodes_created`, `nodes_deleted`, etc."""
|
|
33
37
|
|
|
34
38
|
|
|
35
39
|
def append_results(r1: QueryResult, r2: QueryResult) -> QueryResult:
|
|
36
|
-
|
|
40
|
+
"""
|
|
41
|
+
Appends two QueryResult objects, summing the values for duplicate keys in the summary.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
r1: The first QueryResult object.
|
|
45
|
+
r2: The second QueryResult object to append.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
A new QueryResult object with combined data and summed summary counts.
|
|
49
|
+
"""
|
|
50
|
+
combined_summery = r1.summery.copy()
|
|
51
|
+
|
|
52
|
+
for key, value in r2.summery.items():
|
|
53
|
+
combined_summery[key] = combined_summery.get(key, 0) + value
|
|
54
|
+
|
|
55
|
+
return QueryResult(r1.data + r2.data, combined_summery)
|
|
37
56
|
|
|
38
57
|
|
|
39
58
|
class Neo4jContext:
|
|
@@ -51,37 +70,38 @@ class Neo4jContext:
|
|
|
51
70
|
- `NEO4J_PASSWORD`.
|
|
52
71
|
- `NEO4J_DATABASE`,
|
|
53
72
|
"""
|
|
54
|
-
self.logger = logging.getLogger(self.__class__.__name__)
|
|
73
|
+
self.logger = logging.getLogger(f"{self.__class__.__module__}.{self.__class__.__name__}")
|
|
55
74
|
self.uri = env_vars["NEO4J_URI"]
|
|
56
75
|
self.auth = (env_vars["NEO4J_USERNAME"],
|
|
57
76
|
env_vars["NEO4J_PASSWORD"])
|
|
58
77
|
self.database = env_vars["NEO4J_DATABASE"]
|
|
59
78
|
self.__neo4j_connect()
|
|
60
79
|
|
|
61
|
-
def query_database(self, session, query, **kwargs) -> QueryResult:
|
|
80
|
+
def query_database(self, session: Session, query, **kwargs) -> QueryResult:
|
|
62
81
|
"""
|
|
63
|
-
Executes
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
session: Neo4j database session.
|
|
67
|
-
query: Cypher query either as a single query or as a list.
|
|
82
|
+
Executes Cypher and returns (records, counters) with retryable write semantics.
|
|
83
|
+
Accepts either a single query string or a list of queries.
|
|
84
|
+
Does not work with CALL {} IN TRANSACTION queries.
|
|
68
85
|
"""
|
|
69
86
|
if isinstance(query, list):
|
|
70
|
-
results =
|
|
71
|
-
for
|
|
72
|
-
|
|
73
|
-
results = append_results(results,
|
|
87
|
+
results = None
|
|
88
|
+
for single in query:
|
|
89
|
+
part = self.query_database(session, single, **kwargs)
|
|
90
|
+
results = append_results(results, part) if results is not None else part
|
|
74
91
|
return results
|
|
75
|
-
else:
|
|
76
|
-
try:
|
|
77
|
-
res = session.run(query, **kwargs)
|
|
78
|
-
counters = res.consume().counters
|
|
79
92
|
|
|
80
|
-
|
|
93
|
+
def _tx(tx, q, params):
|
|
94
|
+
res = tx.run(q, **params)
|
|
95
|
+
records = list(res)
|
|
96
|
+
counters = res.consume().counters
|
|
97
|
+
return records, counters
|
|
81
98
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
99
|
+
try:
|
|
100
|
+
records, counters = session.execute_write(_tx, query, kwargs)
|
|
101
|
+
return QueryResult(records, self.__counters_2_dict(counters))
|
|
102
|
+
except Neo4jError as e:
|
|
103
|
+
self.logger.error(e)
|
|
104
|
+
raise
|
|
85
105
|
|
|
86
106
|
@staticmethod
|
|
87
107
|
def __counters_2_dict(counters: SummaryCounters):
|
|
@@ -123,6 +143,7 @@ class Neo4jContext:
|
|
|
123
143
|
self.logger.info(
|
|
124
144
|
f"driver connected to instance at {self.uri} with username {self.auth[0]} and database {self.database}")
|
|
125
145
|
|
|
146
|
+
|
|
126
147
|
def gds(neo4j_context) -> GraphDataScience:
|
|
127
148
|
"""
|
|
128
149
|
Creates a new GraphDataScience client.
|
|
@@ -147,14 +168,26 @@ if sqlalchemy_available:
|
|
|
147
168
|
pool_size (int): Number of connections to maintain in the pool.
|
|
148
169
|
max_overflow (int): Additional connections allowed beyond pool_size.
|
|
149
170
|
"""
|
|
150
|
-
self.engine: Engine = create_engine(
|
|
171
|
+
self.engine: Engine = create_engine(
|
|
172
|
+
database_url,
|
|
173
|
+
pool_pre_ping=True,
|
|
174
|
+
pool_size=pool_size,
|
|
175
|
+
max_overflow=max_overflow,
|
|
176
|
+
pool_recycle=1800, # recycle connections older than 30m
|
|
177
|
+
connect_args={
|
|
178
|
+
# turn on TCP keepalives on the client socket:
|
|
179
|
+
"keepalives": 1,
|
|
180
|
+
"keepalives_idle": 60, # after 60s of idle
|
|
181
|
+
"keepalives_interval": 10, # probe every 10s
|
|
182
|
+
"keepalives_count": 5, # give up after 5 failed probes
|
|
183
|
+
})
|
|
151
184
|
|
|
152
185
|
|
|
153
186
|
class ETLContext:
|
|
154
187
|
"""
|
|
155
188
|
General context information.
|
|
156
189
|
|
|
157
|
-
Will be passed to all :
|
|
190
|
+
Will be passed to all :class:`~etl_lib.core.Task.Task` to provide access to environment variables and functionally
|
|
158
191
|
deemed general enough that all parts of the ETL pipeline would need it.
|
|
159
192
|
"""
|
|
160
193
|
|
|
@@ -163,12 +196,12 @@ class ETLContext:
|
|
|
163
196
|
Create a new ETLContext.
|
|
164
197
|
|
|
165
198
|
Args:
|
|
166
|
-
env_vars: Environment variables. Stored internally and can be accessed via :
|
|
199
|
+
env_vars: Environment variables. Stored internally and can be accessed via :func:`~env` .
|
|
167
200
|
|
|
168
|
-
The context created will contain an :
|
|
201
|
+
The context created will contain an :class:`~Neo4jContext` and a :class:`~etl_lib.core.ProgressReporter.ProgressReporter`.
|
|
169
202
|
See there for keys used from the provided `env_vars` dict.
|
|
170
203
|
"""
|
|
171
|
-
self.logger = logging.getLogger(self.__class__.__name__)
|
|
204
|
+
self.logger = logging.getLogger(f"{self.__class__.__module__}.{self.__class__.__name__}")
|
|
172
205
|
self.neo4j = Neo4jContext(env_vars)
|
|
173
206
|
self.__env_vars = env_vars
|
|
174
207
|
self.reporter = get_reporter(self)
|
|
@@ -176,7 +209,7 @@ class ETLContext:
|
|
|
176
209
|
if sql_uri is not None and sqlalchemy_available:
|
|
177
210
|
self.sql = SQLContext(sql_uri)
|
|
178
211
|
if gds_available:
|
|
179
|
-
self.gds =gds(self.neo4j)
|
|
212
|
+
self.gds = gds(self.neo4j)
|
|
180
213
|
|
|
181
214
|
def env(self, key: str) -> Any:
|
|
182
215
|
"""
|
|
@@ -190,3 +223,4 @@ class ETLContext:
|
|
|
190
223
|
"""
|
|
191
224
|
if key in self.__env_vars:
|
|
192
225
|
return self.__env_vars[key]
|
|
226
|
+
return None
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
import queue
|
|
2
|
+
import threading
|
|
3
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
4
|
+
from typing import Any, Callable, Generator, List
|
|
5
|
+
|
|
6
|
+
from etl_lib.core.BatchProcessor import BatchProcessor, BatchResults
|
|
7
|
+
from etl_lib.core.utils import merge_summery
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ParallelBatchResult(BatchResults):
|
|
11
|
+
"""
|
|
12
|
+
Represents a batch split into parallelizable partitions.
|
|
13
|
+
|
|
14
|
+
`chunk` is a list of lists, each sub-list is a partition.
|
|
15
|
+
"""
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ParallelBatchProcessor(BatchProcessor):
|
|
20
|
+
"""
|
|
21
|
+
BatchProcessor that runs worker threads over partitions of batches.
|
|
22
|
+
|
|
23
|
+
Receives a special BatchResult (:py:class:`ParallelBatchResult`) from the predecessor.
|
|
24
|
+
All chunks in a ParallelBatchResult it receives can be processed in parallel.
|
|
25
|
+
See :py:class:`etl_lib.core.SplittingBatchProcessor` on how to produce them.
|
|
26
|
+
Prefetches the next ParallelBatchResults from its predecessor.
|
|
27
|
+
The actual processing of the batches is deferred to the configured worker.
|
|
28
|
+
|
|
29
|
+
Note:
|
|
30
|
+
- The predecessor must emit `ParallelBatchResult` instances.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
context: ETL context.
|
|
34
|
+
worker_factory: A zero-arg callable that returns a new BatchProcessor
|
|
35
|
+
each time it's called. This processor is responsible for the processing pf the batches.
|
|
36
|
+
task: optional Task for reporting.
|
|
37
|
+
predecessor: upstream BatchProcessor that must emit ParallelBatchResult.
|
|
38
|
+
max_workers: number of parallel threads for partitions.
|
|
39
|
+
prefetch: number of ParallelBatchResults to prefetch from the predecessor.
|
|
40
|
+
|
|
41
|
+
Behavior:
|
|
42
|
+
- For every ParallelBatchResult, spins up `max_workers` threads.
|
|
43
|
+
- Each thread calls its own worker from `worker_factory()`, with its
|
|
44
|
+
partition wrapped by `SingleBatchWrapper`.
|
|
45
|
+
- Collects and merges their BatchResults in a fail-fast manner: on first
|
|
46
|
+
exception, logs the error, cancels remaining threads, and raises an exception.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
context,
|
|
52
|
+
worker_factory: Callable[[], BatchProcessor],
|
|
53
|
+
task=None,
|
|
54
|
+
predecessor=None,
|
|
55
|
+
max_workers: int = 4,
|
|
56
|
+
prefetch: int = 4,
|
|
57
|
+
):
|
|
58
|
+
super().__init__(context, task, predecessor)
|
|
59
|
+
self.worker_factory = worker_factory
|
|
60
|
+
self.max_workers = max_workers
|
|
61
|
+
self.prefetch = prefetch
|
|
62
|
+
self._batches_done = 0
|
|
63
|
+
|
|
64
|
+
def _process_parallel(self, pbr: ParallelBatchResult) -> BatchResults:
|
|
65
|
+
"""
|
|
66
|
+
Run one worker per partition in `pbr.chunk`, merge their outputs, and include upstream
|
|
67
|
+
statistics from `pbr.statistics` so counters (e.g., valid/invalid rows from validation)
|
|
68
|
+
are preserved through the parallel stage.
|
|
69
|
+
|
|
70
|
+
Progress reporting:
|
|
71
|
+
- After each partition completes, report batch count only
|
|
72
|
+
"""
|
|
73
|
+
merged_stats = dict(pbr.statistics or {})
|
|
74
|
+
merged_chunk = []
|
|
75
|
+
total = 0
|
|
76
|
+
|
|
77
|
+
parts_total = len(pbr.chunk)
|
|
78
|
+
partitions_done = 0
|
|
79
|
+
|
|
80
|
+
self.logger.debug(f"Processing pbr of len {parts_total}")
|
|
81
|
+
with ThreadPoolExecutor(max_workers=self.max_workers, thread_name_prefix='PBP_worker_') as pool:
|
|
82
|
+
futures = [pool.submit(self._process_partition, part) for part in pbr.chunk]
|
|
83
|
+
try:
|
|
84
|
+
for f in as_completed(futures):
|
|
85
|
+
out = f.result()
|
|
86
|
+
|
|
87
|
+
# Merge into this PBR's cumulative result (returned downstream)
|
|
88
|
+
merged_stats = merge_summery(merged_stats, out.statistics or {})
|
|
89
|
+
total += out.batch_size
|
|
90
|
+
merged_chunk.extend(out.chunk if isinstance(out.chunk, list) else [out.chunk])
|
|
91
|
+
|
|
92
|
+
partitions_done += 1
|
|
93
|
+
self.context.reporter.report_progress(
|
|
94
|
+
task=self.task,
|
|
95
|
+
batches=self._batches_done,
|
|
96
|
+
expected_batches=None,
|
|
97
|
+
stats={},
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
except Exception as e:
|
|
101
|
+
for g in futures:
|
|
102
|
+
g.cancel()
|
|
103
|
+
pool.shutdown(cancel_futures=True)
|
|
104
|
+
raise RuntimeError("partition processing failed") from e
|
|
105
|
+
|
|
106
|
+
self.logger.debug(f"Finished processing pbr with {merged_stats}")
|
|
107
|
+
return BatchResults(chunk=merged_chunk, statistics=merged_stats, batch_size=total)
|
|
108
|
+
|
|
109
|
+
def get_batch(self, max_batch_size: int) -> Generator[BatchResults, None, None]:
|
|
110
|
+
"""
|
|
111
|
+
Pulls ParallelBatchResult batches from the predecessor, prefetching
|
|
112
|
+
up to `prefetch` ahead, processes each batch's partitions in
|
|
113
|
+
parallel threads, and yields a flattened BatchResults. The predecessor
|
|
114
|
+
can run ahead while the current batch is processed.
|
|
115
|
+
"""
|
|
116
|
+
pbr_queue: queue.Queue[ParallelBatchResult | object] = queue.Queue(self.prefetch)
|
|
117
|
+
SENTINEL = object()
|
|
118
|
+
exc: BaseException | None = None
|
|
119
|
+
|
|
120
|
+
def producer():
|
|
121
|
+
nonlocal exc
|
|
122
|
+
try:
|
|
123
|
+
for pbr in self.predecessor.get_batch(max_batch_size):
|
|
124
|
+
self.logger.debug(
|
|
125
|
+
f"adding pgr {pbr.statistics} / {len(pbr.chunk)} to queue of size {pbr_queue.qsize()}"
|
|
126
|
+
)
|
|
127
|
+
pbr_queue.put(pbr)
|
|
128
|
+
except BaseException as e:
|
|
129
|
+
exc = e
|
|
130
|
+
finally:
|
|
131
|
+
pbr_queue.put(SENTINEL)
|
|
132
|
+
|
|
133
|
+
threading.Thread(target=producer, daemon=True, name='prefetcher').start()
|
|
134
|
+
|
|
135
|
+
while True:
|
|
136
|
+
pbr = pbr_queue.get()
|
|
137
|
+
if pbr is SENTINEL:
|
|
138
|
+
if exc is not None:
|
|
139
|
+
self.logger.error("Upstream producer failed", exc_info=True)
|
|
140
|
+
raise exc
|
|
141
|
+
break
|
|
142
|
+
result = self._process_parallel(pbr)
|
|
143
|
+
yield result
|
|
144
|
+
|
|
145
|
+
class SingleBatchWrapper(BatchProcessor):
|
|
146
|
+
"""
|
|
147
|
+
Simple BatchProcessor that returns the batch it receives via init.
|
|
148
|
+
Will be used as predecessor for the worker
|
|
149
|
+
"""
|
|
150
|
+
|
|
151
|
+
def __init__(self, context, batch: List[Any]):
|
|
152
|
+
super().__init__(context=context, predecessor=None)
|
|
153
|
+
self._batch = batch
|
|
154
|
+
|
|
155
|
+
def get_batch(self, max_batch__size: int) -> Generator[BatchResults, None, None]:
|
|
156
|
+
# Ignores max_size; yields exactly one BatchResults containing the whole batch
|
|
157
|
+
yield BatchResults(
|
|
158
|
+
chunk=self._batch,
|
|
159
|
+
statistics={},
|
|
160
|
+
batch_size=len(self._batch)
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
def _process_partition(self, partition):
|
|
164
|
+
"""
|
|
165
|
+
Processes one partition of items by:
|
|
166
|
+
1. Wrapping it in SingleBatchWrapper
|
|
167
|
+
2. Instantiating a fresh worker via worker_factory()
|
|
168
|
+
3. Setting the worker's predecessor to the wrapper
|
|
169
|
+
4. Running exactly one batch and returning its BatchResults
|
|
170
|
+
|
|
171
|
+
Raises whatever exception the worker raises, allowing _process_parallel
|
|
172
|
+
to handle fail-fast behavior.
|
|
173
|
+
"""
|
|
174
|
+
self.logger.debug("Processing partition")
|
|
175
|
+
wrapper = self.SingleBatchWrapper(self.context, partition)
|
|
176
|
+
worker = self.worker_factory()
|
|
177
|
+
worker.predecessor = wrapper
|
|
178
|
+
result = next(worker.get_batch(len(partition)))
|
|
179
|
+
self.logger.debug(f"finished processing partition with {result.statistics}")
|
|
180
|
+
return result
|
|
@@ -4,6 +4,7 @@ from datetime import datetime
|
|
|
4
4
|
from tabulate import tabulate
|
|
5
5
|
|
|
6
6
|
from etl_lib.core.Task import Task, TaskGroup, TaskReturn
|
|
7
|
+
from etl_lib.core.utils import add_sigint_handler
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
class ProgressReporter:
|
|
@@ -18,7 +19,7 @@ class ProgressReporter:
|
|
|
18
19
|
|
|
19
20
|
def __init__(self, context):
|
|
20
21
|
self.context = context
|
|
21
|
-
self.logger = logging.getLogger(self.__class__.__name__)
|
|
22
|
+
self.logger = logging.getLogger(f"{self.__class__.__module__}.{self.__class__.__name__}")
|
|
22
23
|
|
|
23
24
|
def register_tasks(self, main: Task):
|
|
24
25
|
"""
|
|
@@ -44,7 +45,7 @@ class ProgressReporter:
|
|
|
44
45
|
The task that was provided.
|
|
45
46
|
"""
|
|
46
47
|
task.start_time = datetime.now()
|
|
47
|
-
self.logger.info(f"{'
|
|
48
|
+
self.logger.info(f"{' ' * (4 * task.depth)}starting {task.task_name()}")
|
|
48
49
|
return task
|
|
49
50
|
|
|
50
51
|
def finished_task(self, task: Task, result: TaskReturn) -> Task:
|
|
@@ -119,13 +120,16 @@ class Neo4jProgressReporter(ProgressReporter):
|
|
|
119
120
|
database: Name of the database to write the status updates to.
|
|
120
121
|
"""
|
|
121
122
|
super().__init__(context)
|
|
123
|
+
self.run_uuid = None
|
|
122
124
|
self.database = database
|
|
123
125
|
self.logger.info(f"progress reporting to database: {self.database}")
|
|
124
126
|
self.__create_constraints()
|
|
127
|
+
self._register_shutdown_handler()
|
|
125
128
|
|
|
126
129
|
def register_tasks(self, root: Task, **kwargs):
|
|
127
130
|
super().register_tasks(root)
|
|
128
131
|
|
|
132
|
+
self.run_uuid = root.uuid
|
|
129
133
|
with self.context.neo4j.session(self.database) as session:
|
|
130
134
|
order = 0
|
|
131
135
|
session.run(
|
|
@@ -166,7 +170,7 @@ class Neo4jProgressReporter(ProgressReporter):
|
|
|
166
170
|
start_time=task.start_time)
|
|
167
171
|
return task
|
|
168
172
|
|
|
169
|
-
def finished_task(self, task: Task,
|
|
173
|
+
def finished_task(self, task: Task, result: TaskReturn) -> Task:
|
|
170
174
|
super().finished_task(task=task, result=result)
|
|
171
175
|
if result.success:
|
|
172
176
|
status = "success"
|
|
@@ -190,6 +194,21 @@ class Neo4jProgressReporter(ProgressReporter):
|
|
|
190
194
|
session.run("MATCH (t:ETLTask {uuid:$id}) SET t.batches =$batches, t.expected_batches =$expected_batches",
|
|
191
195
|
id=task.uuid, batches=batches, expected_batches=expected_batches)
|
|
192
196
|
|
|
197
|
+
def _register_shutdown_handler(self):
|
|
198
|
+
def shutdown_handler(signum, frame):
|
|
199
|
+
self.logger.warning("SIGINT received, waiting for running tasks to abort.")
|
|
200
|
+
with self.context.neo4j.session(self.database) as session:
|
|
201
|
+
cnt = session.run("""
|
|
202
|
+
MATCH path=(r:ETLRun {uuid: $runId})-[*]->()
|
|
203
|
+
WITH [task in nodes(path) WHERE task:ETLTask AND task.status IN ['open', 'running'] | task] AS tasks
|
|
204
|
+
UNWIND tasks AS task
|
|
205
|
+
SET task.status = 'aborted'
|
|
206
|
+
RETURN count(task) AS cnt
|
|
207
|
+
""", runId=self.run_uuid
|
|
208
|
+
).single()['cnt']
|
|
209
|
+
self.logger.info(f"marked {cnt} tasks as aborted.")
|
|
210
|
+
add_sigint_handler(shutdown_handler)
|
|
211
|
+
|
|
193
212
|
|
|
194
213
|
def get_reporter(context) -> ProgressReporter:
|
|
195
214
|
"""
|