neo4j-etl-lib 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- etl_lib/__init__.py +1 -1
- etl_lib/core/BatchProcessor.py +7 -7
- etl_lib/core/ETLContext.py +64 -30
- etl_lib/core/ParallelBatchProcessor.py +180 -0
- etl_lib/core/ProgressReporter.py +21 -2
- etl_lib/core/SplittingBatchProcessor.py +268 -0
- etl_lib/core/Task.py +10 -8
- etl_lib/core/ValidationBatchProcessor.py +2 -0
- etl_lib/core/utils.py +52 -11
- etl_lib/data_source/SQLBatchSource.py +79 -25
- etl_lib/task/data_loading/CSVLoad2Neo4jTask.py +1 -1
- etl_lib/task/data_loading/ParallelCSVLoad2Neo4jTask.py +98 -0
- etl_lib/task/data_loading/ParallelSQLLoad2Neo4jTask.py +122 -0
- etl_lib/task/data_loading/SQLLoad2Neo4jTask.py +1 -1
- {neo4j_etl_lib-0.2.0.dist-info → neo4j_etl_lib-0.3.0.dist-info}/METADATA +2 -2
- {neo4j_etl_lib-0.2.0.dist-info → neo4j_etl_lib-0.3.0.dist-info}/RECORD +18 -14
- {neo4j_etl_lib-0.2.0.dist-info → neo4j_etl_lib-0.3.0.dist-info}/WHEEL +0 -0
- {neo4j_etl_lib-0.2.0.dist-info → neo4j_etl_lib-0.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Callable, Union
|
|
3
|
+
|
|
4
|
+
from etl_lib.core.ClosedLoopBatchProcessor import ClosedLoopBatchProcessor
|
|
5
|
+
from etl_lib.core.ETLContext import ETLContext
|
|
6
|
+
from etl_lib.core.ParallelBatchProcessor import ParallelBatchProcessor
|
|
7
|
+
from etl_lib.core.SplittingBatchProcessor import SplittingBatchProcessor, dict_id_extractor
|
|
8
|
+
from etl_lib.core.Task import Task, TaskReturn
|
|
9
|
+
from etl_lib.data_sink.CypherBatchSink import CypherBatchSink
|
|
10
|
+
from etl_lib.data_source.SQLBatchSource import SQLBatchSource
|
|
11
|
+
from sqlalchemy import text
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ParallelSQLLoad2Neo4jTask(Task, ABC):
|
|
15
|
+
"""
|
|
16
|
+
Parallelized version of SQLLoad2Neo4jTask: reads via SQLBatchSource,
|
|
17
|
+
splits into non-overlapping partitions (grid), processes each partition
|
|
18
|
+
in parallel through a CypherBatchSink, and closes the loop.
|
|
19
|
+
|
|
20
|
+
Subclasses must implement:
|
|
21
|
+
- _sql_query()
|
|
22
|
+
- _cypher_query()
|
|
23
|
+
- optionally override _count_query() and _id_extractor().
|
|
24
|
+
|
|
25
|
+
Control parameters:
|
|
26
|
+
batch_size: max items per partition batch
|
|
27
|
+
table_size: dimension of the splitting grid
|
|
28
|
+
max_workers: parallel threads per partition group (defaults to table_size)
|
|
29
|
+
prefetch: number of partition-groups to prefetch
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
context: ETLContext,
|
|
35
|
+
batch_size: int = 5000,
|
|
36
|
+
table_size: int = 10,
|
|
37
|
+
max_workers: int = None,
|
|
38
|
+
prefetch: int = 4
|
|
39
|
+
):
|
|
40
|
+
super().__init__(context)
|
|
41
|
+
self.context = context
|
|
42
|
+
self.batch_size = batch_size
|
|
43
|
+
self.table_size = table_size
|
|
44
|
+
# default max_workers to table_size for full parallelism
|
|
45
|
+
self.max_workers = max_workers or table_size
|
|
46
|
+
self.prefetch = prefetch
|
|
47
|
+
|
|
48
|
+
@abstractmethod
|
|
49
|
+
def _sql_query(self) -> str:
|
|
50
|
+
"""
|
|
51
|
+
Return the SQL query to load source rows.
|
|
52
|
+
"""
|
|
53
|
+
pass
|
|
54
|
+
|
|
55
|
+
@abstractmethod
|
|
56
|
+
def _cypher_query(self) -> str:
|
|
57
|
+
"""
|
|
58
|
+
Return the Cypher query to write rows into Neo4j.
|
|
59
|
+
"""
|
|
60
|
+
pass
|
|
61
|
+
|
|
62
|
+
def _count_query(self) -> Union[str, None]:
|
|
63
|
+
"""
|
|
64
|
+
Optional SQL to count source rows for progress reporting.
|
|
65
|
+
"""
|
|
66
|
+
return None
|
|
67
|
+
|
|
68
|
+
def _id_extractor(self) -> Callable:
|
|
69
|
+
"""
|
|
70
|
+
Extractor mapping each row item to a (row,col) partition index.
|
|
71
|
+
Default expects dict rows with 'start' and 'end' keys.
|
|
72
|
+
Override to customize.
|
|
73
|
+
"""
|
|
74
|
+
return dict_id_extractor()
|
|
75
|
+
|
|
76
|
+
def run_internal(self) -> TaskReturn:
|
|
77
|
+
# total count for ClosedLoopBatchProcessor
|
|
78
|
+
total_count = self.__get_source_count()
|
|
79
|
+
# source of raw rows
|
|
80
|
+
source = SQLBatchSource(self.context, self, self._sql_query())
|
|
81
|
+
|
|
82
|
+
# splitter: non-overlapping partitions as defined by the id_extractor
|
|
83
|
+
splitter = SplittingBatchProcessor(
|
|
84
|
+
context=self.context,
|
|
85
|
+
task=self,
|
|
86
|
+
predecessor=source,
|
|
87
|
+
table_size=self.table_size,
|
|
88
|
+
id_extractor=self._id_extractor()
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# parallel processor: runs CypherBatchSink on each partition concurrently
|
|
92
|
+
parallel = ParallelBatchProcessor(
|
|
93
|
+
context=self.context,
|
|
94
|
+
task=self,
|
|
95
|
+
worker_factory=lambda: CypherBatchSink(context=self.context, task=self, predecessor=None,
|
|
96
|
+
query=self._cypher_query()),
|
|
97
|
+
predecessor=splitter,
|
|
98
|
+
max_workers=self.max_workers,
|
|
99
|
+
prefetch=self.prefetch
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# close loop: drives the pipeline and reports progress
|
|
103
|
+
closing = ClosedLoopBatchProcessor(
|
|
104
|
+
context=self.context,
|
|
105
|
+
task=self,
|
|
106
|
+
predecessor=parallel,
|
|
107
|
+
expected_rows=total_count
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# run once to completion and return aggregated stats
|
|
111
|
+
result = next(closing.get_batch(self.batch_size))
|
|
112
|
+
return TaskReturn(True, result.statistics)
|
|
113
|
+
|
|
114
|
+
def __get_source_count(self):
|
|
115
|
+
count_query = self._count_query()
|
|
116
|
+
if count_query is None:
|
|
117
|
+
return None
|
|
118
|
+
with self.context.sql.engine.connect() as conn:
|
|
119
|
+
with conn.begin():
|
|
120
|
+
result = conn.execute(text(count_query))
|
|
121
|
+
row = result.fetchone()
|
|
122
|
+
return row[0] if row else None
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
from abc import abstractmethod
|
|
2
2
|
|
|
3
|
+
from etl_lib.core.ETLContext import ETLContext
|
|
3
4
|
from sqlalchemy import text
|
|
4
5
|
|
|
5
|
-
from etl_lib.core import ETLContext
|
|
6
6
|
from etl_lib.core.ClosedLoopBatchProcessor import ClosedLoopBatchProcessor
|
|
7
7
|
from etl_lib.core.Task import Task, TaskReturn
|
|
8
8
|
from etl_lib.data_sink.CypherBatchSink import CypherBatchSink
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: neo4j-etl-lib
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Building blocks for ETL pipelines.
|
|
5
5
|
Keywords: etl,graph,database
|
|
6
6
|
Author-email: Bert Radke <bert.radke@pm.me>
|
|
@@ -15,7 +15,7 @@ Classifier: Topic :: Database
|
|
|
15
15
|
Classifier: Development Status :: 4 - Beta
|
|
16
16
|
License-File: LICENSE
|
|
17
17
|
Requires-Dist: pydantic>=2.10.5; python_version >= '3.8'
|
|
18
|
-
Requires-Dist: neo4j>=5.27.0; python_version >= '3.7'
|
|
18
|
+
Requires-Dist: neo4j-rust-ext>=5.27.0; python_version >= '3.7'
|
|
19
19
|
Requires-Dist: python-dotenv>=1.0.1; python_version >= '3.8'
|
|
20
20
|
Requires-Dist: tabulate>=0.9.0; python_version >= '3.7'
|
|
21
21
|
Requires-Dist: click>=8.1.8; python_version >= '3.7'
|
|
@@ -1,32 +1,36 @@
|
|
|
1
|
-
etl_lib/__init__.py,sha256=
|
|
1
|
+
etl_lib/__init__.py,sha256=FyaTAuElsn3y3j1g15X141PnLTYxPrSUVU_YaVmiyPs,65
|
|
2
2
|
etl_lib/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
3
|
etl_lib/cli/run_tools.py,sha256=KIar-y22P4kKm-yoJjecYsPwqC7U76M71dEgFO5-ZBo,8561
|
|
4
|
-
etl_lib/core/BatchProcessor.py,sha256=
|
|
4
|
+
etl_lib/core/BatchProcessor.py,sha256=mRpdxZ6ZMKI8XsY3TPuy4dVcvRqLKCO-p63KeOhFyKE,3417
|
|
5
5
|
etl_lib/core/ClosedLoopBatchProcessor.py,sha256=WzML1nldhZRbP8fhlD6utuK5SBYRl1cJgEobVDIdBP4,1626
|
|
6
|
-
etl_lib/core/ETLContext.py,sha256=
|
|
7
|
-
etl_lib/core/
|
|
8
|
-
etl_lib/core/
|
|
9
|
-
etl_lib/core/
|
|
6
|
+
etl_lib/core/ETLContext.py,sha256=wmEnbs3n_80B6La9Py_-MHG8BN0FajE9MjGPej0A3To,8045
|
|
7
|
+
etl_lib/core/ParallelBatchProcessor.py,sha256=jNo1Xv1Ts34UZIseoQLDZOhHOVeEr8dUibKUt0FJ4Hw,7318
|
|
8
|
+
etl_lib/core/ProgressReporter.py,sha256=UvWAPCuOrqyUcb5_kosIsCg1dyVQL-tnjgqnzs2cwZA,9372
|
|
9
|
+
etl_lib/core/SplittingBatchProcessor.py,sha256=OIRMUVFpUoZc0w__JJjUr7B9QC3sBlqQp41xghrQzC0,11616
|
|
10
|
+
etl_lib/core/Task.py,sha256=muQFY5qj2n-ZVV8F6vlHqo2lVSvB3wtGdIgkSXVpOFM,9365
|
|
11
|
+
etl_lib/core/ValidationBatchProcessor.py,sha256=U1M2Qp9Ledt8qFiHAg8zMxE9lLRkBrr51NKs_Y8skK8,3400
|
|
10
12
|
etl_lib/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
-
etl_lib/core/utils.py,sha256=
|
|
13
|
+
etl_lib/core/utils.py,sha256=9gezAateAmtWZ5lr6ZWYlTtgrj4-CUKJSGz7g1upnRg,2293
|
|
12
14
|
etl_lib/data_sink/CSVBatchSink.py,sha256=oq4VJwnA4WSyJzdvwstGv73vOEuWmPSfCynhVmxBByU,2204
|
|
13
15
|
etl_lib/data_sink/CypherBatchSink.py,sha256=nBH4bzN1IvdSFcKgiAIrAY5IauB565sdyVrnRc1hg_4,1566
|
|
14
16
|
etl_lib/data_sink/SQLBatchSink.py,sha256=vyGrrxpdmCLUZMI2_W2ORej3FLGbwN9-b2GMYHd-k9g,1451
|
|
15
17
|
etl_lib/data_sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
18
|
etl_lib/data_source/CSVBatchSource.py,sha256=HILkaQAFua1OM7xgSNKY6teXpcJjWUPaS4Aol-GLYL8,2767
|
|
17
19
|
etl_lib/data_source/CypherBatchSource.py,sha256=06WuW11BqYjAXBZqL96Qr9MR8JrcjujDpxXe8cI-SYY,2238
|
|
18
|
-
etl_lib/data_source/SQLBatchSource.py,sha256=
|
|
20
|
+
etl_lib/data_source/SQLBatchSource.py,sha256=O3ZA2GXvo5j_KGwOILzguYZMPY_FJkV5j8FIa3-d9oM,4067
|
|
19
21
|
etl_lib/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
20
22
|
etl_lib/task/CreateReportingConstraintsTask.py,sha256=nTcHLBIgXz_h2OQg-SHjQr68bhH974u0MwrtWPnVwng,762
|
|
21
23
|
etl_lib/task/ExecuteCypherTask.py,sha256=thE8YTZzv1abxNhhDcb4p4ke6qmI6kWR4XQ-GrCBBBU,1284
|
|
22
24
|
etl_lib/task/GDSTask.py,sha256=X1E83wYa-N7AXy43WPEqIy77d__z-2wpBjWNhGNXJzA,1781
|
|
23
25
|
etl_lib/task/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
|
-
etl_lib/task/data_loading/CSVLoad2Neo4jTask.py,sha256=
|
|
25
|
-
etl_lib/task/data_loading/
|
|
26
|
+
etl_lib/task/data_loading/CSVLoad2Neo4jTask.py,sha256=9XiVdJHpABE-Hx1bsvTKLJWtChc8XMwXeO5RicaHDUo,3873
|
|
27
|
+
etl_lib/task/data_loading/ParallelCSVLoad2Neo4jTask.py,sha256=2xN-5bHV9XgoaJLbbTEBuJFoZHV_CYi_hg6M1HQ-ffA,4030
|
|
28
|
+
etl_lib/task/data_loading/ParallelSQLLoad2Neo4jTask.py,sha256=9xpCW5i8yGnUHyg475Z8rjN2P5NDshJGUEgTU6sm2Bs,4286
|
|
29
|
+
etl_lib/task/data_loading/SQLLoad2Neo4jTask.py,sha256=HR3DcjOUkQN4SbCkgQYzljQCYhOcb-x2-DR5dBdapzU,2953
|
|
26
30
|
etl_lib/task/data_loading/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
27
31
|
etl_lib/test_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
28
32
|
etl_lib/test_utils/utils.py,sha256=kxWJqdRf1pg-4ByMfrtW3HDbgXIvyVtLndGDVvMCmoI,5641
|
|
29
|
-
neo4j_etl_lib-0.
|
|
30
|
-
neo4j_etl_lib-0.
|
|
31
|
-
neo4j_etl_lib-0.
|
|
32
|
-
neo4j_etl_lib-0.
|
|
33
|
+
neo4j_etl_lib-0.3.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
34
|
+
neo4j_etl_lib-0.3.0.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
|
|
35
|
+
neo4j_etl_lib-0.3.0.dist-info/METADATA,sha256=GJcjdPvmzjEUq0pLndSSVzOg3c7CR6bIWz3sB_9tkVY,2506
|
|
36
|
+
neo4j_etl_lib-0.3.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|