neo4j-etl-lib 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,122 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Callable, Union
3
+
4
+ from etl_lib.core.ClosedLoopBatchProcessor import ClosedLoopBatchProcessor
5
+ from etl_lib.core.ETLContext import ETLContext
6
+ from etl_lib.core.ParallelBatchProcessor import ParallelBatchProcessor
7
+ from etl_lib.core.SplittingBatchProcessor import SplittingBatchProcessor, dict_id_extractor
8
+ from etl_lib.core.Task import Task, TaskReturn
9
+ from etl_lib.data_sink.CypherBatchSink import CypherBatchSink
10
+ from etl_lib.data_source.SQLBatchSource import SQLBatchSource
11
+ from sqlalchemy import text
12
+
13
+
14
+ class ParallelSQLLoad2Neo4jTask(Task, ABC):
15
+ """
16
+ Parallelized version of SQLLoad2Neo4jTask: reads via SQLBatchSource,
17
+ splits into non-overlapping partitions (grid), processes each partition
18
+ in parallel through a CypherBatchSink, and closes the loop.
19
+
20
+ Subclasses must implement:
21
+ - _sql_query()
22
+ - _cypher_query()
23
+ - optionally override _count_query() and _id_extractor().
24
+
25
+ Control parameters:
26
+ batch_size: max items per partition batch
27
+ table_size: dimension of the splitting grid
28
+ max_workers: parallel threads per partition group (defaults to table_size)
29
+ prefetch: number of partition-groups to prefetch
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ context: ETLContext,
35
+ batch_size: int = 5000,
36
+ table_size: int = 10,
37
+ max_workers: int = None,
38
+ prefetch: int = 4
39
+ ):
40
+ super().__init__(context)
41
+ self.context = context
42
+ self.batch_size = batch_size
43
+ self.table_size = table_size
44
+ # default max_workers to table_size for full parallelism
45
+ self.max_workers = max_workers or table_size
46
+ self.prefetch = prefetch
47
+
48
+ @abstractmethod
49
+ def _sql_query(self) -> str:
50
+ """
51
+ Return the SQL query to load source rows.
52
+ """
53
+ pass
54
+
55
+ @abstractmethod
56
+ def _cypher_query(self) -> str:
57
+ """
58
+ Return the Cypher query to write rows into Neo4j.
59
+ """
60
+ pass
61
+
62
+ def _count_query(self) -> Union[str, None]:
63
+ """
64
+ Optional SQL to count source rows for progress reporting.
65
+ """
66
+ return None
67
+
68
+ def _id_extractor(self) -> Callable:
69
+ """
70
+ Extractor mapping each row item to a (row,col) partition index.
71
+ Default expects dict rows with 'start' and 'end' keys.
72
+ Override to customize.
73
+ """
74
+ return dict_id_extractor()
75
+
76
+ def run_internal(self) -> TaskReturn:
77
+ # total count for ClosedLoopBatchProcessor
78
+ total_count = self.__get_source_count()
79
+ # source of raw rows
80
+ source = SQLBatchSource(self.context, self, self._sql_query())
81
+
82
+ # splitter: non-overlapping partitions as defined by the id_extractor
83
+ splitter = SplittingBatchProcessor(
84
+ context=self.context,
85
+ task=self,
86
+ predecessor=source,
87
+ table_size=self.table_size,
88
+ id_extractor=self._id_extractor()
89
+ )
90
+
91
+ # parallel processor: runs CypherBatchSink on each partition concurrently
92
+ parallel = ParallelBatchProcessor(
93
+ context=self.context,
94
+ task=self,
95
+ worker_factory=lambda: CypherBatchSink(context=self.context, task=self, predecessor=None,
96
+ query=self._cypher_query()),
97
+ predecessor=splitter,
98
+ max_workers=self.max_workers,
99
+ prefetch=self.prefetch
100
+ )
101
+
102
+ # close loop: drives the pipeline and reports progress
103
+ closing = ClosedLoopBatchProcessor(
104
+ context=self.context,
105
+ task=self,
106
+ predecessor=parallel,
107
+ expected_rows=total_count
108
+ )
109
+
110
+ # run once to completion and return aggregated stats
111
+ result = next(closing.get_batch(self.batch_size))
112
+ return TaskReturn(True, result.statistics)
113
+
114
+ def __get_source_count(self):
115
+ count_query = self._count_query()
116
+ if count_query is None:
117
+ return None
118
+ with self.context.sql.engine.connect() as conn:
119
+ with conn.begin():
120
+ result = conn.execute(text(count_query))
121
+ row = result.fetchone()
122
+ return row[0] if row else None
@@ -1,8 +1,8 @@
1
1
  from abc import abstractmethod
2
2
 
3
+ from etl_lib.core.ETLContext import ETLContext
3
4
  from sqlalchemy import text
4
5
 
5
- from etl_lib.core import ETLContext
6
6
  from etl_lib.core.ClosedLoopBatchProcessor import ClosedLoopBatchProcessor
7
7
  from etl_lib.core.Task import Task, TaskReturn
8
8
  from etl_lib.data_sink.CypherBatchSink import CypherBatchSink
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: neo4j-etl-lib
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: Building blocks for ETL pipelines.
5
5
  Keywords: etl,graph,database
6
6
  Author-email: Bert Radke <bert.radke@pm.me>
@@ -15,7 +15,7 @@ Classifier: Topic :: Database
15
15
  Classifier: Development Status :: 4 - Beta
16
16
  License-File: LICENSE
17
17
  Requires-Dist: pydantic>=2.10.5; python_version >= '3.8'
18
- Requires-Dist: neo4j>=5.27.0; python_version >= '3.7'
18
+ Requires-Dist: neo4j-rust-ext>=5.27.0; python_version >= '3.7'
19
19
  Requires-Dist: python-dotenv>=1.0.1; python_version >= '3.8'
20
20
  Requires-Dist: tabulate>=0.9.0; python_version >= '3.7'
21
21
  Requires-Dist: click>=8.1.8; python_version >= '3.7'
@@ -1,32 +1,36 @@
1
- etl_lib/__init__.py,sha256=m8CiP9nDQUZCy4l5Crx09vtYrpAtqWkstIOj_Wlp1K8,65
1
+ etl_lib/__init__.py,sha256=FyaTAuElsn3y3j1g15X141PnLTYxPrSUVU_YaVmiyPs,65
2
2
  etl_lib/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  etl_lib/cli/run_tools.py,sha256=KIar-y22P4kKm-yoJjecYsPwqC7U76M71dEgFO5-ZBo,8561
4
- etl_lib/core/BatchProcessor.py,sha256=6quNPE9Dp8hYJDQDTqxQtxbQ3KCmb56Mko34EIsNhyI,3352
4
+ etl_lib/core/BatchProcessor.py,sha256=mRpdxZ6ZMKI8XsY3TPuy4dVcvRqLKCO-p63KeOhFyKE,3417
5
5
  etl_lib/core/ClosedLoopBatchProcessor.py,sha256=WzML1nldhZRbP8fhlD6utuK5SBYRl1cJgEobVDIdBP4,1626
6
- etl_lib/core/ETLContext.py,sha256=CHRNfjTmIIrPUp2F7l4MJlTsRLwfuBsNJdF_SEeShvI,6695
7
- etl_lib/core/ProgressReporter.py,sha256=FNmunA8q1vkAcmDF5z8ExH-mTotQMX0fyf2VAz-zI8o,8414
8
- etl_lib/core/Task.py,sha256=3e8iVXSfXaeBecvgTcs2LiIf2JwpKETRFhH4ig6lock,9202
9
- etl_lib/core/ValidationBatchProcessor.py,sha256=EhO6PFQB-4PZgIOTXP4PwkbAl5HRK0zgTeKMseRU5QU,3261
6
+ etl_lib/core/ETLContext.py,sha256=wmEnbs3n_80B6La9Py_-MHG8BN0FajE9MjGPej0A3To,8045
7
+ etl_lib/core/ParallelBatchProcessor.py,sha256=jNo1Xv1Ts34UZIseoQLDZOhHOVeEr8dUibKUt0FJ4Hw,7318
8
+ etl_lib/core/ProgressReporter.py,sha256=UvWAPCuOrqyUcb5_kosIsCg1dyVQL-tnjgqnzs2cwZA,9372
9
+ etl_lib/core/SplittingBatchProcessor.py,sha256=OIRMUVFpUoZc0w__JJjUr7B9QC3sBlqQp41xghrQzC0,11616
10
+ etl_lib/core/Task.py,sha256=muQFY5qj2n-ZVV8F6vlHqo2lVSvB3wtGdIgkSXVpOFM,9365
11
+ etl_lib/core/ValidationBatchProcessor.py,sha256=U1M2Qp9Ledt8qFiHAg8zMxE9lLRkBrr51NKs_Y8skK8,3400
10
12
  etl_lib/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- etl_lib/core/utils.py,sha256=wwfyvy78fL6sqHdV0IFqAVyEkp6vo5Yo8gRZua2dulw,816
13
+ etl_lib/core/utils.py,sha256=9gezAateAmtWZ5lr6ZWYlTtgrj4-CUKJSGz7g1upnRg,2293
12
14
  etl_lib/data_sink/CSVBatchSink.py,sha256=oq4VJwnA4WSyJzdvwstGv73vOEuWmPSfCynhVmxBByU,2204
13
15
  etl_lib/data_sink/CypherBatchSink.py,sha256=nBH4bzN1IvdSFcKgiAIrAY5IauB565sdyVrnRc1hg_4,1566
14
16
  etl_lib/data_sink/SQLBatchSink.py,sha256=vyGrrxpdmCLUZMI2_W2ORej3FLGbwN9-b2GMYHd-k9g,1451
15
17
  etl_lib/data_sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
18
  etl_lib/data_source/CSVBatchSource.py,sha256=HILkaQAFua1OM7xgSNKY6teXpcJjWUPaS4Aol-GLYL8,2767
17
19
  etl_lib/data_source/CypherBatchSource.py,sha256=06WuW11BqYjAXBZqL96Qr9MR8JrcjujDpxXe8cI-SYY,2238
18
- etl_lib/data_source/SQLBatchSource.py,sha256=rfIpzMU9IG5xrftZss7W-KpSIINCUdPfJBEXa5QDFts,2323
20
+ etl_lib/data_source/SQLBatchSource.py,sha256=O3ZA2GXvo5j_KGwOILzguYZMPY_FJkV5j8FIa3-d9oM,4067
19
21
  etl_lib/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
22
  etl_lib/task/CreateReportingConstraintsTask.py,sha256=nTcHLBIgXz_h2OQg-SHjQr68bhH974u0MwrtWPnVwng,762
21
23
  etl_lib/task/ExecuteCypherTask.py,sha256=thE8YTZzv1abxNhhDcb4p4ke6qmI6kWR4XQ-GrCBBBU,1284
22
24
  etl_lib/task/GDSTask.py,sha256=X1E83wYa-N7AXy43WPEqIy77d__z-2wpBjWNhGNXJzA,1781
23
25
  etl_lib/task/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
- etl_lib/task/data_loading/CSVLoad2Neo4jTask.py,sha256=bD1dWX3guY0f_lFqQP_pzPa03AYveLhjDQzOqkyJVQc,3840
25
- etl_lib/task/data_loading/SQLLoad2Neo4jTask.py,sha256=N8pQFx5vYKJ3UDn6sw7yBmCJao_b5vBOT3tJ8pWVVLk,2942
26
+ etl_lib/task/data_loading/CSVLoad2Neo4jTask.py,sha256=9XiVdJHpABE-Hx1bsvTKLJWtChc8XMwXeO5RicaHDUo,3873
27
+ etl_lib/task/data_loading/ParallelCSVLoad2Neo4jTask.py,sha256=2xN-5bHV9XgoaJLbbTEBuJFoZHV_CYi_hg6M1HQ-ffA,4030
28
+ etl_lib/task/data_loading/ParallelSQLLoad2Neo4jTask.py,sha256=9xpCW5i8yGnUHyg475Z8rjN2P5NDshJGUEgTU6sm2Bs,4286
29
+ etl_lib/task/data_loading/SQLLoad2Neo4jTask.py,sha256=HR3DcjOUkQN4SbCkgQYzljQCYhOcb-x2-DR5dBdapzU,2953
26
30
  etl_lib/task/data_loading/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
31
  etl_lib/test_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
32
  etl_lib/test_utils/utils.py,sha256=kxWJqdRf1pg-4ByMfrtW3HDbgXIvyVtLndGDVvMCmoI,5641
29
- neo4j_etl_lib-0.2.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
30
- neo4j_etl_lib-0.2.0.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
31
- neo4j_etl_lib-0.2.0.dist-info/METADATA,sha256=FISyLgs53iHj3Uun7RgGFVtltLus_FiCkU591q9fJ3o,2497
32
- neo4j_etl_lib-0.2.0.dist-info/RECORD,,
33
+ neo4j_etl_lib-0.3.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
34
+ neo4j_etl_lib-0.3.0.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
35
+ neo4j_etl_lib-0.3.0.dist-info/METADATA,sha256=GJcjdPvmzjEUq0pLndSSVzOg3c7CR6bIWz3sB_9tkVY,2506
36
+ neo4j_etl_lib-0.3.0.dist-info/RECORD,,