neo4j-etl-lib 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,122 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Callable, Union
3
+
4
+ from etl_lib.core.ClosedLoopBatchProcessor import ClosedLoopBatchProcessor
5
+ from etl_lib.core.ETLContext import ETLContext
6
+ from etl_lib.core.ParallelBatchProcessor import ParallelBatchProcessor
7
+ from etl_lib.core.SplittingBatchProcessor import SplittingBatchProcessor, dict_id_extractor
8
+ from etl_lib.core.Task import Task, TaskReturn
9
+ from etl_lib.data_sink.CypherBatchSink import CypherBatchSink
10
+ from etl_lib.data_source.SQLBatchSource import SQLBatchSource
11
+ from sqlalchemy import text
12
+
13
+
14
+ class ParallelSQLLoad2Neo4jTask(Task, ABC):
15
+ """
16
+ Parallelized version of SQLLoad2Neo4jTask: reads via SQLBatchSource,
17
+ splits into non-overlapping partitions (grid), processes each partition
18
+ in parallel through a CypherBatchSink, and closes the loop.
19
+
20
+ Subclasses must implement:
21
+ - _sql_query()
22
+ - _cypher_query()
23
+ - optionally override _count_query() and _id_extractor().
24
+
25
+ Control parameters:
26
+ batch_size: max items per partition batch
27
+ table_size: dimension of the splitting grid
28
+ max_workers: parallel threads per partition group (defaults to table_size)
29
+ prefetch: number of partition-groups to prefetch
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ context: ETLContext,
35
+ batch_size: int = 5000,
36
+ table_size: int = 10,
37
+ max_workers: int = None,
38
+ prefetch: int = 4
39
+ ):
40
+ super().__init__(context)
41
+ self.context = context
42
+ self.batch_size = batch_size
43
+ self.table_size = table_size
44
+ # default max_workers to table_size for full parallelism
45
+ self.max_workers = max_workers or table_size
46
+ self.prefetch = prefetch
47
+
48
+ @abstractmethod
49
+ def _sql_query(self) -> str:
50
+ """
51
+ Return the SQL query to load source rows.
52
+ """
53
+ pass
54
+
55
+ @abstractmethod
56
+ def _cypher_query(self) -> str:
57
+ """
58
+ Return the Cypher query to write rows into Neo4j.
59
+ """
60
+ pass
61
+
62
+ def _count_query(self) -> Union[str, None]:
63
+ """
64
+ Optional SQL to count source rows for progress reporting.
65
+ """
66
+ return None
67
+
68
+ def _id_extractor(self) -> Callable:
69
+ """
70
+ Extractor mapping each row item to a (row,col) partition index.
71
+ Default expects dict rows with 'start' and 'end' keys.
72
+ Override to customize.
73
+ """
74
+ return dict_id_extractor()
75
+
76
+ def run_internal(self) -> TaskReturn:
77
+ # total count for ClosedLoopBatchProcessor
78
+ total_count = self.__get_source_count()
79
+ # source of raw rows
80
+ source = SQLBatchSource(self.context, self, self._sql_query())
81
+
82
+ # splitter: non-overlapping partitions as defined by the id_extractor
83
+ splitter = SplittingBatchProcessor(
84
+ context=self.context,
85
+ task=self,
86
+ predecessor=source,
87
+ table_size=self.table_size,
88
+ id_extractor=self._id_extractor()
89
+ )
90
+
91
+ # parallel processor: runs CypherBatchSink on each partition concurrently
92
+ parallel = ParallelBatchProcessor(
93
+ context=self.context,
94
+ task=self,
95
+ worker_factory=lambda: CypherBatchSink(context=self.context, task=self, predecessor=None,
96
+ query=self._cypher_query()),
97
+ predecessor=splitter,
98
+ max_workers=self.max_workers,
99
+ prefetch=self.prefetch
100
+ )
101
+
102
+ # close loop: drives the pipeline and reports progress
103
+ closing = ClosedLoopBatchProcessor(
104
+ context=self.context,
105
+ task=self,
106
+ predecessor=parallel,
107
+ expected_rows=total_count
108
+ )
109
+
110
+ # run once to completion and return aggregated stats
111
+ result = next(closing.get_batch(self.batch_size))
112
+ return TaskReturn(True, result.statistics)
113
+
114
+ def __get_source_count(self):
115
+ count_query = self._count_query()
116
+ if count_query is None:
117
+ return None
118
+ with self.context.sql.engine.connect() as conn:
119
+ with conn.begin():
120
+ result = conn.execute(text(count_query))
121
+ row = result.fetchone()
122
+ return row[0] if row else None
@@ -1,8 +1,8 @@
1
1
  from abc import abstractmethod
2
2
 
3
+ from etl_lib.core.ETLContext import ETLContext
3
4
  from sqlalchemy import text
4
5
 
5
- from etl_lib.core import ETLContext
6
6
  from etl_lib.core.ClosedLoopBatchProcessor import ClosedLoopBatchProcessor
7
7
  from etl_lib.core.Task import Task, TaskReturn
8
8
  from etl_lib.data_sink.CypherBatchSink import CypherBatchSink
@@ -7,7 +7,7 @@ from _pytest.tmpdir import tmp_path
7
7
  from neo4j import Driver
8
8
  from neo4j.time import Date
9
9
 
10
- from etl_lib.core.ETLContext import QueryResult, Neo4jContext, ETLContext, SQLContext, gds
10
+ from etl_lib.core.ETLContext import ETLContext, Neo4jContext, QueryResult, SQLContext, gds
11
11
  from etl_lib.core.Task import Task
12
12
 
13
13
 
@@ -96,7 +96,7 @@ def get_database_name():
96
96
  raise Exception("define NEO4J_TEST_DATABASE environment variable")
97
97
 
98
98
 
99
- class TestNeo4jContext(Neo4jContext):
99
+ class MockNeo4jContext(Neo4jContext):
100
100
 
101
101
  def __init__(self, driver: Driver):
102
102
  self.logger = logging.getLogger(self.__class__.__name__)
@@ -105,29 +105,32 @@ class TestNeo4jContext(Neo4jContext):
105
105
  self.gds = gds(self)
106
106
 
107
107
 
108
- class TestETLContext(ETLContext):
108
+ class MockETLContext(ETLContext):
109
109
 
110
110
  def __init__(self, driver: Driver, tmp_path):
111
111
  self.logger = logging.getLogger(self.__class__.__name__)
112
112
  self.__env_vars = {"ETL_ERROR_PATH": tmp_path}
113
- self.neo4j = TestNeo4jContext(driver)
113
+ self.neo4j = MockNeo4jContext(driver)
114
114
  self.reporter = DummyReporter()
115
115
 
116
116
  def env(self, key: str) -> Any:
117
117
  if key in self.__env_vars:
118
118
  return self.__env_vars[key]
119
119
 
120
- class TestSQLETLContext(ETLContext):
120
+
121
+ class MockSQLETLContext(ETLContext):
121
122
 
122
123
  def __init__(self, sql_uri):
123
124
  self.logger = logging.getLogger(self.__class__.__name__)
124
125
  self.reporter = DummyReporter()
126
+ self.__env_vars = {}
125
127
  self.sql = SQLContext(sql_uri)
126
128
 
127
129
  def env(self, key: str) -> Any:
128
130
  if key in self.__env_vars:
129
131
  return self.__env_vars[key]
130
132
 
133
+
131
134
  class DummyReporter:
132
135
 
133
136
  def register_tasks(self, main: Task):
@@ -163,6 +166,7 @@ class DummyContext:
163
166
  def env(self, key: str) -> Any:
164
167
  pass
165
168
 
169
+
166
170
  class DummyPredecessor:
167
171
  def __init__(self, batches):
168
172
  self.batches = batches
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: neo4j-etl-lib
3
- Version: 0.2.0
3
+ Version: 0.3.1
4
4
  Summary: Building blocks for ETL pipelines.
5
5
  Keywords: etl,graph,database
6
6
  Author-email: Bert Radke <bert.radke@pm.me>
@@ -14,11 +14,11 @@ Classifier: Programming Language :: Python :: 3
14
14
  Classifier: Topic :: Database
15
15
  Classifier: Development Status :: 4 - Beta
16
16
  License-File: LICENSE
17
- Requires-Dist: pydantic>=2.10.5; python_version >= '3.8'
18
- Requires-Dist: neo4j>=5.27.0; python_version >= '3.7'
19
- Requires-Dist: python-dotenv>=1.0.1; python_version >= '3.8'
20
- Requires-Dist: tabulate>=0.9.0; python_version >= '3.7'
21
- Requires-Dist: click>=8.1.8; python_version >= '3.7'
17
+ Requires-Dist: pydantic>=2.10.5; python_version >= '3.10'
18
+ Requires-Dist: neo4j-rust-ext>=5.27.0,<6; python_version >= '3.10'
19
+ Requires-Dist: python-dotenv>=1.0.1; python_version >= '3.10'
20
+ Requires-Dist: tabulate>=0.9.0; python_version >= '3.10'
21
+ Requires-Dist: click>=8.1.8; python_version >= '3.10'
22
22
  Requires-Dist: pydantic[email-validator]
23
23
  Requires-Dist: pytest>=8.3.0 ; extra == "dev" and ( python_version >= '3.8')
24
24
  Requires-Dist: testcontainers[neo4j]==4.9.0 ; extra == "dev" and ( python_version >= '3.9' and python_version < '4.0')
@@ -35,11 +35,13 @@ Requires-Dist: sphinx-autoapi ; extra == "dev"
35
35
  Requires-Dist: sqlalchemy ; extra == "dev"
36
36
  Requires-Dist: psycopg2-binary ; extra == "dev"
37
37
  Requires-Dist: graphdatascience>=1.13 ; extra == "gds" and ( python_version >= '3.9')
38
+ Requires-Dist: nox>=2024.0.0 ; extra == "nox"
38
39
  Requires-Dist: sqlalchemy ; extra == "sql"
39
40
  Project-URL: Documentation, https://neo-technology-field.github.io/python-etl-lib/index.html
40
41
  Project-URL: Home, https://github.com/neo-technology-field/python-etl-lib
41
42
  Provides-Extra: dev
42
43
  Provides-Extra: gds
44
+ Provides-Extra: nox
43
45
  Provides-Extra: sql
44
46
 
45
47
  # Neo4j ETL Toolbox
@@ -0,0 +1,36 @@
1
+ etl_lib/__init__.py,sha256=x6coFV38ytJ_wPhR3c0UEzX65oTz2ouKwygkC_tyRLM,65
2
+ etl_lib/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ etl_lib/cli/run_tools.py,sha256=KIar-y22P4kKm-yoJjecYsPwqC7U76M71dEgFO5-ZBo,8561
4
+ etl_lib/core/BatchProcessor.py,sha256=mRpdxZ6ZMKI8XsY3TPuy4dVcvRqLKCO-p63KeOhFyKE,3417
5
+ etl_lib/core/ClosedLoopBatchProcessor.py,sha256=WzML1nldhZRbP8fhlD6utuK5SBYRl1cJgEobVDIdBP4,1626
6
+ etl_lib/core/ETLContext.py,sha256=wmEnbs3n_80B6La9Py_-MHG8BN0FajE9MjGPej0A3To,8045
7
+ etl_lib/core/ParallelBatchProcessor.py,sha256=jNo1Xv1Ts34UZIseoQLDZOhHOVeEr8dUibKUt0FJ4Hw,7318
8
+ etl_lib/core/ProgressReporter.py,sha256=tkE-W6qlR25nU8nUoECcxZDnjnG8AtQH9s9s5WBh_-Q,9377
9
+ etl_lib/core/SplittingBatchProcessor.py,sha256=OIRMUVFpUoZc0w__JJjUr7B9QC3sBlqQp41xghrQzC0,11616
10
+ etl_lib/core/Task.py,sha256=muQFY5qj2n-ZVV8F6vlHqo2lVSvB3wtGdIgkSXVpOFM,9365
11
+ etl_lib/core/ValidationBatchProcessor.py,sha256=U1M2Qp9Ledt8qFiHAg8zMxE9lLRkBrr51NKs_Y8skK8,3400
12
+ etl_lib/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
+ etl_lib/core/utils.py,sha256=9gezAateAmtWZ5lr6ZWYlTtgrj4-CUKJSGz7g1upnRg,2293
14
+ etl_lib/data_sink/CSVBatchSink.py,sha256=oq4VJwnA4WSyJzdvwstGv73vOEuWmPSfCynhVmxBByU,2204
15
+ etl_lib/data_sink/CypherBatchSink.py,sha256=nBH4bzN1IvdSFcKgiAIrAY5IauB565sdyVrnRc1hg_4,1566
16
+ etl_lib/data_sink/SQLBatchSink.py,sha256=vyGrrxpdmCLUZMI2_W2ORej3FLGbwN9-b2GMYHd-k9g,1451
17
+ etl_lib/data_sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
+ etl_lib/data_source/CSVBatchSource.py,sha256=0q1XdPhAIKw1HcTpnp_F4WxRUzk-24Q8Qd-WeIo5OZ0,2779
19
+ etl_lib/data_source/CypherBatchSource.py,sha256=06WuW11BqYjAXBZqL96Qr9MR8JrcjujDpxXe8cI-SYY,2238
20
+ etl_lib/data_source/SQLBatchSource.py,sha256=O3ZA2GXvo5j_KGwOILzguYZMPY_FJkV5j8FIa3-d9oM,4067
21
+ etl_lib/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
+ etl_lib/task/CreateReportingConstraintsTask.py,sha256=nTcHLBIgXz_h2OQg-SHjQr68bhH974u0MwrtWPnVwng,762
23
+ etl_lib/task/ExecuteCypherTask.py,sha256=thE8YTZzv1abxNhhDcb4p4ke6qmI6kWR4XQ-GrCBBBU,1284
24
+ etl_lib/task/GDSTask.py,sha256=UP_NMvdeQ9ueLUFlREfe0q3XhFHTCaMvXArSpvxZNiQ,1918
25
+ etl_lib/task/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
+ etl_lib/task/data_loading/CSVLoad2Neo4jTask.py,sha256=9XiVdJHpABE-Hx1bsvTKLJWtChc8XMwXeO5RicaHDUo,3873
27
+ etl_lib/task/data_loading/ParallelCSVLoad2Neo4jTask.py,sha256=2xN-5bHV9XgoaJLbbTEBuJFoZHV_CYi_hg6M1HQ-ffA,4030
28
+ etl_lib/task/data_loading/ParallelSQLLoad2Neo4jTask.py,sha256=9xpCW5i8yGnUHyg475Z8rjN2P5NDshJGUEgTU6sm2Bs,4286
29
+ etl_lib/task/data_loading/SQLLoad2Neo4jTask.py,sha256=HR3DcjOUkQN4SbCkgQYzljQCYhOcb-x2-DR5dBdapzU,2953
30
+ etl_lib/task/data_loading/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
+ etl_lib/test_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
32
+ etl_lib/test_utils/utils.py,sha256=CgYOCXcUyndOdRAmGyPLoCIuEik0yzy6FLV2k16cpDM,5673
33
+ neo4j_etl_lib-0.3.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
34
+ neo4j_etl_lib-0.3.1.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
35
+ neo4j_etl_lib-0.3.1.dist-info/METADATA,sha256=Pm921qyxL36Ed_Ppp2cW3OFPxUGMv7IyRTmtba3n96o,2580
36
+ neo4j_etl_lib-0.3.1.dist-info/RECORD,,
@@ -1,32 +0,0 @@
1
- etl_lib/__init__.py,sha256=m8CiP9nDQUZCy4l5Crx09vtYrpAtqWkstIOj_Wlp1K8,65
2
- etl_lib/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- etl_lib/cli/run_tools.py,sha256=KIar-y22P4kKm-yoJjecYsPwqC7U76M71dEgFO5-ZBo,8561
4
- etl_lib/core/BatchProcessor.py,sha256=6quNPE9Dp8hYJDQDTqxQtxbQ3KCmb56Mko34EIsNhyI,3352
5
- etl_lib/core/ClosedLoopBatchProcessor.py,sha256=WzML1nldhZRbP8fhlD6utuK5SBYRl1cJgEobVDIdBP4,1626
6
- etl_lib/core/ETLContext.py,sha256=CHRNfjTmIIrPUp2F7l4MJlTsRLwfuBsNJdF_SEeShvI,6695
7
- etl_lib/core/ProgressReporter.py,sha256=FNmunA8q1vkAcmDF5z8ExH-mTotQMX0fyf2VAz-zI8o,8414
8
- etl_lib/core/Task.py,sha256=3e8iVXSfXaeBecvgTcs2LiIf2JwpKETRFhH4ig6lock,9202
9
- etl_lib/core/ValidationBatchProcessor.py,sha256=EhO6PFQB-4PZgIOTXP4PwkbAl5HRK0zgTeKMseRU5QU,3261
10
- etl_lib/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- etl_lib/core/utils.py,sha256=wwfyvy78fL6sqHdV0IFqAVyEkp6vo5Yo8gRZua2dulw,816
12
- etl_lib/data_sink/CSVBatchSink.py,sha256=oq4VJwnA4WSyJzdvwstGv73vOEuWmPSfCynhVmxBByU,2204
13
- etl_lib/data_sink/CypherBatchSink.py,sha256=nBH4bzN1IvdSFcKgiAIrAY5IauB565sdyVrnRc1hg_4,1566
14
- etl_lib/data_sink/SQLBatchSink.py,sha256=vyGrrxpdmCLUZMI2_W2ORej3FLGbwN9-b2GMYHd-k9g,1451
15
- etl_lib/data_sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
- etl_lib/data_source/CSVBatchSource.py,sha256=HILkaQAFua1OM7xgSNKY6teXpcJjWUPaS4Aol-GLYL8,2767
17
- etl_lib/data_source/CypherBatchSource.py,sha256=06WuW11BqYjAXBZqL96Qr9MR8JrcjujDpxXe8cI-SYY,2238
18
- etl_lib/data_source/SQLBatchSource.py,sha256=rfIpzMU9IG5xrftZss7W-KpSIINCUdPfJBEXa5QDFts,2323
19
- etl_lib/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
- etl_lib/task/CreateReportingConstraintsTask.py,sha256=nTcHLBIgXz_h2OQg-SHjQr68bhH974u0MwrtWPnVwng,762
21
- etl_lib/task/ExecuteCypherTask.py,sha256=thE8YTZzv1abxNhhDcb4p4ke6qmI6kWR4XQ-GrCBBBU,1284
22
- etl_lib/task/GDSTask.py,sha256=X1E83wYa-N7AXy43WPEqIy77d__z-2wpBjWNhGNXJzA,1781
23
- etl_lib/task/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
- etl_lib/task/data_loading/CSVLoad2Neo4jTask.py,sha256=bD1dWX3guY0f_lFqQP_pzPa03AYveLhjDQzOqkyJVQc,3840
25
- etl_lib/task/data_loading/SQLLoad2Neo4jTask.py,sha256=N8pQFx5vYKJ3UDn6sw7yBmCJao_b5vBOT3tJ8pWVVLk,2942
26
- etl_lib/task/data_loading/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
- etl_lib/test_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
- etl_lib/test_utils/utils.py,sha256=kxWJqdRf1pg-4ByMfrtW3HDbgXIvyVtLndGDVvMCmoI,5641
29
- neo4j_etl_lib-0.2.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
30
- neo4j_etl_lib-0.2.0.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
31
- neo4j_etl_lib-0.2.0.dist-info/METADATA,sha256=FISyLgs53iHj3Uun7RgGFVtltLus_FiCkU591q9fJ3o,2497
32
- neo4j_etl_lib-0.2.0.dist-info/RECORD,,