neo4j-etl-lib 0.1.1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- etl_lib/__init__.py +1 -1
- etl_lib/cli/run_tools.py +1 -1
- etl_lib/core/BatchProcessor.py +7 -7
- etl_lib/core/ClosedLoopBatchProcessor.py +8 -2
- etl_lib/core/ETLContext.py +112 -46
- etl_lib/core/ParallelBatchProcessor.py +180 -0
- etl_lib/core/ProgressReporter.py +23 -4
- etl_lib/core/SplittingBatchProcessor.py +268 -0
- etl_lib/core/Task.py +10 -8
- etl_lib/core/ValidationBatchProcessor.py +2 -0
- etl_lib/core/utils.py +52 -11
- etl_lib/data_sink/CypherBatchSink.py +4 -3
- etl_lib/data_sink/SQLBatchSink.py +36 -0
- etl_lib/data_source/SQLBatchSource.py +114 -0
- etl_lib/task/CreateReportingConstraintsTask.py +2 -2
- etl_lib/task/data_loading/CSVLoad2Neo4jTask.py +42 -6
- etl_lib/task/data_loading/ParallelCSVLoad2Neo4jTask.py +98 -0
- etl_lib/task/data_loading/ParallelSQLLoad2Neo4jTask.py +122 -0
- etl_lib/task/data_loading/SQLLoad2Neo4jTask.py +90 -0
- etl_lib/test_utils/utils.py +19 -1
- {neo4j_etl_lib-0.1.1.dist-info → neo4j_etl_lib-0.3.0.dist-info}/METADATA +14 -3
- neo4j_etl_lib-0.3.0.dist-info/RECORD +36 -0
- {neo4j_etl_lib-0.1.1.dist-info → neo4j_etl_lib-0.3.0.dist-info}/WHEEL +1 -1
- neo4j_etl_lib-0.1.1.dist-info/RECORD +0 -29
- {neo4j_etl_lib-0.1.1.dist-info → neo4j_etl_lib-0.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Type
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from etl_lib.core.ClosedLoopBatchProcessor import ClosedLoopBatchProcessor
|
|
7
|
+
from etl_lib.core.ETLContext import ETLContext
|
|
8
|
+
from etl_lib.core.ParallelBatchProcessor import ParallelBatchProcessor
|
|
9
|
+
from etl_lib.core.SplittingBatchProcessor import SplittingBatchProcessor, dict_id_extractor
|
|
10
|
+
from etl_lib.core.Task import Task, TaskReturn
|
|
11
|
+
from etl_lib.core.ValidationBatchProcessor import ValidationBatchProcessor
|
|
12
|
+
from etl_lib.data_sink.CypherBatchSink import CypherBatchSink
|
|
13
|
+
from etl_lib.data_source.CSVBatchSource import CSVBatchSource
|
|
14
|
+
from pydantic import BaseModel
|
|
15
|
+
|
|
16
|
+
class ParallelCSVLoad2Neo4jTask(Task):
|
|
17
|
+
"""
|
|
18
|
+
Parallel CSV → Neo4j load using the mix-and-batch strategy.
|
|
19
|
+
|
|
20
|
+
Wires a CSV reader, optional Pydantic validation, a diagonal splitter
|
|
21
|
+
(to avoid overlapping node locks), and a Cypher sink. Rows are
|
|
22
|
+
distributed into (row, col) partitions and processed in non-overlapping groups.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
context: Shared ETL context.
|
|
26
|
+
file: CSV file to load.
|
|
27
|
+
model: Optional Pydantic model for row validation; invalid rows go to `error_file`.
|
|
28
|
+
error_file: Output for invalid rows. Required when `model` is set.
|
|
29
|
+
table_size: Bucketing grid size for the splitter.
|
|
30
|
+
batch_size: Per-cell target batch size from the splitter.
|
|
31
|
+
max_workers: Worker threads per wave.
|
|
32
|
+
prefetch: Number of waves to prefetch from the splitter.
|
|
33
|
+
**csv_reader_kwargs: Forwarded to :py:class:`etl_lib.data_source.CSVBatchSource.CSVBatchSource`.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
:py:class:`~etl_lib.core.Task.TaskReturn` with merged validation and Neo4j counters.
|
|
37
|
+
|
|
38
|
+
Notes:
|
|
39
|
+
- `_query()` must return Cypher that starts with ``UNWIND $batch AS row``.
|
|
40
|
+
- Override `_id_extractor()` if your CSV schema doesn’t expose ``start``/``end``; the default uses
|
|
41
|
+
:py:func:`etl_lib.core.SplittingBatchProcessor.dict_id_extractor`.
|
|
42
|
+
- See the nyc-taxi example for a working subclass.
|
|
43
|
+
"""
|
|
44
|
+
def __init__(self,
|
|
45
|
+
context: ETLContext,
|
|
46
|
+
file: Path,
|
|
47
|
+
model: Type[BaseModel] | None = None,
|
|
48
|
+
error_file: Path | None = None,
|
|
49
|
+
table_size: int = 10,
|
|
50
|
+
batch_size: int = 5000,
|
|
51
|
+
max_workers: int | None = None,
|
|
52
|
+
prefetch: int = 4,
|
|
53
|
+
**csv_reader_kwargs):
|
|
54
|
+
super().__init__(context)
|
|
55
|
+
self.file = file
|
|
56
|
+
self.model = model
|
|
57
|
+
if model is not None and error_file is None:
|
|
58
|
+
raise ValueError('you must provide error file if the model is specified')
|
|
59
|
+
self.error_file = error_file
|
|
60
|
+
self.table_size = table_size
|
|
61
|
+
self.batch_size = batch_size
|
|
62
|
+
self.max_workers = max_workers or table_size
|
|
63
|
+
self.prefetch = prefetch
|
|
64
|
+
self.csv_reader_kwargs = csv_reader_kwargs
|
|
65
|
+
|
|
66
|
+
def run_internal(self) -> TaskReturn:
|
|
67
|
+
csv = CSVBatchSource(self.file, self.context, self, **self.csv_reader_kwargs)
|
|
68
|
+
predecessor = csv
|
|
69
|
+
if self.model is not None:
|
|
70
|
+
predecessor = ValidationBatchProcessor(self.context, self, csv, self.model, self.error_file)
|
|
71
|
+
|
|
72
|
+
splitter = SplittingBatchProcessor(
|
|
73
|
+
context=self.context,
|
|
74
|
+
task=self,
|
|
75
|
+
predecessor=predecessor,
|
|
76
|
+
table_size=self.table_size,
|
|
77
|
+
id_extractor=self._id_extractor()
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
parallel = ParallelBatchProcessor(
|
|
81
|
+
context=self.context,
|
|
82
|
+
task=self,
|
|
83
|
+
predecessor=splitter,
|
|
84
|
+
worker_factory=lambda: CypherBatchSink(self.context, self, None, self._query()),
|
|
85
|
+
max_workers=self.max_workers,
|
|
86
|
+
prefetch=self.prefetch
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
closing = ClosedLoopBatchProcessor(self.context, self, parallel)
|
|
90
|
+
result = next(closing.get_batch(self.batch_size))
|
|
91
|
+
return TaskReturn(True, result.statistics)
|
|
92
|
+
|
|
93
|
+
def _id_extractor(self):
|
|
94
|
+
return dict_id_extractor()
|
|
95
|
+
|
|
96
|
+
@abc.abstractmethod
|
|
97
|
+
def _query(self):
|
|
98
|
+
pass
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Callable, Union
|
|
3
|
+
|
|
4
|
+
from etl_lib.core.ClosedLoopBatchProcessor import ClosedLoopBatchProcessor
|
|
5
|
+
from etl_lib.core.ETLContext import ETLContext
|
|
6
|
+
from etl_lib.core.ParallelBatchProcessor import ParallelBatchProcessor
|
|
7
|
+
from etl_lib.core.SplittingBatchProcessor import SplittingBatchProcessor, dict_id_extractor
|
|
8
|
+
from etl_lib.core.Task import Task, TaskReturn
|
|
9
|
+
from etl_lib.data_sink.CypherBatchSink import CypherBatchSink
|
|
10
|
+
from etl_lib.data_source.SQLBatchSource import SQLBatchSource
|
|
11
|
+
from sqlalchemy import text
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ParallelSQLLoad2Neo4jTask(Task, ABC):
|
|
15
|
+
"""
|
|
16
|
+
Parallelized version of SQLLoad2Neo4jTask: reads via SQLBatchSource,
|
|
17
|
+
splits into non-overlapping partitions (grid), processes each partition
|
|
18
|
+
in parallel through a CypherBatchSink, and closes the loop.
|
|
19
|
+
|
|
20
|
+
Subclasses must implement:
|
|
21
|
+
- _sql_query()
|
|
22
|
+
- _cypher_query()
|
|
23
|
+
- optionally override _count_query() and _id_extractor().
|
|
24
|
+
|
|
25
|
+
Control parameters:
|
|
26
|
+
batch_size: max items per partition batch
|
|
27
|
+
table_size: dimension of the splitting grid
|
|
28
|
+
max_workers: parallel threads per partition group (defaults to table_size)
|
|
29
|
+
prefetch: number of partition-groups to prefetch
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
context: ETLContext,
|
|
35
|
+
batch_size: int = 5000,
|
|
36
|
+
table_size: int = 10,
|
|
37
|
+
max_workers: int = None,
|
|
38
|
+
prefetch: int = 4
|
|
39
|
+
):
|
|
40
|
+
super().__init__(context)
|
|
41
|
+
self.context = context
|
|
42
|
+
self.batch_size = batch_size
|
|
43
|
+
self.table_size = table_size
|
|
44
|
+
# default max_workers to table_size for full parallelism
|
|
45
|
+
self.max_workers = max_workers or table_size
|
|
46
|
+
self.prefetch = prefetch
|
|
47
|
+
|
|
48
|
+
@abstractmethod
|
|
49
|
+
def _sql_query(self) -> str:
|
|
50
|
+
"""
|
|
51
|
+
Return the SQL query to load source rows.
|
|
52
|
+
"""
|
|
53
|
+
pass
|
|
54
|
+
|
|
55
|
+
@abstractmethod
|
|
56
|
+
def _cypher_query(self) -> str:
|
|
57
|
+
"""
|
|
58
|
+
Return the Cypher query to write rows into Neo4j.
|
|
59
|
+
"""
|
|
60
|
+
pass
|
|
61
|
+
|
|
62
|
+
def _count_query(self) -> Union[str, None]:
|
|
63
|
+
"""
|
|
64
|
+
Optional SQL to count source rows for progress reporting.
|
|
65
|
+
"""
|
|
66
|
+
return None
|
|
67
|
+
|
|
68
|
+
def _id_extractor(self) -> Callable:
|
|
69
|
+
"""
|
|
70
|
+
Extractor mapping each row item to a (row,col) partition index.
|
|
71
|
+
Default expects dict rows with 'start' and 'end' keys.
|
|
72
|
+
Override to customize.
|
|
73
|
+
"""
|
|
74
|
+
return dict_id_extractor()
|
|
75
|
+
|
|
76
|
+
def run_internal(self) -> TaskReturn:
|
|
77
|
+
# total count for ClosedLoopBatchProcessor
|
|
78
|
+
total_count = self.__get_source_count()
|
|
79
|
+
# source of raw rows
|
|
80
|
+
source = SQLBatchSource(self.context, self, self._sql_query())
|
|
81
|
+
|
|
82
|
+
# splitter: non-overlapping partitions as defined by the id_extractor
|
|
83
|
+
splitter = SplittingBatchProcessor(
|
|
84
|
+
context=self.context,
|
|
85
|
+
task=self,
|
|
86
|
+
predecessor=source,
|
|
87
|
+
table_size=self.table_size,
|
|
88
|
+
id_extractor=self._id_extractor()
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# parallel processor: runs CypherBatchSink on each partition concurrently
|
|
92
|
+
parallel = ParallelBatchProcessor(
|
|
93
|
+
context=self.context,
|
|
94
|
+
task=self,
|
|
95
|
+
worker_factory=lambda: CypherBatchSink(context=self.context, task=self, predecessor=None,
|
|
96
|
+
query=self._cypher_query()),
|
|
97
|
+
predecessor=splitter,
|
|
98
|
+
max_workers=self.max_workers,
|
|
99
|
+
prefetch=self.prefetch
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# close loop: drives the pipeline and reports progress
|
|
103
|
+
closing = ClosedLoopBatchProcessor(
|
|
104
|
+
context=self.context,
|
|
105
|
+
task=self,
|
|
106
|
+
predecessor=parallel,
|
|
107
|
+
expected_rows=total_count
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# run once to completion and return aggregated stats
|
|
111
|
+
result = next(closing.get_batch(self.batch_size))
|
|
112
|
+
return TaskReturn(True, result.statistics)
|
|
113
|
+
|
|
114
|
+
def __get_source_count(self):
|
|
115
|
+
count_query = self._count_query()
|
|
116
|
+
if count_query is None:
|
|
117
|
+
return None
|
|
118
|
+
with self.context.sql.engine.connect() as conn:
|
|
119
|
+
with conn.begin():
|
|
120
|
+
result = conn.execute(text(count_query))
|
|
121
|
+
row = result.fetchone()
|
|
122
|
+
return row[0] if row else None
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
from abc import abstractmethod
|
|
2
|
+
|
|
3
|
+
from etl_lib.core.ETLContext import ETLContext
|
|
4
|
+
from sqlalchemy import text
|
|
5
|
+
|
|
6
|
+
from etl_lib.core.ClosedLoopBatchProcessor import ClosedLoopBatchProcessor
|
|
7
|
+
from etl_lib.core.Task import Task, TaskReturn
|
|
8
|
+
from etl_lib.data_sink.CypherBatchSink import CypherBatchSink
|
|
9
|
+
from etl_lib.data_source.SQLBatchSource import SQLBatchSource
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SQLLoad2Neo4jTask(Task):
|
|
13
|
+
'''
|
|
14
|
+
Load the output of the specified SQL query to Neo4j.
|
|
15
|
+
|
|
16
|
+
Uses BatchProcessors to read and write data.
|
|
17
|
+
Subclasses must implement the methods returning the SQL and Cypher queries.
|
|
18
|
+
|
|
19
|
+
Example usage: (from the MusicBrainz example)
|
|
20
|
+
|
|
21
|
+
.. code-block:: python
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class LoadArtistCreditTask(SQLLoad2Neo4jTask):
|
|
25
|
+
def _sql_query(self) -> str:
|
|
26
|
+
return """
|
|
27
|
+
SELECT ac.id AS artist_credit_id, ac.name AS credit_name
|
|
28
|
+
FROM artist_credit ac;
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def _cypher_query(self) -> str:
|
|
32
|
+
return """
|
|
33
|
+
UNWIND $batch AS row
|
|
34
|
+
MERGE (ac:ArtistCredit {id: row.artist_credit_id})
|
|
35
|
+
SET ac.name = row.credit_name
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def _count_query(self) -> str | None:
|
|
39
|
+
return "SELECT COUNT(*) FROM artist_credit;"
|
|
40
|
+
|
|
41
|
+
'''
|
|
42
|
+
|
|
43
|
+
def __init__(self, context: ETLContext, batch_size: int = 5000):
|
|
44
|
+
super().__init__(context)
|
|
45
|
+
self.context = context
|
|
46
|
+
self.batch_size = batch_size
|
|
47
|
+
|
|
48
|
+
@abstractmethod
|
|
49
|
+
def _sql_query(self) -> str:
|
|
50
|
+
"""
|
|
51
|
+
Return the SQL query to load the source data.
|
|
52
|
+
"""
|
|
53
|
+
pass
|
|
54
|
+
|
|
55
|
+
@abstractmethod
|
|
56
|
+
def _cypher_query(self) -> str:
|
|
57
|
+
"""
|
|
58
|
+
Return the Cypher query to write the data in batches to Neo4j.
|
|
59
|
+
"""
|
|
60
|
+
pass
|
|
61
|
+
|
|
62
|
+
def _count_query(self) -> str | None:
|
|
63
|
+
"""
|
|
64
|
+
Return the SQL query to count the number of rows returned from :func:`_sql_query`.
|
|
65
|
+
|
|
66
|
+
Optional. If provided, it will run once at the beginning of the task and
|
|
67
|
+
provide the :class:`etl_lib.core.ClosedLoopBatchProcessor` with the total number of rows.
|
|
68
|
+
"""
|
|
69
|
+
return None
|
|
70
|
+
|
|
71
|
+
def run_internal(self) -> TaskReturn:
|
|
72
|
+
total_count = self.__get_source_count()
|
|
73
|
+
source = SQLBatchSource(self.context, self, self._sql_query())
|
|
74
|
+
sink = CypherBatchSink(self.context, self, source, self._cypher_query())
|
|
75
|
+
|
|
76
|
+
end = ClosedLoopBatchProcessor(self.context, self, sink, total_count)
|
|
77
|
+
|
|
78
|
+
result = next(end.get_batch(self.batch_size))
|
|
79
|
+
return TaskReturn(True, result.statistics)
|
|
80
|
+
|
|
81
|
+
def __get_source_count(self):
|
|
82
|
+
count_query = self._count_query()
|
|
83
|
+
if count_query is None:
|
|
84
|
+
return None
|
|
85
|
+
|
|
86
|
+
with self.context.sql.engine.connect() as conn:
|
|
87
|
+
with conn.begin():
|
|
88
|
+
result = conn.execute(text(count_query))
|
|
89
|
+
row = result.fetchone()
|
|
90
|
+
return row[0] if row else None
|
etl_lib/test_utils/utils.py
CHANGED
|
@@ -7,7 +7,7 @@ from _pytest.tmpdir import tmp_path
|
|
|
7
7
|
from neo4j import Driver
|
|
8
8
|
from neo4j.time import Date
|
|
9
9
|
|
|
10
|
-
from etl_lib.core.ETLContext import QueryResult, Neo4jContext, ETLContext
|
|
10
|
+
from etl_lib.core.ETLContext import QueryResult, Neo4jContext, ETLContext, SQLContext, gds
|
|
11
11
|
from etl_lib.core.Task import Task
|
|
12
12
|
|
|
13
13
|
|
|
@@ -102,6 +102,7 @@ class TestNeo4jContext(Neo4jContext):
|
|
|
102
102
|
self.logger = logging.getLogger(self.__class__.__name__)
|
|
103
103
|
self.driver = driver
|
|
104
104
|
self.database = get_database_name()
|
|
105
|
+
self.gds = gds(self)
|
|
105
106
|
|
|
106
107
|
|
|
107
108
|
class TestETLContext(ETLContext):
|
|
@@ -116,6 +117,16 @@ class TestETLContext(ETLContext):
|
|
|
116
117
|
if key in self.__env_vars:
|
|
117
118
|
return self.__env_vars[key]
|
|
118
119
|
|
|
120
|
+
class TestSQLETLContext(ETLContext):
|
|
121
|
+
|
|
122
|
+
def __init__(self, sql_uri):
|
|
123
|
+
self.logger = logging.getLogger(self.__class__.__name__)
|
|
124
|
+
self.reporter = DummyReporter()
|
|
125
|
+
self.sql = SQLContext(sql_uri)
|
|
126
|
+
|
|
127
|
+
def env(self, key: str) -> Any:
|
|
128
|
+
if key in self.__env_vars:
|
|
129
|
+
return self.__env_vars[key]
|
|
119
130
|
|
|
120
131
|
class DummyReporter:
|
|
121
132
|
|
|
@@ -151,3 +162,10 @@ class DummyContext:
|
|
|
151
162
|
|
|
152
163
|
def env(self, key: str) -> Any:
|
|
153
164
|
pass
|
|
165
|
+
|
|
166
|
+
class DummyPredecessor:
|
|
167
|
+
def __init__(self, batches):
|
|
168
|
+
self.batches = batches
|
|
169
|
+
|
|
170
|
+
def get_batch(self, batch_size):
|
|
171
|
+
yield from self.batches
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: neo4j-etl-lib
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Building blocks for ETL pipelines.
|
|
5
5
|
Keywords: etl,graph,database
|
|
6
6
|
Author-email: Bert Radke <bert.radke@pm.me>
|
|
@@ -15,10 +15,11 @@ Classifier: Topic :: Database
|
|
|
15
15
|
Classifier: Development Status :: 4 - Beta
|
|
16
16
|
License-File: LICENSE
|
|
17
17
|
Requires-Dist: pydantic>=2.10.5; python_version >= '3.8'
|
|
18
|
-
Requires-Dist: neo4j>=5.27.0; python_version >= '3.7'
|
|
18
|
+
Requires-Dist: neo4j-rust-ext>=5.27.0; python_version >= '3.7'
|
|
19
19
|
Requires-Dist: python-dotenv>=1.0.1; python_version >= '3.8'
|
|
20
20
|
Requires-Dist: tabulate>=0.9.0; python_version >= '3.7'
|
|
21
21
|
Requires-Dist: click>=8.1.8; python_version >= '3.7'
|
|
22
|
+
Requires-Dist: pydantic[email-validator]
|
|
22
23
|
Requires-Dist: pytest>=8.3.0 ; extra == "dev" and ( python_version >= '3.8')
|
|
23
24
|
Requires-Dist: testcontainers[neo4j]==4.9.0 ; extra == "dev" and ( python_version >= '3.9' and python_version < '4.0')
|
|
24
25
|
Requires-Dist: pytest-cov ; extra == "dev"
|
|
@@ -31,11 +32,15 @@ Requires-Dist: pydata-sphinx-theme ; extra == "dev"
|
|
|
31
32
|
Requires-Dist: sphinx-autodoc-typehints ; extra == "dev"
|
|
32
33
|
Requires-Dist: sphinxcontrib-napoleon ; extra == "dev"
|
|
33
34
|
Requires-Dist: sphinx-autoapi ; extra == "dev"
|
|
35
|
+
Requires-Dist: sqlalchemy ; extra == "dev"
|
|
36
|
+
Requires-Dist: psycopg2-binary ; extra == "dev"
|
|
34
37
|
Requires-Dist: graphdatascience>=1.13 ; extra == "gds" and ( python_version >= '3.9')
|
|
38
|
+
Requires-Dist: sqlalchemy ; extra == "sql"
|
|
35
39
|
Project-URL: Documentation, https://neo-technology-field.github.io/python-etl-lib/index.html
|
|
36
40
|
Project-URL: Home, https://github.com/neo-technology-field/python-etl-lib
|
|
37
41
|
Provides-Extra: dev
|
|
38
42
|
Provides-Extra: gds
|
|
43
|
+
Provides-Extra: sql
|
|
39
44
|
|
|
40
45
|
# Neo4j ETL Toolbox
|
|
41
46
|
|
|
@@ -43,7 +48,13 @@ A Python library of building blocks to assemble etl pipelines.
|
|
|
43
48
|
|
|
44
49
|
Complete documentation can be found on https://neo-technology-field.github.io/python-etl-lib/index.html
|
|
45
50
|
|
|
46
|
-
See https://github.com/neo-technology-field/python-etl-lib/tree/main/examples/gtfs
|
|
51
|
+
See https://github.com/neo-technology-field/python-etl-lib/tree/main/examples/gtfs
|
|
52
|
+
|
|
53
|
+
or
|
|
54
|
+
|
|
55
|
+
https://github.com/neo-technology-field/python-etl-lib/tree/main/examples/musicbrainz
|
|
56
|
+
|
|
57
|
+
for example projects.
|
|
47
58
|
|
|
48
59
|
|
|
49
60
|
The library can be installed via
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
etl_lib/__init__.py,sha256=FyaTAuElsn3y3j1g15X141PnLTYxPrSUVU_YaVmiyPs,65
|
|
2
|
+
etl_lib/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
etl_lib/cli/run_tools.py,sha256=KIar-y22P4kKm-yoJjecYsPwqC7U76M71dEgFO5-ZBo,8561
|
|
4
|
+
etl_lib/core/BatchProcessor.py,sha256=mRpdxZ6ZMKI8XsY3TPuy4dVcvRqLKCO-p63KeOhFyKE,3417
|
|
5
|
+
etl_lib/core/ClosedLoopBatchProcessor.py,sha256=WzML1nldhZRbP8fhlD6utuK5SBYRl1cJgEobVDIdBP4,1626
|
|
6
|
+
etl_lib/core/ETLContext.py,sha256=wmEnbs3n_80B6La9Py_-MHG8BN0FajE9MjGPej0A3To,8045
|
|
7
|
+
etl_lib/core/ParallelBatchProcessor.py,sha256=jNo1Xv1Ts34UZIseoQLDZOhHOVeEr8dUibKUt0FJ4Hw,7318
|
|
8
|
+
etl_lib/core/ProgressReporter.py,sha256=UvWAPCuOrqyUcb5_kosIsCg1dyVQL-tnjgqnzs2cwZA,9372
|
|
9
|
+
etl_lib/core/SplittingBatchProcessor.py,sha256=OIRMUVFpUoZc0w__JJjUr7B9QC3sBlqQp41xghrQzC0,11616
|
|
10
|
+
etl_lib/core/Task.py,sha256=muQFY5qj2n-ZVV8F6vlHqo2lVSvB3wtGdIgkSXVpOFM,9365
|
|
11
|
+
etl_lib/core/ValidationBatchProcessor.py,sha256=U1M2Qp9Ledt8qFiHAg8zMxE9lLRkBrr51NKs_Y8skK8,3400
|
|
12
|
+
etl_lib/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
|
+
etl_lib/core/utils.py,sha256=9gezAateAmtWZ5lr6ZWYlTtgrj4-CUKJSGz7g1upnRg,2293
|
|
14
|
+
etl_lib/data_sink/CSVBatchSink.py,sha256=oq4VJwnA4WSyJzdvwstGv73vOEuWmPSfCynhVmxBByU,2204
|
|
15
|
+
etl_lib/data_sink/CypherBatchSink.py,sha256=nBH4bzN1IvdSFcKgiAIrAY5IauB565sdyVrnRc1hg_4,1566
|
|
16
|
+
etl_lib/data_sink/SQLBatchSink.py,sha256=vyGrrxpdmCLUZMI2_W2ORej3FLGbwN9-b2GMYHd-k9g,1451
|
|
17
|
+
etl_lib/data_sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
|
+
etl_lib/data_source/CSVBatchSource.py,sha256=HILkaQAFua1OM7xgSNKY6teXpcJjWUPaS4Aol-GLYL8,2767
|
|
19
|
+
etl_lib/data_source/CypherBatchSource.py,sha256=06WuW11BqYjAXBZqL96Qr9MR8JrcjujDpxXe8cI-SYY,2238
|
|
20
|
+
etl_lib/data_source/SQLBatchSource.py,sha256=O3ZA2GXvo5j_KGwOILzguYZMPY_FJkV5j8FIa3-d9oM,4067
|
|
21
|
+
etl_lib/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
|
+
etl_lib/task/CreateReportingConstraintsTask.py,sha256=nTcHLBIgXz_h2OQg-SHjQr68bhH974u0MwrtWPnVwng,762
|
|
23
|
+
etl_lib/task/ExecuteCypherTask.py,sha256=thE8YTZzv1abxNhhDcb4p4ke6qmI6kWR4XQ-GrCBBBU,1284
|
|
24
|
+
etl_lib/task/GDSTask.py,sha256=X1E83wYa-N7AXy43WPEqIy77d__z-2wpBjWNhGNXJzA,1781
|
|
25
|
+
etl_lib/task/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
|
+
etl_lib/task/data_loading/CSVLoad2Neo4jTask.py,sha256=9XiVdJHpABE-Hx1bsvTKLJWtChc8XMwXeO5RicaHDUo,3873
|
|
27
|
+
etl_lib/task/data_loading/ParallelCSVLoad2Neo4jTask.py,sha256=2xN-5bHV9XgoaJLbbTEBuJFoZHV_CYi_hg6M1HQ-ffA,4030
|
|
28
|
+
etl_lib/task/data_loading/ParallelSQLLoad2Neo4jTask.py,sha256=9xpCW5i8yGnUHyg475Z8rjN2P5NDshJGUEgTU6sm2Bs,4286
|
|
29
|
+
etl_lib/task/data_loading/SQLLoad2Neo4jTask.py,sha256=HR3DcjOUkQN4SbCkgQYzljQCYhOcb-x2-DR5dBdapzU,2953
|
|
30
|
+
etl_lib/task/data_loading/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
31
|
+
etl_lib/test_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
32
|
+
etl_lib/test_utils/utils.py,sha256=kxWJqdRf1pg-4ByMfrtW3HDbgXIvyVtLndGDVvMCmoI,5641
|
|
33
|
+
neo4j_etl_lib-0.3.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
34
|
+
neo4j_etl_lib-0.3.0.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
|
|
35
|
+
neo4j_etl_lib-0.3.0.dist-info/METADATA,sha256=GJcjdPvmzjEUq0pLndSSVzOg3c7CR6bIWz3sB_9tkVY,2506
|
|
36
|
+
neo4j_etl_lib-0.3.0.dist-info/RECORD,,
|
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
etl_lib/__init__.py,sha256=sxY6lj4IZU25bZRF4lb6N5nn6yH1W4S1Qqysw-NzcXI,65
|
|
2
|
-
etl_lib/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
etl_lib/cli/run_tools.py,sha256=KAm6XRz5audOP_PhMVozEckvdeMJ0HfyleEFc5jAalc,8579
|
|
4
|
-
etl_lib/core/BatchProcessor.py,sha256=6quNPE9Dp8hYJDQDTqxQtxbQ3KCmb56Mko34EIsNhyI,3352
|
|
5
|
-
etl_lib/core/ClosedLoopBatchProcessor.py,sha256=unlx_A339oi2nOOXF0irrVf8j_GFhwcTuk_w5liqbWc,1321
|
|
6
|
-
etl_lib/core/ETLContext.py,sha256=ZTk_IDILpjUji0DphPUzTNx8k_2hZRxy37mqIcEA-kM,5641
|
|
7
|
-
etl_lib/core/ProgressReporter.py,sha256=QR9ZwwyHEEBYa8i3Udc5J68Ir1bsPIM1fFyt0n_lqFU,8407
|
|
8
|
-
etl_lib/core/Task.py,sha256=3e8iVXSfXaeBecvgTcs2LiIf2JwpKETRFhH4ig6lock,9202
|
|
9
|
-
etl_lib/core/ValidationBatchProcessor.py,sha256=EhO6PFQB-4PZgIOTXP4PwkbAl5HRK0zgTeKMseRU5QU,3261
|
|
10
|
-
etl_lib/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
-
etl_lib/core/utils.py,sha256=wwfyvy78fL6sqHdV0IFqAVyEkp6vo5Yo8gRZua2dulw,816
|
|
12
|
-
etl_lib/data_sink/CSVBatchSink.py,sha256=oq4VJwnA4WSyJzdvwstGv73vOEuWmPSfCynhVmxBByU,2204
|
|
13
|
-
etl_lib/data_sink/CypherBatchSink.py,sha256=RMuelUat55ojLQMRYmoiXG0D_fgWH0RLbmUd01UMv_c,1511
|
|
14
|
-
etl_lib/data_sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
|
-
etl_lib/data_source/CSVBatchSource.py,sha256=HILkaQAFua1OM7xgSNKY6teXpcJjWUPaS4Aol-GLYL8,2767
|
|
16
|
-
etl_lib/data_source/CypherBatchSource.py,sha256=06WuW11BqYjAXBZqL96Qr9MR8JrcjujDpxXe8cI-SYY,2238
|
|
17
|
-
etl_lib/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
|
-
etl_lib/task/CreateReportingConstraintsTask.py,sha256=aV5i1EwjfuG-eEGoNaB-NcaPhyu0NgdVhmZr5MIv8ak,760
|
|
19
|
-
etl_lib/task/ExecuteCypherTask.py,sha256=thE8YTZzv1abxNhhDcb4p4ke6qmI6kWR4XQ-GrCBBBU,1284
|
|
20
|
-
etl_lib/task/GDSTask.py,sha256=X1E83wYa-N7AXy43WPEqIy77d__z-2wpBjWNhGNXJzA,1781
|
|
21
|
-
etl_lib/task/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
|
-
etl_lib/task/data_loading/CSVLoad2Neo4jTask.py,sha256=US9Sa6ytPPEa6BSVUBttlWdKzqyxlF-09If5XCf-LIE,2277
|
|
23
|
-
etl_lib/task/data_loading/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
|
-
etl_lib/test_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
|
-
etl_lib/test_utils/utils.py,sha256=j7RMhT5Q69-5EAVwB1hePPJobq69_uYxuMTfd6gnbbc,5109
|
|
26
|
-
neo4j_etl_lib-0.1.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
27
|
-
neo4j_etl_lib-0.1.1.dist-info/WHEEL,sha256=_2ozNFCLWc93bK4WKHCO-eDUENDlo-dgc9cU3qokYO4,82
|
|
28
|
-
neo4j_etl_lib-0.1.1.dist-info/METADATA,sha256=LG9xc0NIjBUtdRZwLl9O8WpSXjJqCWLIWf0m8j0iZHQ,2210
|
|
29
|
-
neo4j_etl_lib-0.1.1.dist-info/RECORD,,
|
|
File without changes
|