neo4j-etl-lib 0.0.3__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- etl_lib/__init__.py +1 -1
- etl_lib/cli/run_tools.py +45 -13
- etl_lib/core/ETLContext.py +0 -2
- etl_lib/core/ProgressReporter.py +13 -15
- etl_lib/core/Task.py +3 -7
- etl_lib/core/ValidationBatchProcessor.py +1 -1
- etl_lib/data_sink/CSVBatchSink.py +57 -0
- etl_lib/data_sink/{CypherBatchProcessor.py → CypherBatchSink.py} +5 -5
- etl_lib/data_source/{CSVBatchProcessor.py → CSVBatchSource.py} +3 -3
- etl_lib/data_source/CypherBatchSource.py +62 -0
- etl_lib/task/ExecuteCypherTask.py +2 -2
- etl_lib/task/data_loading/CSVLoad2Neo4jTask.py +21 -5
- etl_lib/test_utils/__init__.py +0 -0
- etl_lib/test_utils/utils.py +153 -0
- {neo4j_etl_lib-0.0.3.dist-info → neo4j_etl_lib-0.1.1.dist-info}/METADATA +3 -2
- neo4j_etl_lib-0.1.1.dist-info/RECORD +29 -0
- {neo4j_etl_lib-0.0.3.dist-info → neo4j_etl_lib-0.1.1.dist-info}/WHEEL +1 -1
- neo4j_etl_lib-0.0.3.dist-info/RECORD +0 -25
- {neo4j_etl_lib-0.0.3.dist-info → neo4j_etl_lib-0.1.1.dist-info/licenses}/LICENSE +0 -0
etl_lib/__init__.py
CHANGED
etl_lib/cli/run_tools.py
CHANGED
|
@@ -55,7 +55,7 @@ def __driver(ctx):
|
|
|
55
55
|
database_name = ctx.obj["database_name"]
|
|
56
56
|
neo4j_password = ctx.obj["neo4j_password"]
|
|
57
57
|
return GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password), database=database_name,
|
|
58
|
-
notifications_min_severity="OFF", user_agent="ETL CLI
|
|
58
|
+
notifications_min_severity="OFF", user_agent="ETL CLI")
|
|
59
59
|
|
|
60
60
|
|
|
61
61
|
@click.group()
|
|
@@ -98,7 +98,7 @@ def cli(ctx, neo4j_uri, neo4j_user, neo4j_password, log_file, database_name):
|
|
|
98
98
|
@click.pass_context
|
|
99
99
|
def query(ctx, number_runs):
|
|
100
100
|
"""
|
|
101
|
-
Retrieve the list of the last x
|
|
101
|
+
Retrieve the list of the last x ETL runs from the database and display them.
|
|
102
102
|
"""
|
|
103
103
|
print(f"Listing runs in database '{ctx.obj['database_name']}'")
|
|
104
104
|
with __driver(ctx) as driver:
|
|
@@ -165,25 +165,57 @@ def detail(ctx, run_id, details):
|
|
|
165
165
|
__print_details(driver, run_id)
|
|
166
166
|
|
|
167
167
|
|
|
168
|
+
# noinspection PyTypeChecker
|
|
168
169
|
@cli.command()
|
|
169
|
-
@click.option('--run-id', required=False, help='Run
|
|
170
|
-
@click.option('--
|
|
171
|
-
@click.option('--older', help='Delete runs older than
|
|
170
|
+
@click.option('--run-id', required=False, type=str, help='Run IDs to delete, works with comma separated list')
|
|
171
|
+
@click.option('--before', type=click.DateTime(formats=["%Y-%m-%d"]), help='Delete runs before a specific date in format YYYY-MM-DD')
|
|
172
|
+
@click.option('--older', help='Delete runs older than x days', type=int)
|
|
172
173
|
@click.pass_context
|
|
173
|
-
def delete(ctx, run_id,
|
|
174
|
+
def delete(ctx, run_id, before, older):
|
|
174
175
|
"""
|
|
175
|
-
Delete runs based on run ID, date, or age. One and only one of --run-id, --
|
|
176
|
+
Delete runs based on run ID, date, or age. One and only one of --run-id, --before, or --older must be provided.
|
|
176
177
|
"""
|
|
177
178
|
# Ensure mutual exclusivity
|
|
178
|
-
options = [run_id,
|
|
179
|
+
options = [run_id, before, older]
|
|
179
180
|
if sum(bool(opt) for opt in options) != 1:
|
|
180
|
-
print("You must specify exactly one of --run-id, --
|
|
181
|
+
print("You must specify exactly one of --run-id, --before, or --older.")
|
|
181
182
|
return
|
|
182
183
|
|
|
183
184
|
if run_id:
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
185
|
+
ids = run_id.split(',')
|
|
186
|
+
delete_runs(ctx, ids)
|
|
187
|
+
elif before:
|
|
188
|
+
print(f"Deleting runs before: {before}")
|
|
189
|
+
with __driver(ctx) as driver:
|
|
190
|
+
record= driver.execute_query(
|
|
191
|
+
"""MATCH (r:ETLRun) WHERE date(r.startTime) < date($before)
|
|
192
|
+
RETURN collect(r.uuid) AS ids
|
|
193
|
+
""",
|
|
194
|
+
result_transformer_=neo4j.Result.single,
|
|
195
|
+
before=before)
|
|
196
|
+
ids = record[0]
|
|
197
|
+
delete_runs(ctx, ids)
|
|
198
|
+
|
|
187
199
|
elif older:
|
|
188
200
|
print(f"Deleting runs older than: {older}")
|
|
189
|
-
|
|
201
|
+
with __driver(ctx) as driver:
|
|
202
|
+
record = driver.execute_query(
|
|
203
|
+
"""MATCH (r:ETLRun) WHERE date(r.startTime) < (date() - duration({days: $days}))
|
|
204
|
+
RETURN collect(r.uuid) AS ids
|
|
205
|
+
""",
|
|
206
|
+
result_transformer_=neo4j.Result.single,
|
|
207
|
+
days=older)
|
|
208
|
+
ids = record[0]
|
|
209
|
+
delete_runs(ctx, ids)
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def delete_runs(ctx, ids):
|
|
213
|
+
print(f"Deleting run IDs: {ids}")
|
|
214
|
+
with __driver(ctx) as driver:
|
|
215
|
+
records, _, _ = driver.execute_query(
|
|
216
|
+
"""
|
|
217
|
+
MATCH (r:ETLRun)-[*]->(n) WHERE r.uuid IN $ids
|
|
218
|
+
DETACH DELETE n
|
|
219
|
+
DETACH DELETE r
|
|
220
|
+
""", ids=ids, routing_=neo4j.RoutingControl.WRITE)
|
|
221
|
+
print(f"Deleted run IDs: {ids} successfully")
|
etl_lib/core/ETLContext.py
CHANGED
|
@@ -130,8 +130,6 @@ class ETLContext:
|
|
|
130
130
|
Will be passed to all :py:class:`etl_lib.core.Task` to provide access to environment variables and functionally
|
|
131
131
|
deemed general enough that all parts of the ETL pipeline would need it.
|
|
132
132
|
"""
|
|
133
|
-
neo4j: Neo4jContext
|
|
134
|
-
__env_vars: dict
|
|
135
133
|
|
|
136
134
|
def __init__(self, env_vars: dict):
|
|
137
135
|
"""
|
etl_lib/core/ProgressReporter.py
CHANGED
|
@@ -3,7 +3,7 @@ from datetime import datetime
|
|
|
3
3
|
|
|
4
4
|
from tabulate import tabulate
|
|
5
5
|
|
|
6
|
-
from etl_lib.core.Task import Task, TaskGroup
|
|
6
|
+
from etl_lib.core.Task import Task, TaskGroup, TaskReturn
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class ProgressReporter:
|
|
@@ -47,7 +47,7 @@ class ProgressReporter:
|
|
|
47
47
|
self.logger.info(f"{'\t' * task.depth}starting {task.task_name()}")
|
|
48
48
|
return task
|
|
49
49
|
|
|
50
|
-
def finished_task(self, task: Task,
|
|
50
|
+
def finished_task(self, task: Task, result: TaskReturn) -> Task:
|
|
51
51
|
"""
|
|
52
52
|
Marks the task as finished.
|
|
53
53
|
|
|
@@ -55,23 +55,21 @@ class ProgressReporter:
|
|
|
55
55
|
|
|
56
56
|
Args:
|
|
57
57
|
task: Task to be marked as finished.
|
|
58
|
-
|
|
59
|
-
summery: statistics for this task (such as `nodes_created`)
|
|
60
|
-
error: If an exception occurred, the exception text should be provided here.
|
|
58
|
+
result: result of the task execution, such as status and summery information.
|
|
61
59
|
|
|
62
60
|
Returns:
|
|
63
61
|
Task to be marked as started.
|
|
64
62
|
"""
|
|
65
63
|
task.end_time = datetime.now()
|
|
66
|
-
task.success = success
|
|
67
|
-
task.summery = summery
|
|
64
|
+
task.success = result.success
|
|
65
|
+
task.summery = result.summery
|
|
68
66
|
|
|
69
|
-
report = f"{'\t' * task.depth} finished {task.task_name()} in {task.end_time - task.start_time} with success: {success}"
|
|
70
|
-
if error is not None:
|
|
71
|
-
report += f", error: \n{error}"
|
|
67
|
+
report = f"{'\t' * task.depth} finished {task.task_name()} in {task.end_time - task.start_time} with success: {result.success}"
|
|
68
|
+
if result.error is not None:
|
|
69
|
+
report += f", error: \n{result.error}"
|
|
72
70
|
else:
|
|
73
71
|
# for the logger, remove entries with 0, but keep them in the original for reporting
|
|
74
|
-
cleaned_summery = {key: value for key, value in summery.items() if value != 0}
|
|
72
|
+
cleaned_summery = {key: value for key, value in result.summery.items() if value != 0}
|
|
75
73
|
if len(cleaned_summery) > 0:
|
|
76
74
|
report += f"\n{tabulate([cleaned_summery], headers='keys', tablefmt='psql')}"
|
|
77
75
|
self.logger.info(report)
|
|
@@ -168,9 +166,9 @@ class Neo4jProgressReporter(ProgressReporter):
|
|
|
168
166
|
start_time=task.start_time)
|
|
169
167
|
return task
|
|
170
168
|
|
|
171
|
-
def finished_task(self, task: Task,
|
|
172
|
-
super().finished_task(task=task,
|
|
173
|
-
if success:
|
|
169
|
+
def finished_task(self, task: Task, result: TaskReturn) -> Task:
|
|
170
|
+
super().finished_task(task=task, result=result)
|
|
171
|
+
if result.success:
|
|
174
172
|
status = "success"
|
|
175
173
|
else:
|
|
176
174
|
status = "failure"
|
|
@@ -179,7 +177,7 @@ class Neo4jProgressReporter(ProgressReporter):
|
|
|
179
177
|
MATCH (t:ETLTask {uuid:$id}) SET t.endTime = $end_time, t.status = $status, t.error = $error
|
|
180
178
|
CREATE (s:ETLStats) SET s=$summery
|
|
181
179
|
CREATE (t)-[:HAS_STATS]->(s)
|
|
182
|
-
""", id=task.uuid, end_time=task.end_time, summery=summery, status=status, error=error)
|
|
180
|
+
""", id=task.uuid, end_time=task.end_time, summery=result.summery, status=status, error=result.error)
|
|
183
181
|
return task
|
|
184
182
|
|
|
185
183
|
def __create_constraints(self):
|
etl_lib/core/Task.py
CHANGED
|
@@ -46,7 +46,8 @@ class TaskReturn:
|
|
|
46
46
|
|
|
47
47
|
# Combine success values and errors
|
|
48
48
|
combined_success = self.success and other.success
|
|
49
|
-
combined_error =
|
|
49
|
+
combined_error = None if not (self.error or other.error) \
|
|
50
|
+
else f"{self.error or ''} | {other.error or ''}".strip(" |")
|
|
50
51
|
|
|
51
52
|
return TaskReturn(
|
|
52
53
|
success=combined_success, summery=merged_summery, error=combined_error
|
|
@@ -99,12 +100,7 @@ class Task:
|
|
|
99
100
|
except Exception as e:
|
|
100
101
|
result = TaskReturn(success=False, summery={}, error=str(e))
|
|
101
102
|
|
|
102
|
-
self.context.reporter.finished_task(
|
|
103
|
-
task=self,
|
|
104
|
-
success=result.success,
|
|
105
|
-
summery=result.summery,
|
|
106
|
-
error=result.error,
|
|
107
|
-
)
|
|
103
|
+
self.context.reporter.finished_task(task=self,result=result)
|
|
108
104
|
|
|
109
105
|
return result
|
|
110
106
|
|
|
@@ -47,7 +47,7 @@ class ValidationBatchProcessor(BatchProcessor):
|
|
|
47
47
|
for row in batch.chunk:
|
|
48
48
|
try:
|
|
49
49
|
# Validate and transform the row
|
|
50
|
-
validated_row = self.model(**row).
|
|
50
|
+
validated_row = json.loads(self.model(**row).model_dump_json())
|
|
51
51
|
valid_rows.append(validated_row)
|
|
52
52
|
except ValidationError as e:
|
|
53
53
|
# Collect invalid rows with errors
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Generator
|
|
4
|
+
|
|
5
|
+
from etl_lib.core.ETLContext import ETLContext
|
|
6
|
+
from etl_lib.core.BatchProcessor import BatchProcessor, BatchResults, append_result
|
|
7
|
+
from etl_lib.core.Task import Task
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class CSVBatchSink(BatchProcessor):
|
|
11
|
+
"""
|
|
12
|
+
BatchProcessor to write batches of data to a CSV file.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, context: ETLContext, task: Task, predecessor: BatchProcessor, file_path: Path, **kwargs):
|
|
16
|
+
"""
|
|
17
|
+
Constructs a new CSVBatchSink.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
context: :class:`etl_lib.core.ETLContext.ETLContext` instance.
|
|
21
|
+
task: :class:`etl_lib.core.Task.Task` instance owning this batchProcessor.
|
|
22
|
+
predecessor: BatchProcessor which :func:`~get_batch` function will be called to receive batches to process.
|
|
23
|
+
file_path: Path to the CSV file where data will be written. If the file exists, data will be appended.
|
|
24
|
+
**kwargs: Additional arguments passed to `csv.DictWriter` to allow tuning the csv creation.
|
|
25
|
+
"""
|
|
26
|
+
super().__init__(context, task, predecessor)
|
|
27
|
+
self.file_path = file_path
|
|
28
|
+
self.file_initialized = False
|
|
29
|
+
self.csv_kwargs = kwargs
|
|
30
|
+
|
|
31
|
+
def get_batch(self, batch_size: int) -> Generator[BatchResults, None, None]:
|
|
32
|
+
assert self.predecessor is not None
|
|
33
|
+
|
|
34
|
+
for batch_result in self.predecessor.get_batch(batch_size):
|
|
35
|
+
self._write_to_csv(batch_result.chunk)
|
|
36
|
+
yield append_result(batch_result, {"rows_written": len(batch_result.chunk)})
|
|
37
|
+
|
|
38
|
+
def _write_to_csv(self, data: list[dict]):
|
|
39
|
+
"""
|
|
40
|
+
Writes a batch of data to the CSV file.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
data: A list of dictionaries representing rows of data.
|
|
44
|
+
"""
|
|
45
|
+
if not data:
|
|
46
|
+
return
|
|
47
|
+
|
|
48
|
+
fieldnames = data[0].keys()
|
|
49
|
+
write_header = not self.file_initialized or not self.file_path.exists()
|
|
50
|
+
|
|
51
|
+
with self.file_path.open(mode="a", newline="", encoding="utf-8") as csvfile:
|
|
52
|
+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, **self.csv_kwargs)
|
|
53
|
+
if write_header:
|
|
54
|
+
writer.writeheader()
|
|
55
|
+
writer.writerows(data)
|
|
56
|
+
|
|
57
|
+
self.file_initialized = True
|
|
@@ -5,19 +5,19 @@ from etl_lib.core.BatchProcessor import BatchProcessor, BatchResults, append_res
|
|
|
5
5
|
from etl_lib.core.Task import Task
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
class
|
|
8
|
+
class CypherBatchSink(BatchProcessor):
|
|
9
9
|
"""
|
|
10
10
|
BatchProcessor to write batches of data to a Neo4j database.
|
|
11
11
|
"""
|
|
12
12
|
|
|
13
13
|
def __init__(self, context: ETLContext, task: Task, predecessor: BatchProcessor, query: str):
|
|
14
14
|
"""
|
|
15
|
-
Constructs a new
|
|
15
|
+
Constructs a new CypherBatchSink.
|
|
16
16
|
|
|
17
17
|
Args:
|
|
18
|
-
context: :
|
|
19
|
-
task: :
|
|
20
|
-
predecessor: BatchProcessor which :
|
|
18
|
+
context: :class:`etl_lib.core.ETLContext.ETLContext` instance.
|
|
19
|
+
task: :class:`etl_lib.core.Task.Task` instance owning this batchProcessor.
|
|
20
|
+
predecessor: BatchProcessor which :func:`~get_batch` function will be called to receive batches to process.
|
|
21
21
|
query: Cypher to write the query to Neo4j.
|
|
22
22
|
Data will be passed as `batch` parameter.
|
|
23
23
|
Therefor, the query should start with a `UNWIND $batch AS row`.
|
|
@@ -7,7 +7,7 @@ from etl_lib.core.BatchProcessor import BatchProcessor, BatchResults
|
|
|
7
7
|
from etl_lib.core.Task import Task
|
|
8
8
|
|
|
9
9
|
|
|
10
|
-
class
|
|
10
|
+
class CSVBatchSource(BatchProcessor):
|
|
11
11
|
"""
|
|
12
12
|
BatchProcessor that reads a CSV file using the `csv` package.
|
|
13
13
|
|
|
@@ -18,11 +18,11 @@ class CSVBatchProcessor(BatchProcessor):
|
|
|
18
18
|
|
|
19
19
|
def __init__(self, csv_file: Path, context, task: Task = None, **kwargs):
|
|
20
20
|
"""
|
|
21
|
-
Constructs a new
|
|
21
|
+
Constructs a new CSVBatchSource.
|
|
22
22
|
|
|
23
23
|
Args:
|
|
24
24
|
csv_file: Path to the CSV file.
|
|
25
|
-
context: :
|
|
25
|
+
context: :class:`etl_lib.core.ETLContext.ETLContext` instance.
|
|
26
26
|
kwargs: Will be passed on to the `csv.DictReader` providing a way to customise the reading to different
|
|
27
27
|
csv formats.
|
|
28
28
|
"""
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from typing import Generator, Callable, Optional
|
|
2
|
+
|
|
3
|
+
from neo4j import Record
|
|
4
|
+
|
|
5
|
+
from etl_lib.core.BatchProcessor import BatchResults, BatchProcessor
|
|
6
|
+
from etl_lib.core.ETLContext import ETLContext
|
|
7
|
+
from etl_lib.core.Task import Task
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class CypherBatchSource(BatchProcessor):
|
|
11
|
+
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
context: ETLContext,
|
|
15
|
+
task: Task,
|
|
16
|
+
query: str,
|
|
17
|
+
record_transformer: Optional[Callable[[Record], dict]] = None,
|
|
18
|
+
**kwargs
|
|
19
|
+
):
|
|
20
|
+
"""
|
|
21
|
+
Constructs a new CypherBatchSource.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
context: :class:`etl_lib.core.ETLContext.ETLContext` instance.
|
|
25
|
+
task: :class:`etl_lib.core.Task.Task` instance owning this batchProcessor.
|
|
26
|
+
query: Cypher query to execute.
|
|
27
|
+
record_transformer: Optional function to transform each record. See Neo4j API documentation on `result_transformer_`
|
|
28
|
+
kwargs: Arguments passed as parameters with the query.
|
|
29
|
+
"""
|
|
30
|
+
super().__init__(context, task)
|
|
31
|
+
self.query = query
|
|
32
|
+
self.record_transformer = record_transformer
|
|
33
|
+
self.kwargs = kwargs
|
|
34
|
+
|
|
35
|
+
def __read_records(self, tx, batch_size):
|
|
36
|
+
batch_ = []
|
|
37
|
+
result = tx.run(self.query, **self.kwargs)
|
|
38
|
+
|
|
39
|
+
for record in result:
|
|
40
|
+
data = record.data()
|
|
41
|
+
if self.record_transformer:
|
|
42
|
+
data = self.record_transformer(data)
|
|
43
|
+
batch_.append(data)
|
|
44
|
+
|
|
45
|
+
if len(batch_) == batch_size:
|
|
46
|
+
yield batch_
|
|
47
|
+
batch_ = []
|
|
48
|
+
|
|
49
|
+
if batch_:
|
|
50
|
+
yield batch_
|
|
51
|
+
|
|
52
|
+
def get_batch(self, max_batch_size: int) -> Generator[BatchResults, None, None]:
|
|
53
|
+
# not using managed tx on purpose. First of, we want to keep the tx open while delivering batches
|
|
54
|
+
# automatic retry logic would help, as we do not want to start the query again
|
|
55
|
+
with self.context.neo4j.session() as session:
|
|
56
|
+
with session.begin_transaction() as tx:
|
|
57
|
+
for chunk in self.__read_records(tx, max_batch_size):
|
|
58
|
+
yield BatchResults(
|
|
59
|
+
chunk=chunk,
|
|
60
|
+
statistics={"cypher_rows_read": len(chunk)},
|
|
61
|
+
batch_size=len(chunk)
|
|
62
|
+
)
|
|
@@ -24,10 +24,10 @@ class ExecuteCypherTask(Task):
|
|
|
24
24
|
for query in self._query():
|
|
25
25
|
result = self.context.neo4j.query_database(session=session, query=query, **kwargs)
|
|
26
26
|
stats = merge_summery(stats, result.summery)
|
|
27
|
-
return TaskReturn(True, stats)
|
|
27
|
+
return TaskReturn(success=True, summery=stats)
|
|
28
28
|
else:
|
|
29
29
|
result = self.context.neo4j.query_database(session=session, query=self._query(), **kwargs)
|
|
30
|
-
return TaskReturn(True, result.summery)
|
|
30
|
+
return TaskReturn(success=True, summery=result.summery)
|
|
31
31
|
|
|
32
32
|
@abc.abstractmethod
|
|
33
33
|
def _query(self) -> str | list[str]:
|
|
@@ -9,12 +9,24 @@ from etl_lib.core.ETLContext import ETLContext
|
|
|
9
9
|
from etl_lib.core.ClosedLoopBatchProcessor import ClosedLoopBatchProcessor
|
|
10
10
|
from etl_lib.core.Task import Task, TaskReturn
|
|
11
11
|
from etl_lib.core.ValidationBatchProcessor import ValidationBatchProcessor
|
|
12
|
-
from etl_lib.data_sink.
|
|
13
|
-
from etl_lib.data_source.
|
|
12
|
+
from etl_lib.data_sink.CypherBatchSink import CypherBatchSink
|
|
13
|
+
from etl_lib.data_source.CSVBatchSource import CSVBatchSource
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class CSVLoad2Neo4jTask(Task):
|
|
17
|
+
"""
|
|
18
|
+
Loads the specified CSV file to Neo4j.
|
|
17
19
|
|
|
20
|
+
Uses BatchProcessors to read, validate and write to Neo4j.
|
|
21
|
+
The validation step is using pydantic, hence a Pydantic model needs to be provided.
|
|
22
|
+
Rows that fail the validation, will be written to en error file. The location of the error file is determined as
|
|
23
|
+
follows:
|
|
24
|
+
|
|
25
|
+
If the context env vars hold an entry `ETL_ERROR_PATH` the file will be place there, with the name set to name
|
|
26
|
+
of the provided filename appended with `.error.json`
|
|
27
|
+
|
|
28
|
+
If `ETL_ERROR_PATH` is not set, the file will be placed in the same directory as the CSV file.
|
|
29
|
+
"""
|
|
18
30
|
def __init__(self, context: ETLContext, model: Type[BaseModel], file: Path, batch_size: int = 5000):
|
|
19
31
|
super().__init__(context)
|
|
20
32
|
self.batch_size = batch_size
|
|
@@ -23,11 +35,15 @@ class CSVLoad2Neo4jTask(Task):
|
|
|
23
35
|
self.file = file
|
|
24
36
|
|
|
25
37
|
def run_internal(self, **kwargs) -> TaskReturn:
|
|
26
|
-
|
|
38
|
+
error_path = self.context.env("ETL_ERROR_PATH")
|
|
39
|
+
if error_path is None:
|
|
40
|
+
error_file = self.file.with_suffix(".error.json")
|
|
41
|
+
else:
|
|
42
|
+
error_file = error_path / self.file.with_name(self.file.stem + ".error.json").name
|
|
27
43
|
|
|
28
|
-
csv =
|
|
44
|
+
csv = CSVBatchSource(self.file, self.context, self)
|
|
29
45
|
validator = ValidationBatchProcessor(self.context, self, csv, self.model, error_file)
|
|
30
|
-
cypher =
|
|
46
|
+
cypher = CypherBatchSink(self.context, self, validator, self._query())
|
|
31
47
|
end = ClosedLoopBatchProcessor(self.context, self, cypher)
|
|
32
48
|
result = next(end.get_batch(self.batch_size))
|
|
33
49
|
|
|
File without changes
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from _pytest.tmpdir import tmp_path
|
|
7
|
+
from neo4j import Driver
|
|
8
|
+
from neo4j.time import Date
|
|
9
|
+
|
|
10
|
+
from etl_lib.core.ETLContext import QueryResult, Neo4jContext, ETLContext
|
|
11
|
+
from etl_lib.core.Task import Task
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def run_query(driver, query, data):
|
|
15
|
+
with driver.session(database=get_database_name()) as session:
|
|
16
|
+
result = session.run(query, data=data)
|
|
17
|
+
return result.data()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_node_count(driver, label: str) -> int:
|
|
21
|
+
"""Get the count of nodes with the specified label"""
|
|
22
|
+
query = f"MATCH (n:{label}) RETURN COUNT(n) AS count"
|
|
23
|
+
with driver.session(database=get_database_name()) as session:
|
|
24
|
+
result = session.run(query)
|
|
25
|
+
return result.single()["count"]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_relationship_count(driver, rel_type: str) -> int:
|
|
29
|
+
"""Get the count of relationships with the specified type"""
|
|
30
|
+
query = f"MATCH ()-[r:{rel_type}]->() RETURN COUNT(r) AS count"
|
|
31
|
+
with driver.session(database=get_database_name()) as session:
|
|
32
|
+
result = session.run(query)
|
|
33
|
+
return result.single()["count"]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def check_property_exists(driver, label: str, property_name: str) -> bool:
|
|
37
|
+
"""
|
|
38
|
+
Checks if all nodes with the given label have the given property.
|
|
39
|
+
:return:
|
|
40
|
+
"""
|
|
41
|
+
with driver.session(database=get_database_name()) as session:
|
|
42
|
+
result = session.run(f"MATCH (n:{label}) WHERE n.{property_name} IS NULL"
|
|
43
|
+
f" RETURN COUNT(n) = 0 AS exists")
|
|
44
|
+
return result.single()["exists"]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def get_graph(driver):
|
|
48
|
+
"""
|
|
49
|
+
Return a grap representation of all data in the database.
|
|
50
|
+
The returned structure is an array of dicts. Each dict has the following keys:
|
|
51
|
+
`start`, `end`, and `rel` representing each relationship found in the graph.
|
|
52
|
+
Use the following query to generate this structure from a known good graph:
|
|
53
|
+
|
|
54
|
+
MATCH (s)-[r]->(e)
|
|
55
|
+
WITH {labels:labels(s), props:properties(s)} AS start, {type:type(r), props:properties(r)} AS rel, {labels:labels(e), props:properties(e)} AS end
|
|
56
|
+
RETURN {start:start, rel:rel, end:end}
|
|
57
|
+
"""
|
|
58
|
+
with driver.session(database=get_database_name()) as session:
|
|
59
|
+
records = session.run(
|
|
60
|
+
"""
|
|
61
|
+
MATCH (s)-[r]->(e)
|
|
62
|
+
WITH {labels:labels(s), props:properties(s)} AS start,
|
|
63
|
+
{type:type(r), props:properties(r)} AS rel,
|
|
64
|
+
{labels:labels(e), props:properties(e)} AS end
|
|
65
|
+
RETURN {start:start, rel:rel, end:end} AS graph
|
|
66
|
+
"""
|
|
67
|
+
)
|
|
68
|
+
data = [record.data()["graph"] for record in records]
|
|
69
|
+
return convert_neo4j_date_to_string(data, "%Y-%m-%d")
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def convert_neo4j_date_to_string(data, date_format):
|
|
73
|
+
"""
|
|
74
|
+
Recursively converts all neo4j.time.Date instances in a dictionary into strings using the provided format.
|
|
75
|
+
|
|
76
|
+
:param data: The input dictionary or list to process.
|
|
77
|
+
:param date_format: A format string compatible with Python's strftime.
|
|
78
|
+
:return: The processed dictionary or list with dates converted to strings.
|
|
79
|
+
"""
|
|
80
|
+
if isinstance(data, dict):
|
|
81
|
+
return {key: convert_neo4j_date_to_string(value, date_format) for key, value in data.items()}
|
|
82
|
+
elif isinstance(data, list):
|
|
83
|
+
return [convert_neo4j_date_to_string(item, date_format) for item in data]
|
|
84
|
+
elif isinstance(data, Date):
|
|
85
|
+
return data.to_native().strftime(date_format)
|
|
86
|
+
else:
|
|
87
|
+
return data
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def get_database_name():
|
|
91
|
+
if os.getenv("NEO4J_TEST_CONTAINER") is None:
|
|
92
|
+
# not running with test containers. expect test db to be set
|
|
93
|
+
if os.getenv("NEO4J_TEST_DATABASE") is not None:
|
|
94
|
+
return os.getenv("NEO4J_TEST_DATABASE")
|
|
95
|
+
else:
|
|
96
|
+
raise Exception("define NEO4J_TEST_DATABASE environment variable")
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class TestNeo4jContext(Neo4jContext):
|
|
100
|
+
|
|
101
|
+
def __init__(self, driver: Driver):
|
|
102
|
+
self.logger = logging.getLogger(self.__class__.__name__)
|
|
103
|
+
self.driver = driver
|
|
104
|
+
self.database = get_database_name()
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class TestETLContext(ETLContext):
|
|
108
|
+
|
|
109
|
+
def __init__(self, driver: Driver, tmp_path):
|
|
110
|
+
self.logger = logging.getLogger(self.__class__.__name__)
|
|
111
|
+
self.__env_vars = {"ETL_ERROR_PATH": tmp_path}
|
|
112
|
+
self.neo4j = TestNeo4jContext(driver)
|
|
113
|
+
self.reporter = DummyReporter()
|
|
114
|
+
|
|
115
|
+
def env(self, key: str) -> Any:
|
|
116
|
+
if key in self.__env_vars:
|
|
117
|
+
return self.__env_vars[key]
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class DummyReporter:
|
|
121
|
+
|
|
122
|
+
def register_tasks(self, main: Task):
|
|
123
|
+
pass
|
|
124
|
+
|
|
125
|
+
def started_task(self, task: Task) -> Task:
|
|
126
|
+
pass
|
|
127
|
+
|
|
128
|
+
def finished_task(self, task, result) -> Task:
|
|
129
|
+
pass
|
|
130
|
+
|
|
131
|
+
def report_progress(self, task, batches: int, expected_batches: int, stats: dict) -> None:
|
|
132
|
+
pass
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class DummyNeo4jContext:
|
|
136
|
+
|
|
137
|
+
def query_database(self, session, query, **kwargs) -> QueryResult:
|
|
138
|
+
return QueryResult([], {})
|
|
139
|
+
|
|
140
|
+
def session(self, database=None):
|
|
141
|
+
return None
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class DummyContext:
|
|
145
|
+
neo4j: DummyNeo4jContext
|
|
146
|
+
__env_vars: dict
|
|
147
|
+
path_error: Path
|
|
148
|
+
path_import: Path
|
|
149
|
+
path_processed: Path
|
|
150
|
+
reporter = DummyReporter()
|
|
151
|
+
|
|
152
|
+
def env(self, key: str) -> Any:
|
|
153
|
+
pass
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: neo4j-etl-lib
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.1.1
|
|
4
4
|
Summary: Building blocks for ETL pipelines.
|
|
5
5
|
Keywords: etl,graph,database
|
|
6
6
|
Author-email: Bert Radke <bert.radke@pm.me>
|
|
@@ -13,6 +13,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
13
13
|
Classifier: Programming Language :: Python :: 3
|
|
14
14
|
Classifier: Topic :: Database
|
|
15
15
|
Classifier: Development Status :: 4 - Beta
|
|
16
|
+
License-File: LICENSE
|
|
16
17
|
Requires-Dist: pydantic>=2.10.5; python_version >= '3.8'
|
|
17
18
|
Requires-Dist: neo4j>=5.27.0; python_version >= '3.7'
|
|
18
19
|
Requires-Dist: python-dotenv>=1.0.1; python_version >= '3.8'
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
etl_lib/__init__.py,sha256=sxY6lj4IZU25bZRF4lb6N5nn6yH1W4S1Qqysw-NzcXI,65
|
|
2
|
+
etl_lib/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
etl_lib/cli/run_tools.py,sha256=KAm6XRz5audOP_PhMVozEckvdeMJ0HfyleEFc5jAalc,8579
|
|
4
|
+
etl_lib/core/BatchProcessor.py,sha256=6quNPE9Dp8hYJDQDTqxQtxbQ3KCmb56Mko34EIsNhyI,3352
|
|
5
|
+
etl_lib/core/ClosedLoopBatchProcessor.py,sha256=unlx_A339oi2nOOXF0irrVf8j_GFhwcTuk_w5liqbWc,1321
|
|
6
|
+
etl_lib/core/ETLContext.py,sha256=ZTk_IDILpjUji0DphPUzTNx8k_2hZRxy37mqIcEA-kM,5641
|
|
7
|
+
etl_lib/core/ProgressReporter.py,sha256=QR9ZwwyHEEBYa8i3Udc5J68Ir1bsPIM1fFyt0n_lqFU,8407
|
|
8
|
+
etl_lib/core/Task.py,sha256=3e8iVXSfXaeBecvgTcs2LiIf2JwpKETRFhH4ig6lock,9202
|
|
9
|
+
etl_lib/core/ValidationBatchProcessor.py,sha256=EhO6PFQB-4PZgIOTXP4PwkbAl5HRK0zgTeKMseRU5QU,3261
|
|
10
|
+
etl_lib/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
+
etl_lib/core/utils.py,sha256=wwfyvy78fL6sqHdV0IFqAVyEkp6vo5Yo8gRZua2dulw,816
|
|
12
|
+
etl_lib/data_sink/CSVBatchSink.py,sha256=oq4VJwnA4WSyJzdvwstGv73vOEuWmPSfCynhVmxBByU,2204
|
|
13
|
+
etl_lib/data_sink/CypherBatchSink.py,sha256=RMuelUat55ojLQMRYmoiXG0D_fgWH0RLbmUd01UMv_c,1511
|
|
14
|
+
etl_lib/data_sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
|
+
etl_lib/data_source/CSVBatchSource.py,sha256=HILkaQAFua1OM7xgSNKY6teXpcJjWUPaS4Aol-GLYL8,2767
|
|
16
|
+
etl_lib/data_source/CypherBatchSource.py,sha256=06WuW11BqYjAXBZqL96Qr9MR8JrcjujDpxXe8cI-SYY,2238
|
|
17
|
+
etl_lib/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
|
+
etl_lib/task/CreateReportingConstraintsTask.py,sha256=aV5i1EwjfuG-eEGoNaB-NcaPhyu0NgdVhmZr5MIv8ak,760
|
|
19
|
+
etl_lib/task/ExecuteCypherTask.py,sha256=thE8YTZzv1abxNhhDcb4p4ke6qmI6kWR4XQ-GrCBBBU,1284
|
|
20
|
+
etl_lib/task/GDSTask.py,sha256=X1E83wYa-N7AXy43WPEqIy77d__z-2wpBjWNhGNXJzA,1781
|
|
21
|
+
etl_lib/task/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
|
+
etl_lib/task/data_loading/CSVLoad2Neo4jTask.py,sha256=US9Sa6ytPPEa6BSVUBttlWdKzqyxlF-09If5XCf-LIE,2277
|
|
23
|
+
etl_lib/task/data_loading/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
|
+
etl_lib/test_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
|
+
etl_lib/test_utils/utils.py,sha256=j7RMhT5Q69-5EAVwB1hePPJobq69_uYxuMTfd6gnbbc,5109
|
|
26
|
+
neo4j_etl_lib-0.1.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
27
|
+
neo4j_etl_lib-0.1.1.dist-info/WHEEL,sha256=_2ozNFCLWc93bK4WKHCO-eDUENDlo-dgc9cU3qokYO4,82
|
|
28
|
+
neo4j_etl_lib-0.1.1.dist-info/METADATA,sha256=LG9xc0NIjBUtdRZwLl9O8WpSXjJqCWLIWf0m8j0iZHQ,2210
|
|
29
|
+
neo4j_etl_lib-0.1.1.dist-info/RECORD,,
|
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
etl_lib/__init__.py,sha256=uowSzShO11zBjnhp9t67CQUAe0Z9PFA3kpVEM-T4gfM,65
|
|
2
|
-
etl_lib/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
etl_lib/cli/run_tools.py,sha256=ZXqXcCv9IBjMVED9KwwEMm9ti7xZ9MM-4P_-RcLY_gg,7292
|
|
4
|
-
etl_lib/core/BatchProcessor.py,sha256=6quNPE9Dp8hYJDQDTqxQtxbQ3KCmb56Mko34EIsNhyI,3352
|
|
5
|
-
etl_lib/core/ClosedLoopBatchProcessor.py,sha256=unlx_A339oi2nOOXF0irrVf8j_GFhwcTuk_w5liqbWc,1321
|
|
6
|
-
etl_lib/core/ETLContext.py,sha256=RdJr7GlNtoGz246XNdbq8YvTzgw4KWOQ6i3RjTH2b88,5686
|
|
7
|
-
etl_lib/core/ProgressReporter.py,sha256=z5aVBjDJZSNGr6zmY8DsMC6dzEcnhAV7RboHWJdl49g,8557
|
|
8
|
-
etl_lib/core/Task.py,sha256=qhCRYEJciYdaYzMurUTTzGQgm7UeKe0Ik37Fp-qAgr8,9256
|
|
9
|
-
etl_lib/core/ValidationBatchProcessor.py,sha256=23uYtIVAtR755h34vkrN_QepFJnYroFKVO4xuHF922Y,3244
|
|
10
|
-
etl_lib/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
-
etl_lib/core/utils.py,sha256=wwfyvy78fL6sqHdV0IFqAVyEkp6vo5Yo8gRZua2dulw,816
|
|
12
|
-
etl_lib/data_sink/CypherBatchProcessor.py,sha256=VrnTOoMJMEjy0CeMPm-Q5p0mMlc4JnwrLDK-QrvEj9U,1530
|
|
13
|
-
etl_lib/data_sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
-
etl_lib/data_source/CSVBatchProcessor.py,sha256=FaerMVPXqDmxAfxIO0Mot0IEuYyQ2lXxnm7hzcE0Oug,2776
|
|
15
|
-
etl_lib/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
|
-
etl_lib/task/CreateReportingConstraintsTask.py,sha256=aV5i1EwjfuG-eEGoNaB-NcaPhyu0NgdVhmZr5MIv8ak,760
|
|
17
|
-
etl_lib/task/ExecuteCypherTask.py,sha256=wpPF-bbawRiNS1cCXLhIwuXROAcXsv3OfdKc6DH5q2o,1252
|
|
18
|
-
etl_lib/task/GDSTask.py,sha256=X1E83wYa-N7AXy43WPEqIy77d__z-2wpBjWNhGNXJzA,1781
|
|
19
|
-
etl_lib/task/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
20
|
-
etl_lib/task/data_loading/CSVLoad2Neo4jTask.py,sha256=GxMfRRcrdFtGNkJ_-_bxRybKf9AoRCTg5_iAkVVjSyg,1487
|
|
21
|
-
etl_lib/task/data_loading/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
|
-
neo4j_etl_lib-0.0.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
23
|
-
neo4j_etl_lib-0.0.3.dist-info/WHEEL,sha256=CpUCUxeHQbRN5UGRQHYRJorO5Af-Qy_fHMctcQ8DSGI,82
|
|
24
|
-
neo4j_etl_lib-0.0.3.dist-info/METADATA,sha256=YIjIIJkmZ_WDPmQA78_IdzH7Q1XcS0w_GpBoPKZHvoQ,2188
|
|
25
|
-
neo4j_etl_lib-0.0.3.dist-info/RECORD,,
|
|
File without changes
|