neo4j-etl-lib 0.1.0__tar.gz → 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {neo4j_etl_lib-0.1.0 → neo4j_etl_lib-0.1.1}/PKG-INFO +1 -1
- {neo4j_etl_lib-0.1.0 → neo4j_etl_lib-0.1.1}/src/etl_lib/__init__.py +1 -1
- {neo4j_etl_lib-0.1.0 → neo4j_etl_lib-0.1.1}/src/etl_lib/cli/run_tools.py +44 -12
- {neo4j_etl_lib-0.1.0 → neo4j_etl_lib-0.1.1}/src/etl_lib/core/ProgressReporter.py +13 -15
- {neo4j_etl_lib-0.1.0 → neo4j_etl_lib-0.1.1}/src/etl_lib/core/Task.py +3 -7
- {neo4j_etl_lib-0.1.0 → neo4j_etl_lib-0.1.1}/src/etl_lib/data_source/CypherBatchSource.py +18 -3
- {neo4j_etl_lib-0.1.0 → neo4j_etl_lib-0.1.1}/src/etl_lib/task/ExecuteCypherTask.py +2 -2
- {neo4j_etl_lib-0.1.0 → neo4j_etl_lib-0.1.1}/src/etl_lib/test_utils/utils.py +1 -1
- {neo4j_etl_lib-0.1.0 → neo4j_etl_lib-0.1.1}/LICENSE +0 -0
- {neo4j_etl_lib-0.1.0 → neo4j_etl_lib-0.1.1}/README.md +0 -0
- {neo4j_etl_lib-0.1.0 → neo4j_etl_lib-0.1.1}/pyproject.toml +0 -0
- {neo4j_etl_lib-0.1.0 → neo4j_etl_lib-0.1.1}/src/etl_lib/cli/__init__.py +0 -0
- {neo4j_etl_lib-0.1.0 → neo4j_etl_lib-0.1.1}/src/etl_lib/core/BatchProcessor.py +0 -0
- {neo4j_etl_lib-0.1.0 → neo4j_etl_lib-0.1.1}/src/etl_lib/core/ClosedLoopBatchProcessor.py +0 -0
- {neo4j_etl_lib-0.1.0 → neo4j_etl_lib-0.1.1}/src/etl_lib/core/ETLContext.py +0 -0
- {neo4j_etl_lib-0.1.0 → neo4j_etl_lib-0.1.1}/src/etl_lib/core/ValidationBatchProcessor.py +0 -0
- {neo4j_etl_lib-0.1.0 → neo4j_etl_lib-0.1.1}/src/etl_lib/core/__init__.py +0 -0
- {neo4j_etl_lib-0.1.0 → neo4j_etl_lib-0.1.1}/src/etl_lib/core/utils.py +0 -0
- {neo4j_etl_lib-0.1.0 → neo4j_etl_lib-0.1.1}/src/etl_lib/data_sink/CSVBatchSink.py +0 -0
- {neo4j_etl_lib-0.1.0 → neo4j_etl_lib-0.1.1}/src/etl_lib/data_sink/CypherBatchSink.py +0 -0
- {neo4j_etl_lib-0.1.0 → neo4j_etl_lib-0.1.1}/src/etl_lib/data_sink/__init__.py +0 -0
- {neo4j_etl_lib-0.1.0 → neo4j_etl_lib-0.1.1}/src/etl_lib/data_source/CSVBatchSource.py +0 -0
- {neo4j_etl_lib-0.1.0 → neo4j_etl_lib-0.1.1}/src/etl_lib/data_source/__init__.py +0 -0
- {neo4j_etl_lib-0.1.0 → neo4j_etl_lib-0.1.1}/src/etl_lib/task/CreateReportingConstraintsTask.py +0 -0
- {neo4j_etl_lib-0.1.0 → neo4j_etl_lib-0.1.1}/src/etl_lib/task/GDSTask.py +0 -0
- {neo4j_etl_lib-0.1.0 → neo4j_etl_lib-0.1.1}/src/etl_lib/task/__init__.py +0 -0
- {neo4j_etl_lib-0.1.0 → neo4j_etl_lib-0.1.1}/src/etl_lib/task/data_loading/CSVLoad2Neo4jTask.py +0 -0
- {neo4j_etl_lib-0.1.0 → neo4j_etl_lib-0.1.1}/src/etl_lib/task/data_loading/__init__.py +0 -0
- {neo4j_etl_lib-0.1.0 → neo4j_etl_lib-0.1.1}/src/etl_lib/test_utils/__init__.py +0 -0
|
@@ -55,7 +55,7 @@ def __driver(ctx):
|
|
|
55
55
|
database_name = ctx.obj["database_name"]
|
|
56
56
|
neo4j_password = ctx.obj["neo4j_password"]
|
|
57
57
|
return GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password), database=database_name,
|
|
58
|
-
notifications_min_severity="OFF", user_agent="ETL CLI
|
|
58
|
+
notifications_min_severity="OFF", user_agent="ETL CLI")
|
|
59
59
|
|
|
60
60
|
|
|
61
61
|
@click.group()
|
|
@@ -165,25 +165,57 @@ def detail(ctx, run_id, details):
|
|
|
165
165
|
__print_details(driver, run_id)
|
|
166
166
|
|
|
167
167
|
|
|
168
|
+
# noinspection PyTypeChecker
|
|
168
169
|
@cli.command()
|
|
169
|
-
@click.option('--run-id', required=False, help='Run
|
|
170
|
-
@click.option('--
|
|
171
|
-
@click.option('--older', help='Delete runs older than
|
|
170
|
+
@click.option('--run-id', required=False, type=str, help='Run IDs to delete, works with comma separated list')
|
|
171
|
+
@click.option('--before', type=click.DateTime(formats=["%Y-%m-%d"]), help='Delete runs before a specific date in format YYYY-MM-DD')
|
|
172
|
+
@click.option('--older', help='Delete runs older than x days', type=int)
|
|
172
173
|
@click.pass_context
|
|
173
|
-
def delete(ctx, run_id,
|
|
174
|
+
def delete(ctx, run_id, before, older):
|
|
174
175
|
"""
|
|
175
|
-
Delete runs based on run ID, date, or age. One and only one of --run-id, --
|
|
176
|
+
Delete runs based on run ID, date, or age. One and only one of --run-id, --before, or --older must be provided.
|
|
176
177
|
"""
|
|
177
178
|
# Ensure mutual exclusivity
|
|
178
|
-
options = [run_id,
|
|
179
|
+
options = [run_id, before, older]
|
|
179
180
|
if sum(bool(opt) for opt in options) != 1:
|
|
180
|
-
print("You must specify exactly one of --run-id, --
|
|
181
|
+
print("You must specify exactly one of --run-id, --before, or --older.")
|
|
181
182
|
return
|
|
182
183
|
|
|
183
184
|
if run_id:
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
185
|
+
ids = run_id.split(',')
|
|
186
|
+
delete_runs(ctx, ids)
|
|
187
|
+
elif before:
|
|
188
|
+
print(f"Deleting runs before: {before}")
|
|
189
|
+
with __driver(ctx) as driver:
|
|
190
|
+
record= driver.execute_query(
|
|
191
|
+
"""MATCH (r:ETLRun) WHERE date(r.startTime) < date($before)
|
|
192
|
+
RETURN collect(r.uuid) AS ids
|
|
193
|
+
""",
|
|
194
|
+
result_transformer_=neo4j.Result.single,
|
|
195
|
+
before=before)
|
|
196
|
+
ids = record[0]
|
|
197
|
+
delete_runs(ctx, ids)
|
|
198
|
+
|
|
187
199
|
elif older:
|
|
188
200
|
print(f"Deleting runs older than: {older}")
|
|
189
|
-
|
|
201
|
+
with __driver(ctx) as driver:
|
|
202
|
+
record = driver.execute_query(
|
|
203
|
+
"""MATCH (r:ETLRun) WHERE date(r.startTime) < (date() - duration({days: $days}))
|
|
204
|
+
RETURN collect(r.uuid) AS ids
|
|
205
|
+
""",
|
|
206
|
+
result_transformer_=neo4j.Result.single,
|
|
207
|
+
days=older)
|
|
208
|
+
ids = record[0]
|
|
209
|
+
delete_runs(ctx, ids)
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def delete_runs(ctx, ids):
|
|
213
|
+
print(f"Deleting run IDs: {ids}")
|
|
214
|
+
with __driver(ctx) as driver:
|
|
215
|
+
records, _, _ = driver.execute_query(
|
|
216
|
+
"""
|
|
217
|
+
MATCH (r:ETLRun)-[*]->(n) WHERE r.uuid IN $ids
|
|
218
|
+
DETACH DELETE n
|
|
219
|
+
DETACH DELETE r
|
|
220
|
+
""", ids=ids, routing_=neo4j.RoutingControl.WRITE)
|
|
221
|
+
print(f"Deleted run IDs: {ids} successfully")
|
|
@@ -3,7 +3,7 @@ from datetime import datetime
|
|
|
3
3
|
|
|
4
4
|
from tabulate import tabulate
|
|
5
5
|
|
|
6
|
-
from etl_lib.core.Task import Task, TaskGroup
|
|
6
|
+
from etl_lib.core.Task import Task, TaskGroup, TaskReturn
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class ProgressReporter:
|
|
@@ -47,7 +47,7 @@ class ProgressReporter:
|
|
|
47
47
|
self.logger.info(f"{'\t' * task.depth}starting {task.task_name()}")
|
|
48
48
|
return task
|
|
49
49
|
|
|
50
|
-
def finished_task(self, task: Task,
|
|
50
|
+
def finished_task(self, task: Task, result: TaskReturn) -> Task:
|
|
51
51
|
"""
|
|
52
52
|
Marks the task as finished.
|
|
53
53
|
|
|
@@ -55,23 +55,21 @@ class ProgressReporter:
|
|
|
55
55
|
|
|
56
56
|
Args:
|
|
57
57
|
task: Task to be marked as finished.
|
|
58
|
-
|
|
59
|
-
summery: statistics for this task (such as `nodes_created`)
|
|
60
|
-
error: If an exception occurred, the exception text should be provided here.
|
|
58
|
+
result: result of the task execution, such as status and summery information.
|
|
61
59
|
|
|
62
60
|
Returns:
|
|
63
61
|
Task to be marked as started.
|
|
64
62
|
"""
|
|
65
63
|
task.end_time = datetime.now()
|
|
66
|
-
task.success = success
|
|
67
|
-
task.summery = summery
|
|
64
|
+
task.success = result.success
|
|
65
|
+
task.summery = result.summery
|
|
68
66
|
|
|
69
|
-
report = f"{'\t' * task.depth} finished {task.task_name()} in {task.end_time - task.start_time} with success: {success}"
|
|
70
|
-
if error is not None:
|
|
71
|
-
report += f", error: \n{error}"
|
|
67
|
+
report = f"{'\t' * task.depth} finished {task.task_name()} in {task.end_time - task.start_time} with success: {result.success}"
|
|
68
|
+
if result.error is not None:
|
|
69
|
+
report += f", error: \n{result.error}"
|
|
72
70
|
else:
|
|
73
71
|
# for the logger, remove entries with 0, but keep them in the original for reporting
|
|
74
|
-
cleaned_summery = {key: value for key, value in summery.items() if value != 0}
|
|
72
|
+
cleaned_summery = {key: value for key, value in result.summery.items() if value != 0}
|
|
75
73
|
if len(cleaned_summery) > 0:
|
|
76
74
|
report += f"\n{tabulate([cleaned_summery], headers='keys', tablefmt='psql')}"
|
|
77
75
|
self.logger.info(report)
|
|
@@ -168,9 +166,9 @@ class Neo4jProgressReporter(ProgressReporter):
|
|
|
168
166
|
start_time=task.start_time)
|
|
169
167
|
return task
|
|
170
168
|
|
|
171
|
-
def finished_task(self, task: Task,
|
|
172
|
-
super().finished_task(task=task,
|
|
173
|
-
if success:
|
|
169
|
+
def finished_task(self, task: Task, result: TaskReturn) -> Task:
|
|
170
|
+
super().finished_task(task=task, result=result)
|
|
171
|
+
if result.success:
|
|
174
172
|
status = "success"
|
|
175
173
|
else:
|
|
176
174
|
status = "failure"
|
|
@@ -179,7 +177,7 @@ class Neo4jProgressReporter(ProgressReporter):
|
|
|
179
177
|
MATCH (t:ETLTask {uuid:$id}) SET t.endTime = $end_time, t.status = $status, t.error = $error
|
|
180
178
|
CREATE (s:ETLStats) SET s=$summery
|
|
181
179
|
CREATE (t)-[:HAS_STATS]->(s)
|
|
182
|
-
""", id=task.uuid, end_time=task.end_time, summery=summery, status=status, error=error)
|
|
180
|
+
""", id=task.uuid, end_time=task.end_time, summery=result.summery, status=status, error=result.error)
|
|
183
181
|
return task
|
|
184
182
|
|
|
185
183
|
def __create_constraints(self):
|
|
@@ -46,7 +46,8 @@ class TaskReturn:
|
|
|
46
46
|
|
|
47
47
|
# Combine success values and errors
|
|
48
48
|
combined_success = self.success and other.success
|
|
49
|
-
combined_error =
|
|
49
|
+
combined_error = None if not (self.error or other.error) \
|
|
50
|
+
else f"{self.error or ''} | {other.error or ''}".strip(" |")
|
|
50
51
|
|
|
51
52
|
return TaskReturn(
|
|
52
53
|
success=combined_success, summery=merged_summery, error=combined_error
|
|
@@ -99,12 +100,7 @@ class Task:
|
|
|
99
100
|
except Exception as e:
|
|
100
101
|
result = TaskReturn(success=False, summery={}, error=str(e))
|
|
101
102
|
|
|
102
|
-
self.context.reporter.finished_task(
|
|
103
|
-
task=self,
|
|
104
|
-
success=result.success,
|
|
105
|
-
summery=result.summery,
|
|
106
|
-
error=result.error,
|
|
107
|
-
)
|
|
103
|
+
self.context.reporter.finished_task(task=self,result=result)
|
|
108
104
|
|
|
109
105
|
return result
|
|
110
106
|
|
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
from typing import Generator
|
|
1
|
+
from typing import Generator, Callable, Optional
|
|
2
|
+
|
|
3
|
+
from neo4j import Record
|
|
2
4
|
|
|
3
5
|
from etl_lib.core.BatchProcessor import BatchResults, BatchProcessor
|
|
4
6
|
from etl_lib.core.ETLContext import ETLContext
|
|
@@ -7,7 +9,14 @@ from etl_lib.core.Task import Task
|
|
|
7
9
|
|
|
8
10
|
class CypherBatchSource(BatchProcessor):
|
|
9
11
|
|
|
10
|
-
def __init__(
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
context: ETLContext,
|
|
15
|
+
task: Task,
|
|
16
|
+
query: str,
|
|
17
|
+
record_transformer: Optional[Callable[[Record], dict]] = None,
|
|
18
|
+
**kwargs
|
|
19
|
+
):
|
|
11
20
|
"""
|
|
12
21
|
Constructs a new CypherBatchSource.
|
|
13
22
|
|
|
@@ -15,10 +24,12 @@ class CypherBatchSource(BatchProcessor):
|
|
|
15
24
|
context: :class:`etl_lib.core.ETLContext.ETLContext` instance.
|
|
16
25
|
task: :class:`etl_lib.core.Task.Task` instance owning this batchProcessor.
|
|
17
26
|
query: Cypher query to execute.
|
|
27
|
+
record_transformer: Optional function to transform each record. See Neo4j API documentation on `result_transformer_`
|
|
18
28
|
kwargs: Arguments passed as parameters with the query.
|
|
19
29
|
"""
|
|
20
30
|
super().__init__(context, task)
|
|
21
31
|
self.query = query
|
|
32
|
+
self.record_transformer = record_transformer
|
|
22
33
|
self.kwargs = kwargs
|
|
23
34
|
|
|
24
35
|
def __read_records(self, tx, batch_size):
|
|
@@ -26,7 +37,11 @@ class CypherBatchSource(BatchProcessor):
|
|
|
26
37
|
result = tx.run(self.query, **self.kwargs)
|
|
27
38
|
|
|
28
39
|
for record in result:
|
|
29
|
-
|
|
40
|
+
data = record.data()
|
|
41
|
+
if self.record_transformer:
|
|
42
|
+
data = self.record_transformer(data)
|
|
43
|
+
batch_.append(data)
|
|
44
|
+
|
|
30
45
|
if len(batch_) == batch_size:
|
|
31
46
|
yield batch_
|
|
32
47
|
batch_ = []
|
|
@@ -24,10 +24,10 @@ class ExecuteCypherTask(Task):
|
|
|
24
24
|
for query in self._query():
|
|
25
25
|
result = self.context.neo4j.query_database(session=session, query=query, **kwargs)
|
|
26
26
|
stats = merge_summery(stats, result.summery)
|
|
27
|
-
return TaskReturn(True, stats)
|
|
27
|
+
return TaskReturn(success=True, summery=stats)
|
|
28
28
|
else:
|
|
29
29
|
result = self.context.neo4j.query_database(session=session, query=self._query(), **kwargs)
|
|
30
|
-
return TaskReturn(True, result.summery)
|
|
30
|
+
return TaskReturn(success=True, summery=result.summery)
|
|
31
31
|
|
|
32
32
|
@abc.abstractmethod
|
|
33
33
|
def _query(self) -> str | list[str]:
|
|
@@ -125,7 +125,7 @@ class DummyReporter:
|
|
|
125
125
|
def started_task(self, task: Task) -> Task:
|
|
126
126
|
pass
|
|
127
127
|
|
|
128
|
-
def finished_task(self, task,
|
|
128
|
+
def finished_task(self, task, result) -> Task:
|
|
129
129
|
pass
|
|
130
130
|
|
|
131
131
|
def report_progress(self, task, batches: int, expected_batches: int, stats: dict) -> None:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{neo4j_etl_lib-0.1.0 → neo4j_etl_lib-0.1.1}/src/etl_lib/task/CreateReportingConstraintsTask.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{neo4j_etl_lib-0.1.0 → neo4j_etl_lib-0.1.1}/src/etl_lib/task/data_loading/CSVLoad2Neo4jTask.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|