neo4j-etl-lib 0.0.2__tar.gz → 0.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- neo4j_etl_lib-0.0.3/PKG-INFO +53 -0
- neo4j_etl_lib-0.0.3/README.md +14 -0
- {neo4j_etl_lib-0.0.2 → neo4j_etl_lib-0.0.3}/pyproject.toml +2 -1
- {neo4j_etl_lib-0.0.2 → neo4j_etl_lib-0.0.3}/src/etl_lib/__init__.py +1 -1
- {neo4j_etl_lib-0.0.2 → neo4j_etl_lib-0.0.3}/src/etl_lib/cli/run_tools.py +1 -1
- {neo4j_etl_lib-0.0.2 → neo4j_etl_lib-0.0.3}/src/etl_lib/core/BatchProcessor.py +1 -2
- {neo4j_etl_lib-0.0.2 → neo4j_etl_lib-0.0.3}/src/etl_lib/core/ETLContext.py +31 -5
- {neo4j_etl_lib-0.0.2 → neo4j_etl_lib-0.0.3}/src/etl_lib/core/ProgressReporter.py +4 -4
- {neo4j_etl_lib-0.0.2 → neo4j_etl_lib-0.0.3}/src/etl_lib/core/Task.py +0 -3
- neo4j_etl_lib-0.0.3/src/etl_lib/core/utils.py +28 -0
- {neo4j_etl_lib-0.0.2 → neo4j_etl_lib-0.0.3}/src/etl_lib/data_source/CSVBatchProcessor.py +13 -21
- neo4j_etl_lib-0.0.3/src/etl_lib/task/CreateReportingConstraintsTask.py +17 -0
- {neo4j_etl_lib-0.0.2 → neo4j_etl_lib-0.0.3}/src/etl_lib/task/ExecuteCypherTask.py +5 -0
- {neo4j_etl_lib-0.0.2 → neo4j_etl_lib-0.0.3}/src/etl_lib/task/data_loading/CSVLoad2Neo4jTask.py +1 -1
- neo4j_etl_lib-0.0.2/.env.sample +0 -7
- neo4j_etl_lib-0.0.2/.gitignore +0 -9
- neo4j_etl_lib-0.0.2/PKG-INFO +0 -126
- neo4j_etl_lib-0.0.2/README.md +0 -88
- neo4j_etl_lib-0.0.2/dashboard.json +0 -190
- neo4j_etl_lib-0.0.2/docs/Makefile +0 -23
- neo4j_etl_lib-0.0.2/docs/README.md +0 -16
- neo4j_etl_lib-0.0.2/docs/_static/images/schema.json +0 -510
- neo4j_etl_lib-0.0.2/docs/_static/images/schema.png +0 -0
- neo4j_etl_lib-0.0.2/docs/_static/pydata-custom.css +0 -10
- neo4j_etl_lib-0.0.2/docs/_static/readthedocs-custom.css +0 -30
- neo4j_etl_lib-0.0.2/docs/_templates/custom-class-template.rst +0 -34
- neo4j_etl_lib-0.0.2/docs/_templates/custom-module-template.rst +0 -66
- neo4j_etl_lib-0.0.2/docs/api.rst +0 -12
- neo4j_etl_lib-0.0.2/docs/conf.py +0 -95
- neo4j_etl_lib-0.0.2/docs/index.rst +0 -24
- neo4j_etl_lib-0.0.2/pytest.ini +0 -10
- neo4j_etl_lib-0.0.2/src/etl_lib/core/utils.py +0 -7
- {neo4j_etl_lib-0.0.2 → neo4j_etl_lib-0.0.3}/LICENSE +0 -0
- {neo4j_etl_lib-0.0.2 → neo4j_etl_lib-0.0.3}/src/etl_lib/cli/__init__.py +0 -0
- {neo4j_etl_lib-0.0.2 → neo4j_etl_lib-0.0.3}/src/etl_lib/core/ClosedLoopBatchProcessor.py +0 -0
- {neo4j_etl_lib-0.0.2 → neo4j_etl_lib-0.0.3}/src/etl_lib/core/ValidationBatchProcessor.py +0 -0
- {neo4j_etl_lib-0.0.2 → neo4j_etl_lib-0.0.3}/src/etl_lib/core/__init__.py +0 -0
- {neo4j_etl_lib-0.0.2 → neo4j_etl_lib-0.0.3}/src/etl_lib/data_sink/CypherBatchProcessor.py +0 -0
- {neo4j_etl_lib-0.0.2 → neo4j_etl_lib-0.0.3}/src/etl_lib/data_sink/__init__.py +0 -0
- {neo4j_etl_lib-0.0.2 → neo4j_etl_lib-0.0.3}/src/etl_lib/data_source/__init__.py +0 -0
- {neo4j_etl_lib-0.0.2 → neo4j_etl_lib-0.0.3}/src/etl_lib/task/GDSTask.py +0 -0
- {neo4j_etl_lib-0.0.2 → neo4j_etl_lib-0.0.3}/src/etl_lib/task/__init__.py +0 -0
- {neo4j_etl_lib-0.0.2 → neo4j_etl_lib-0.0.3}/src/etl_lib/task/data_loading/__init__.py +0 -0
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: neo4j-etl-lib
|
|
3
|
+
Version: 0.0.3
|
|
4
|
+
Summary: Building blocks for ETL pipelines.
|
|
5
|
+
Keywords: etl,graph,database
|
|
6
|
+
Author-email: Bert Radke <bert.radke@pm.me>
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Programming Language :: Python
|
|
12
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Topic :: Database
|
|
15
|
+
Classifier: Development Status :: 4 - Beta
|
|
16
|
+
Requires-Dist: pydantic>=2.10.5; python_version >= '3.8'
|
|
17
|
+
Requires-Dist: neo4j>=5.27.0; python_version >= '3.7'
|
|
18
|
+
Requires-Dist: python-dotenv>=1.0.1; python_version >= '3.8'
|
|
19
|
+
Requires-Dist: tabulate>=0.9.0; python_version >= '3.7'
|
|
20
|
+
Requires-Dist: click>=8.1.8; python_version >= '3.7'
|
|
21
|
+
Requires-Dist: pytest>=8.3.0 ; extra == "dev" and ( python_version >= '3.8')
|
|
22
|
+
Requires-Dist: testcontainers[neo4j]==4.9.0 ; extra == "dev" and ( python_version >= '3.9' and python_version < '4.0')
|
|
23
|
+
Requires-Dist: pytest-cov ; extra == "dev"
|
|
24
|
+
Requires-Dist: bumpver ; extra == "dev"
|
|
25
|
+
Requires-Dist: isort ; extra == "dev"
|
|
26
|
+
Requires-Dist: pip-tools ; extra == "dev"
|
|
27
|
+
Requires-Dist: sphinx ; extra == "dev"
|
|
28
|
+
Requires-Dist: sphinx-rtd-theme ; extra == "dev"
|
|
29
|
+
Requires-Dist: pydata-sphinx-theme ; extra == "dev"
|
|
30
|
+
Requires-Dist: sphinx-autodoc-typehints ; extra == "dev"
|
|
31
|
+
Requires-Dist: sphinxcontrib-napoleon ; extra == "dev"
|
|
32
|
+
Requires-Dist: sphinx-autoapi ; extra == "dev"
|
|
33
|
+
Requires-Dist: graphdatascience>=1.13 ; extra == "gds" and ( python_version >= '3.9')
|
|
34
|
+
Project-URL: Documentation, https://neo-technology-field.github.io/python-etl-lib/index.html
|
|
35
|
+
Project-URL: Home, https://github.com/neo-technology-field/python-etl-lib
|
|
36
|
+
Provides-Extra: dev
|
|
37
|
+
Provides-Extra: gds
|
|
38
|
+
|
|
39
|
+
# Neo4j ETL Toolbox
|
|
40
|
+
|
|
41
|
+
A Python library of building blocks to assemble etl pipelines.
|
|
42
|
+
|
|
43
|
+
Complete documentation can be found on https://neo-technology-field.github.io/python-etl-lib/index.html
|
|
44
|
+
|
|
45
|
+
See https://github.com/neo-technology-field/python-etl-lib/tree/main/examples/gtfs for an example project.
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
The library can be installed via
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
pip install neo4j-etl-lib
|
|
52
|
+
```
|
|
53
|
+
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# Neo4j ETL Toolbox
|
|
2
|
+
|
|
3
|
+
A Python library of building blocks to assemble etl pipelines.
|
|
4
|
+
|
|
5
|
+
Complete documentation can be found on https://neo-technology-field.github.io/python-etl-lib/index.html
|
|
6
|
+
|
|
7
|
+
See https://github.com/neo-technology-field/python-etl-lib/tree/main/examples/gtfs for an example project.
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
The library can be installed via
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
pip install neo4j-etl-lib
|
|
14
|
+
```
|
|
@@ -34,7 +34,8 @@ dev = [
|
|
|
34
34
|
"pytest>=8.3.0; python_version >= '3.8'",
|
|
35
35
|
"testcontainers[neo4j]==4.9.0; python_version >= '3.9' and python_version < '4.0'",
|
|
36
36
|
"pytest-cov", "bumpver", "isort", "pip-tools",
|
|
37
|
-
"sphinx", "sphinx-rtd-theme", "pydata-sphinx-theme", "sphinx-autodoc-typehints",
|
|
37
|
+
"sphinx", "sphinx-rtd-theme", "pydata-sphinx-theme", "sphinx-autodoc-typehints",
|
|
38
|
+
"sphinxcontrib-napoleon", "sphinx-autoapi"
|
|
38
39
|
]
|
|
39
40
|
gds = ["graphdatascience>=1.13; python_version >= '3.9'"]
|
|
40
41
|
|
|
@@ -155,7 +155,7 @@ def detail(ctx, run_id, details):
|
|
|
155
155
|
"status": record["status"],
|
|
156
156
|
"batches": record["batches"],
|
|
157
157
|
"duration": __duration_from_start_end(record["startTime"], record["endTime"]),
|
|
158
|
-
"changes":
|
|
158
|
+
"changes": record.get("changes", 0)
|
|
159
159
|
}
|
|
160
160
|
for record in records
|
|
161
161
|
]
|
|
@@ -4,7 +4,6 @@ import sys
|
|
|
4
4
|
from dataclasses import dataclass, field
|
|
5
5
|
from typing import Generator
|
|
6
6
|
|
|
7
|
-
from etl_lib.core.ETLContext import ETLContext
|
|
8
7
|
from etl_lib.core.Task import Task
|
|
9
8
|
from etl_lib.core.utils import merge_summery
|
|
10
9
|
|
|
@@ -53,7 +52,7 @@ class BatchProcessor:
|
|
|
53
52
|
and returned in batches to the caller. Usage of `Generators` ensure that not all data must be loaded at once.
|
|
54
53
|
"""
|
|
55
54
|
|
|
56
|
-
def __init__(self, context
|
|
55
|
+
def __init__(self, context, task: Task = None, predecessor=None):
|
|
57
56
|
"""
|
|
58
57
|
Constructs a new :py:class:`etl_lib.core.BatchProcessor` instance.
|
|
59
58
|
|
|
@@ -2,7 +2,7 @@ import logging
|
|
|
2
2
|
from typing import NamedTuple, Any
|
|
3
3
|
|
|
4
4
|
from graphdatascience import GraphDataScience
|
|
5
|
-
from neo4j import
|
|
5
|
+
from neo4j import GraphDatabase, WRITE_ACCESS, SummaryCounters
|
|
6
6
|
|
|
7
7
|
from etl_lib.core.ProgressReporter import get_reporter
|
|
8
8
|
|
|
@@ -20,18 +20,19 @@ def append_results(r1: QueryResult, r2: QueryResult) -> QueryResult:
|
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
class Neo4jContext:
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
database: str
|
|
23
|
+
"""
|
|
24
|
+
Holds the connection to the neo4j database and provides facilities to execute queries.
|
|
25
|
+
"""
|
|
27
26
|
|
|
28
27
|
def __init__(self, env_vars: dict):
|
|
29
28
|
"""
|
|
30
29
|
Create a new Neo4j context.
|
|
30
|
+
|
|
31
31
|
Reads the following env_vars keys:
|
|
32
32
|
- `NEO4J_URI`,
|
|
33
33
|
- `NEO4J_USERNAME`,
|
|
34
34
|
- `NEO4J_PASSWORD`.
|
|
35
|
+
- `NEO4J_DATABASE`,
|
|
35
36
|
"""
|
|
36
37
|
self.logger = logging.getLogger(self.__class__.__name__)
|
|
37
38
|
self.uri = env_vars["NEO4J_URI"]
|
|
@@ -43,6 +44,10 @@ class Neo4jContext:
|
|
|
43
44
|
def query_database(self, session, query, **kwargs) -> QueryResult:
|
|
44
45
|
"""
|
|
45
46
|
Executes a Cypher query on the Neo4j database.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
session: Neo4j database session.
|
|
50
|
+
query: Cypher query either as a single query or as a list.
|
|
46
51
|
"""
|
|
47
52
|
if isinstance(query, list):
|
|
48
53
|
results = []
|
|
@@ -78,12 +83,33 @@ class Neo4jContext:
|
|
|
78
83
|
}
|
|
79
84
|
|
|
80
85
|
def session(self, database=None):
|
|
86
|
+
"""
|
|
87
|
+
Create a new Neo4j session in write mode, caller is responsible to close the session.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
database: name of the database to use for this session. If not provided, the database name provided during
|
|
91
|
+
construction will be used.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
newly created Neo4j session.
|
|
95
|
+
|
|
96
|
+
"""
|
|
81
97
|
if database is None:
|
|
82
98
|
return self.driver.session(database=self.database, default_access_mode=WRITE_ACCESS)
|
|
83
99
|
else:
|
|
84
100
|
return self.driver.session(database=database, default_access_mode=WRITE_ACCESS)
|
|
85
101
|
|
|
86
102
|
def gds(self, database=None) -> GraphDataScience:
|
|
103
|
+
"""
|
|
104
|
+
Creates a new GraphDataScience client.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
database: Name of the database to use for this dgs client.
|
|
108
|
+
If not provided, the database name provided during construction will be used.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
gds client.
|
|
112
|
+
"""
|
|
87
113
|
if database is None:
|
|
88
114
|
return GraphDataScience.from_neo4j_driver(driver=self.driver, database=self.database)
|
|
89
115
|
else:
|
|
@@ -66,7 +66,7 @@ class ProgressReporter:
|
|
|
66
66
|
task.success = success
|
|
67
67
|
task.summery = summery
|
|
68
68
|
|
|
69
|
-
report = f"{'\t' * task.depth}finished {task.task_name()} with success: {success}"
|
|
69
|
+
report = f"{'\t' * task.depth} finished {task.task_name()} in {task.end_time - task.start_time} with success: {success}"
|
|
70
70
|
if error is not None:
|
|
71
71
|
report += f", error: \n{error}"
|
|
72
72
|
else:
|
|
@@ -197,10 +197,10 @@ def get_reporter(context) -> ProgressReporter:
|
|
|
197
197
|
"""
|
|
198
198
|
Returns a ProgressReporter instance.
|
|
199
199
|
|
|
200
|
-
If the :
|
|
201
|
-
a :
|
|
200
|
+
If the :class:`ETLContext <etl_lib.core.ETLContext>` env holds the key `REPORTER_DATABASE` then
|
|
201
|
+
a :class:`Neo4jProgressReporter` instance is created with the given database name.
|
|
202
202
|
|
|
203
|
-
Otherwise, a :
|
|
203
|
+
Otherwise, a :class:`ProgressReporter` (no logging to database) instance will be created.
|
|
204
204
|
"""
|
|
205
205
|
|
|
206
206
|
db = context.env("REPORTER_DATABASE")
|
|
@@ -78,9 +78,6 @@ class Task:
|
|
|
78
78
|
"""Time when the :py:func:`~execute` has finished., `None` before."""
|
|
79
79
|
self.success: bool
|
|
80
80
|
"""True if the task has finished successful. False otherwise, `None` before the task has finished."""
|
|
81
|
-
self.summery: dict # TODO: still in use?
|
|
82
|
-
"""Summery statistics about the task performed, such as rows inserted, updated."""
|
|
83
|
-
self.error: str # TODO: still in use?
|
|
84
81
|
self.depth: int = 0
|
|
85
82
|
"""Level or depth of the task in the hierarchy. The root task is depth 0. Updated by the Reporter"""
|
|
86
83
|
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def merge_summery(summery_1: dict, summery_2: dict) -> dict:
|
|
5
|
+
"""
|
|
6
|
+
Helper function to merge dicts. Assuming that values are numbers.
|
|
7
|
+
If a key exists in both dicts, then the result will contain a key with the added values.
|
|
8
|
+
"""
|
|
9
|
+
return {i: summery_1.get(i, 0) + summery_2.get(i, 0)
|
|
10
|
+
for i in set(summery_1).union(summery_2)}
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def setup_logging(log_file=None):
|
|
14
|
+
"""
|
|
15
|
+
Set up logging to console and optionally to a log file.
|
|
16
|
+
|
|
17
|
+
:param log_file: Path to the log file
|
|
18
|
+
:type log_file: str, optional
|
|
19
|
+
"""
|
|
20
|
+
handlers = [logging.StreamHandler()]
|
|
21
|
+
if log_file:
|
|
22
|
+
handlers.append(logging.FileHandler(log_file))
|
|
23
|
+
|
|
24
|
+
logging.basicConfig(
|
|
25
|
+
level=logging.INFO,
|
|
26
|
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
27
|
+
handlers=handlers
|
|
28
|
+
)
|
|
@@ -4,7 +4,6 @@ from pathlib import Path
|
|
|
4
4
|
from typing import Generator
|
|
5
5
|
|
|
6
6
|
from etl_lib.core.BatchProcessor import BatchProcessor, BatchResults
|
|
7
|
-
from etl_lib.core.ETLContext import ETLContext
|
|
8
7
|
from etl_lib.core.Task import Task
|
|
9
8
|
|
|
10
9
|
|
|
@@ -17,7 +16,7 @@ class CSVBatchProcessor(BatchProcessor):
|
|
|
17
16
|
starting with 0.
|
|
18
17
|
"""
|
|
19
18
|
|
|
20
|
-
def __init__(self, csv_file: Path, context
|
|
19
|
+
def __init__(self, csv_file: Path, context, task: Task = None, **kwargs):
|
|
21
20
|
"""
|
|
22
21
|
Constructs a new CSVBatchProcessor.
|
|
23
22
|
|
|
@@ -32,10 +31,10 @@ class CSVBatchProcessor(BatchProcessor):
|
|
|
32
31
|
self.kwargs = kwargs
|
|
33
32
|
|
|
34
33
|
def get_batch(self, max_batch__size: int) -> Generator[BatchResults]:
|
|
35
|
-
for batch_size, chunks_ in self.
|
|
34
|
+
for batch_size, chunks_ in self.__read_csv(self.csv_file, batch_size=max_batch__size, **self.kwargs):
|
|
36
35
|
yield BatchResults(chunk=chunks_, statistics={"csv_lines_read": batch_size}, batch_size=batch_size)
|
|
37
36
|
|
|
38
|
-
def
|
|
37
|
+
def __read_csv(self, file: Path, batch_size: int, **kwargs):
|
|
39
38
|
if file.suffix == ".gz":
|
|
40
39
|
with gzip.open(file, "rt", encoding='utf-8-sig') as f:
|
|
41
40
|
yield from self.__parse_csv(batch_size, file=f, **kwargs)
|
|
@@ -44,30 +43,23 @@ class CSVBatchProcessor(BatchProcessor):
|
|
|
44
43
|
yield from self.__parse_csv(batch_size, file=f, **kwargs)
|
|
45
44
|
|
|
46
45
|
def __parse_csv(self, batch_size, file, **kwargs):
|
|
47
|
-
|
|
48
|
-
|
|
46
|
+
"""Read CSV in batches without loading the entire file at once."""
|
|
47
|
+
csv_reader = csv.DictReader(file, **kwargs)
|
|
49
48
|
|
|
50
|
-
def __split_to_batches(self, source: [dict], batch_size):
|
|
51
|
-
"""
|
|
52
|
-
Splits the provided source into batches.
|
|
53
|
-
|
|
54
|
-
Args:
|
|
55
|
-
source: Anything that can be loop over, ideally, this should also be a generator
|
|
56
|
-
batch_size: desired batch size
|
|
57
|
-
|
|
58
|
-
Returns:
|
|
59
|
-
generator object to loop over the batches. Each batch is an Array.
|
|
60
|
-
"""
|
|
61
49
|
cnt = 0
|
|
62
50
|
batch_ = []
|
|
63
|
-
|
|
64
|
-
|
|
51
|
+
|
|
52
|
+
for row in csv_reader:
|
|
53
|
+
row["_row"] = cnt
|
|
65
54
|
cnt += 1
|
|
66
|
-
batch_.append(self.__clean_dict(
|
|
55
|
+
batch_.append(self.__clean_dict(row))
|
|
56
|
+
|
|
67
57
|
if len(batch_) == batch_size:
|
|
68
58
|
yield len(batch_), batch_
|
|
69
59
|
batch_ = []
|
|
70
|
-
|
|
60
|
+
|
|
61
|
+
# Yield any remaining data
|
|
62
|
+
if batch_:
|
|
71
63
|
yield len(batch_), batch_
|
|
72
64
|
|
|
73
65
|
def __clean_dict(self, input_dict):
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from etl_lib.core.Task import Task, TaskReturn
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class CreateReportingConstraintsTask(Task):
|
|
5
|
+
"""Creates the constraint in the REPORTER_DATABASE database."""
|
|
6
|
+
|
|
7
|
+
def __init__(self, config):
|
|
8
|
+
super().__init__(config)
|
|
9
|
+
|
|
10
|
+
def run_internal(self, **kwargs) -> TaskReturn:
|
|
11
|
+
database = self.context.env("REPORTER_DATABASE")
|
|
12
|
+
assert database is not None, "REPORTER_DATABASE needs to be set in order to run this task"
|
|
13
|
+
|
|
14
|
+
with self.context.neo4j.session(database) as session:
|
|
15
|
+
result = self.context.neo4j.query_database(session=session,
|
|
16
|
+
query="CREATE CONSTRAINT IF NOT EXISTS FOR (n:ETLTask) REQUIRE n.uuid IS UNIQUE")
|
|
17
|
+
return TaskReturn(True, result.summery)
|
|
@@ -6,7 +6,12 @@ from etl_lib.core.utils import merge_summery
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class ExecuteCypherTask(Task):
|
|
9
|
+
"""
|
|
10
|
+
Execute cypher (write) as a Task.
|
|
9
11
|
|
|
12
|
+
This task is for data refinement jobs, as it does not return cypher results.
|
|
13
|
+
Parameters can be passed as keyword arguments to the constructor and will be available as parameters inside cypher.
|
|
14
|
+
"""
|
|
10
15
|
def __init__(self, context: ETLContext):
|
|
11
16
|
super().__init__(context)
|
|
12
17
|
self.context = context
|
{neo4j_etl_lib-0.0.2 → neo4j_etl_lib-0.0.3}/src/etl_lib/task/data_loading/CSVLoad2Neo4jTask.py
RENAMED
|
@@ -13,7 +13,7 @@ from etl_lib.data_sink.CypherBatchProcessor import CypherBatchProcessor
|
|
|
13
13
|
from etl_lib.data_source.CSVBatchProcessor import CSVBatchProcessor
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
class
|
|
16
|
+
class CSVLoad2Neo4jTask(Task):
|
|
17
17
|
|
|
18
18
|
def __init__(self, context: ETLContext, model: Type[BaseModel], file: Path, batch_size: int = 5000):
|
|
19
19
|
super().__init__(context)
|
neo4j_etl_lib-0.0.2/.env.sample
DELETED
neo4j_etl_lib-0.0.2/.gitignore
DELETED
neo4j_etl_lib-0.0.2/PKG-INFO
DELETED
|
@@ -1,126 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.3
|
|
2
|
-
Name: neo4j-etl-lib
|
|
3
|
-
Version: 0.0.2
|
|
4
|
-
Summary: Building blocks for ETL pipelines.
|
|
5
|
-
Keywords: etl,graph,database
|
|
6
|
-
Author-email: Bert Radke <bert.radke@pm.me>
|
|
7
|
-
Requires-Python: >=3.10
|
|
8
|
-
Description-Content-Type: text/markdown
|
|
9
|
-
Classifier: License :: OSI Approved :: Apache Software License
|
|
10
|
-
Classifier: Intended Audience :: Developers
|
|
11
|
-
Classifier: Programming Language :: Python
|
|
12
|
-
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
13
|
-
Classifier: Programming Language :: Python :: 3
|
|
14
|
-
Classifier: Topic :: Database
|
|
15
|
-
Classifier: Development Status :: 4 - Beta
|
|
16
|
-
Requires-Dist: pydantic>=2.10.5; python_version >= '3.8'
|
|
17
|
-
Requires-Dist: neo4j>=5.27.0; python_version >= '3.7'
|
|
18
|
-
Requires-Dist: python-dotenv>=1.0.1; python_version >= '3.8'
|
|
19
|
-
Requires-Dist: tabulate>=0.9.0; python_version >= '3.7'
|
|
20
|
-
Requires-Dist: click>=8.1.8; python_version >= '3.7'
|
|
21
|
-
Requires-Dist: pytest>=8.3.0 ; extra == "dev" and ( python_version >= '3.8')
|
|
22
|
-
Requires-Dist: testcontainers[neo4j]==4.9.0 ; extra == "dev" and ( python_version >= '3.9' and python_version < '4.0')
|
|
23
|
-
Requires-Dist: pytest-cov ; extra == "dev"
|
|
24
|
-
Requires-Dist: bumpver ; extra == "dev"
|
|
25
|
-
Requires-Dist: isort ; extra == "dev"
|
|
26
|
-
Requires-Dist: pip-tools ; extra == "dev"
|
|
27
|
-
Requires-Dist: sphinx ; extra == "dev"
|
|
28
|
-
Requires-Dist: sphinx-rtd-theme ; extra == "dev"
|
|
29
|
-
Requires-Dist: pydata-sphinx-theme ; extra == "dev"
|
|
30
|
-
Requires-Dist: sphinx-autodoc-typehints ; extra == "dev"
|
|
31
|
-
Requires-Dist: sphinxcontrib-napoleon ; extra == "dev"
|
|
32
|
-
Requires-Dist: graphdatascience>=1.13 ; extra == "gds" and ( python_version >= '3.9')
|
|
33
|
-
Project-URL: Documentation, https://neo-technology-field.github.io/python-etl-lib/index.html
|
|
34
|
-
Project-URL: Home, https://github.com/neo-technology-field/python-etl-lib
|
|
35
|
-
Provides-Extra: dev
|
|
36
|
-
Provides-Extra: gds
|
|
37
|
-
|
|
38
|
-
# Python ETL Toolbox
|
|
39
|
-
|
|
40
|
-
Complete documentation can be found on https://neo-technology-field.github.io/python-etl-lib/index.html
|
|
41
|
-
|
|
42
|
-
A library of building blocks to assemble etl pipelines.
|
|
43
|
-
|
|
44
|
-
So, instead of providing yet another etl tool, the aim is to provide quality building blocks for the usual etl task. These building blocks should (do) meet the following functional requirements:
|
|
45
|
-
|
|
46
|
-
* logging (of tasks performed including times, errors, and statistics)
|
|
47
|
-
* error handling
|
|
48
|
-
* validation of data (currently via Pydantic)
|
|
49
|
-
* batching and streaming
|
|
50
|
-
* optionally record the information about performed tasks and provide means (NeoDash, console) to review past etl runs.
|
|
51
|
-
|
|
52
|
-
While this library currently focuses on Neo4j databases, it can be extended to other sources and sinks as needed.
|
|
53
|
-
|
|
54
|
-
It does not provide a CLI out of the box, but contains a set of functions to list and manage past runs (if they are stored in a database). In addition, the provided example illustrates how to assemble a etl pipeline and run it from a CLI.
|
|
55
|
-
|
|
56
|
-
## Quick guide
|
|
57
|
-
|
|
58
|
-
### Installation
|
|
59
|
-
|
|
60
|
-
Package is available on PyPi and can be installed (for development) via:
|
|
61
|
-
|
|
62
|
-
```bash
|
|
63
|
-
python3 -m venv venv
|
|
64
|
-
source venv/bin/activate
|
|
65
|
-
python -m pip install pip-tools
|
|
66
|
-
pip-compile --extra dev pyproject.toml
|
|
67
|
-
pip-sync
|
|
68
|
-
```
|
|
69
|
-
|
|
70
|
-
### Usage
|
|
71
|
-
|
|
72
|
-
The below shows a minimalistic etl pipeline to a single CSV file (look at the GTFS example to see more details)
|
|
73
|
-
|
|
74
|
-
```python
|
|
75
|
-
|
|
76
|
-
class LoadAgenciesTask(CSVLoad2Neo4jTasks):
|
|
77
|
-
|
|
78
|
-
class Agency(BaseModel):
|
|
79
|
-
""" Define the Pydantic model for data validation. """
|
|
80
|
-
id: str = Field(alias="agency_id", default="generic")
|
|
81
|
-
name: str = Field(alias="agency_name")
|
|
82
|
-
url: str = Field(alias="agency_url")
|
|
83
|
-
timezone: str = Field(alias="agency_timezone")
|
|
84
|
-
lang: str = Field(alias="agency_lang")
|
|
85
|
-
|
|
86
|
-
def __init__(self, context: ETLContext, file:Path):
|
|
87
|
-
super().__init__(context, LoadAgenciesTask.Agency, file)
|
|
88
|
-
|
|
89
|
-
def task_name(self) -> str:
|
|
90
|
-
return f"{self.__class__.__name__}('{self.file}')"
|
|
91
|
-
|
|
92
|
-
def _query(self):
|
|
93
|
-
"""Load the data into Neo4j."""
|
|
94
|
-
return """ UNWIND $batch AS row
|
|
95
|
-
MERGE (a:Agency {id: row.id})
|
|
96
|
-
SET a.name= row.name,
|
|
97
|
-
a.url= row.url,
|
|
98
|
-
a.timezone= row.timezone,
|
|
99
|
-
a.lang= row.lang
|
|
100
|
-
"""
|
|
101
|
-
|
|
102
|
-
@classmethod
|
|
103
|
-
def file_name(cls):
|
|
104
|
-
return "agency.txt"
|
|
105
|
-
|
|
106
|
-
context = ETLContext(env_vars=dict(os.environ))
|
|
107
|
-
|
|
108
|
-
schema = SchemaTask(context=context)
|
|
109
|
-
init_group = TaskGroup(context=context, tasks=[schema], name="schema-init")
|
|
110
|
-
|
|
111
|
-
tasks = [
|
|
112
|
-
LoadAgenciesTask(context=context, file=input_directory / LoadAgenciesTask.file_name()),
|
|
113
|
-
]
|
|
114
|
-
csv_group = TaskGroup(context=context, tasks=tasks, name="csv-loading")
|
|
115
|
-
|
|
116
|
-
all_group = TaskGroup(context=context, tasks=[init_group, csv_group], name="main")
|
|
117
|
-
|
|
118
|
-
context.reporter.register_tasks(all_group)
|
|
119
|
-
|
|
120
|
-
all_group.execute()
|
|
121
|
-
|
|
122
|
-
```
|
|
123
|
-
See the provided [example](examples/gtfs/README.md) for a more realistic pipeline and how the logging and reporting would look like.
|
|
124
|
-
|
|
125
|
-
With the above, all lines in the input file `agency.txt` that do not fit the Pydantic model, would be sent to an json file, containing the error data and a description of why it could not be loaded.
|
|
126
|
-
|
neo4j_etl_lib-0.0.2/README.md
DELETED
|
@@ -1,88 +0,0 @@
|
|
|
1
|
-
# Python ETL Toolbox
|
|
2
|
-
|
|
3
|
-
Complete documentation can be found on https://neo-technology-field.github.io/python-etl-lib/index.html
|
|
4
|
-
|
|
5
|
-
A library of building blocks to assemble etl pipelines.
|
|
6
|
-
|
|
7
|
-
So, instead of providing yet another etl tool, the aim is to provide quality building blocks for the usual etl task. These building blocks should (do) meet the following functional requirements:
|
|
8
|
-
|
|
9
|
-
* logging (of tasks performed including times, errors, and statistics)
|
|
10
|
-
* error handling
|
|
11
|
-
* validation of data (currently via Pydantic)
|
|
12
|
-
* batching and streaming
|
|
13
|
-
* optionally record the information about performed tasks and provide means (NeoDash, console) to review past etl runs.
|
|
14
|
-
|
|
15
|
-
While this library currently focuses on Neo4j databases, it can be extended to other sources and sinks as needed.
|
|
16
|
-
|
|
17
|
-
It does not provide a CLI out of the box, but contains a set of functions to list and manage past runs (if they are stored in a database). In addition, the provided example illustrates how to assemble a etl pipeline and run it from a CLI.
|
|
18
|
-
|
|
19
|
-
## Quick guide
|
|
20
|
-
|
|
21
|
-
### Installation
|
|
22
|
-
|
|
23
|
-
Package is available on PyPi and can be installed (for development) via:
|
|
24
|
-
|
|
25
|
-
```bash
|
|
26
|
-
python3 -m venv venv
|
|
27
|
-
source venv/bin/activate
|
|
28
|
-
python -m pip install pip-tools
|
|
29
|
-
pip-compile --extra dev pyproject.toml
|
|
30
|
-
pip-sync
|
|
31
|
-
```
|
|
32
|
-
|
|
33
|
-
### Usage
|
|
34
|
-
|
|
35
|
-
The below shows a minimalistic etl pipeline to a single CSV file (look at the GTFS example to see more details)
|
|
36
|
-
|
|
37
|
-
```python
|
|
38
|
-
|
|
39
|
-
class LoadAgenciesTask(CSVLoad2Neo4jTasks):
|
|
40
|
-
|
|
41
|
-
class Agency(BaseModel):
|
|
42
|
-
""" Define the Pydantic model for data validation. """
|
|
43
|
-
id: str = Field(alias="agency_id", default="generic")
|
|
44
|
-
name: str = Field(alias="agency_name")
|
|
45
|
-
url: str = Field(alias="agency_url")
|
|
46
|
-
timezone: str = Field(alias="agency_timezone")
|
|
47
|
-
lang: str = Field(alias="agency_lang")
|
|
48
|
-
|
|
49
|
-
def __init__(self, context: ETLContext, file:Path):
|
|
50
|
-
super().__init__(context, LoadAgenciesTask.Agency, file)
|
|
51
|
-
|
|
52
|
-
def task_name(self) -> str:
|
|
53
|
-
return f"{self.__class__.__name__}('{self.file}')"
|
|
54
|
-
|
|
55
|
-
def _query(self):
|
|
56
|
-
"""Load the data into Neo4j."""
|
|
57
|
-
return """ UNWIND $batch AS row
|
|
58
|
-
MERGE (a:Agency {id: row.id})
|
|
59
|
-
SET a.name= row.name,
|
|
60
|
-
a.url= row.url,
|
|
61
|
-
a.timezone= row.timezone,
|
|
62
|
-
a.lang= row.lang
|
|
63
|
-
"""
|
|
64
|
-
|
|
65
|
-
@classmethod
|
|
66
|
-
def file_name(cls):
|
|
67
|
-
return "agency.txt"
|
|
68
|
-
|
|
69
|
-
context = ETLContext(env_vars=dict(os.environ))
|
|
70
|
-
|
|
71
|
-
schema = SchemaTask(context=context)
|
|
72
|
-
init_group = TaskGroup(context=context, tasks=[schema], name="schema-init")
|
|
73
|
-
|
|
74
|
-
tasks = [
|
|
75
|
-
LoadAgenciesTask(context=context, file=input_directory / LoadAgenciesTask.file_name()),
|
|
76
|
-
]
|
|
77
|
-
csv_group = TaskGroup(context=context, tasks=tasks, name="csv-loading")
|
|
78
|
-
|
|
79
|
-
all_group = TaskGroup(context=context, tasks=[init_group, csv_group], name="main")
|
|
80
|
-
|
|
81
|
-
context.reporter.register_tasks(all_group)
|
|
82
|
-
|
|
83
|
-
all_group.execute()
|
|
84
|
-
|
|
85
|
-
```
|
|
86
|
-
See the provided [example](examples/gtfs/README.md) for a more realistic pipeline and how the logging and reporting would look like.
|
|
87
|
-
|
|
88
|
-
With the above, all lines in the input file `agency.txt` that do not fit the Pydantic model, would be sent to an json file, containing the error data and a description of why it could not be loaded.
|