neo4j-etl-lib 0.0.2__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
etl_lib/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
1
  """
2
2
  Building blocks for ETL pipelines.
3
3
  """
4
- __version__ = "0.0.2"
4
+ __version__ = "0.1.0"
etl_lib/cli/run_tools.py CHANGED
@@ -98,7 +98,7 @@ def cli(ctx, neo4j_uri, neo4j_user, neo4j_password, log_file, database_name):
98
98
  @click.pass_context
99
99
  def query(ctx, number_runs):
100
100
  """
101
- Retrieve the list of the last x etl runs from the database and display them.
101
+ Retrieve the list of the last x ETL runs from the database and display them.
102
102
  """
103
103
  print(f"Listing runs in database '{ctx.obj['database_name']}'")
104
104
  with __driver(ctx) as driver:
@@ -155,7 +155,7 @@ def detail(ctx, run_id, details):
155
155
  "status": record["status"],
156
156
  "batches": record["batches"],
157
157
  "duration": __duration_from_start_end(record["startTime"], record["endTime"]),
158
- "changes": sum(record.get("stats", {}).values())
158
+ "changes": record.get("changes", 0)
159
159
  }
160
160
  for record in records
161
161
  ]
@@ -4,7 +4,6 @@ import sys
4
4
  from dataclasses import dataclass, field
5
5
  from typing import Generator
6
6
 
7
- from etl_lib.core.ETLContext import ETLContext
8
7
  from etl_lib.core.Task import Task
9
8
  from etl_lib.core.utils import merge_summery
10
9
 
@@ -53,7 +52,7 @@ class BatchProcessor:
53
52
  and returned in batches to the caller. Usage of `Generators` ensure that not all data must be loaded at once.
54
53
  """
55
54
 
56
- def __init__(self, context: ETLContext, task: Task, predecessor=None):
55
+ def __init__(self, context, task: Task = None, predecessor=None):
57
56
  """
58
57
  Constructs a new :py:class:`etl_lib.core.BatchProcessor` instance.
59
58
 
@@ -2,7 +2,7 @@ import logging
2
2
  from typing import NamedTuple, Any
3
3
 
4
4
  from graphdatascience import GraphDataScience
5
- from neo4j import Driver, GraphDatabase, WRITE_ACCESS, SummaryCounters
5
+ from neo4j import GraphDatabase, WRITE_ACCESS, SummaryCounters
6
6
 
7
7
  from etl_lib.core.ProgressReporter import get_reporter
8
8
 
@@ -20,18 +20,19 @@ def append_results(r1: QueryResult, r2: QueryResult) -> QueryResult:
20
20
 
21
21
 
22
22
  class Neo4jContext:
23
- uri: str
24
- auth: (str, str)
25
- driver: Driver
26
- database: str
23
+ """
24
+ Holds the connection to the neo4j database and provides facilities to execute queries.
25
+ """
27
26
 
28
27
  def __init__(self, env_vars: dict):
29
28
  """
30
29
  Create a new Neo4j context.
30
+
31
31
  Reads the following env_vars keys:
32
32
  - `NEO4J_URI`,
33
33
  - `NEO4J_USERNAME`,
34
34
  - `NEO4J_PASSWORD`.
35
+ - `NEO4J_DATABASE`,
35
36
  """
36
37
  self.logger = logging.getLogger(self.__class__.__name__)
37
38
  self.uri = env_vars["NEO4J_URI"]
@@ -43,6 +44,10 @@ class Neo4jContext:
43
44
  def query_database(self, session, query, **kwargs) -> QueryResult:
44
45
  """
45
46
  Executes a Cypher query on the Neo4j database.
47
+
48
+ Args:
49
+ session: Neo4j database session.
50
+ query: Cypher query either as a single query or as a list.
46
51
  """
47
52
  if isinstance(query, list):
48
53
  results = []
@@ -78,12 +83,33 @@ class Neo4jContext:
78
83
  }
79
84
 
80
85
  def session(self, database=None):
86
+ """
87
+ Create a new Neo4j session in write mode, caller is responsible to close the session.
88
+
89
+ Args:
90
+ database: name of the database to use for this session. If not provided, the database name provided during
91
+ construction will be used.
92
+
93
+ Returns:
94
+ newly created Neo4j session.
95
+
96
+ """
81
97
  if database is None:
82
98
  return self.driver.session(database=self.database, default_access_mode=WRITE_ACCESS)
83
99
  else:
84
100
  return self.driver.session(database=database, default_access_mode=WRITE_ACCESS)
85
101
 
86
102
  def gds(self, database=None) -> GraphDataScience:
103
+ """
104
+ Creates a new GraphDataScience client.
105
+
106
+ Args:
107
+ database: Name of the database to use for this dgs client.
108
+ If not provided, the database name provided during construction will be used.
109
+
110
+ Returns:
111
+ gds client.
112
+ """
87
113
  if database is None:
88
114
  return GraphDataScience.from_neo4j_driver(driver=self.driver, database=self.database)
89
115
  else:
@@ -104,8 +130,6 @@ class ETLContext:
104
130
  Will be passed to all :py:class:`etl_lib.core.Task` to provide access to environment variables and functionally
105
131
  deemed general enough that all parts of the ETL pipeline would need it.
106
132
  """
107
- neo4j: Neo4jContext
108
- __env_vars: dict
109
133
 
110
134
  def __init__(self, env_vars: dict):
111
135
  """
@@ -66,7 +66,7 @@ class ProgressReporter:
66
66
  task.success = success
67
67
  task.summery = summery
68
68
 
69
- report = f"{'\t' * task.depth}finished {task.task_name()} with success: {success}"
69
+ report = f"{'\t' * task.depth} finished {task.task_name()} in {task.end_time - task.start_time} with success: {success}"
70
70
  if error is not None:
71
71
  report += f", error: \n{error}"
72
72
  else:
@@ -197,10 +197,10 @@ def get_reporter(context) -> ProgressReporter:
197
197
  """
198
198
  Returns a ProgressReporter instance.
199
199
 
200
- If the :py:class:`ETLContext <etl_lib.core.ETLContext>` env holds the key `REPORTER_DATABASE` then
201
- a :py:class:`Neo4jProgressReporter` instance is created with the given database name.
200
+ If the :class:`ETLContext <etl_lib.core.ETLContext>` env holds the key `REPORTER_DATABASE` then
201
+ a :class:`Neo4jProgressReporter` instance is created with the given database name.
202
202
 
203
- Otherwise, a :py:class:`ProgressReporter` (no logging to database) instance will be created.
203
+ Otherwise, a :class:`ProgressReporter` (no logging to database) instance will be created.
204
204
  """
205
205
 
206
206
  db = context.env("REPORTER_DATABASE")
etl_lib/core/Task.py CHANGED
@@ -78,9 +78,6 @@ class Task:
78
78
  """Time when the :py:func:`~execute` has finished., `None` before."""
79
79
  self.success: bool
80
80
  """True if the task has finished successful. False otherwise, `None` before the task has finished."""
81
- self.summery: dict # TODO: still in use?
82
- """Summery statistics about the task performed, such as rows inserted, updated."""
83
- self.error: str # TODO: still in use?
84
81
  self.depth: int = 0
85
82
  """Level or depth of the task in the hierarchy. The root task is depth 0. Updated by the Reporter"""
86
83
 
@@ -47,7 +47,7 @@ class ValidationBatchProcessor(BatchProcessor):
47
47
  for row in batch.chunk:
48
48
  try:
49
49
  # Validate and transform the row
50
- validated_row = self.model(**row).model_dump()
50
+ validated_row = json.loads(self.model(**row).model_dump_json())
51
51
  valid_rows.append(validated_row)
52
52
  except ValidationError as e:
53
53
  # Collect invalid rows with errors
etl_lib/core/utils.py CHANGED
@@ -1,3 +1,6 @@
1
+ import logging
2
+
3
+
1
4
  def merge_summery(summery_1: dict, summery_2: dict) -> dict:
2
5
  """
3
6
  Helper function to merge dicts. Assuming that values are numbers.
@@ -5,3 +8,21 @@ def merge_summery(summery_1: dict, summery_2: dict) -> dict:
5
8
  """
6
9
  return {i: summery_1.get(i, 0) + summery_2.get(i, 0)
7
10
  for i in set(summery_1).union(summery_2)}
11
+
12
+
13
+ def setup_logging(log_file=None):
14
+ """
15
+ Set up logging to console and optionally to a log file.
16
+
17
+ :param log_file: Path to the log file
18
+ :type log_file: str, optional
19
+ """
20
+ handlers = [logging.StreamHandler()]
21
+ if log_file:
22
+ handlers.append(logging.FileHandler(log_file))
23
+
24
+ logging.basicConfig(
25
+ level=logging.INFO,
26
+ format='%(asctime)s - %(levelname)s - %(message)s',
27
+ handlers=handlers
28
+ )
@@ -0,0 +1,57 @@
1
+ import csv
2
+ from pathlib import Path
3
+ from typing import Generator
4
+
5
+ from etl_lib.core.ETLContext import ETLContext
6
+ from etl_lib.core.BatchProcessor import BatchProcessor, BatchResults, append_result
7
+ from etl_lib.core.Task import Task
8
+
9
+
10
+ class CSVBatchSink(BatchProcessor):
11
+ """
12
+ BatchProcessor to write batches of data to a CSV file.
13
+ """
14
+
15
+ def __init__(self, context: ETLContext, task: Task, predecessor: BatchProcessor, file_path: Path, **kwargs):
16
+ """
17
+ Constructs a new CSVBatchSink.
18
+
19
+ Args:
20
+ context: :class:`etl_lib.core.ETLContext.ETLContext` instance.
21
+ task: :class:`etl_lib.core.Task.Task` instance owning this batchProcessor.
22
+ predecessor: BatchProcessor which :func:`~get_batch` function will be called to receive batches to process.
23
+ file_path: Path to the CSV file where data will be written. If the file exists, data will be appended.
24
+ **kwargs: Additional arguments passed to `csv.DictWriter` to allow tuning the csv creation.
25
+ """
26
+ super().__init__(context, task, predecessor)
27
+ self.file_path = file_path
28
+ self.file_initialized = False
29
+ self.csv_kwargs = kwargs
30
+
31
+ def get_batch(self, batch_size: int) -> Generator[BatchResults, None, None]:
32
+ assert self.predecessor is not None
33
+
34
+ for batch_result in self.predecessor.get_batch(batch_size):
35
+ self._write_to_csv(batch_result.chunk)
36
+ yield append_result(batch_result, {"rows_written": len(batch_result.chunk)})
37
+
38
+ def _write_to_csv(self, data: list[dict]):
39
+ """
40
+ Writes a batch of data to the CSV file.
41
+
42
+ Args:
43
+ data: A list of dictionaries representing rows of data.
44
+ """
45
+ if not data:
46
+ return
47
+
48
+ fieldnames = data[0].keys()
49
+ write_header = not self.file_initialized or not self.file_path.exists()
50
+
51
+ with self.file_path.open(mode="a", newline="", encoding="utf-8") as csvfile:
52
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames, **self.csv_kwargs)
53
+ if write_header:
54
+ writer.writeheader()
55
+ writer.writerows(data)
56
+
57
+ self.file_initialized = True
@@ -5,19 +5,19 @@ from etl_lib.core.BatchProcessor import BatchProcessor, BatchResults, append_res
5
5
  from etl_lib.core.Task import Task
6
6
 
7
7
 
8
- class CypherBatchProcessor(BatchProcessor):
8
+ class CypherBatchSink(BatchProcessor):
9
9
  """
10
10
  BatchProcessor to write batches of data to a Neo4j database.
11
11
  """
12
12
 
13
13
  def __init__(self, context: ETLContext, task: Task, predecessor: BatchProcessor, query: str):
14
14
  """
15
- Constructs a new CypherBatchProcessor.
15
+ Constructs a new CypherBatchSink.
16
16
 
17
17
  Args:
18
- context: :py:class:`etl_lib.core.ETLContext.ETLContext` instance.
19
- task: :py:class:`etl_lib.core.Task.Task` instance owning this batchProcessor.
20
- predecessor: BatchProcessor which :py:func:`~get_batch` function will be called to receive batches to process.
18
+ context: :class:`etl_lib.core.ETLContext.ETLContext` instance.
19
+ task: :class:`etl_lib.core.Task.Task` instance owning this batchProcessor.
20
+ predecessor: BatchProcessor which :func:`~get_batch` function will be called to receive batches to process.
21
21
  query: Cypher to write the query to Neo4j.
22
22
  Data will be passed as `batch` parameter.
23
23
  Therefor, the query should start with a `UNWIND $batch AS row`.
@@ -4,11 +4,10 @@ from pathlib import Path
4
4
  from typing import Generator
5
5
 
6
6
  from etl_lib.core.BatchProcessor import BatchProcessor, BatchResults
7
- from etl_lib.core.ETLContext import ETLContext
8
7
  from etl_lib.core.Task import Task
9
8
 
10
9
 
11
- class CSVBatchProcessor(BatchProcessor):
10
+ class CSVBatchSource(BatchProcessor):
12
11
  """
13
12
  BatchProcessor that reads a CSV file using the `csv` package.
14
13
 
@@ -17,13 +16,13 @@ class CSVBatchProcessor(BatchProcessor):
17
16
  starting with 0.
18
17
  """
19
18
 
20
- def __init__(self, csv_file: Path, context: ETLContext, task: Task, **kwargs):
19
+ def __init__(self, csv_file: Path, context, task: Task = None, **kwargs):
21
20
  """
22
- Constructs a new CSVBatchProcessor.
21
+ Constructs a new CSVBatchSource.
23
22
 
24
23
  Args:
25
24
  csv_file: Path to the CSV file.
26
- context: :py:class:`etl_lib.core.ETLContext.ETLContext` instance.
25
+ context: :class:`etl_lib.core.ETLContext.ETLContext` instance.
27
26
  kwargs: Will be passed on to the `csv.DictReader` providing a way to customise the reading to different
28
27
  csv formats.
29
28
  """
@@ -32,10 +31,10 @@ class CSVBatchProcessor(BatchProcessor):
32
31
  self.kwargs = kwargs
33
32
 
34
33
  def get_batch(self, max_batch__size: int) -> Generator[BatchResults]:
35
- for batch_size, chunks_ in self.read_csv(self.csv_file, batch_size=max_batch__size, **self.kwargs):
34
+ for batch_size, chunks_ in self.__read_csv(self.csv_file, batch_size=max_batch__size, **self.kwargs):
36
35
  yield BatchResults(chunk=chunks_, statistics={"csv_lines_read": batch_size}, batch_size=batch_size)
37
36
 
38
- def read_csv(self, file: Path, batch_size: int, **kwargs):
37
+ def __read_csv(self, file: Path, batch_size: int, **kwargs):
39
38
  if file.suffix == ".gz":
40
39
  with gzip.open(file, "rt", encoding='utf-8-sig') as f:
41
40
  yield from self.__parse_csv(batch_size, file=f, **kwargs)
@@ -44,30 +43,23 @@ class CSVBatchProcessor(BatchProcessor):
44
43
  yield from self.__parse_csv(batch_size, file=f, **kwargs)
45
44
 
46
45
  def __parse_csv(self, batch_size, file, **kwargs):
47
- csv_file = csv.DictReader(file, **kwargs)
48
- yield from self.__split_to_batches(csv_file, batch_size)
46
+ """Read CSV in batches without loading the entire file at once."""
47
+ csv_reader = csv.DictReader(file, **kwargs)
49
48
 
50
- def __split_to_batches(self, source: [dict], batch_size):
51
- """
52
- Splits the provided source into batches.
53
-
54
- Args:
55
- source: Anything that can be loop over, ideally, this should also be a generator
56
- batch_size: desired batch size
57
-
58
- Returns:
59
- generator object to loop over the batches. Each batch is an Array.
60
- """
61
49
  cnt = 0
62
50
  batch_ = []
63
- for i in source:
64
- i["_row"] = cnt
51
+
52
+ for row in csv_reader:
53
+ row["_row"] = cnt
65
54
  cnt += 1
66
- batch_.append(self.__clean_dict(i))
55
+ batch_.append(self.__clean_dict(row))
56
+
67
57
  if len(batch_) == batch_size:
68
58
  yield len(batch_), batch_
69
59
  batch_ = []
70
- if len(batch_) > 0:
60
+
61
+ # Yield any remaining data
62
+ if batch_:
71
63
  yield len(batch_), batch_
72
64
 
73
65
  def __clean_dict(self, input_dict):
@@ -0,0 +1,47 @@
1
+ from typing import Generator
2
+
3
+ from etl_lib.core.BatchProcessor import BatchResults, BatchProcessor
4
+ from etl_lib.core.ETLContext import ETLContext
5
+ from etl_lib.core.Task import Task
6
+
7
+
8
+ class CypherBatchSource(BatchProcessor):
9
+
10
+ def __init__(self, context: ETLContext, task: Task, query: str, **kwargs):
11
+ """
12
+ Constructs a new CypherBatchSource.
13
+
14
+ Args:
15
+ context: :class:`etl_lib.core.ETLContext.ETLContext` instance.
16
+ task: :class:`etl_lib.core.Task.Task` instance owning this batchProcessor.
17
+ query: Cypher query to execute.
18
+ kwargs: Arguments passed as parameters with the query.
19
+ """
20
+ super().__init__(context, task)
21
+ self.query = query
22
+ self.kwargs = kwargs
23
+
24
+ def __read_records(self, tx, batch_size):
25
+ batch_ = []
26
+ result = tx.run(self.query, **self.kwargs)
27
+
28
+ for record in result:
29
+ batch_.append(record.data())
30
+ if len(batch_) == batch_size:
31
+ yield batch_
32
+ batch_ = []
33
+
34
+ if batch_:
35
+ yield batch_
36
+
37
+ def get_batch(self, max_batch_size: int) -> Generator[BatchResults, None, None]:
38
+ # not using managed tx on purpose. First of, we want to keep the tx open while delivering batches
39
+ # automatic retry logic would help, as we do not want to start the query again
40
+ with self.context.neo4j.session() as session:
41
+ with session.begin_transaction() as tx:
42
+ for chunk in self.__read_records(tx, max_batch_size):
43
+ yield BatchResults(
44
+ chunk=chunk,
45
+ statistics={"cypher_rows_read": len(chunk)},
46
+ batch_size=len(chunk)
47
+ )
@@ -0,0 +1,17 @@
1
+ from etl_lib.core.Task import Task, TaskReturn
2
+
3
+
4
+ class CreateReportingConstraintsTask(Task):
5
+ """Creates the constraint in the REPORTER_DATABASE database."""
6
+
7
+ def __init__(self, config):
8
+ super().__init__(config)
9
+
10
+ def run_internal(self, **kwargs) -> TaskReturn:
11
+ database = self.context.env("REPORTER_DATABASE")
12
+ assert database is not None, "REPORTER_DATABASE needs to be set in order to run this task"
13
+
14
+ with self.context.neo4j.session(database) as session:
15
+ result = self.context.neo4j.query_database(session=session,
16
+ query="CREATE CONSTRAINT IF NOT EXISTS FOR (n:ETLTask) REQUIRE n.uuid IS UNIQUE")
17
+ return TaskReturn(True, result.summery)
@@ -6,7 +6,12 @@ from etl_lib.core.utils import merge_summery
6
6
 
7
7
 
8
8
  class ExecuteCypherTask(Task):
9
+ """
10
+ Execute cypher (write) as a Task.
9
11
 
12
+ This task is for data refinement jobs, as it does not return cypher results.
13
+ Parameters can be passed as keyword arguments to the constructor and will be available as parameters inside cypher.
14
+ """
10
15
  def __init__(self, context: ETLContext):
11
16
  super().__init__(context)
12
17
  self.context = context
@@ -9,12 +9,24 @@ from etl_lib.core.ETLContext import ETLContext
9
9
  from etl_lib.core.ClosedLoopBatchProcessor import ClosedLoopBatchProcessor
10
10
  from etl_lib.core.Task import Task, TaskReturn
11
11
  from etl_lib.core.ValidationBatchProcessor import ValidationBatchProcessor
12
- from etl_lib.data_sink.CypherBatchProcessor import CypherBatchProcessor
13
- from etl_lib.data_source.CSVBatchProcessor import CSVBatchProcessor
12
+ from etl_lib.data_sink.CypherBatchSink import CypherBatchSink
13
+ from etl_lib.data_source.CSVBatchSource import CSVBatchSource
14
14
 
15
15
 
16
- class CSVLoad2Neo4jTasks(Task):
16
+ class CSVLoad2Neo4jTask(Task):
17
+ """
18
+ Loads the specified CSV file to Neo4j.
17
19
 
20
+ Uses BatchProcessors to read, validate and write to Neo4j.
21
+ The validation step is using pydantic, hence a Pydantic model needs to be provided.
22
+ Rows that fail the validation, will be written to en error file. The location of the error file is determined as
23
+ follows:
24
+
25
+ If the context env vars hold an entry `ETL_ERROR_PATH` the file will be place there, with the name set to name
26
+ of the provided filename appended with `.error.json`
27
+
28
+ If `ETL_ERROR_PATH` is not set, the file will be placed in the same directory as the CSV file.
29
+ """
18
30
  def __init__(self, context: ETLContext, model: Type[BaseModel], file: Path, batch_size: int = 5000):
19
31
  super().__init__(context)
20
32
  self.batch_size = batch_size
@@ -23,11 +35,15 @@ class CSVLoad2Neo4jTasks(Task):
23
35
  self.file = file
24
36
 
25
37
  def run_internal(self, **kwargs) -> TaskReturn:
26
- error_file = self.file.with_suffix(".error.json")
38
+ error_path = self.context.env("ETL_ERROR_PATH")
39
+ if error_path is None:
40
+ error_file = self.file.with_suffix(".error.json")
41
+ else:
42
+ error_file = error_path / self.file.with_name(self.file.stem + ".error.json").name
27
43
 
28
- csv = CSVBatchProcessor(self.file, self.context, self)
44
+ csv = CSVBatchSource(self.file, self.context, self)
29
45
  validator = ValidationBatchProcessor(self.context, self, csv, self.model, error_file)
30
- cypher = CypherBatchProcessor(self.context, self, validator, self._query())
46
+ cypher = CypherBatchSink(self.context, self, validator, self._query())
31
47
  end = ClosedLoopBatchProcessor(self.context, self, cypher)
32
48
  result = next(end.get_batch(self.batch_size))
33
49
 
File without changes
@@ -0,0 +1,153 @@
1
+ import logging
2
+ import os
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+ from _pytest.tmpdir import tmp_path
7
+ from neo4j import Driver
8
+ from neo4j.time import Date
9
+
10
+ from etl_lib.core.ETLContext import QueryResult, Neo4jContext, ETLContext
11
+ from etl_lib.core.Task import Task
12
+
13
+
14
+ def run_query(driver, query, data):
15
+ with driver.session(database=get_database_name()) as session:
16
+ result = session.run(query, data=data)
17
+ return result.data()
18
+
19
+
20
+ def get_node_count(driver, label: str) -> int:
21
+ """Get the count of nodes with the specified label"""
22
+ query = f"MATCH (n:{label}) RETURN COUNT(n) AS count"
23
+ with driver.session(database=get_database_name()) as session:
24
+ result = session.run(query)
25
+ return result.single()["count"]
26
+
27
+
28
+ def get_relationship_count(driver, rel_type: str) -> int:
29
+ """Get the count of relationships with the specified type"""
30
+ query = f"MATCH ()-[r:{rel_type}]->() RETURN COUNT(r) AS count"
31
+ with driver.session(database=get_database_name()) as session:
32
+ result = session.run(query)
33
+ return result.single()["count"]
34
+
35
+
36
+ def check_property_exists(driver, label: str, property_name: str) -> bool:
37
+ """
38
+ Checks if all nodes with the given label have the given property.
39
+ :return:
40
+ """
41
+ with driver.session(database=get_database_name()) as session:
42
+ result = session.run(f"MATCH (n:{label}) WHERE n.{property_name} IS NULL"
43
+ f" RETURN COUNT(n) = 0 AS exists")
44
+ return result.single()["exists"]
45
+
46
+
47
+ def get_graph(driver):
48
+ """
49
+ Return a grap representation of all data in the database.
50
+ The returned structure is an array of dicts. Each dict has the following keys:
51
+ `start`, `end`, and `rel` representing each relationship found in the graph.
52
+ Use the following query to generate this structure from a known good graph:
53
+
54
+ MATCH (s)-[r]->(e)
55
+ WITH {labels:labels(s), props:properties(s)} AS start, {type:type(r), props:properties(r)} AS rel, {labels:labels(e), props:properties(e)} AS end
56
+ RETURN {start:start, rel:rel, end:end}
57
+ """
58
+ with driver.session(database=get_database_name()) as session:
59
+ records = session.run(
60
+ """
61
+ MATCH (s)-[r]->(e)
62
+ WITH {labels:labels(s), props:properties(s)} AS start,
63
+ {type:type(r), props:properties(r)} AS rel,
64
+ {labels:labels(e), props:properties(e)} AS end
65
+ RETURN {start:start, rel:rel, end:end} AS graph
66
+ """
67
+ )
68
+ data = [record.data()["graph"] for record in records]
69
+ return convert_neo4j_date_to_string(data, "%Y-%m-%d")
70
+
71
+
72
+ def convert_neo4j_date_to_string(data, date_format):
73
+ """
74
+ Recursively converts all neo4j.time.Date instances in a dictionary into strings using the provided format.
75
+
76
+ :param data: The input dictionary or list to process.
77
+ :param date_format: A format string compatible with Python's strftime.
78
+ :return: The processed dictionary or list with dates converted to strings.
79
+ """
80
+ if isinstance(data, dict):
81
+ return {key: convert_neo4j_date_to_string(value, date_format) for key, value in data.items()}
82
+ elif isinstance(data, list):
83
+ return [convert_neo4j_date_to_string(item, date_format) for item in data]
84
+ elif isinstance(data, Date):
85
+ return data.to_native().strftime(date_format)
86
+ else:
87
+ return data
88
+
89
+
90
+ def get_database_name():
91
+ if os.getenv("NEO4J_TEST_CONTAINER") is None:
92
+ # not running with test containers. expect test db to be set
93
+ if os.getenv("NEO4J_TEST_DATABASE") is not None:
94
+ return os.getenv("NEO4J_TEST_DATABASE")
95
+ else:
96
+ raise Exception("define NEO4J_TEST_DATABASE environment variable")
97
+
98
+
99
+ class TestNeo4jContext(Neo4jContext):
100
+
101
+ def __init__(self, driver: Driver):
102
+ self.logger = logging.getLogger(self.__class__.__name__)
103
+ self.driver = driver
104
+ self.database = get_database_name()
105
+
106
+
107
+ class TestETLContext(ETLContext):
108
+
109
+ def __init__(self, driver: Driver, tmp_path):
110
+ self.logger = logging.getLogger(self.__class__.__name__)
111
+ self.__env_vars = {"ETL_ERROR_PATH": tmp_path}
112
+ self.neo4j = TestNeo4jContext(driver)
113
+ self.reporter = DummyReporter()
114
+
115
+ def env(self, key: str) -> Any:
116
+ if key in self.__env_vars:
117
+ return self.__env_vars[key]
118
+
119
+
120
+ class DummyReporter:
121
+
122
+ def register_tasks(self, main: Task):
123
+ pass
124
+
125
+ def started_task(self, task: Task) -> Task:
126
+ pass
127
+
128
+ def finished_task(self, task, success: bool, summery: dict, error: str = None) -> Task:
129
+ pass
130
+
131
+ def report_progress(self, task, batches: int, expected_batches: int, stats: dict) -> None:
132
+ pass
133
+
134
+
135
+ class DummyNeo4jContext:
136
+
137
+ def query_database(self, session, query, **kwargs) -> QueryResult:
138
+ return QueryResult([], {})
139
+
140
+ def session(self, database=None):
141
+ return None
142
+
143
+
144
+ class DummyContext:
145
+ neo4j: DummyNeo4jContext
146
+ __env_vars: dict
147
+ path_error: Path
148
+ path_import: Path
149
+ path_processed: Path
150
+ reporter = DummyReporter()
151
+
152
+ def env(self, key: str) -> Any:
153
+ pass
@@ -0,0 +1,54 @@
1
+ Metadata-Version: 2.4
2
+ Name: neo4j-etl-lib
3
+ Version: 0.1.0
4
+ Summary: Building blocks for ETL pipelines.
5
+ Keywords: etl,graph,database
6
+ Author-email: Bert Radke <bert.radke@pm.me>
7
+ Requires-Python: >=3.10
8
+ Description-Content-Type: text/markdown
9
+ Classifier: License :: OSI Approved :: Apache Software License
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Programming Language :: Python
12
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Topic :: Database
15
+ Classifier: Development Status :: 4 - Beta
16
+ License-File: LICENSE
17
+ Requires-Dist: pydantic>=2.10.5; python_version >= '3.8'
18
+ Requires-Dist: neo4j>=5.27.0; python_version >= '3.7'
19
+ Requires-Dist: python-dotenv>=1.0.1; python_version >= '3.8'
20
+ Requires-Dist: tabulate>=0.9.0; python_version >= '3.7'
21
+ Requires-Dist: click>=8.1.8; python_version >= '3.7'
22
+ Requires-Dist: pytest>=8.3.0 ; extra == "dev" and ( python_version >= '3.8')
23
+ Requires-Dist: testcontainers[neo4j]==4.9.0 ; extra == "dev" and ( python_version >= '3.9' and python_version < '4.0')
24
+ Requires-Dist: pytest-cov ; extra == "dev"
25
+ Requires-Dist: bumpver ; extra == "dev"
26
+ Requires-Dist: isort ; extra == "dev"
27
+ Requires-Dist: pip-tools ; extra == "dev"
28
+ Requires-Dist: sphinx ; extra == "dev"
29
+ Requires-Dist: sphinx-rtd-theme ; extra == "dev"
30
+ Requires-Dist: pydata-sphinx-theme ; extra == "dev"
31
+ Requires-Dist: sphinx-autodoc-typehints ; extra == "dev"
32
+ Requires-Dist: sphinxcontrib-napoleon ; extra == "dev"
33
+ Requires-Dist: sphinx-autoapi ; extra == "dev"
34
+ Requires-Dist: graphdatascience>=1.13 ; extra == "gds" and ( python_version >= '3.9')
35
+ Project-URL: Documentation, https://neo-technology-field.github.io/python-etl-lib/index.html
36
+ Project-URL: Home, https://github.com/neo-technology-field/python-etl-lib
37
+ Provides-Extra: dev
38
+ Provides-Extra: gds
39
+
40
+ # Neo4j ETL Toolbox
41
+
42
+ A Python library of building blocks to assemble etl pipelines.
43
+
44
+ Complete documentation can be found on https://neo-technology-field.github.io/python-etl-lib/index.html
45
+
46
+ See https://github.com/neo-technology-field/python-etl-lib/tree/main/examples/gtfs for an example project.
47
+
48
+
49
+ The library can be installed via
50
+
51
+ ```bash
52
+ pip install neo4j-etl-lib
53
+ ```
54
+
@@ -0,0 +1,29 @@
1
+ etl_lib/__init__.py,sha256=xZKM1gxoW-QX6-igG9rff50v2lL6MgnuhzOOxOORaUI,65
2
+ etl_lib/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ etl_lib/cli/run_tools.py,sha256=YMpa-WICon6mjuPuwyrtdBM9IiHdO9UPBYtA_y6UA0Y,7292
4
+ etl_lib/core/BatchProcessor.py,sha256=6quNPE9Dp8hYJDQDTqxQtxbQ3KCmb56Mko34EIsNhyI,3352
5
+ etl_lib/core/ClosedLoopBatchProcessor.py,sha256=unlx_A339oi2nOOXF0irrVf8j_GFhwcTuk_w5liqbWc,1321
6
+ etl_lib/core/ETLContext.py,sha256=ZTk_IDILpjUji0DphPUzTNx8k_2hZRxy37mqIcEA-kM,5641
7
+ etl_lib/core/ProgressReporter.py,sha256=z5aVBjDJZSNGr6zmY8DsMC6dzEcnhAV7RboHWJdl49g,8557
8
+ etl_lib/core/Task.py,sha256=qhCRYEJciYdaYzMurUTTzGQgm7UeKe0Ik37Fp-qAgr8,9256
9
+ etl_lib/core/ValidationBatchProcessor.py,sha256=EhO6PFQB-4PZgIOTXP4PwkbAl5HRK0zgTeKMseRU5QU,3261
10
+ etl_lib/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ etl_lib/core/utils.py,sha256=wwfyvy78fL6sqHdV0IFqAVyEkp6vo5Yo8gRZua2dulw,816
12
+ etl_lib/data_sink/CSVBatchSink.py,sha256=oq4VJwnA4WSyJzdvwstGv73vOEuWmPSfCynhVmxBByU,2204
13
+ etl_lib/data_sink/CypherBatchSink.py,sha256=RMuelUat55ojLQMRYmoiXG0D_fgWH0RLbmUd01UMv_c,1511
14
+ etl_lib/data_sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ etl_lib/data_source/CSVBatchSource.py,sha256=HILkaQAFua1OM7xgSNKY6teXpcJjWUPaS4Aol-GLYL8,2767
16
+ etl_lib/data_source/CypherBatchSource.py,sha256=Umyr5-eQ5vI7EFqjDhUTgSGzuUkglGKjYIWLpijdGrU,1752
17
+ etl_lib/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
+ etl_lib/task/CreateReportingConstraintsTask.py,sha256=aV5i1EwjfuG-eEGoNaB-NcaPhyu0NgdVhmZr5MIv8ak,760
19
+ etl_lib/task/ExecuteCypherTask.py,sha256=wpPF-bbawRiNS1cCXLhIwuXROAcXsv3OfdKc6DH5q2o,1252
20
+ etl_lib/task/GDSTask.py,sha256=X1E83wYa-N7AXy43WPEqIy77d__z-2wpBjWNhGNXJzA,1781
21
+ etl_lib/task/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
+ etl_lib/task/data_loading/CSVLoad2Neo4jTask.py,sha256=US9Sa6ytPPEa6BSVUBttlWdKzqyxlF-09If5XCf-LIE,2277
23
+ etl_lib/task/data_loading/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
+ etl_lib/test_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
+ etl_lib/test_utils/utils.py,sha256=G_qT2WHrBAnNNCmAjCZAgqPP0NseJzBDyBttYmSshQU,5150
26
+ neo4j_etl_lib-0.1.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
27
+ neo4j_etl_lib-0.1.0.dist-info/WHEEL,sha256=_2ozNFCLWc93bK4WKHCO-eDUENDlo-dgc9cU3qokYO4,82
28
+ neo4j_etl_lib-0.1.0.dist-info/METADATA,sha256=nk13cf2M1ErdY9fL0T2leYjJlkdXZOtnKpa-XMu8ifE,2210
29
+ neo4j_etl_lib-0.1.0.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: flit 3.10.1
2
+ Generator: flit 3.11.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,126 +0,0 @@
1
- Metadata-Version: 2.3
2
- Name: neo4j-etl-lib
3
- Version: 0.0.2
4
- Summary: Building blocks for ETL pipelines.
5
- Keywords: etl,graph,database
6
- Author-email: Bert Radke <bert.radke@pm.me>
7
- Requires-Python: >=3.10
8
- Description-Content-Type: text/markdown
9
- Classifier: License :: OSI Approved :: Apache Software License
10
- Classifier: Intended Audience :: Developers
11
- Classifier: Programming Language :: Python
12
- Classifier: Topic :: Software Development :: Libraries :: Python Modules
13
- Classifier: Programming Language :: Python :: 3
14
- Classifier: Topic :: Database
15
- Classifier: Development Status :: 4 - Beta
16
- Requires-Dist: pydantic>=2.10.5; python_version >= '3.8'
17
- Requires-Dist: neo4j>=5.27.0; python_version >= '3.7'
18
- Requires-Dist: python-dotenv>=1.0.1; python_version >= '3.8'
19
- Requires-Dist: tabulate>=0.9.0; python_version >= '3.7'
20
- Requires-Dist: click>=8.1.8; python_version >= '3.7'
21
- Requires-Dist: pytest>=8.3.0 ; extra == "dev" and ( python_version >= '3.8')
22
- Requires-Dist: testcontainers[neo4j]==4.9.0 ; extra == "dev" and ( python_version >= '3.9' and python_version < '4.0')
23
- Requires-Dist: pytest-cov ; extra == "dev"
24
- Requires-Dist: bumpver ; extra == "dev"
25
- Requires-Dist: isort ; extra == "dev"
26
- Requires-Dist: pip-tools ; extra == "dev"
27
- Requires-Dist: sphinx ; extra == "dev"
28
- Requires-Dist: sphinx-rtd-theme ; extra == "dev"
29
- Requires-Dist: pydata-sphinx-theme ; extra == "dev"
30
- Requires-Dist: sphinx-autodoc-typehints ; extra == "dev"
31
- Requires-Dist: sphinxcontrib-napoleon ; extra == "dev"
32
- Requires-Dist: graphdatascience>=1.13 ; extra == "gds" and ( python_version >= '3.9')
33
- Project-URL: Documentation, https://neo-technology-field.github.io/python-etl-lib/index.html
34
- Project-URL: Home, https://github.com/neo-technology-field/python-etl-lib
35
- Provides-Extra: dev
36
- Provides-Extra: gds
37
-
38
- # Python ETL Toolbox
39
-
40
- Complete documentation can be found on https://neo-technology-field.github.io/python-etl-lib/index.html
41
-
42
- A library of building blocks to assemble etl pipelines.
43
-
44
- So, instead of providing yet another etl tool, the aim is to provide quality building blocks for the usual etl task. These building blocks should (do) meet the following functional requirements:
45
-
46
- * logging (of tasks performed including times, errors, and statistics)
47
- * error handling
48
- * validation of data (currently via Pydantic)
49
- * batching and streaming
50
- * optionally record the information about performed tasks and provide means (NeoDash, console) to review past etl runs.
51
-
52
- While this library currently focuses on Neo4j databases, it can be extended to other sources and sinks as needed.
53
-
54
- It does not provide a CLI out of the box, but contains a set of functions to list and manage past runs (if they are stored in a database). In addition, the provided example illustrates how to assemble a etl pipeline and run it from a CLI.
55
-
56
- ## Quick guide
57
-
58
- ### Installation
59
-
60
- Package is available on PyPi and can be installed (for development) via:
61
-
62
- ```bash
63
- python3 -m venv venv
64
- source venv/bin/activate
65
- python -m pip install pip-tools
66
- pip-compile --extra dev pyproject.toml
67
- pip-sync
68
- ```
69
-
70
- ### Usage
71
-
72
- The below shows a minimalistic etl pipeline to a single CSV file (look at the GTFS example to see more details)
73
-
74
- ```python
75
-
76
- class LoadAgenciesTask(CSVLoad2Neo4jTasks):
77
-
78
- class Agency(BaseModel):
79
- """ Define the Pydantic model for data validation. """
80
- id: str = Field(alias="agency_id", default="generic")
81
- name: str = Field(alias="agency_name")
82
- url: str = Field(alias="agency_url")
83
- timezone: str = Field(alias="agency_timezone")
84
- lang: str = Field(alias="agency_lang")
85
-
86
- def __init__(self, context: ETLContext, file:Path):
87
- super().__init__(context, LoadAgenciesTask.Agency, file)
88
-
89
- def task_name(self) -> str:
90
- return f"{self.__class__.__name__}('{self.file}')"
91
-
92
- def _query(self):
93
- """Load the data into Neo4j."""
94
- return """ UNWIND $batch AS row
95
- MERGE (a:Agency {id: row.id})
96
- SET a.name= row.name,
97
- a.url= row.url,
98
- a.timezone= row.timezone,
99
- a.lang= row.lang
100
- """
101
-
102
- @classmethod
103
- def file_name(cls):
104
- return "agency.txt"
105
-
106
- context = ETLContext(env_vars=dict(os.environ))
107
-
108
- schema = SchemaTask(context=context)
109
- init_group = TaskGroup(context=context, tasks=[schema], name="schema-init")
110
-
111
- tasks = [
112
- LoadAgenciesTask(context=context, file=input_directory / LoadAgenciesTask.file_name()),
113
- ]
114
- csv_group = TaskGroup(context=context, tasks=tasks, name="csv-loading")
115
-
116
- all_group = TaskGroup(context=context, tasks=[init_group, csv_group], name="main")
117
-
118
- context.reporter.register_tasks(all_group)
119
-
120
- all_group.execute()
121
-
122
- ```
123
- See the provided [example](examples/gtfs/README.md) for a more realistic pipeline and how the logging and reporting would look like.
124
-
125
- With the above, all lines in the input file `agency.txt` that do not fit the Pydantic model, would be sent to an json file, containing the error data and a description of why it could not be loaded.
126
-
@@ -1,24 +0,0 @@
1
- etl_lib/__init__.py,sha256=pM_SQG2bJ2wlVTBzzIFY5eGB_DuAgTVBlDwZse7DPR0,65
2
- etl_lib/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- etl_lib/cli/run_tools.py,sha256=Z93IZ5WZFSg-DhMANGOgOK3UrZy8s75xwJPAOW9HmTo,7305
4
- etl_lib/core/BatchProcessor.py,sha256=cAOl8m5Hh0wjGgWpe1NKbN-v-_9JT9B4uQs9guPMlzs,3404
5
- etl_lib/core/ClosedLoopBatchProcessor.py,sha256=unlx_A339oi2nOOXF0irrVf8j_GFhwcTuk_w5liqbWc,1321
6
- etl_lib/core/ETLContext.py,sha256=9oeBUntFlIyAODmJopm4rv2HgTH5JLIGCR-fo9VA854,4850
7
- etl_lib/core/ProgressReporter.py,sha256=SeuA6SxjcFWyKyfKaRkjkeZBFFkty2ZjvCVVGO1cfJY,8528
8
- etl_lib/core/Task.py,sha256=vrVQTfLfRZJ3GmDDtXwBlb9Cu_oJ32VjNEvhpmw5wiQ,9444
9
- etl_lib/core/ValidationBatchProcessor.py,sha256=23uYtIVAtR755h34vkrN_QepFJnYroFKVO4xuHF922Y,3244
10
- etl_lib/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- etl_lib/core/utils.py,sha256=Ba3yVA7brtSFiT0z6svq0L1QUnM26QOXGvffogmyGzA,351
12
- etl_lib/data_sink/CypherBatchProcessor.py,sha256=VrnTOoMJMEjy0CeMPm-Q5p0mMlc4JnwrLDK-QrvEj9U,1530
13
- etl_lib/data_sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
- etl_lib/data_source/CSVBatchProcessor.py,sha256=HNOj8YxKM9Sh2sl2V16gK5PkfIK40OYNaLXmkQGj9vs,3156
15
- etl_lib/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
- etl_lib/task/ExecuteCypherTask.py,sha256=KPZ42YrSsLJSW2zzEqjG6i_ANJC_VsX94WaWe9MdcZE,997
17
- etl_lib/task/GDSTask.py,sha256=X1E83wYa-N7AXy43WPEqIy77d__z-2wpBjWNhGNXJzA,1781
18
- etl_lib/task/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
- etl_lib/task/data_loading/CSVLoad2Neo4jTask.py,sha256=aagJCszCvfYEwvivTMX7L56cfL1uF3G9c47ZiAvYUPM,1488
20
- etl_lib/task/data_loading/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
- neo4j_etl_lib-0.0.2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
22
- neo4j_etl_lib-0.0.2.dist-info/WHEEL,sha256=CpUCUxeHQbRN5UGRQHYRJorO5Af-Qy_fHMctcQ8DSGI,82
23
- neo4j_etl_lib-0.0.2.dist-info/METADATA,sha256=U9PM4oS-oFNwzd_PHc9BPY1wr5ScSuXxbxEcxOHDkn8,4961
24
- neo4j_etl_lib-0.0.2.dist-info/RECORD,,