airflow-postgres-csv 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airflow_postgres_csv-0.1.0/.github/workflows/lint.yml +19 -0
- airflow_postgres_csv-0.1.0/.github/workflows/publish.yml +44 -0
- airflow_postgres_csv-0.1.0/.github/workflows/test.yml +30 -0
- airflow_postgres_csv-0.1.0/.gitignore +17 -0
- airflow_postgres_csv-0.1.0/.pre-commit-config.yaml +7 -0
- airflow_postgres_csv-0.1.0/LICENSE +21 -0
- airflow_postgres_csv-0.1.0/PKG-INFO +95 -0
- airflow_postgres_csv-0.1.0/README.md +74 -0
- airflow_postgres_csv-0.1.0/pyproject.toml +46 -0
- airflow_postgres_csv-0.1.0/src/airflow_postgres_csv/__init__.py +6 -0
- airflow_postgres_csv-0.1.0/src/airflow_postgres_csv/operators.py +183 -0
- airflow_postgres_csv-0.1.0/tests/test_operators.py +153 -0
- airflow_postgres_csv-0.1.0/uv.lock +3323 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
name: Lint
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
lint:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v4
|
|
14
|
+
- name: Install uv
|
|
15
|
+
uses: astral-sh/setup-uv@v4
|
|
16
|
+
- name: Lint
|
|
17
|
+
run: uvx ruff check src/ tests/
|
|
18
|
+
- name: Format check
|
|
19
|
+
run: uvx ruff format --check src/ tests/
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
build:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
steps:
|
|
11
|
+
- uses: actions/checkout@v4
|
|
12
|
+
|
|
13
|
+
- name: Set up Python
|
|
14
|
+
uses: actions/setup-python@v5
|
|
15
|
+
with:
|
|
16
|
+
python-version: "3.12"
|
|
17
|
+
|
|
18
|
+
- name: Install build dependencies
|
|
19
|
+
run: pip install build
|
|
20
|
+
|
|
21
|
+
- name: Build package
|
|
22
|
+
run: python -m build
|
|
23
|
+
|
|
24
|
+
- name: Upload build artifacts
|
|
25
|
+
uses: actions/upload-artifact@v4
|
|
26
|
+
with:
|
|
27
|
+
name: dist
|
|
28
|
+
path: dist/
|
|
29
|
+
|
|
30
|
+
publish:
|
|
31
|
+
needs: build
|
|
32
|
+
runs-on: ubuntu-latest
|
|
33
|
+
environment: pypi
|
|
34
|
+
permissions:
|
|
35
|
+
id-token: write
|
|
36
|
+
steps:
|
|
37
|
+
- name: Download build artifacts
|
|
38
|
+
uses: actions/download-artifact@v4
|
|
39
|
+
with:
|
|
40
|
+
name: dist
|
|
41
|
+
path: dist/
|
|
42
|
+
|
|
43
|
+
- name: Publish to PyPI
|
|
44
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
name: Tests
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
|
|
18
|
+
- name: Install uv
|
|
19
|
+
uses: astral-sh/setup-uv@v4
|
|
20
|
+
|
|
21
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
22
|
+
run: uv python install ${{ matrix.python-version }}
|
|
23
|
+
|
|
24
|
+
- name: Install dependencies
|
|
25
|
+
run: |
|
|
26
|
+
uv venv .venv --python ${{ matrix.python-version }}
|
|
27
|
+
uv pip install -e ".[dev]" --python .venv/bin/python
|
|
28
|
+
|
|
29
|
+
- name: Test
|
|
30
|
+
run: .venv/bin/pytest -v
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: airflow-postgres-csv
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Airflow operators for PostgreSQL <-> CSV file transfers using COPY
|
|
5
|
+
Project-URL: Repository, https://github.com/Redevil10/airflow-postgres-csv
|
|
6
|
+
Author: Qing Wan
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Classifier: Framework :: Apache Airflow
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Requires-Python: >=3.10
|
|
13
|
+
Requires-Dist: apache-airflow-providers-postgres>=6.0.0
|
|
14
|
+
Requires-Dist: apache-airflow>=3.0.0
|
|
15
|
+
Provides-Extra: dev
|
|
16
|
+
Requires-Dist: pre-commit; extra == 'dev'
|
|
17
|
+
Requires-Dist: pytest-cov; extra == 'dev'
|
|
18
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
19
|
+
Requires-Dist: ruff; extra == 'dev'
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# airflow-postgres-csv
|
|
23
|
+
|
|
24
|
+
Airflow 3 operators for bulk PostgreSQL ↔ CSV transfers using `COPY`.
|
|
25
|
+
|
|
26
|
+
## Operators
|
|
27
|
+
|
|
28
|
+
- **`PostgresToCsvOperator`** — Run a SQL query and export results to a CSV file
|
|
29
|
+
- **`CsvToPostgresOperator`** — Load a CSV file into a PostgreSQL table
|
|
30
|
+
|
|
31
|
+
Both use PostgreSQL's `COPY` command for maximum throughput.
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install airflow-postgres-csv
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Usage
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from airflow_postgres_csv import PostgresToCsvOperator, CsvToPostgresOperator
|
|
43
|
+
|
|
44
|
+
# Export query results to CSV
|
|
45
|
+
export_task = PostgresToCsvOperator(
|
|
46
|
+
task_id="export_users",
|
|
47
|
+
conn_id="my_postgres",
|
|
48
|
+
sql_query="SELECT * FROM users WHERE active = %(active)s",
|
|
49
|
+
query_params={"active": True},
|
|
50
|
+
csv_file_path="/tmp/users.csv",
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# Load CSV into a table
|
|
54
|
+
import_task = CsvToPostgresOperator(
|
|
55
|
+
task_id="import_users",
|
|
56
|
+
conn_id="my_postgres",
|
|
57
|
+
table_name="staging.users",
|
|
58
|
+
csv_file_path="/tmp/users.csv",
|
|
59
|
+
)
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### Key parameters
|
|
63
|
+
|
|
64
|
+
**PostgresToCsvOperator:**
|
|
65
|
+
| Parameter | Description | Default |
|
|
66
|
+
|---|---|---|
|
|
67
|
+
| `conn_id` | Airflow Postgres connection ID | required |
|
|
68
|
+
| `csv_file_path` | Output file path (template-able) | required |
|
|
69
|
+
| `sql_query` | SQL to execute | `None` |
|
|
70
|
+
| `sql_file_path` | Path to `.sql` file | `None` |
|
|
71
|
+
| `query_params` | Dict passed to `cursor.mogrify` | `{}` |
|
|
72
|
+
| `has_header` | Include CSV header | `True` |
|
|
73
|
+
| `timeout` | Query timeout (minutes) | `60` |
|
|
74
|
+
|
|
75
|
+
**CsvToPostgresOperator:**
|
|
76
|
+
| Parameter | Description | Default |
|
|
77
|
+
|---|---|---|
|
|
78
|
+
| `conn_id` | Airflow Postgres connection ID | required |
|
|
79
|
+
| `table_name` | Target table (template-able, supports `schema.table`) | required |
|
|
80
|
+
| `csv_file_path` | Input file path (template-able) | required |
|
|
81
|
+
| `columns` | Explicit column list | `None` |
|
|
82
|
+
| `has_header` | CSV has header row | `True` |
|
|
83
|
+
| `delimiter` | CSV delimiter | `","` |
|
|
84
|
+
| `quote_char` | CSV quote character | `'"'` |
|
|
85
|
+
| `null_string` | String representing NULL | `""` |
|
|
86
|
+
| `timeout` | Query timeout (minutes) | `60` |
|
|
87
|
+
|
|
88
|
+
## Requirements
|
|
89
|
+
|
|
90
|
+
- Apache Airflow >= 3.0.0
|
|
91
|
+
- apache-airflow-providers-postgres >= 6.0.0
|
|
92
|
+
|
|
93
|
+
## License
|
|
94
|
+
|
|
95
|
+
MIT
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# airflow-postgres-csv
|
|
2
|
+
|
|
3
|
+
Airflow 3 operators for bulk PostgreSQL ↔ CSV transfers using `COPY`.
|
|
4
|
+
|
|
5
|
+
## Operators
|
|
6
|
+
|
|
7
|
+
- **`PostgresToCsvOperator`** — Run a SQL query and export results to a CSV file
|
|
8
|
+
- **`CsvToPostgresOperator`** — Load a CSV file into a PostgreSQL table
|
|
9
|
+
|
|
10
|
+
Both use PostgreSQL's `COPY` command for maximum throughput.
|
|
11
|
+
|
|
12
|
+
## Installation
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
pip install airflow-postgres-csv
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Usage
|
|
19
|
+
|
|
20
|
+
```python
|
|
21
|
+
from airflow_postgres_csv import PostgresToCsvOperator, CsvToPostgresOperator
|
|
22
|
+
|
|
23
|
+
# Export query results to CSV
|
|
24
|
+
export_task = PostgresToCsvOperator(
|
|
25
|
+
task_id="export_users",
|
|
26
|
+
conn_id="my_postgres",
|
|
27
|
+
sql_query="SELECT * FROM users WHERE active = %(active)s",
|
|
28
|
+
query_params={"active": True},
|
|
29
|
+
csv_file_path="/tmp/users.csv",
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
# Load CSV into a table
|
|
33
|
+
import_task = CsvToPostgresOperator(
|
|
34
|
+
task_id="import_users",
|
|
35
|
+
conn_id="my_postgres",
|
|
36
|
+
table_name="staging.users",
|
|
37
|
+
csv_file_path="/tmp/users.csv",
|
|
38
|
+
)
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
### Key parameters
|
|
42
|
+
|
|
43
|
+
**PostgresToCsvOperator:**
|
|
44
|
+
| Parameter | Description | Default |
|
|
45
|
+
|---|---|---|
|
|
46
|
+
| `conn_id` | Airflow Postgres connection ID | required |
|
|
47
|
+
| `csv_file_path` | Output file path (template-able) | required |
|
|
48
|
+
| `sql_query` | SQL to execute | `None` |
|
|
49
|
+
| `sql_file_path` | Path to `.sql` file | `None` |
|
|
50
|
+
| `query_params` | Dict passed to `cursor.mogrify` | `{}` |
|
|
51
|
+
| `has_header` | Include CSV header | `True` |
|
|
52
|
+
| `timeout` | Query timeout (minutes) | `60` |
|
|
53
|
+
|
|
54
|
+
**CsvToPostgresOperator:**
|
|
55
|
+
| Parameter | Description | Default |
|
|
56
|
+
|---|---|---|
|
|
57
|
+
| `conn_id` | Airflow Postgres connection ID | required |
|
|
58
|
+
| `table_name` | Target table (template-able, supports `schema.table`) | required |
|
|
59
|
+
| `csv_file_path` | Input file path (template-able) | required |
|
|
60
|
+
| `columns` | Explicit column list | `None` |
|
|
61
|
+
| `has_header` | CSV has header row | `True` |
|
|
62
|
+
| `delimiter` | CSV delimiter | `","` |
|
|
63
|
+
| `quote_char` | CSV quote character | `'"'` |
|
|
64
|
+
| `null_string` | String representing NULL | `""` |
|
|
65
|
+
| `timeout` | Query timeout (minutes) | `60` |
|
|
66
|
+
|
|
67
|
+
## Requirements
|
|
68
|
+
|
|
69
|
+
- Apache Airflow >= 3.0.0
|
|
70
|
+
- apache-airflow-providers-postgres >= 6.0.0
|
|
71
|
+
|
|
72
|
+
## License
|
|
73
|
+
|
|
74
|
+
MIT
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "airflow-postgres-csv"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Airflow operators for PostgreSQL <-> CSV file transfers using COPY"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [{ name = "Qing Wan" }]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Framework :: Apache Airflow",
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
dependencies = [
|
|
20
|
+
"apache-airflow>=3.0.0",
|
|
21
|
+
"apache-airflow-providers-postgres>=6.0.0",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
[project.optional-dependencies]
|
|
25
|
+
dev = [
|
|
26
|
+
"pytest>=8.0",
|
|
27
|
+
"pytest-cov",
|
|
28
|
+
"ruff",
|
|
29
|
+
"pre-commit",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[project.urls]
|
|
33
|
+
Repository = "https://github.com/Redevil10/airflow-postgres-csv"
|
|
34
|
+
|
|
35
|
+
[tool.hatch.build.targets.wheel]
|
|
36
|
+
packages = ["src/airflow_postgres_csv"]
|
|
37
|
+
|
|
38
|
+
[tool.ruff]
|
|
39
|
+
target-version = "py310"
|
|
40
|
+
line-length = 99
|
|
41
|
+
|
|
42
|
+
[tool.ruff.lint]
|
|
43
|
+
select = ["E", "F", "I", "UP"]
|
|
44
|
+
|
|
45
|
+
[tool.pytest.ini_options]
|
|
46
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""Custom Airflow operators for PostgreSQL <-> CSV file transfers."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from collections.abc import Sequence
|
|
5
|
+
|
|
6
|
+
from airflow.exceptions import AirflowException
|
|
7
|
+
from airflow.providers.postgres.hooks.postgres import PostgresHook
|
|
8
|
+
from airflow.sdk.bases.operator import BaseOperator
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class PostgresToCsvOperator(BaseOperator):
|
|
12
|
+
"""
|
|
13
|
+
Execute a SQL query on PostgreSQL and save the result as a CSV file.
|
|
14
|
+
|
|
15
|
+
Uses ``COPY (...) TO STDOUT WITH CSV`` for high-performance bulk export.
|
|
16
|
+
|
|
17
|
+
:param conn_id: Airflow connection ID for the PostgreSQL database.
|
|
18
|
+
:param csv_file_path: Local file path where the CSV will be saved.
|
|
19
|
+
:param sql_query: SQL query to execute. Overrides *sql_file_path* if both given.
|
|
20
|
+
:param sql_file_path: Path to a ``.sql`` file containing the query.
|
|
21
|
+
:param query_params: Parameters passed to the SQL query via ``cursor.mogrify``.
|
|
22
|
+
:param has_header: Include a CSV header row. Defaults to ``True``.
|
|
23
|
+
:param timeout: Query timeout in minutes. Defaults to ``60``.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
template_fields: Sequence[str] = (
|
|
27
|
+
"sql_query",
|
|
28
|
+
"sql_file_path",
|
|
29
|
+
"csv_file_path",
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
conn_id: str,
|
|
35
|
+
csv_file_path: str,
|
|
36
|
+
sql_query: str | None = None,
|
|
37
|
+
sql_file_path: str | None = None,
|
|
38
|
+
query_params: dict | None = None,
|
|
39
|
+
has_header: bool = True,
|
|
40
|
+
timeout: int = 60,
|
|
41
|
+
**kwargs,
|
|
42
|
+
):
|
|
43
|
+
super().__init__(**kwargs)
|
|
44
|
+
self.conn_id = conn_id
|
|
45
|
+
self.csv_file_path = csv_file_path
|
|
46
|
+
self.sql_query = sql_query
|
|
47
|
+
self.sql_file_path = sql_file_path
|
|
48
|
+
self.query_params = query_params or {}
|
|
49
|
+
self.has_header = has_header
|
|
50
|
+
self.timeout = timeout
|
|
51
|
+
|
|
52
|
+
def execute(self, context):
|
|
53
|
+
if not self.sql_query and not self.sql_file_path:
|
|
54
|
+
raise AirflowException("Either sql_query or sql_file_path must be provided")
|
|
55
|
+
|
|
56
|
+
if not self.sql_query:
|
|
57
|
+
with open(self.sql_file_path, encoding="utf-8") as f:
|
|
58
|
+
self.sql_query = f.read()
|
|
59
|
+
|
|
60
|
+
pg_hook = PostgresHook(postgres_conn_id=self.conn_id)
|
|
61
|
+
cleaned_sql = self.sql_query.strip().rstrip(";")
|
|
62
|
+
|
|
63
|
+
self.log.info("Running query and saving to CSV: %s", self.csv_file_path)
|
|
64
|
+
|
|
65
|
+
with pg_hook.get_conn() as conn:
|
|
66
|
+
with conn.cursor() as cursor:
|
|
67
|
+
cursor.execute("SET statement_timeout = %s;", (self.timeout * 60 * 1000,))
|
|
68
|
+
formatted_sql = cursor.mogrify(cleaned_sql, self.query_params).decode("utf-8")
|
|
69
|
+
|
|
70
|
+
header_clause = " HEADER" if self.has_header else ""
|
|
71
|
+
copy_command = f"COPY ({formatted_sql}) TO STDOUT WITH CSV{header_clause}"
|
|
72
|
+
|
|
73
|
+
rows = 0
|
|
74
|
+
with open(self.csv_file_path, "w", encoding="utf-8") as csv_file:
|
|
75
|
+
cursor.copy_expert(copy_command, csv_file)
|
|
76
|
+
rows = cursor.rowcount
|
|
77
|
+
|
|
78
|
+
self.log.info(
|
|
79
|
+
"CSV saved: %s (%s rows, %s)",
|
|
80
|
+
self.csv_file_path,
|
|
81
|
+
rows if rows >= 0 else "unknown",
|
|
82
|
+
"with header" if self.has_header else "no header",
|
|
83
|
+
)
|
|
84
|
+
return self.csv_file_path
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class CsvToPostgresOperator(BaseOperator):
|
|
88
|
+
"""
|
|
89
|
+
Load a CSV file into a PostgreSQL table.
|
|
90
|
+
|
|
91
|
+
Uses ``COPY ... FROM STDIN WITH CSV`` for high-performance bulk import.
|
|
92
|
+
|
|
93
|
+
:param conn_id: Airflow connection ID for the PostgreSQL database.
|
|
94
|
+
:param table_name: Target table (may include schema, e.g. ``"myschema.mytable"``).
|
|
95
|
+
:param csv_file_path: Local file path of the CSV to load.
|
|
96
|
+
:param delimiter: CSV delimiter. Defaults to ``','``.
|
|
97
|
+
:param quote_char: CSV quote character. Defaults to ``'"'``.
|
|
98
|
+
:param null_string: String representing NULL values. Defaults to ``''``.
|
|
99
|
+
:param has_header: Whether the CSV has a header row. Defaults to ``True``.
|
|
100
|
+
:param columns: Explicit column list. If provided, maps CSV columns to these
|
|
101
|
+
table columns and skips the file header (if present).
|
|
102
|
+
:param timeout: Query timeout in minutes. Defaults to ``60``.
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
template_fields: Sequence[str] = ("csv_file_path", "table_name")
|
|
106
|
+
|
|
107
|
+
def __init__(
|
|
108
|
+
self,
|
|
109
|
+
conn_id: str,
|
|
110
|
+
table_name: str,
|
|
111
|
+
csv_file_path: str,
|
|
112
|
+
delimiter: str = ",",
|
|
113
|
+
quote_char: str = '"',
|
|
114
|
+
null_string: str = "",
|
|
115
|
+
has_header: bool = True,
|
|
116
|
+
columns: list[str] | None = None,
|
|
117
|
+
timeout: int = 60,
|
|
118
|
+
**kwargs,
|
|
119
|
+
):
|
|
120
|
+
super().__init__(**kwargs)
|
|
121
|
+
self.conn_id = conn_id
|
|
122
|
+
self.table_name = table_name
|
|
123
|
+
self.csv_file_path = csv_file_path
|
|
124
|
+
self.delimiter = delimiter
|
|
125
|
+
self.quote_char = quote_char
|
|
126
|
+
self.null_string = null_string
|
|
127
|
+
self.has_header = has_header
|
|
128
|
+
self.columns = columns
|
|
129
|
+
self.timeout = timeout
|
|
130
|
+
|
|
131
|
+
def execute(self, context):
|
|
132
|
+
if not os.path.exists(self.csv_file_path):
|
|
133
|
+
raise AirflowException(f"CSV file not found: {self.csv_file_path}")
|
|
134
|
+
|
|
135
|
+
pg_hook = PostgresHook(postgres_conn_id=self.conn_id)
|
|
136
|
+
|
|
137
|
+
self.log.info("Loading %s into %s", self.csv_file_path, self.table_name)
|
|
138
|
+
|
|
139
|
+
column_clause = self._build_column_clause()
|
|
140
|
+
header_clause = "HEADER" if self.has_header and not self.columns else ""
|
|
141
|
+
|
|
142
|
+
copy_command = (
|
|
143
|
+
f"COPY {self._quote_table_name()} {column_clause} "
|
|
144
|
+
f"FROM STDIN WITH CSV "
|
|
145
|
+
f"DELIMITER '{self.delimiter}' "
|
|
146
|
+
f"QUOTE '{self.quote_char}' "
|
|
147
|
+
f"NULL '{self.null_string}' "
|
|
148
|
+
f"{header_clause}"
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
with pg_hook.get_conn() as conn:
|
|
152
|
+
with conn.cursor() as cursor:
|
|
153
|
+
cursor.execute("SET statement_timeout = %s;", (self.timeout * 60 * 1000,))
|
|
154
|
+
with open(self.csv_file_path, encoding="utf-8") as csv_file:
|
|
155
|
+
if self.columns and self.has_header:
|
|
156
|
+
next(csv_file)
|
|
157
|
+
cursor.copy_expert(copy_command, csv_file)
|
|
158
|
+
rows = cursor.rowcount
|
|
159
|
+
conn.commit()
|
|
160
|
+
|
|
161
|
+
self.log.info(
|
|
162
|
+
"Loaded %s rows from %s into %s",
|
|
163
|
+
rows if rows >= 0 else "unknown",
|
|
164
|
+
self.csv_file_path,
|
|
165
|
+
self.table_name,
|
|
166
|
+
)
|
|
167
|
+
return rows
|
|
168
|
+
|
|
169
|
+
@staticmethod
|
|
170
|
+
def _quote_identifier(name: str) -> str:
|
|
171
|
+
"""Quote a SQL identifier, escaping any double quotes."""
|
|
172
|
+
escaped = name.replace('"', '""')
|
|
173
|
+
return f'"{escaped}"'
|
|
174
|
+
|
|
175
|
+
def _quote_table_name(self) -> str:
|
|
176
|
+
parts = self.table_name.split(".")
|
|
177
|
+
return ".".join(self._quote_identifier(p) for p in parts)
|
|
178
|
+
|
|
179
|
+
def _build_column_clause(self) -> str:
|
|
180
|
+
if not self.columns:
|
|
181
|
+
return ""
|
|
182
|
+
cols = ", ".join(self._quote_identifier(c) for c in self.columns)
|
|
183
|
+
return f"({cols})"
|