airflow-postgres-csv 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,19 @@
1
+ name: Lint
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ lint:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v4
14
+ - name: Install uv
15
+ uses: astral-sh/setup-uv@v4
16
+ - name: Lint
17
+ run: uvx ruff check src/ tests/
18
+ - name: Format check
19
+ run: uvx ruff format --check src/ tests/
@@ -0,0 +1,44 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ jobs:
8
+ build:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - uses: actions/checkout@v4
12
+
13
+ - name: Set up Python
14
+ uses: actions/setup-python@v5
15
+ with:
16
+ python-version: "3.12"
17
+
18
+ - name: Install build dependencies
19
+ run: pip install build
20
+
21
+ - name: Build package
22
+ run: python -m build
23
+
24
+ - name: Upload build artifacts
25
+ uses: actions/upload-artifact@v4
26
+ with:
27
+ name: dist
28
+ path: dist/
29
+
30
+ publish:
31
+ needs: build
32
+ runs-on: ubuntu-latest
33
+ environment: pypi
34
+ permissions:
35
+ id-token: write
36
+ steps:
37
+ - name: Download build artifacts
38
+ uses: actions/download-artifact@v4
39
+ with:
40
+ name: dist
41
+ path: dist/
42
+
43
+ - name: Publish to PyPI
44
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,30 @@
1
+ name: Tests
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.10", "3.11", "3.12"]
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+
18
+ - name: Install uv
19
+ uses: astral-sh/setup-uv@v4
20
+
21
+ - name: Set up Python ${{ matrix.python-version }}
22
+ run: uv python install ${{ matrix.python-version }}
23
+
24
+ - name: Install dependencies
25
+ run: |
26
+ uv venv .venv --python ${{ matrix.python-version }}
27
+ uv pip install -e ".[dev]" --python .venv/bin/python
28
+
29
+ - name: Test
30
+ run: .venv/bin/pytest -v
@@ -0,0 +1,17 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+ dist/
5
+ build/
6
+ .eggs/
7
+ .venv/
8
+ .pytest_cache/
9
+ .ruff_cache/
10
+ .python-version
11
+ *.csv
12
+
13
+ # pycharm
14
+ /.idea/
15
+
16
+ # macOS
17
+ **/.DS_Store
@@ -0,0 +1,7 @@
1
+ repos:
2
+ - repo: https://github.com/astral-sh/ruff-pre-commit
3
+ rev: v0.9.6
4
+ hooks:
5
+ - id: ruff
6
+ args: [--fix]
7
+ - id: ruff-format
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,95 @@
1
+ Metadata-Version: 2.4
2
+ Name: airflow-postgres-csv
3
+ Version: 0.1.0
4
+ Summary: Airflow operators for PostgreSQL <-> CSV file transfers using COPY
5
+ Project-URL: Repository, https://github.com/Redevil10/airflow-postgres-csv
6
+ Author: Qing Wan
7
+ License-Expression: MIT
8
+ License-File: LICENSE
9
+ Classifier: Framework :: Apache Airflow
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Requires-Python: >=3.10
13
+ Requires-Dist: apache-airflow-providers-postgres>=6.0.0
14
+ Requires-Dist: apache-airflow>=3.0.0
15
+ Provides-Extra: dev
16
+ Requires-Dist: pre-commit; extra == 'dev'
17
+ Requires-Dist: pytest-cov; extra == 'dev'
18
+ Requires-Dist: pytest>=8.0; extra == 'dev'
19
+ Requires-Dist: ruff; extra == 'dev'
20
+ Description-Content-Type: text/markdown
21
+
22
+ # airflow-postgres-csv
23
+
24
+ Airflow 3 operators for bulk PostgreSQL ↔ CSV transfers using `COPY`.
25
+
26
+ ## Operators
27
+
28
+ - **`PostgresToCsvOperator`** — Run a SQL query and export results to a CSV file
29
+ - **`CsvToPostgresOperator`** — Load a CSV file into a PostgreSQL table
30
+
31
+ Both use PostgreSQL's `COPY` command for maximum throughput.
32
+
33
+ ## Installation
34
+
35
+ ```bash
36
+ pip install airflow-postgres-csv
37
+ ```
38
+
39
+ ## Usage
40
+
41
+ ```python
42
+ from airflow_postgres_csv import PostgresToCsvOperator, CsvToPostgresOperator
43
+
44
+ # Export query results to CSV
45
+ export_task = PostgresToCsvOperator(
46
+ task_id="export_users",
47
+ conn_id="my_postgres",
48
+ sql_query="SELECT * FROM users WHERE active = %(active)s",
49
+ query_params={"active": True},
50
+ csv_file_path="/tmp/users.csv",
51
+ )
52
+
53
+ # Load CSV into a table
54
+ import_task = CsvToPostgresOperator(
55
+ task_id="import_users",
56
+ conn_id="my_postgres",
57
+ table_name="staging.users",
58
+ csv_file_path="/tmp/users.csv",
59
+ )
60
+ ```
61
+
62
+ ### Key parameters
63
+
64
+ **PostgresToCsvOperator:**
65
+ | Parameter | Description | Default |
66
+ |---|---|---|
67
+ | `conn_id` | Airflow Postgres connection ID | required |
68
+ | `csv_file_path` | Output file path (template-able) | required |
69
+ | `sql_query` | SQL to execute | `None` |
70
+ | `sql_file_path` | Path to `.sql` file | `None` |
71
+ | `query_params` | Dict passed to `cursor.mogrify` | `{}` |
72
+ | `has_header` | Include CSV header | `True` |
73
+ | `timeout` | Query timeout (minutes) | `60` |
74
+
75
+ **CsvToPostgresOperator:**
76
+ | Parameter | Description | Default |
77
+ |---|---|---|
78
+ | `conn_id` | Airflow Postgres connection ID | required |
79
+ | `table_name` | Target table (template-able, supports `schema.table`) | required |
80
+ | `csv_file_path` | Input file path (template-able) | required |
81
+ | `columns` | Explicit column list | `None` |
82
+ | `has_header` | CSV has header row | `True` |
83
+ | `delimiter` | CSV delimiter | `","` |
84
+ | `quote_char` | CSV quote character | `'"'` |
85
+ | `null_string` | String representing NULL | `""` |
86
+ | `timeout` | Query timeout (minutes) | `60` |
87
+
88
+ ## Requirements
89
+
90
+ - Apache Airflow >= 3.0.0
91
+ - apache-airflow-providers-postgres >= 6.0.0
92
+
93
+ ## License
94
+
95
+ MIT
@@ -0,0 +1,74 @@
1
+ # airflow-postgres-csv
2
+
3
+ Airflow 3 operators for bulk PostgreSQL ↔ CSV transfers using `COPY`.
4
+
5
+ ## Operators
6
+
7
+ - **`PostgresToCsvOperator`** — Run a SQL query and export results to a CSV file
8
+ - **`CsvToPostgresOperator`** — Load a CSV file into a PostgreSQL table
9
+
10
+ Both use PostgreSQL's `COPY` command for maximum throughput.
11
+
12
+ ## Installation
13
+
14
+ ```bash
15
+ pip install airflow-postgres-csv
16
+ ```
17
+
18
+ ## Usage
19
+
20
+ ```python
21
+ from airflow_postgres_csv import PostgresToCsvOperator, CsvToPostgresOperator
22
+
23
+ # Export query results to CSV
24
+ export_task = PostgresToCsvOperator(
25
+ task_id="export_users",
26
+ conn_id="my_postgres",
27
+ sql_query="SELECT * FROM users WHERE active = %(active)s",
28
+ query_params={"active": True},
29
+ csv_file_path="/tmp/users.csv",
30
+ )
31
+
32
+ # Load CSV into a table
33
+ import_task = CsvToPostgresOperator(
34
+ task_id="import_users",
35
+ conn_id="my_postgres",
36
+ table_name="staging.users",
37
+ csv_file_path="/tmp/users.csv",
38
+ )
39
+ ```
40
+
41
+ ### Key parameters
42
+
43
+ **PostgresToCsvOperator:**
44
+ | Parameter | Description | Default |
45
+ |---|---|---|
46
+ | `conn_id` | Airflow Postgres connection ID | required |
47
+ | `csv_file_path` | Output file path (template-able) | required |
48
+ | `sql_query` | SQL to execute | `None` |
49
+ | `sql_file_path` | Path to `.sql` file | `None` |
50
+ | `query_params` | Dict passed to `cursor.mogrify` | `{}` |
51
+ | `has_header` | Include CSV header | `True` |
52
+ | `timeout` | Query timeout (minutes) | `60` |
53
+
54
+ **CsvToPostgresOperator:**
55
+ | Parameter | Description | Default |
56
+ |---|---|---|
57
+ | `conn_id` | Airflow Postgres connection ID | required |
58
+ | `table_name` | Target table (template-able, supports `schema.table`) | required |
59
+ | `csv_file_path` | Input file path (template-able) | required |
60
+ | `columns` | Explicit column list | `None` |
61
+ | `has_header` | CSV has header row | `True` |
62
+ | `delimiter` | CSV delimiter | `","` |
63
+ | `quote_char` | CSV quote character | `'"'` |
64
+ | `null_string` | String representing NULL | `""` |
65
+ | `timeout` | Query timeout (minutes) | `60` |
66
+
67
+ ## Requirements
68
+
69
+ - Apache Airflow >= 3.0.0
70
+ - apache-airflow-providers-postgres >= 6.0.0
71
+
72
+ ## License
73
+
74
+ MIT
@@ -0,0 +1,46 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "airflow-postgres-csv"
7
+ version = "0.1.0"
8
+ description = "Airflow operators for PostgreSQL <-> CSV file transfers using COPY"
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.10"
12
+ authors = [{ name = "Qing Wan" }]
13
+ classifiers = [
14
+ "Framework :: Apache Airflow",
15
+ "Programming Language :: Python :: 3",
16
+ "License :: OSI Approved :: MIT License",
17
+ ]
18
+
19
+ dependencies = [
20
+ "apache-airflow>=3.0.0",
21
+ "apache-airflow-providers-postgres>=6.0.0",
22
+ ]
23
+
24
+ [project.optional-dependencies]
25
+ dev = [
26
+ "pytest>=8.0",
27
+ "pytest-cov",
28
+ "ruff",
29
+ "pre-commit",
30
+ ]
31
+
32
+ [project.urls]
33
+ Repository = "https://github.com/Redevil10/airflow-postgres-csv"
34
+
35
+ [tool.hatch.build.targets.wheel]
36
+ packages = ["src/airflow_postgres_csv"]
37
+
38
+ [tool.ruff]
39
+ target-version = "py310"
40
+ line-length = 99
41
+
42
+ [tool.ruff.lint]
43
+ select = ["E", "F", "I", "UP"]
44
+
45
+ [tool.pytest.ini_options]
46
+ testpaths = ["tests"]
@@ -0,0 +1,6 @@
1
+ """Airflow operators for PostgreSQL <-> CSV file transfers using COPY."""
2
+
3
+ from airflow_postgres_csv.operators import CsvToPostgresOperator, PostgresToCsvOperator
4
+
5
+ __all__ = ["PostgresToCsvOperator", "CsvToPostgresOperator"]
6
+ __version__ = "0.1.0"
@@ -0,0 +1,183 @@
1
+ """Custom Airflow operators for PostgreSQL <-> CSV file transfers."""
2
+
3
+ import os
4
+ from collections.abc import Sequence
5
+
6
+ from airflow.exceptions import AirflowException
7
+ from airflow.providers.postgres.hooks.postgres import PostgresHook
8
+ from airflow.sdk.bases.operator import BaseOperator
9
+
10
+
11
+ class PostgresToCsvOperator(BaseOperator):
12
+ """
13
+ Execute a SQL query on PostgreSQL and save the result as a CSV file.
14
+
15
+ Uses ``COPY (...) TO STDOUT WITH CSV`` for high-performance bulk export.
16
+
17
+ :param conn_id: Airflow connection ID for the PostgreSQL database.
18
+ :param csv_file_path: Local file path where the CSV will be saved.
19
+ :param sql_query: SQL query to execute. Overrides *sql_file_path* if both given.
20
+ :param sql_file_path: Path to a ``.sql`` file containing the query.
21
+ :param query_params: Parameters passed to the SQL query via ``cursor.mogrify``.
22
+ :param has_header: Include a CSV header row. Defaults to ``True``.
23
+ :param timeout: Query timeout in minutes. Defaults to ``60``.
24
+ """
25
+
26
+ template_fields: Sequence[str] = (
27
+ "sql_query",
28
+ "sql_file_path",
29
+ "csv_file_path",
30
+ )
31
+
32
+ def __init__(
33
+ self,
34
+ conn_id: str,
35
+ csv_file_path: str,
36
+ sql_query: str | None = None,
37
+ sql_file_path: str | None = None,
38
+ query_params: dict | None = None,
39
+ has_header: bool = True,
40
+ timeout: int = 60,
41
+ **kwargs,
42
+ ):
43
+ super().__init__(**kwargs)
44
+ self.conn_id = conn_id
45
+ self.csv_file_path = csv_file_path
46
+ self.sql_query = sql_query
47
+ self.sql_file_path = sql_file_path
48
+ self.query_params = query_params or {}
49
+ self.has_header = has_header
50
+ self.timeout = timeout
51
+
52
+ def execute(self, context):
53
+ if not self.sql_query and not self.sql_file_path:
54
+ raise AirflowException("Either sql_query or sql_file_path must be provided")
55
+
56
+ if not self.sql_query:
57
+ with open(self.sql_file_path, encoding="utf-8") as f:
58
+ self.sql_query = f.read()
59
+
60
+ pg_hook = PostgresHook(postgres_conn_id=self.conn_id)
61
+ cleaned_sql = self.sql_query.strip().rstrip(";")
62
+
63
+ self.log.info("Running query and saving to CSV: %s", self.csv_file_path)
64
+
65
+ with pg_hook.get_conn() as conn:
66
+ with conn.cursor() as cursor:
67
+ cursor.execute("SET statement_timeout = %s;", (self.timeout * 60 * 1000,))
68
+ formatted_sql = cursor.mogrify(cleaned_sql, self.query_params).decode("utf-8")
69
+
70
+ header_clause = " HEADER" if self.has_header else ""
71
+ copy_command = f"COPY ({formatted_sql}) TO STDOUT WITH CSV{header_clause}"
72
+
73
+ rows = 0
74
+ with open(self.csv_file_path, "w", encoding="utf-8") as csv_file:
75
+ cursor.copy_expert(copy_command, csv_file)
76
+ rows = cursor.rowcount
77
+
78
+ self.log.info(
79
+ "CSV saved: %s (%s rows, %s)",
80
+ self.csv_file_path,
81
+ rows if rows >= 0 else "unknown",
82
+ "with header" if self.has_header else "no header",
83
+ )
84
+ return self.csv_file_path
85
+
86
+
87
+ class CsvToPostgresOperator(BaseOperator):
88
+ """
89
+ Load a CSV file into a PostgreSQL table.
90
+
91
+ Uses ``COPY ... FROM STDIN WITH CSV`` for high-performance bulk import.
92
+
93
+ :param conn_id: Airflow connection ID for the PostgreSQL database.
94
+ :param table_name: Target table (may include schema, e.g. ``"myschema.mytable"``).
95
+ :param csv_file_path: Local file path of the CSV to load.
96
+ :param delimiter: CSV delimiter. Defaults to ``','``.
97
+ :param quote_char: CSV quote character. Defaults to ``'"'``.
98
+ :param null_string: String representing NULL values. Defaults to ``''``.
99
+ :param has_header: Whether the CSV has a header row. Defaults to ``True``.
100
+ :param columns: Explicit column list. If provided, maps CSV columns to these
101
+ table columns and skips the file header (if present).
102
+ :param timeout: Query timeout in minutes. Defaults to ``60``.
103
+ """
104
+
105
+ template_fields: Sequence[str] = ("csv_file_path", "table_name")
106
+
107
+ def __init__(
108
+ self,
109
+ conn_id: str,
110
+ table_name: str,
111
+ csv_file_path: str,
112
+ delimiter: str = ",",
113
+ quote_char: str = '"',
114
+ null_string: str = "",
115
+ has_header: bool = True,
116
+ columns: list[str] | None = None,
117
+ timeout: int = 60,
118
+ **kwargs,
119
+ ):
120
+ super().__init__(**kwargs)
121
+ self.conn_id = conn_id
122
+ self.table_name = table_name
123
+ self.csv_file_path = csv_file_path
124
+ self.delimiter = delimiter
125
+ self.quote_char = quote_char
126
+ self.null_string = null_string
127
+ self.has_header = has_header
128
+ self.columns = columns
129
+ self.timeout = timeout
130
+
131
+ def execute(self, context):
132
+ if not os.path.exists(self.csv_file_path):
133
+ raise AirflowException(f"CSV file not found: {self.csv_file_path}")
134
+
135
+ pg_hook = PostgresHook(postgres_conn_id=self.conn_id)
136
+
137
+ self.log.info("Loading %s into %s", self.csv_file_path, self.table_name)
138
+
139
+ column_clause = self._build_column_clause()
140
+ header_clause = "HEADER" if self.has_header and not self.columns else ""
141
+
142
+ copy_command = (
143
+ f"COPY {self._quote_table_name()} {column_clause} "
144
+ f"FROM STDIN WITH CSV "
145
+ f"DELIMITER '{self.delimiter}' "
146
+ f"QUOTE '{self.quote_char}' "
147
+ f"NULL '{self.null_string}' "
148
+ f"{header_clause}"
149
+ )
150
+
151
+ with pg_hook.get_conn() as conn:
152
+ with conn.cursor() as cursor:
153
+ cursor.execute("SET statement_timeout = %s;", (self.timeout * 60 * 1000,))
154
+ with open(self.csv_file_path, encoding="utf-8") as csv_file:
155
+ if self.columns and self.has_header:
156
+ next(csv_file)
157
+ cursor.copy_expert(copy_command, csv_file)
158
+ rows = cursor.rowcount
159
+ conn.commit()
160
+
161
+ self.log.info(
162
+ "Loaded %s rows from %s into %s",
163
+ rows if rows >= 0 else "unknown",
164
+ self.csv_file_path,
165
+ self.table_name,
166
+ )
167
+ return rows
168
+
169
+ @staticmethod
170
+ def _quote_identifier(name: str) -> str:
171
+ """Quote a SQL identifier, escaping any double quotes."""
172
+ escaped = name.replace('"', '""')
173
+ return f'"{escaped}"'
174
+
175
+ def _quote_table_name(self) -> str:
176
+ parts = self.table_name.split(".")
177
+ return ".".join(self._quote_identifier(p) for p in parts)
178
+
179
+ def _build_column_clause(self) -> str:
180
+ if not self.columns:
181
+ return ""
182
+ cols = ", ".join(self._quote_identifier(c) for c in self.columns)
183
+ return f"({cols})"