bqcsv 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bqcsv/__init__.py +1 -0
- bqcsv/cli.py +201 -0
- bqcsv/config.py +54 -0
- bqcsv/schema.py +212 -0
- bqcsv/table.py +46 -0
- bqcsv/uploader.py +193 -0
- bqcsv-1.0.0.dist-info/METADATA +137 -0
- bqcsv-1.0.0.dist-info/RECORD +11 -0
- bqcsv-1.0.0.dist-info/WHEEL +5 -0
- bqcsv-1.0.0.dist-info/entry_points.txt +2 -0
- bqcsv-1.0.0.dist-info/top_level.txt +1 -0
bqcsv/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.0.0"
|
bqcsv/cli.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from bqcsv.config import (
|
|
9
|
+
CONFIG_KEYS,
|
|
10
|
+
CONFIG_PATH,
|
|
11
|
+
load_config,
|
|
12
|
+
resolve_setting,
|
|
13
|
+
save_config,
|
|
14
|
+
unset_config,
|
|
15
|
+
)
|
|
16
|
+
from bqcsv.uploader import upload_csv
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _upload_parser() -> argparse.ArgumentParser:
|
|
20
|
+
parser = argparse.ArgumentParser(
|
|
21
|
+
prog="bqcsv",
|
|
22
|
+
description="Upload a local CSV file to BigQuery using the authenticated `bq` CLI.",
|
|
23
|
+
)
|
|
24
|
+
parser.add_argument("csv_path", type=Path, help="Path to the local CSV file to upload")
|
|
25
|
+
parser.add_argument("--project", help="GCP project ID (overrides config)")
|
|
26
|
+
parser.add_argument("--dataset", help="BigQuery dataset ID (overrides config)")
|
|
27
|
+
parser.add_argument(
|
|
28
|
+
"--table",
|
|
29
|
+
help="BigQuery table ID (overrides config; defaults to the CSV file name without extension)",
|
|
30
|
+
)
|
|
31
|
+
parser.add_argument(
|
|
32
|
+
"--replace",
|
|
33
|
+
action="store_true",
|
|
34
|
+
help="Replace the destination table instead of appending rows",
|
|
35
|
+
)
|
|
36
|
+
parser.add_argument(
|
|
37
|
+
"--no-header",
|
|
38
|
+
action="store_true",
|
|
39
|
+
help="Treat the first row as data instead of a header row",
|
|
40
|
+
)
|
|
41
|
+
parser.add_argument(
|
|
42
|
+
"--schema",
|
|
43
|
+
type=Path,
|
|
44
|
+
help="Optional JSON schema file for the table (disables autodetect)",
|
|
45
|
+
)
|
|
46
|
+
parser.add_argument(
|
|
47
|
+
"--output",
|
|
48
|
+
choices=("text", "json"),
|
|
49
|
+
default="text",
|
|
50
|
+
help="Output format: text prints progress as it runs; json prints a single JSON object at the end",
|
|
51
|
+
)
|
|
52
|
+
return parser
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _config_parser() -> argparse.ArgumentParser:
|
|
56
|
+
parser = argparse.ArgumentParser(prog="bqcsv config")
|
|
57
|
+
subparsers = parser.add_subparsers(dest="config_command", required=True)
|
|
58
|
+
|
|
59
|
+
show_parser = subparsers.add_parser("show", help="Show saved defaults")
|
|
60
|
+
show_parser.set_defaults(func=_run_config_show)
|
|
61
|
+
|
|
62
|
+
set_parser = subparsers.add_parser("set", help="Save default project/dataset/table")
|
|
63
|
+
set_parser.add_argument("--project", help="Default GCP project ID")
|
|
64
|
+
set_parser.add_argument("--dataset", help="Default BigQuery dataset ID")
|
|
65
|
+
set_parser.add_argument("--table", help="Default BigQuery table ID")
|
|
66
|
+
set_parser.set_defaults(func=_run_config_set)
|
|
67
|
+
|
|
68
|
+
unset_parser = subparsers.add_parser("unset", help="Remove saved defaults")
|
|
69
|
+
unset_parser.add_argument("--project", action="store_true", help="Remove default project")
|
|
70
|
+
unset_parser.add_argument("--dataset", action="store_true", help="Remove default dataset")
|
|
71
|
+
unset_parser.add_argument("--table", action="store_true", help="Remove default table")
|
|
72
|
+
unset_parser.set_defaults(func=_run_config_unset)
|
|
73
|
+
|
|
74
|
+
return parser
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def resolve_table_name(
|
|
78
|
+
csv_path: Path,
|
|
79
|
+
cli_table: str | None,
|
|
80
|
+
config: dict[str, str],
|
|
81
|
+
) -> str:
|
|
82
|
+
table = resolve_setting(cli_table, config, "table")
|
|
83
|
+
if table:
|
|
84
|
+
return table
|
|
85
|
+
return csv_path.expanduser().resolve().stem
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _emit_upload_result(
|
|
89
|
+
*,
|
|
90
|
+
output: str,
|
|
91
|
+
logs: list[str],
|
|
92
|
+
status: str,
|
|
93
|
+
) -> None:
|
|
94
|
+
if output == "json":
|
|
95
|
+
print(json.dumps({"logs": "\n".join(logs), "status": status}))
|
|
96
|
+
return
|
|
97
|
+
for line in logs:
|
|
98
|
+
print(line, file=sys.stderr if status == "error" else sys.stdout)
|
|
99
|
+
print(f"Status: {status}.")
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _run_upload(argv: list[str]) -> int:
|
|
103
|
+
args = _upload_parser().parse_args(argv)
|
|
104
|
+
config = load_config()
|
|
105
|
+
csv_path = args.csv_path.expanduser().resolve()
|
|
106
|
+
project = resolve_setting(args.project, config, "project")
|
|
107
|
+
dataset = resolve_setting(args.dataset, config, "dataset")
|
|
108
|
+
table = resolve_table_name(csv_path, args.table, config)
|
|
109
|
+
json_output = args.output == "json"
|
|
110
|
+
logs: list[str] = []
|
|
111
|
+
|
|
112
|
+
missing = [
|
|
113
|
+
name
|
|
114
|
+
for name, value in (("project", project), ("dataset", dataset))
|
|
115
|
+
if not value
|
|
116
|
+
]
|
|
117
|
+
if missing:
|
|
118
|
+
names = ", ".join(f"--{name}" for name in missing)
|
|
119
|
+
logs.append(
|
|
120
|
+
f"Missing required setting(s): {names}. "
|
|
121
|
+
f"Set them on the command line or via `bqcsv config set`."
|
|
122
|
+
)
|
|
123
|
+
_emit_upload_result(output=args.output, logs=logs, status="error")
|
|
124
|
+
return 2
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
upload_csv(
|
|
128
|
+
csv_path,
|
|
129
|
+
project=project,
|
|
130
|
+
dataset=dataset,
|
|
131
|
+
table=table,
|
|
132
|
+
replace=args.replace,
|
|
133
|
+
skip_header=not args.no_header,
|
|
134
|
+
schema_path=args.schema.expanduser().resolve() if args.schema else None,
|
|
135
|
+
on_log=logs.append if json_output else None,
|
|
136
|
+
)
|
|
137
|
+
except Exception as exc:
|
|
138
|
+
logs.append(str(exc))
|
|
139
|
+
_emit_upload_result(output=args.output, logs=logs, status="error")
|
|
140
|
+
return 1
|
|
141
|
+
|
|
142
|
+
destination = f"{project}:{dataset}.{table}" if project else f"{dataset}.{table}"
|
|
143
|
+
logs.append(f"Uploaded {args.csv_path} to {destination}")
|
|
144
|
+
_emit_upload_result(output=args.output, logs=logs, status="success")
|
|
145
|
+
return 0
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _run_config_show(_: argparse.Namespace) -> int:
|
|
149
|
+
config = load_config()
|
|
150
|
+
if not config:
|
|
151
|
+
print(f"No config saved at {CONFIG_PATH}")
|
|
152
|
+
return 0
|
|
153
|
+
for key in CONFIG_KEYS:
|
|
154
|
+
if key in config:
|
|
155
|
+
print(f"{key} = {config[key]}")
|
|
156
|
+
print(f"\nConfig file: {CONFIG_PATH}")
|
|
157
|
+
return 0
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _run_config_set(args: argparse.Namespace) -> int:
|
|
161
|
+
updates = {
|
|
162
|
+
key: value
|
|
163
|
+
for key, value in (
|
|
164
|
+
("project", args.project),
|
|
165
|
+
("dataset", args.dataset),
|
|
166
|
+
("table", args.table),
|
|
167
|
+
)
|
|
168
|
+
if value
|
|
169
|
+
}
|
|
170
|
+
if not updates:
|
|
171
|
+
print("Provide at least one of --project, --dataset, or --table.", file=sys.stderr)
|
|
172
|
+
return 2
|
|
173
|
+
save_config(updates)
|
|
174
|
+
print(f"Saved defaults to {CONFIG_PATH}")
|
|
175
|
+
return 0
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _run_config_unset(args: argparse.Namespace) -> int:
|
|
179
|
+
keys = [key for key in CONFIG_KEYS if getattr(args, key)]
|
|
180
|
+
if not keys:
|
|
181
|
+
print("Provide at least one of --project, --dataset, or --table.", file=sys.stderr)
|
|
182
|
+
return 2
|
|
183
|
+
unset_config(keys)
|
|
184
|
+
print(f"Removed {', '.join(keys)} from {CONFIG_PATH}")
|
|
185
|
+
return 0
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _run_config(argv: list[str]) -> int:
|
|
189
|
+
args = _config_parser().parse_args(argv)
|
|
190
|
+
return args.func(args)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def main(argv: list[str] | None = None) -> int:
|
|
194
|
+
argv = list(sys.argv[1:] if argv is None else argv)
|
|
195
|
+
if argv and argv[0] == "config":
|
|
196
|
+
return _run_config(argv[1:])
|
|
197
|
+
return _run_upload(argv)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
if __name__ == "__main__":
|
|
201
|
+
raise SystemExit(main())
|
bqcsv/config.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import tomllib
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
CONFIG_DIR = Path(os.environ.get("XDG_CONFIG_HOME", Path.home() / ".config")) / "bqcsv"
|
|
9
|
+
CONFIG_PATH = CONFIG_DIR / "config.toml"
|
|
10
|
+
|
|
11
|
+
CONFIG_KEYS = ("project", "dataset", "table")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _ensure_config_dir() -> None:
|
|
15
|
+
CONFIG_DIR.mkdir(parents=True, exist_ok=True)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def load_config() -> dict[str, str]:
|
|
19
|
+
if not CONFIG_PATH.is_file():
|
|
20
|
+
return {}
|
|
21
|
+
with CONFIG_PATH.open("rb") as f:
|
|
22
|
+
data = tomllib.load(f)
|
|
23
|
+
return {key: str(data[key]) for key in CONFIG_KEYS if key in data and data[key]}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def save_config(values: dict[str, str]) -> None:
|
|
27
|
+
_ensure_config_dir()
|
|
28
|
+
current = load_config()
|
|
29
|
+
current.update(values)
|
|
30
|
+
lines = [f'{key} = "{_escape_toml(value)}"' for key, value in current.items()]
|
|
31
|
+
CONFIG_PATH.write_text("\n".join(lines) + "\n", encoding="utf-8")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def unset_config(keys: list[str]) -> None:
|
|
35
|
+
if not CONFIG_PATH.is_file():
|
|
36
|
+
return
|
|
37
|
+
current = load_config()
|
|
38
|
+
for key in keys:
|
|
39
|
+
current.pop(key, None)
|
|
40
|
+
if not current:
|
|
41
|
+
CONFIG_PATH.unlink(missing_ok=True)
|
|
42
|
+
return
|
|
43
|
+
lines = [f'{key} = "{_escape_toml(value)}"' for key, value in current.items()]
|
|
44
|
+
CONFIG_PATH.write_text("\n".join(lines) + "\n", encoding="utf-8")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def resolve_setting(cli_value: str | None, config: dict[str, str], key: str) -> str | None:
|
|
48
|
+
if cli_value:
|
|
49
|
+
return cli_value
|
|
50
|
+
return config.get(key)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _escape_toml(value: str) -> str:
|
|
54
|
+
return value.replace("\\", "\\\\").replace('"', '\\"')
|
bqcsv/schema.py
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import json
|
|
5
|
+
import re
|
|
6
|
+
import tempfile
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from google.cloud import bigquery
|
|
11
|
+
|
|
12
|
+
_CANDIDATE_DELIMITERS = ",;\t|"
|
|
13
|
+
DEFAULT_FIELD_DELIMITER = ","
|
|
14
|
+
_DELIMITER_SAMPLE_SIZE = 8192
|
|
15
|
+
_INTEGER_RE = re.compile(r"^-?\d+$")
|
|
16
|
+
_DATE_ONLY_RE = re.compile(r"^\d{4}-\d{2}-\d{2}$")
|
|
17
|
+
_BOOLEAN_VALUES = frozenset({"true", "false"})
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class SchemaError(Exception):
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _delimiter_count_in_line(line: str) -> dict[str, int]:
|
|
25
|
+
return {delimiter: line.count(delimiter) for delimiter in _CANDIDATE_DELIMITERS}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _delimiter_from_header_line(line: str) -> str | None:
|
|
29
|
+
counts = _delimiter_count_in_line(line)
|
|
30
|
+
delimiter, count = max(counts.items(), key=lambda item: item[1])
|
|
31
|
+
if count == 0:
|
|
32
|
+
return None
|
|
33
|
+
tied = [candidate for candidate, value in counts.items() if value == count]
|
|
34
|
+
if len(tied) > 1:
|
|
35
|
+
return None
|
|
36
|
+
return delimiter
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def detect_field_delimiter(csv_path: Path) -> str:
|
|
40
|
+
with csv_path.open(newline="", encoding="utf-8") as handle:
|
|
41
|
+
sample = handle.read(_DELIMITER_SAMPLE_SIZE)
|
|
42
|
+
if not sample.strip():
|
|
43
|
+
return DEFAULT_FIELD_DELIMITER
|
|
44
|
+
|
|
45
|
+
first_line = sample.splitlines()[0]
|
|
46
|
+
header_delimiter = _delimiter_from_header_line(first_line)
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
sniffed = csv.Sniffer().sniff(sample, delimiters=_CANDIDATE_DELIMITERS).delimiter
|
|
50
|
+
except csv.Error:
|
|
51
|
+
return header_delimiter if header_delimiter is not None else DEFAULT_FIELD_DELIMITER
|
|
52
|
+
|
|
53
|
+
if header_delimiter is None:
|
|
54
|
+
return sniffed
|
|
55
|
+
|
|
56
|
+
sniffed_columns = first_line.count(sniffed) + (1 if sniffed in first_line else 1)
|
|
57
|
+
header_columns = first_line.count(header_delimiter) + 1
|
|
58
|
+
if header_columns > sniffed_columns:
|
|
59
|
+
return header_delimiter
|
|
60
|
+
return sniffed
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _non_empty_values(series: pd.Series) -> pd.Series:
|
|
64
|
+
as_string = series.astype(str).str.strip()
|
|
65
|
+
return as_string[as_string != ""]
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _infer_bq_type_for_series(series: pd.Series) -> str:
|
|
69
|
+
values = _non_empty_values(series)
|
|
70
|
+
if values.empty:
|
|
71
|
+
return "STRING"
|
|
72
|
+
|
|
73
|
+
if pd.api.types.is_bool_dtype(series.dtype):
|
|
74
|
+
return "BOOLEAN"
|
|
75
|
+
if pd.api.types.is_integer_dtype(series.dtype):
|
|
76
|
+
return "INTEGER"
|
|
77
|
+
if pd.api.types.is_float_dtype(series.dtype):
|
|
78
|
+
if (series.dropna() % 1 == 0).all():
|
|
79
|
+
return "INTEGER"
|
|
80
|
+
return "FLOAT"
|
|
81
|
+
if pd.api.types.is_datetime64_any_dtype(series.dtype):
|
|
82
|
+
return "TIMESTAMP"
|
|
83
|
+
|
|
84
|
+
if values.str.match(_INTEGER_RE).all():
|
|
85
|
+
return "INTEGER"
|
|
86
|
+
|
|
87
|
+
if values.str.lower().isin(_BOOLEAN_VALUES).all():
|
|
88
|
+
return "BOOLEAN"
|
|
89
|
+
|
|
90
|
+
numeric = pd.to_numeric(series, errors="coerce")
|
|
91
|
+
non_null = series.notna()
|
|
92
|
+
if non_null.any() and numeric[non_null].notna().all():
|
|
93
|
+
if (numeric[non_null] % 1 == 0).all():
|
|
94
|
+
return "INTEGER"
|
|
95
|
+
return "FLOAT"
|
|
96
|
+
|
|
97
|
+
if values.str.match(_DATE_ONLY_RE).all():
|
|
98
|
+
return "DATE"
|
|
99
|
+
|
|
100
|
+
parsed = pd.to_datetime(series, errors="coerce", format="mixed")
|
|
101
|
+
if non_null.any() and parsed[non_null].notna().all():
|
|
102
|
+
return "TIMESTAMP"
|
|
103
|
+
|
|
104
|
+
return "STRING"
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def load_schema_from_json(schema_path: Path) -> list[bigquery.SchemaField]:
|
|
108
|
+
with schema_path.open(encoding="utf-8") as handle:
|
|
109
|
+
raw_schema = json.load(handle)
|
|
110
|
+
return [bigquery.SchemaField.from_api_repr(field) for field in raw_schema]
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def read_csv_dataframe(
|
|
114
|
+
csv_path: Path,
|
|
115
|
+
*,
|
|
116
|
+
field_delimiter: str,
|
|
117
|
+
skip_header: bool,
|
|
118
|
+
column_names: list[str] | None = None,
|
|
119
|
+
) -> pd.DataFrame:
|
|
120
|
+
read_kwargs = {
|
|
121
|
+
"filepath_or_buffer": csv_path,
|
|
122
|
+
"sep": field_delimiter,
|
|
123
|
+
"encoding": "utf-8",
|
|
124
|
+
"dtype": str,
|
|
125
|
+
"keep_default_na": False,
|
|
126
|
+
}
|
|
127
|
+
if skip_header:
|
|
128
|
+
dataframe = pd.read_csv(**read_kwargs)
|
|
129
|
+
else:
|
|
130
|
+
dataframe = pd.read_csv(**read_kwargs, header=None)
|
|
131
|
+
if column_names is None:
|
|
132
|
+
raise SchemaError(
|
|
133
|
+
"CSV has no header row. Create the destination table first or upload a CSV with a header."
|
|
134
|
+
)
|
|
135
|
+
if len(column_names) != len(dataframe.columns):
|
|
136
|
+
raise SchemaError(
|
|
137
|
+
f"CSV has {len(dataframe.columns)} column(s) but the destination table has "
|
|
138
|
+
f"{len(column_names)} column(s)."
|
|
139
|
+
)
|
|
140
|
+
dataframe.columns = column_names
|
|
141
|
+
return dataframe
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def format_dataframe_for_bq_load(
|
|
145
|
+
dataframe: pd.DataFrame,
|
|
146
|
+
schema: list[bigquery.SchemaField],
|
|
147
|
+
) -> pd.DataFrame:
|
|
148
|
+
formatted = dataframe.copy()
|
|
149
|
+
for field in schema:
|
|
150
|
+
if field.name not in formatted.columns:
|
|
151
|
+
continue
|
|
152
|
+
column = formatted[field.name]
|
|
153
|
+
if field.field_type == "TIMESTAMP":
|
|
154
|
+
parsed = pd.to_datetime(column, errors="coerce", format="mixed")
|
|
155
|
+
formatted[field.name] = parsed.dt.strftime("%Y-%m-%d %H:%M:%S.%f")
|
|
156
|
+
elif field.field_type == "DATE":
|
|
157
|
+
parsed = pd.to_datetime(column, errors="coerce", format="mixed")
|
|
158
|
+
formatted[field.name] = parsed.dt.strftime("%Y-%m-%d")
|
|
159
|
+
elif field.field_type == "INTEGER":
|
|
160
|
+
formatted[field.name] = (
|
|
161
|
+
pd.to_numeric(column, errors="coerce").astype("Int64").astype(str)
|
|
162
|
+
)
|
|
163
|
+
return formatted
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def infer_bq_schema_from_csv(
|
|
167
|
+
csv_path: Path,
|
|
168
|
+
*,
|
|
169
|
+
field_delimiter: str,
|
|
170
|
+
skip_header: bool,
|
|
171
|
+
column_names: list[str] | None = None,
|
|
172
|
+
) -> list[bigquery.SchemaField]:
|
|
173
|
+
dataframe = read_csv_dataframe(
|
|
174
|
+
csv_path,
|
|
175
|
+
field_delimiter=field_delimiter,
|
|
176
|
+
skip_header=skip_header,
|
|
177
|
+
column_names=column_names,
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
return [
|
|
181
|
+
bigquery.SchemaField(str(column), _infer_bq_type_for_series(dataframe[column]))
|
|
182
|
+
for column in dataframe.columns
|
|
183
|
+
]
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def format_schema(schema: list[bigquery.SchemaField]) -> str:
|
|
187
|
+
return ", ".join(f"{field.name}:{field.field_type}" for field in schema)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def schemas_match(
|
|
191
|
+
csv_schema: list[bigquery.SchemaField],
|
|
192
|
+
table_schema: list[bigquery.SchemaField],
|
|
193
|
+
) -> bool:
|
|
194
|
+
if len(csv_schema) != len(table_schema):
|
|
195
|
+
return False
|
|
196
|
+
for csv_field, table_field in zip(csv_schema, table_schema, strict=True):
|
|
197
|
+
if csv_field.name != table_field.name:
|
|
198
|
+
return False
|
|
199
|
+
if csv_field.field_type.upper() != table_field.field_type.upper():
|
|
200
|
+
return False
|
|
201
|
+
return True
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def write_schema_file(schema: list[bigquery.SchemaField]) -> Path:
|
|
205
|
+
with tempfile.NamedTemporaryFile(
|
|
206
|
+
"w",
|
|
207
|
+
suffix=".json",
|
|
208
|
+
delete=False,
|
|
209
|
+
encoding="utf-8",
|
|
210
|
+
) as handle:
|
|
211
|
+
json.dump([field.to_api_repr() for field in schema], handle)
|
|
212
|
+
return Path(handle.name)
|
bqcsv/table.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from google.api_core.exceptions import NotFound
|
|
4
|
+
from google.cloud import bigquery
|
|
5
|
+
|
|
6
|
+
from bqcsv.schema import format_schema, schemas_match
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TableError(Exception):
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def table_id(*, project: str | None, dataset: str, table: str) -> str:
|
|
14
|
+
if project:
|
|
15
|
+
return f"{project}.{dataset}.{table}"
|
|
16
|
+
return f"{dataset}.{table}"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def get_bq_client(project: str | None) -> bigquery.Client:
|
|
20
|
+
return bigquery.Client(project=project) if project else bigquery.Client()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def ensure_table_exists(
|
|
24
|
+
client: bigquery.Client,
|
|
25
|
+
destination_table_id: str,
|
|
26
|
+
schema: list[bigquery.SchemaField],
|
|
27
|
+
*,
|
|
28
|
+
replace: bool = False,
|
|
29
|
+
) -> list[bigquery.SchemaField]:
|
|
30
|
+
try:
|
|
31
|
+
existing_table = client.get_table(destination_table_id)
|
|
32
|
+
except NotFound:
|
|
33
|
+
client.create_table(bigquery.Table(destination_table_id, schema=schema))
|
|
34
|
+
return schema
|
|
35
|
+
|
|
36
|
+
table_schema = list(existing_table.schema)
|
|
37
|
+
if not schemas_match(schema, table_schema):
|
|
38
|
+
if replace:
|
|
39
|
+
return schema
|
|
40
|
+
raise TableError(
|
|
41
|
+
"CSV schema does not match the destination table schema. "
|
|
42
|
+
f"CSV: [{format_schema(schema)}]. "
|
|
43
|
+
f"Table: [{format_schema(table_schema)}]. "
|
|
44
|
+
"Use --replace to recreate the table."
|
|
45
|
+
)
|
|
46
|
+
return table_schema
|
bqcsv/uploader.py
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import shutil
|
|
4
|
+
import subprocess
|
|
5
|
+
import tempfile
|
|
6
|
+
from collections.abc import Callable
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from google.api_core.exceptions import NotFound
|
|
10
|
+
|
|
11
|
+
from bqcsv.schema import (
|
|
12
|
+
DEFAULT_FIELD_DELIMITER,
|
|
13
|
+
SchemaError,
|
|
14
|
+
detect_field_delimiter,
|
|
15
|
+
format_dataframe_for_bq_load,
|
|
16
|
+
format_schema,
|
|
17
|
+
infer_bq_schema_from_csv,
|
|
18
|
+
load_schema_from_json,
|
|
19
|
+
read_csv_dataframe,
|
|
20
|
+
write_schema_file,
|
|
21
|
+
)
|
|
22
|
+
from bqcsv.table import TableError, ensure_table_exists, get_bq_client, table_id
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class UploadError(Exception):
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def ensure_bq_available() -> str:
|
|
30
|
+
bq_path = shutil.which("bq")
|
|
31
|
+
if not bq_path:
|
|
32
|
+
raise UploadError(
|
|
33
|
+
"The `bq` CLI was not found on PATH. Install the Google Cloud SDK and run "
|
|
34
|
+
"`gcloud auth login` before uploading."
|
|
35
|
+
)
|
|
36
|
+
return bq_path
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _write_prepared_csv(
|
|
40
|
+
dataframe,
|
|
41
|
+
*,
|
|
42
|
+
field_delimiter: str,
|
|
43
|
+
skip_header: bool,
|
|
44
|
+
) -> Path:
|
|
45
|
+
with tempfile.NamedTemporaryFile(
|
|
46
|
+
"w",
|
|
47
|
+
suffix=".csv",
|
|
48
|
+
delete=False,
|
|
49
|
+
newline="",
|
|
50
|
+
encoding="utf-8",
|
|
51
|
+
) as handle:
|
|
52
|
+
dataframe.to_csv(
|
|
53
|
+
handle,
|
|
54
|
+
index=False,
|
|
55
|
+
sep=field_delimiter,
|
|
56
|
+
header=skip_header,
|
|
57
|
+
)
|
|
58
|
+
return Path(handle.name)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def build_load_command(
|
|
62
|
+
csv_path: Path,
|
|
63
|
+
*,
|
|
64
|
+
project: str | None,
|
|
65
|
+
dataset: str,
|
|
66
|
+
table: str,
|
|
67
|
+
schema_path: Path,
|
|
68
|
+
replace: bool = False,
|
|
69
|
+
skip_header: bool = True,
|
|
70
|
+
field_delimiter: str = DEFAULT_FIELD_DELIMITER,
|
|
71
|
+
) -> list[str]:
|
|
72
|
+
destination = f"{project}:{dataset}.{table}" if project else f"{dataset}.{table}"
|
|
73
|
+
cmd = ["bq", "load", "--source_format=CSV", "--noautodetect"]
|
|
74
|
+
if project:
|
|
75
|
+
cmd.extend(["--project_id", project])
|
|
76
|
+
cmd.append(f"--field_delimiter={field_delimiter}")
|
|
77
|
+
if skip_header:
|
|
78
|
+
cmd.append("--skip_leading_rows=1")
|
|
79
|
+
if replace:
|
|
80
|
+
cmd.append("--replace")
|
|
81
|
+
cmd.append(destination)
|
|
82
|
+
cmd.append(str(csv_path))
|
|
83
|
+
cmd.append(str(schema_path))
|
|
84
|
+
return cmd
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _log(on_log: Callable[[str], None] | None, message: str) -> None:
|
|
88
|
+
if on_log is not None:
|
|
89
|
+
on_log(message)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def upload_csv(
|
|
93
|
+
csv_path: Path,
|
|
94
|
+
*,
|
|
95
|
+
project: str | None,
|
|
96
|
+
dataset: str,
|
|
97
|
+
table: str,
|
|
98
|
+
replace: bool = False,
|
|
99
|
+
skip_header: bool = True,
|
|
100
|
+
schema_path: Path | None = None,
|
|
101
|
+
on_log: Callable[[str], None] | None = None,
|
|
102
|
+
) -> None:
|
|
103
|
+
ensure_bq_available()
|
|
104
|
+
if not csv_path.is_file():
|
|
105
|
+
raise UploadError(f"CSV file not found: {csv_path}")
|
|
106
|
+
if schema_path is not None and not schema_path.is_file():
|
|
107
|
+
raise UploadError(f"Schema file not found: {schema_path}")
|
|
108
|
+
|
|
109
|
+
try:
|
|
110
|
+
field_delimiter = detect_field_delimiter(csv_path)
|
|
111
|
+
_log(on_log, f"Detected field delimiter: {field_delimiter!r}")
|
|
112
|
+
destination_table_id = table_id(project=project, dataset=dataset, table=table)
|
|
113
|
+
client = get_bq_client(project)
|
|
114
|
+
|
|
115
|
+
explicit_schema = (
|
|
116
|
+
load_schema_from_json(schema_path) if schema_path is not None else None
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
column_names: list[str] | None = None
|
|
120
|
+
if not skip_header:
|
|
121
|
+
if explicit_schema is not None:
|
|
122
|
+
column_names = [field.name for field in explicit_schema]
|
|
123
|
+
else:
|
|
124
|
+
try:
|
|
125
|
+
existing_table = client.get_table(destination_table_id)
|
|
126
|
+
except NotFound as exc:
|
|
127
|
+
raise UploadError(
|
|
128
|
+
"CSV has no header row and destination table does not exist. "
|
|
129
|
+
"Upload a CSV with a header or create the table first."
|
|
130
|
+
) from exc
|
|
131
|
+
column_names = [field.name for field in existing_table.schema]
|
|
132
|
+
|
|
133
|
+
if explicit_schema is not None:
|
|
134
|
+
csv_schema = explicit_schema
|
|
135
|
+
_log(on_log, f"Using schema from {schema_path}")
|
|
136
|
+
else:
|
|
137
|
+
csv_schema = infer_bq_schema_from_csv(
|
|
138
|
+
csv_path,
|
|
139
|
+
field_delimiter=field_delimiter,
|
|
140
|
+
skip_header=skip_header,
|
|
141
|
+
column_names=column_names,
|
|
142
|
+
)
|
|
143
|
+
_log(on_log, f"Inferred schema: [{format_schema(csv_schema)}]")
|
|
144
|
+
|
|
145
|
+
load_schema = ensure_table_exists(
|
|
146
|
+
client,
|
|
147
|
+
destination_table_id,
|
|
148
|
+
csv_schema,
|
|
149
|
+
replace=replace,
|
|
150
|
+
)
|
|
151
|
+
_log(on_log, f"Destination table ready: {destination_table_id}")
|
|
152
|
+
|
|
153
|
+
dataframe = read_csv_dataframe(
|
|
154
|
+
csv_path,
|
|
155
|
+
field_delimiter=field_delimiter,
|
|
156
|
+
skip_header=skip_header,
|
|
157
|
+
column_names=column_names,
|
|
158
|
+
)
|
|
159
|
+
prepared_csv_path = _write_prepared_csv(
|
|
160
|
+
format_dataframe_for_bq_load(dataframe, load_schema),
|
|
161
|
+
field_delimiter=field_delimiter,
|
|
162
|
+
skip_header=skip_header,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
temp_schema_path = write_schema_file(load_schema)
|
|
166
|
+
try:
|
|
167
|
+
cmd = build_load_command(
|
|
168
|
+
prepared_csv_path,
|
|
169
|
+
project=project,
|
|
170
|
+
dataset=dataset,
|
|
171
|
+
table=table,
|
|
172
|
+
schema_path=temp_schema_path,
|
|
173
|
+
replace=replace,
|
|
174
|
+
skip_header=skip_header,
|
|
175
|
+
field_delimiter=field_delimiter,
|
|
176
|
+
)
|
|
177
|
+
_log(on_log, f"Running: {' '.join(cmd)}")
|
|
178
|
+
try:
|
|
179
|
+
if on_log is not None:
|
|
180
|
+
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
|
181
|
+
if result.stdout.strip():
|
|
182
|
+
_log(on_log, result.stdout.rstrip())
|
|
183
|
+
if result.stderr.strip():
|
|
184
|
+
_log(on_log, result.stderr.rstrip())
|
|
185
|
+
else:
|
|
186
|
+
subprocess.run(cmd, check=True)
|
|
187
|
+
except subprocess.CalledProcessError as exc:
|
|
188
|
+
raise UploadError(f"`bq load` failed with exit code {exc.returncode}") from exc
|
|
189
|
+
finally:
|
|
190
|
+
temp_schema_path.unlink(missing_ok=True)
|
|
191
|
+
prepared_csv_path.unlink(missing_ok=True)
|
|
192
|
+
except (SchemaError, TableError) as exc:
|
|
193
|
+
raise UploadError(str(exc)) from exc
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: bqcsv
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Upload a local CSV file to a BigQuery table via the bq CLI
|
|
5
|
+
Requires-Python: >=3.10
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: google-cloud-bigquery>=3.0
|
|
8
|
+
Requires-Dist: pandas>=2.0
|
|
9
|
+
|
|
10
|
+
# bqcsv
|
|
11
|
+
|
|
12
|
+
Upload a local CSV file to BigQuery using the `bq` CLI and your existing `gcloud` authentication.
|
|
13
|
+
|
|
14
|
+
## Why a dedicated CLI tool?
|
|
15
|
+
|
|
16
|
+
Out of the box, Google's `bq` CLI cannot create a table with column names inferred from a CSV file.
|
|
17
|
+
|
|
18
|
+
`bqcsv` fixes that:
|
|
19
|
+
|
|
20
|
+
* detects the schema from the CSV file
|
|
21
|
+
* creates a table with proper column names and types
|
|
22
|
+
* loads the CSV file using `bq load`
|
|
23
|
+
|
|
24
|
+
## Authentication
|
|
25
|
+
|
|
26
|
+
No additional authentication is needed.
|
|
27
|
+
|
|
28
|
+
`bqcsv` uses your existing authentication via `gcloud auth login`.
|
|
29
|
+
|
|
30
|
+
## Requirements
|
|
31
|
+
|
|
32
|
+
- Python 3.10+
|
|
33
|
+
- [Google Cloud SDK](https://cloud.google.com/sdk) with `bq` on your `PATH`
|
|
34
|
+
|
|
35
|
+
## How to use `bqcsv`
|
|
36
|
+
|
|
37
|
+
### Upload a CSV file to a table
|
|
38
|
+
|
|
39
|
+
To upload a CSV file, specify your project ID, dataset ID, and table name:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
bqcsv data.csv --project my-gcp-project --dataset staging --table events_raw
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
The `--table` argument is optional. By default, `bqcsv` derives the table name from the CSV file:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
bqcsv data.csv --project my-gcp-project --dataset staging
|
|
49
|
+
|
|
50
|
+
# is identical to
|
|
51
|
+
|
|
52
|
+
bqcsv data.csv --project my-gcp-project --dataset staging --table data
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Saving your configuration
|
|
56
|
+
|
|
57
|
+
To avoid passing `--project`, `--dataset`, or `--table` on every run, save them to your local config:
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
bqcsv config set --project my-gcp-project --dataset analytics --table events
|
|
61
|
+
bqcsv config show
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Defaults are stored in `~/.config/bqcsv/config.toml`.
|
|
65
|
+
|
|
66
|
+
After you set your defaults, you can call `bqcsv` without arguments:
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
bqcsv data.csv
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
If you have not set a default `--table` value, the table name is derived from the CSV file.
|
|
73
|
+
|
|
74
|
+
## Development
|
|
75
|
+
|
|
76
|
+
### Install from your local repo
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
pip install -e .
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### Testing
|
|
83
|
+
|
|
84
|
+
To delete a test table, use `bq`:
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
bq rm -f -t PROJECT_ID:DATASET_ID.TABLE_NAME
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
You can run the module directly when working on a new feature or fixing a bug:
|
|
91
|
+
|
|
92
|
+
```sh
|
|
93
|
+
python -m bqcsv.cli config set --project PROJECT_ID --dataset DATASET_ID --table TEST_TABLE_NAME
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Releasing to PyPI
|
|
97
|
+
|
|
98
|
+
1. **Bump the version** in both places (they must match):
|
|
99
|
+
- `pyproject.toml` → `[project].version`
|
|
100
|
+
- `bqcsv/__init__.py` → `__version__`
|
|
101
|
+
|
|
102
|
+
2. **Install build tools** (one-time):
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
pip install build twine
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
3. **Run tests** and commit the version bump.
|
|
109
|
+
|
|
110
|
+
4. **Build the package**:
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
python -m build
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
This creates `dist/bqcsv-<version>.tar.gz` and `dist/bqcsv-<version>-py3-none-any.whl`.
|
|
117
|
+
|
|
118
|
+
5. **Upload to PyPI**:
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
twine upload dist/*
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
On first upload, create an account at [pypi.org](https://pypi.org) and use an [API token](https://pypi.org/help/#apitoken) as the password (`__token__` as the username).
|
|
125
|
+
|
|
126
|
+
6. **Tag the release** (optional but recommended):
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
git tag v0.2.0
|
|
130
|
+
git push origin v0.2.0
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
After publishing, users can install the new version with:
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
pip install --upgrade bqcsv
|
|
137
|
+
```
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
bqcsv/__init__.py,sha256=J-j-u0itpEFT6irdmWmixQqYMadNl1X91TxUmoiLHMI,22
|
|
2
|
+
bqcsv/cli.py,sha256=drFb5yv_2R-C2Pkzz5hu7goN72EQCQ8MWgg8DrdycMg,6425
|
|
3
|
+
bqcsv/config.py,sha256=TA2k1eeVa041S777RTM9fbyBW-QP_sjgt4IApVOPf8Y,1569
|
|
4
|
+
bqcsv/schema.py,sha256=TdNeatvKCONZmOd8PSYNE4UPCjz6fCFE-VuS5cI9G4Q,6602
|
|
5
|
+
bqcsv/table.py,sha256=rfoA0v3tyUzCKhgsN0BQMBWdLDyZVH-eUpfxSRajM7s,1338
|
|
6
|
+
bqcsv/uploader.py,sha256=RxAPwWwAunh7HAGrPU0lHcmklznIJ0gPuhWRV0P3Nhk,6111
|
|
7
|
+
bqcsv-1.0.0.dist-info/METADATA,sha256=eMXIhk6WpbFYYb8HyzmnCLCCuQ_4cNRdvtesBXtv69k,3139
|
|
8
|
+
bqcsv-1.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
9
|
+
bqcsv-1.0.0.dist-info/entry_points.txt,sha256=3p_cZqU8eHNBnnkGSKkdbDXdXUEVL19gPOJeZGQ6hE8,41
|
|
10
|
+
bqcsv-1.0.0.dist-info/top_level.txt,sha256=iVTsmLuF2tHvlgDaGBA-NOFo8UwVRgUETjKPVYlWfHU,6
|
|
11
|
+
bqcsv-1.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
bqcsv
|