bqcsv 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bqcsv/__init__.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "1.0.0"
bqcsv/cli.py ADDED
@@ -0,0 +1,201 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ from bqcsv.config import (
9
+ CONFIG_KEYS,
10
+ CONFIG_PATH,
11
+ load_config,
12
+ resolve_setting,
13
+ save_config,
14
+ unset_config,
15
+ )
16
+ from bqcsv.uploader import upload_csv
17
+
18
+
19
+ def _upload_parser() -> argparse.ArgumentParser:
20
+ parser = argparse.ArgumentParser(
21
+ prog="bqcsv",
22
+ description="Upload a local CSV file to BigQuery using the authenticated `bq` CLI.",
23
+ )
24
+ parser.add_argument("csv_path", type=Path, help="Path to the local CSV file to upload")
25
+ parser.add_argument("--project", help="GCP project ID (overrides config)")
26
+ parser.add_argument("--dataset", help="BigQuery dataset ID (overrides config)")
27
+ parser.add_argument(
28
+ "--table",
29
+ help="BigQuery table ID (overrides config; defaults to the CSV file name without extension)",
30
+ )
31
+ parser.add_argument(
32
+ "--replace",
33
+ action="store_true",
34
+ help="Replace the destination table instead of appending rows",
35
+ )
36
+ parser.add_argument(
37
+ "--no-header",
38
+ action="store_true",
39
+ help="Treat the first row as data instead of a header row",
40
+ )
41
+ parser.add_argument(
42
+ "--schema",
43
+ type=Path,
44
+ help="Optional JSON schema file for the table (disables autodetect)",
45
+ )
46
+ parser.add_argument(
47
+ "--output",
48
+ choices=("text", "json"),
49
+ default="text",
50
+ help="Output format: text prints progress as it runs; json prints a single JSON object at the end",
51
+ )
52
+ return parser
53
+
54
+
55
+ def _config_parser() -> argparse.ArgumentParser:
56
+ parser = argparse.ArgumentParser(prog="bqcsv config")
57
+ subparsers = parser.add_subparsers(dest="config_command", required=True)
58
+
59
+ show_parser = subparsers.add_parser("show", help="Show saved defaults")
60
+ show_parser.set_defaults(func=_run_config_show)
61
+
62
+ set_parser = subparsers.add_parser("set", help="Save default project/dataset/table")
63
+ set_parser.add_argument("--project", help="Default GCP project ID")
64
+ set_parser.add_argument("--dataset", help="Default BigQuery dataset ID")
65
+ set_parser.add_argument("--table", help="Default BigQuery table ID")
66
+ set_parser.set_defaults(func=_run_config_set)
67
+
68
+ unset_parser = subparsers.add_parser("unset", help="Remove saved defaults")
69
+ unset_parser.add_argument("--project", action="store_true", help="Remove default project")
70
+ unset_parser.add_argument("--dataset", action="store_true", help="Remove default dataset")
71
+ unset_parser.add_argument("--table", action="store_true", help="Remove default table")
72
+ unset_parser.set_defaults(func=_run_config_unset)
73
+
74
+ return parser
75
+
76
+
77
+ def resolve_table_name(
78
+ csv_path: Path,
79
+ cli_table: str | None,
80
+ config: dict[str, str],
81
+ ) -> str:
82
+ table = resolve_setting(cli_table, config, "table")
83
+ if table:
84
+ return table
85
+ return csv_path.expanduser().resolve().stem
86
+
87
+
88
+ def _emit_upload_result(
89
+ *,
90
+ output: str,
91
+ logs: list[str],
92
+ status: str,
93
+ ) -> None:
94
+ if output == "json":
95
+ print(json.dumps({"logs": "\n".join(logs), "status": status}))
96
+ return
97
+ for line in logs:
98
+ print(line, file=sys.stderr if status == "error" else sys.stdout)
99
+ print(f"Status: {status}.")
100
+
101
+
102
+ def _run_upload(argv: list[str]) -> int:
103
+ args = _upload_parser().parse_args(argv)
104
+ config = load_config()
105
+ csv_path = args.csv_path.expanduser().resolve()
106
+ project = resolve_setting(args.project, config, "project")
107
+ dataset = resolve_setting(args.dataset, config, "dataset")
108
+ table = resolve_table_name(csv_path, args.table, config)
109
+ json_output = args.output == "json"
110
+ logs: list[str] = []
111
+
112
+ missing = [
113
+ name
114
+ for name, value in (("project", project), ("dataset", dataset))
115
+ if not value
116
+ ]
117
+ if missing:
118
+ names = ", ".join(f"--{name}" for name in missing)
119
+ logs.append(
120
+ f"Missing required setting(s): {names}. "
121
+ f"Set them on the command line or via `bqcsv config set`."
122
+ )
123
+ _emit_upload_result(output=args.output, logs=logs, status="error")
124
+ return 2
125
+
126
+ try:
127
+ upload_csv(
128
+ csv_path,
129
+ project=project,
130
+ dataset=dataset,
131
+ table=table,
132
+ replace=args.replace,
133
+ skip_header=not args.no_header,
134
+ schema_path=args.schema.expanduser().resolve() if args.schema else None,
135
+ on_log=logs.append if json_output else None,
136
+ )
137
+ except Exception as exc:
138
+ logs.append(str(exc))
139
+ _emit_upload_result(output=args.output, logs=logs, status="error")
140
+ return 1
141
+
142
+ destination = f"{project}:{dataset}.{table}" if project else f"{dataset}.{table}"
143
+ logs.append(f"Uploaded {args.csv_path} to {destination}")
144
+ _emit_upload_result(output=args.output, logs=logs, status="success")
145
+ return 0
146
+
147
+
148
+ def _run_config_show(_: argparse.Namespace) -> int:
149
+ config = load_config()
150
+ if not config:
151
+ print(f"No config saved at {CONFIG_PATH}")
152
+ return 0
153
+ for key in CONFIG_KEYS:
154
+ if key in config:
155
+ print(f"{key} = {config[key]}")
156
+ print(f"\nConfig file: {CONFIG_PATH}")
157
+ return 0
158
+
159
+
160
+ def _run_config_set(args: argparse.Namespace) -> int:
161
+ updates = {
162
+ key: value
163
+ for key, value in (
164
+ ("project", args.project),
165
+ ("dataset", args.dataset),
166
+ ("table", args.table),
167
+ )
168
+ if value
169
+ }
170
+ if not updates:
171
+ print("Provide at least one of --project, --dataset, or --table.", file=sys.stderr)
172
+ return 2
173
+ save_config(updates)
174
+ print(f"Saved defaults to {CONFIG_PATH}")
175
+ return 0
176
+
177
+
178
+ def _run_config_unset(args: argparse.Namespace) -> int:
179
+ keys = [key for key in CONFIG_KEYS if getattr(args, key)]
180
+ if not keys:
181
+ print("Provide at least one of --project, --dataset, or --table.", file=sys.stderr)
182
+ return 2
183
+ unset_config(keys)
184
+ print(f"Removed {', '.join(keys)} from {CONFIG_PATH}")
185
+ return 0
186
+
187
+
188
+ def _run_config(argv: list[str]) -> int:
189
+ args = _config_parser().parse_args(argv)
190
+ return args.func(args)
191
+
192
+
193
+ def main(argv: list[str] | None = None) -> int:
194
+ argv = list(sys.argv[1:] if argv is None else argv)
195
+ if argv and argv[0] == "config":
196
+ return _run_config(argv[1:])
197
+ return _run_upload(argv)
198
+
199
+
200
+ if __name__ == "__main__":
201
+ raise SystemExit(main())
bqcsv/config.py ADDED
@@ -0,0 +1,54 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import tomllib
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ CONFIG_DIR = Path(os.environ.get("XDG_CONFIG_HOME", Path.home() / ".config")) / "bqcsv"
9
+ CONFIG_PATH = CONFIG_DIR / "config.toml"
10
+
11
+ CONFIG_KEYS = ("project", "dataset", "table")
12
+
13
+
14
+ def _ensure_config_dir() -> None:
15
+ CONFIG_DIR.mkdir(parents=True, exist_ok=True)
16
+
17
+
18
+ def load_config() -> dict[str, str]:
19
+ if not CONFIG_PATH.is_file():
20
+ return {}
21
+ with CONFIG_PATH.open("rb") as f:
22
+ data = tomllib.load(f)
23
+ return {key: str(data[key]) for key in CONFIG_KEYS if key in data and data[key]}
24
+
25
+
26
+ def save_config(values: dict[str, str]) -> None:
27
+ _ensure_config_dir()
28
+ current = load_config()
29
+ current.update(values)
30
+ lines = [f'{key} = "{_escape_toml(value)}"' for key, value in current.items()]
31
+ CONFIG_PATH.write_text("\n".join(lines) + "\n", encoding="utf-8")
32
+
33
+
34
+ def unset_config(keys: list[str]) -> None:
35
+ if not CONFIG_PATH.is_file():
36
+ return
37
+ current = load_config()
38
+ for key in keys:
39
+ current.pop(key, None)
40
+ if not current:
41
+ CONFIG_PATH.unlink(missing_ok=True)
42
+ return
43
+ lines = [f'{key} = "{_escape_toml(value)}"' for key, value in current.items()]
44
+ CONFIG_PATH.write_text("\n".join(lines) + "\n", encoding="utf-8")
45
+
46
+
47
+ def resolve_setting(cli_value: str | None, config: dict[str, str], key: str) -> str | None:
48
+ if cli_value:
49
+ return cli_value
50
+ return config.get(key)
51
+
52
+
53
+ def _escape_toml(value: str) -> str:
54
+ return value.replace("\\", "\\\\").replace('"', '\\"')
bqcsv/schema.py ADDED
@@ -0,0 +1,212 @@
1
+ from __future__ import annotations
2
+
3
+ import csv
4
+ import json
5
+ import re
6
+ import tempfile
7
+ from pathlib import Path
8
+
9
+ import pandas as pd
10
+ from google.cloud import bigquery
11
+
12
+ _CANDIDATE_DELIMITERS = ",;\t|"
13
+ DEFAULT_FIELD_DELIMITER = ","
14
+ _DELIMITER_SAMPLE_SIZE = 8192
15
+ _INTEGER_RE = re.compile(r"^-?\d+$")
16
+ _DATE_ONLY_RE = re.compile(r"^\d{4}-\d{2}-\d{2}$")
17
+ _BOOLEAN_VALUES = frozenset({"true", "false"})
18
+
19
+
20
+ class SchemaError(Exception):
21
+ pass
22
+
23
+
24
+ def _delimiter_count_in_line(line: str) -> dict[str, int]:
25
+ return {delimiter: line.count(delimiter) for delimiter in _CANDIDATE_DELIMITERS}
26
+
27
+
28
+ def _delimiter_from_header_line(line: str) -> str | None:
29
+ counts = _delimiter_count_in_line(line)
30
+ delimiter, count = max(counts.items(), key=lambda item: item[1])
31
+ if count == 0:
32
+ return None
33
+ tied = [candidate for candidate, value in counts.items() if value == count]
34
+ if len(tied) > 1:
35
+ return None
36
+ return delimiter
37
+
38
+
39
+ def detect_field_delimiter(csv_path: Path) -> str:
40
+ with csv_path.open(newline="", encoding="utf-8") as handle:
41
+ sample = handle.read(_DELIMITER_SAMPLE_SIZE)
42
+ if not sample.strip():
43
+ return DEFAULT_FIELD_DELIMITER
44
+
45
+ first_line = sample.splitlines()[0]
46
+ header_delimiter = _delimiter_from_header_line(first_line)
47
+
48
+ try:
49
+ sniffed = csv.Sniffer().sniff(sample, delimiters=_CANDIDATE_DELIMITERS).delimiter
50
+ except csv.Error:
51
+ return header_delimiter if header_delimiter is not None else DEFAULT_FIELD_DELIMITER
52
+
53
+ if header_delimiter is None:
54
+ return sniffed
55
+
56
+ sniffed_columns = first_line.count(sniffed) + (1 if sniffed in first_line else 1)
57
+ header_columns = first_line.count(header_delimiter) + 1
58
+ if header_columns > sniffed_columns:
59
+ return header_delimiter
60
+ return sniffed
61
+
62
+
63
+ def _non_empty_values(series: pd.Series) -> pd.Series:
64
+ as_string = series.astype(str).str.strip()
65
+ return as_string[as_string != ""]
66
+
67
+
68
+ def _infer_bq_type_for_series(series: pd.Series) -> str:
69
+ values = _non_empty_values(series)
70
+ if values.empty:
71
+ return "STRING"
72
+
73
+ if pd.api.types.is_bool_dtype(series.dtype):
74
+ return "BOOLEAN"
75
+ if pd.api.types.is_integer_dtype(series.dtype):
76
+ return "INTEGER"
77
+ if pd.api.types.is_float_dtype(series.dtype):
78
+ if (series.dropna() % 1 == 0).all():
79
+ return "INTEGER"
80
+ return "FLOAT"
81
+ if pd.api.types.is_datetime64_any_dtype(series.dtype):
82
+ return "TIMESTAMP"
83
+
84
+ if values.str.match(_INTEGER_RE).all():
85
+ return "INTEGER"
86
+
87
+ if values.str.lower().isin(_BOOLEAN_VALUES).all():
88
+ return "BOOLEAN"
89
+
90
+ numeric = pd.to_numeric(series, errors="coerce")
91
+ non_null = series.notna()
92
+ if non_null.any() and numeric[non_null].notna().all():
93
+ if (numeric[non_null] % 1 == 0).all():
94
+ return "INTEGER"
95
+ return "FLOAT"
96
+
97
+ if values.str.match(_DATE_ONLY_RE).all():
98
+ return "DATE"
99
+
100
+ parsed = pd.to_datetime(series, errors="coerce", format="mixed")
101
+ if non_null.any() and parsed[non_null].notna().all():
102
+ return "TIMESTAMP"
103
+
104
+ return "STRING"
105
+
106
+
107
+ def load_schema_from_json(schema_path: Path) -> list[bigquery.SchemaField]:
108
+ with schema_path.open(encoding="utf-8") as handle:
109
+ raw_schema = json.load(handle)
110
+ return [bigquery.SchemaField.from_api_repr(field) for field in raw_schema]
111
+
112
+
113
+ def read_csv_dataframe(
114
+ csv_path: Path,
115
+ *,
116
+ field_delimiter: str,
117
+ skip_header: bool,
118
+ column_names: list[str] | None = None,
119
+ ) -> pd.DataFrame:
120
+ read_kwargs = {
121
+ "filepath_or_buffer": csv_path,
122
+ "sep": field_delimiter,
123
+ "encoding": "utf-8",
124
+ "dtype": str,
125
+ "keep_default_na": False,
126
+ }
127
+ if skip_header:
128
+ dataframe = pd.read_csv(**read_kwargs)
129
+ else:
130
+ dataframe = pd.read_csv(**read_kwargs, header=None)
131
+ if column_names is None:
132
+ raise SchemaError(
133
+ "CSV has no header row. Create the destination table first or upload a CSV with a header."
134
+ )
135
+ if len(column_names) != len(dataframe.columns):
136
+ raise SchemaError(
137
+ f"CSV has {len(dataframe.columns)} column(s) but the destination table has "
138
+ f"{len(column_names)} column(s)."
139
+ )
140
+ dataframe.columns = column_names
141
+ return dataframe
142
+
143
+
144
+ def format_dataframe_for_bq_load(
145
+ dataframe: pd.DataFrame,
146
+ schema: list[bigquery.SchemaField],
147
+ ) -> pd.DataFrame:
148
+ formatted = dataframe.copy()
149
+ for field in schema:
150
+ if field.name not in formatted.columns:
151
+ continue
152
+ column = formatted[field.name]
153
+ if field.field_type == "TIMESTAMP":
154
+ parsed = pd.to_datetime(column, errors="coerce", format="mixed")
155
+ formatted[field.name] = parsed.dt.strftime("%Y-%m-%d %H:%M:%S.%f")
156
+ elif field.field_type == "DATE":
157
+ parsed = pd.to_datetime(column, errors="coerce", format="mixed")
158
+ formatted[field.name] = parsed.dt.strftime("%Y-%m-%d")
159
+ elif field.field_type == "INTEGER":
160
+ formatted[field.name] = (
161
+ pd.to_numeric(column, errors="coerce").astype("Int64").astype(str)
162
+ )
163
+ return formatted
164
+
165
+
166
+ def infer_bq_schema_from_csv(
167
+ csv_path: Path,
168
+ *,
169
+ field_delimiter: str,
170
+ skip_header: bool,
171
+ column_names: list[str] | None = None,
172
+ ) -> list[bigquery.SchemaField]:
173
+ dataframe = read_csv_dataframe(
174
+ csv_path,
175
+ field_delimiter=field_delimiter,
176
+ skip_header=skip_header,
177
+ column_names=column_names,
178
+ )
179
+
180
+ return [
181
+ bigquery.SchemaField(str(column), _infer_bq_type_for_series(dataframe[column]))
182
+ for column in dataframe.columns
183
+ ]
184
+
185
+
186
+ def format_schema(schema: list[bigquery.SchemaField]) -> str:
187
+ return ", ".join(f"{field.name}:{field.field_type}" for field in schema)
188
+
189
+
190
+ def schemas_match(
191
+ csv_schema: list[bigquery.SchemaField],
192
+ table_schema: list[bigquery.SchemaField],
193
+ ) -> bool:
194
+ if len(csv_schema) != len(table_schema):
195
+ return False
196
+ for csv_field, table_field in zip(csv_schema, table_schema, strict=True):
197
+ if csv_field.name != table_field.name:
198
+ return False
199
+ if csv_field.field_type.upper() != table_field.field_type.upper():
200
+ return False
201
+ return True
202
+
203
+
204
+ def write_schema_file(schema: list[bigquery.SchemaField]) -> Path:
205
+ with tempfile.NamedTemporaryFile(
206
+ "w",
207
+ suffix=".json",
208
+ delete=False,
209
+ encoding="utf-8",
210
+ ) as handle:
211
+ json.dump([field.to_api_repr() for field in schema], handle)
212
+ return Path(handle.name)
bqcsv/table.py ADDED
@@ -0,0 +1,46 @@
1
+ from __future__ import annotations
2
+
3
+ from google.api_core.exceptions import NotFound
4
+ from google.cloud import bigquery
5
+
6
+ from bqcsv.schema import format_schema, schemas_match
7
+
8
+
9
+ class TableError(Exception):
10
+ pass
11
+
12
+
13
+ def table_id(*, project: str | None, dataset: str, table: str) -> str:
14
+ if project:
15
+ return f"{project}.{dataset}.{table}"
16
+ return f"{dataset}.{table}"
17
+
18
+
19
+ def get_bq_client(project: str | None) -> bigquery.Client:
20
+ return bigquery.Client(project=project) if project else bigquery.Client()
21
+
22
+
23
+ def ensure_table_exists(
24
+ client: bigquery.Client,
25
+ destination_table_id: str,
26
+ schema: list[bigquery.SchemaField],
27
+ *,
28
+ replace: bool = False,
29
+ ) -> list[bigquery.SchemaField]:
30
+ try:
31
+ existing_table = client.get_table(destination_table_id)
32
+ except NotFound:
33
+ client.create_table(bigquery.Table(destination_table_id, schema=schema))
34
+ return schema
35
+
36
+ table_schema = list(existing_table.schema)
37
+ if not schemas_match(schema, table_schema):
38
+ if replace:
39
+ return schema
40
+ raise TableError(
41
+ "CSV schema does not match the destination table schema. "
42
+ f"CSV: [{format_schema(schema)}]. "
43
+ f"Table: [{format_schema(table_schema)}]. "
44
+ "Use --replace to recreate the table."
45
+ )
46
+ return table_schema
bqcsv/uploader.py ADDED
@@ -0,0 +1,193 @@
1
+ from __future__ import annotations
2
+
3
+ import shutil
4
+ import subprocess
5
+ import tempfile
6
+ from collections.abc import Callable
7
+ from pathlib import Path
8
+
9
+ from google.api_core.exceptions import NotFound
10
+
11
+ from bqcsv.schema import (
12
+ DEFAULT_FIELD_DELIMITER,
13
+ SchemaError,
14
+ detect_field_delimiter,
15
+ format_dataframe_for_bq_load,
16
+ format_schema,
17
+ infer_bq_schema_from_csv,
18
+ load_schema_from_json,
19
+ read_csv_dataframe,
20
+ write_schema_file,
21
+ )
22
+ from bqcsv.table import TableError, ensure_table_exists, get_bq_client, table_id
23
+
24
+
25
+ class UploadError(Exception):
26
+ pass
27
+
28
+
29
+ def ensure_bq_available() -> str:
30
+ bq_path = shutil.which("bq")
31
+ if not bq_path:
32
+ raise UploadError(
33
+ "The `bq` CLI was not found on PATH. Install the Google Cloud SDK and run "
34
+ "`gcloud auth login` before uploading."
35
+ )
36
+ return bq_path
37
+
38
+
39
+ def _write_prepared_csv(
40
+ dataframe,
41
+ *,
42
+ field_delimiter: str,
43
+ skip_header: bool,
44
+ ) -> Path:
45
+ with tempfile.NamedTemporaryFile(
46
+ "w",
47
+ suffix=".csv",
48
+ delete=False,
49
+ newline="",
50
+ encoding="utf-8",
51
+ ) as handle:
52
+ dataframe.to_csv(
53
+ handle,
54
+ index=False,
55
+ sep=field_delimiter,
56
+ header=skip_header,
57
+ )
58
+ return Path(handle.name)
59
+
60
+
61
+ def build_load_command(
62
+ csv_path: Path,
63
+ *,
64
+ project: str | None,
65
+ dataset: str,
66
+ table: str,
67
+ schema_path: Path,
68
+ replace: bool = False,
69
+ skip_header: bool = True,
70
+ field_delimiter: str = DEFAULT_FIELD_DELIMITER,
71
+ ) -> list[str]:
72
+ destination = f"{project}:{dataset}.{table}" if project else f"{dataset}.{table}"
73
+ cmd = ["bq", "load", "--source_format=CSV", "--noautodetect"]
74
+ if project:
75
+ cmd.extend(["--project_id", project])
76
+ cmd.append(f"--field_delimiter={field_delimiter}")
77
+ if skip_header:
78
+ cmd.append("--skip_leading_rows=1")
79
+ if replace:
80
+ cmd.append("--replace")
81
+ cmd.append(destination)
82
+ cmd.append(str(csv_path))
83
+ cmd.append(str(schema_path))
84
+ return cmd
85
+
86
+
87
+ def _log(on_log: Callable[[str], None] | None, message: str) -> None:
88
+ if on_log is not None:
89
+ on_log(message)
90
+
91
+
92
+ def upload_csv(
93
+ csv_path: Path,
94
+ *,
95
+ project: str | None,
96
+ dataset: str,
97
+ table: str,
98
+ replace: bool = False,
99
+ skip_header: bool = True,
100
+ schema_path: Path | None = None,
101
+ on_log: Callable[[str], None] | None = None,
102
+ ) -> None:
103
+ ensure_bq_available()
104
+ if not csv_path.is_file():
105
+ raise UploadError(f"CSV file not found: {csv_path}")
106
+ if schema_path is not None and not schema_path.is_file():
107
+ raise UploadError(f"Schema file not found: {schema_path}")
108
+
109
+ try:
110
+ field_delimiter = detect_field_delimiter(csv_path)
111
+ _log(on_log, f"Detected field delimiter: {field_delimiter!r}")
112
+ destination_table_id = table_id(project=project, dataset=dataset, table=table)
113
+ client = get_bq_client(project)
114
+
115
+ explicit_schema = (
116
+ load_schema_from_json(schema_path) if schema_path is not None else None
117
+ )
118
+
119
+ column_names: list[str] | None = None
120
+ if not skip_header:
121
+ if explicit_schema is not None:
122
+ column_names = [field.name for field in explicit_schema]
123
+ else:
124
+ try:
125
+ existing_table = client.get_table(destination_table_id)
126
+ except NotFound as exc:
127
+ raise UploadError(
128
+ "CSV has no header row and destination table does not exist. "
129
+ "Upload a CSV with a header or create the table first."
130
+ ) from exc
131
+ column_names = [field.name for field in existing_table.schema]
132
+
133
+ if explicit_schema is not None:
134
+ csv_schema = explicit_schema
135
+ _log(on_log, f"Using schema from {schema_path}")
136
+ else:
137
+ csv_schema = infer_bq_schema_from_csv(
138
+ csv_path,
139
+ field_delimiter=field_delimiter,
140
+ skip_header=skip_header,
141
+ column_names=column_names,
142
+ )
143
+ _log(on_log, f"Inferred schema: [{format_schema(csv_schema)}]")
144
+
145
+ load_schema = ensure_table_exists(
146
+ client,
147
+ destination_table_id,
148
+ csv_schema,
149
+ replace=replace,
150
+ )
151
+ _log(on_log, f"Destination table ready: {destination_table_id}")
152
+
153
+ dataframe = read_csv_dataframe(
154
+ csv_path,
155
+ field_delimiter=field_delimiter,
156
+ skip_header=skip_header,
157
+ column_names=column_names,
158
+ )
159
+ prepared_csv_path = _write_prepared_csv(
160
+ format_dataframe_for_bq_load(dataframe, load_schema),
161
+ field_delimiter=field_delimiter,
162
+ skip_header=skip_header,
163
+ )
164
+
165
+ temp_schema_path = write_schema_file(load_schema)
166
+ try:
167
+ cmd = build_load_command(
168
+ prepared_csv_path,
169
+ project=project,
170
+ dataset=dataset,
171
+ table=table,
172
+ schema_path=temp_schema_path,
173
+ replace=replace,
174
+ skip_header=skip_header,
175
+ field_delimiter=field_delimiter,
176
+ )
177
+ _log(on_log, f"Running: {' '.join(cmd)}")
178
+ try:
179
+ if on_log is not None:
180
+ result = subprocess.run(cmd, check=True, capture_output=True, text=True)
181
+ if result.stdout.strip():
182
+ _log(on_log, result.stdout.rstrip())
183
+ if result.stderr.strip():
184
+ _log(on_log, result.stderr.rstrip())
185
+ else:
186
+ subprocess.run(cmd, check=True)
187
+ except subprocess.CalledProcessError as exc:
188
+ raise UploadError(f"`bq load` failed with exit code {exc.returncode}") from exc
189
+ finally:
190
+ temp_schema_path.unlink(missing_ok=True)
191
+ prepared_csv_path.unlink(missing_ok=True)
192
+ except (SchemaError, TableError) as exc:
193
+ raise UploadError(str(exc)) from exc
@@ -0,0 +1,137 @@
1
+ Metadata-Version: 2.4
2
+ Name: bqcsv
3
+ Version: 1.0.0
4
+ Summary: Upload a local CSV file to a BigQuery table via the bq CLI
5
+ Requires-Python: >=3.10
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: google-cloud-bigquery>=3.0
8
+ Requires-Dist: pandas>=2.0
9
+
10
+ # bqcsv
11
+
12
+ Upload a local CSV file to BigQuery using the `bq` CLI and your existing `gcloud` authentication.
13
+
14
+ ## Why a dedicated CLI tool?
15
+
16
+ Out of the box, Google's `bq` CLI cannot create a table with column names inferred from a CSV file.
17
+
18
+ `bqcsv` fixes that:
19
+
20
+ * detects the schema from the CSV file
21
+ * creates a table with proper column names and types
22
+ * loads the CSV file using `bq load`
23
+
24
+ ## Authentication
25
+
26
+ No additional authentication is needed.
27
+
28
+ `bqcsv` uses your existing authentication via `gcloud auth login`.
29
+
30
+ ## Requirements
31
+
32
+ - Python 3.10+
33
+ - [Google Cloud SDK](https://cloud.google.com/sdk) with `bq` on your `PATH`
34
+
35
+ ## How to use `bqcsv`
36
+
37
+ ### Upload a CSV file to a table
38
+
39
+ To upload a CSV file, specify your project ID, dataset ID, and table name:
40
+
41
+ ```bash
42
+ bqcsv data.csv --project my-gcp-project --dataset staging --table events_raw
43
+ ```
44
+
45
+ The `--table` argument is optional. By default, `bqcsv` derives the table name from the CSV file:
46
+
47
+ ```bash
48
+ bqcsv data.csv --project my-gcp-project --dataset staging
49
+
50
+ # is identical to
51
+
52
+ bqcsv data.csv --project my-gcp-project --dataset staging --table data
53
+ ```
54
+
55
+ ### Saving your configuration
56
+
57
+ To avoid passing `--project`, `--dataset`, or `--table` on every run, save them to your local config:
58
+
59
+ ```bash
60
+ bqcsv config set --project my-gcp-project --dataset analytics --table events
61
+ bqcsv config show
62
+ ```
63
+
64
+ Defaults are stored in `~/.config/bqcsv/config.toml`.
65
+
66
+ After you set your defaults, you can call `bqcsv` without arguments:
67
+
68
+ ```bash
69
+ bqcsv data.csv
70
+ ```
71
+
72
+ If you have not set a default `--table` value, the table name is derived from the CSV file.
73
+
74
+ ## Development
75
+
76
+ ### Install from your local repo
77
+
78
+ ```bash
79
+ pip install -e .
80
+ ```
81
+
82
+ ### Testing
83
+
84
+ To delete a test table, use `bq`:
85
+
86
+ ```bash
87
+ bq rm -f -t PROJECT_ID:DATASET_ID.TABLE_NAME
88
+ ```
89
+
90
+ You can run the module directly when working on a new feature or fixing a bug:
91
+
92
+ ```sh
93
+ python -m bqcsv.cli config set --project PROJECT_ID --dataset DATASET_ID --table TEST_TABLE_NAME
94
+ ```
95
+
96
+ ## Releasing to PyPI
97
+
98
+ 1. **Bump the version** in both places (they must match):
99
+ - `pyproject.toml` → `[project].version`
100
+ - `bqcsv/__init__.py` → `__version__`
101
+
102
+ 2. **Install build tools** (one-time):
103
+
104
+ ```bash
105
+ pip install build twine
106
+ ```
107
+
108
+ 3. **Run tests** and commit the version bump.
109
+
110
+ 4. **Build the package**:
111
+
112
+ ```bash
113
+ python -m build
114
+ ```
115
+
116
+ This creates `dist/bqcsv-<version>.tar.gz` and `dist/bqcsv-<version>-py3-none-any.whl`.
117
+
118
+ 5. **Upload to PyPI**:
119
+
120
+ ```bash
121
+ twine upload dist/*
122
+ ```
123
+
124
+ On first upload, create an account at [pypi.org](https://pypi.org) and use an [API token](https://pypi.org/help/#apitoken) as the password (`__token__` as the username).
125
+
126
+ 6. **Tag the release** (optional but recommended):
127
+
128
+ ```bash
129
+ git tag v0.2.0
130
+ git push origin v0.2.0
131
+ ```
132
+
133
+ After publishing, users can install the new version with:
134
+
135
+ ```bash
136
+ pip install --upgrade bqcsv
137
+ ```
@@ -0,0 +1,11 @@
1
+ bqcsv/__init__.py,sha256=J-j-u0itpEFT6irdmWmixQqYMadNl1X91TxUmoiLHMI,22
2
+ bqcsv/cli.py,sha256=drFb5yv_2R-C2Pkzz5hu7goN72EQCQ8MWgg8DrdycMg,6425
3
+ bqcsv/config.py,sha256=TA2k1eeVa041S777RTM9fbyBW-QP_sjgt4IApVOPf8Y,1569
4
+ bqcsv/schema.py,sha256=TdNeatvKCONZmOd8PSYNE4UPCjz6fCFE-VuS5cI9G4Q,6602
5
+ bqcsv/table.py,sha256=rfoA0v3tyUzCKhgsN0BQMBWdLDyZVH-eUpfxSRajM7s,1338
6
+ bqcsv/uploader.py,sha256=RxAPwWwAunh7HAGrPU0lHcmklznIJ0gPuhWRV0P3Nhk,6111
7
+ bqcsv-1.0.0.dist-info/METADATA,sha256=eMXIhk6WpbFYYb8HyzmnCLCCuQ_4cNRdvtesBXtv69k,3139
8
+ bqcsv-1.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
9
+ bqcsv-1.0.0.dist-info/entry_points.txt,sha256=3p_cZqU8eHNBnnkGSKkdbDXdXUEVL19gPOJeZGQ6hE8,41
10
+ bqcsv-1.0.0.dist-info/top_level.txt,sha256=iVTsmLuF2tHvlgDaGBA-NOFo8UwVRgUETjKPVYlWfHU,6
11
+ bqcsv-1.0.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ bqcsv = bqcsv.cli:main
@@ -0,0 +1 @@
1
+ bqcsv