pgsalesgen 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,10 @@
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Rio Fujita
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,157 @@
1
+ Metadata-Version: 2.4
2
+ Name: pgsalesgen
3
+ Version: 0.1.0
4
+ Summary: Generate sales data for PostgreSQL
5
+ Project-URL: Homepage, https://github.com/rioriost/homebrew-pgsalesgen
6
+ Project-URL: Issues, https://github.com/rioriost/homebrew-pgsalesgen/issues
7
+ Author-email: Rio Fujita <rifujita@microsoft.com>
8
+ License: MIT License
9
+
10
+ Copyright (c) 2026 Rio Fujita
11
+
12
+ Permission is hereby granted, free of charge, to any person obtaining a copy
13
+ of this software and associated documentation files (the "Software"), to deal
14
+ in the Software without restriction, including without limitation the rights
15
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16
+ copies of the Software, and to permit persons to whom the Software is
17
+ furnished to do so, subject to the following conditions:
18
+
19
+ The above copyright notice and this permission notice shall be included in all
20
+ copies or substantial portions of the Software.
21
+
22
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28
+ SOFTWARE.
29
+ License-File: LICENSE
30
+ Requires-Python: >=3.14
31
+ Requires-Dist: numpy>=2.4.2
32
+ Requires-Dist: psycopg>=3.3.2
33
+ Description-Content-Type: text/markdown
34
+
35
+ # PostgreSQL Sales Generator
36
+
37
+ ![License](https://img.shields.io/badge/license-MIT-blue.svg)
38
+ ![Python](https://img.shields.io/badge/Python-3.14%2B-blue)
39
+
40
+ PostgreSQL Sales Generator
41
+
42
+ ## Table of Contents
43
+
44
+ - [Prerequisites](#prerequisites)
45
+ - [Install](#install)
46
+ - [Usage with Claude](#usage-with-claude)
47
+ - [Usage with Visual Studio Code Insiders](#usage-with-visual-studio-code-insiders)
48
+ - [Write Operations](#write-operations)
49
+ - [Release Notes](#release-notes)
50
+ - [For More Information](#for-more-information)
51
+ - [License](#license)
52
+
53
+ ## Prerequisites
54
+
55
+ - Python 3.14 and above
56
+ - This module runs on [psycopg](https://www.psycopg.org/)
57
+
58
+ ## Install
59
+
60
+ - with brew
61
+
62
+ ```bash
63
+ brew tap rioriost/pg-salesgen
64
+ brew install pg-salesgen
65
+ ```
66
+
67
+ ## Usage
68
+
69
+ ```
70
+ [pg-salesgen --help](usage: pg-salesgen [--help] [--dsn DSN] [-h HOST] [-p PORT] [-U USER]
71
+ [-d DBNAME] [--password PASSWORD] [--sslmode SSLMODE]
72
+ [--options OPTIONS] [--print-psql] [--logged] [--with-fk]
73
+ [--create-indexes] [--customers CUSTOMERS]
74
+ [--products PRODUCTS] [--target-gb TARGET_GB]
75
+ [--workers WORKERS] [--batch-orders BATCH_ORDERS]
76
+ [--avg-items AVG_ITEMS] [--max-items MAX_ITEMS]
77
+ [--order-note-len ORDER_NOTE_LEN]
78
+ [--item-note-len ITEM_NOTE_LEN] [--start-date START_DATE]
79
+ [--end-date END_DATE] [--unit-price-min UNIT_PRICE_MIN]
80
+ [--unit-price-max UNIT_PRICE_MAX] [--tax-rate TAX_RATE]
81
+ [--shipping-threshold SHIPPING_THRESHOLD]
82
+ [--shipping-fee SHIPPING_FEE]
83
+ [--progress-interval PROGRESS_INTERVAL] [--seed SEED]
84
+ [--order-id-stride ORDER_ID_STRIDE]
85
+ [--copy-orders-buf-mb COPY_ORDERS_BUF_MB]
86
+ [--copy-items-buf-mb COPY_ITEMS_BUF_MB]
87
+ [--join-timeout-sec JOIN_TIMEOUT_SEC]
88
+
89
+ Empty DB -> create schema/tables -> fill masters -> generate sales-like data FAST.
90
+
91
+ options:
92
+ --help, -? show this help message and exit
93
+ --dsn DSN libpq DSN. Overrides -h/-p/-U/-d.
94
+ -h, --host HOST database server host or socket directory (psql
95
+ compatible).
96
+ -p, --port PORT database server port (psql compatible).
97
+ -U, --user USER database user name (psql compatible).
98
+ -d, --dbname DBNAME database name (psql compatible).
99
+ --password PASSWORD database password (or use PGPASSWORD env / .pgpass).
100
+ --sslmode SSLMODE sslmode (require, verify-full, etc.).
101
+ --options OPTIONS libpq options string (e.g., "-c statement_timeout=0").
102
+ --print-psql Print equivalent psql command and exit.
103
+ --logged Create LOGGED tables (default UNLOGGED for speed).
104
+ --with-fk Create foreign keys (slower).
105
+ --create-indexes Create typical indexes + ANALYZE after load.
106
+ --customers CUSTOMERS
107
+ Number of customers to generate.
108
+ --products PRODUCTS Number of products to generate.
109
+ --target-gb TARGET_GB
110
+ Target size (orders+items) in GB.
111
+ --workers WORKERS Number of worker processes.
112
+ --batch-orders BATCH_ORDERS
113
+ Orders per batch per worker.
114
+ --avg-items AVG_ITEMS
115
+ Average number of items per order.
116
+ --max-items MAX_ITEMS
117
+ Max items per order.
118
+ --order-note-len ORDER_NOTE_LEN
119
+ Order note length.
120
+ --item-note-len ITEM_NOTE_LEN
121
+ Item note length.
122
+ --start-date START_DATE
123
+ Start date YYYY-MM-DD (UTC).
124
+ --end-date END_DATE End date YYYY-MM-DD (UTC).
125
+ --unit-price-min UNIT_PRICE_MIN
126
+ Min unit price (cents).
127
+ --unit-price-max UNIT_PRICE_MAX
128
+ Max unit price (cents).
129
+ --tax-rate TAX_RATE Tax rate (e.g., 0.10).
130
+ --shipping-threshold SHIPPING_THRESHOLD
131
+ Free shipping threshold (cents).
132
+ --shipping-fee SHIPPING_FEE
133
+ Shipping fee under threshold (cents).
134
+ --progress-interval PROGRESS_INTERVAL
135
+ Seconds between progress prints.
136
+ --seed SEED Base RNG seed.
137
+ --order-id-stride ORDER_ID_STRIDE
138
+ Per-worker order_id stride (must exceed total orders
139
+ per worker).
140
+ --copy-orders-buf-mb COPY_ORDERS_BUF_MB
141
+ COPY buffer for orders (MB).
142
+ --copy-items-buf-mb COPY_ITEMS_BUF_MB
143
+ COPY buffer for items (MB).
144
+ --join-timeout-sec JOIN_TIMEOUT_SEC
145
+ If >0, timeout seconds for joining each worker. 0
146
+ means wait indefinitely.
147
+ )
148
+ ```
149
+
150
+ ## Release Notes
151
+
152
+ ### 0.1.0 Release
153
+ - Initial release
154
+
155
+ ## License
156
+
157
+ MIT License
@@ -0,0 +1,123 @@
1
+ # PostgreSQL Sales Generator
2
+
3
+ ![License](https://img.shields.io/badge/license-MIT-blue.svg)
4
+ ![Python](https://img.shields.io/badge/Python-3.14%2B-blue)
5
+
6
+ PostgreSQL Sales Generator
7
+
8
+ ## Table of Contents
9
+
10
+ - [Prerequisites](#prerequisites)
11
+ - [Install](#install)
12
+ - [Usage with Claude](#usage-with-claude)
13
+ - [Usage with Visual Studio Code Insiders](#usage-with-visual-studio-code-insiders)
14
+ - [Write Operations](#write-operations)
15
+ - [Release Notes](#release-notes)
16
+ - [For More Information](#for-more-information)
17
+ - [License](#license)
18
+
19
+ ## Prerequisites
20
+
21
+ - Python 3.14 and above
22
+ - This module runs on [psycopg](https://www.psycopg.org/)
23
+
24
+ ## Install
25
+
26
+ - with brew
27
+
28
+ ```bash
29
+ brew tap rioriost/pg-salesgen
30
+ brew install pg-salesgen
31
+ ```
32
+
33
+ ## Usage
34
+
35
+ ```
36
+ [pg-salesgen --help](usage: pg-salesgen [--help] [--dsn DSN] [-h HOST] [-p PORT] [-U USER]
37
+ [-d DBNAME] [--password PASSWORD] [--sslmode SSLMODE]
38
+ [--options OPTIONS] [--print-psql] [--logged] [--with-fk]
39
+ [--create-indexes] [--customers CUSTOMERS]
40
+ [--products PRODUCTS] [--target-gb TARGET_GB]
41
+ [--workers WORKERS] [--batch-orders BATCH_ORDERS]
42
+ [--avg-items AVG_ITEMS] [--max-items MAX_ITEMS]
43
+ [--order-note-len ORDER_NOTE_LEN]
44
+ [--item-note-len ITEM_NOTE_LEN] [--start-date START_DATE]
45
+ [--end-date END_DATE] [--unit-price-min UNIT_PRICE_MIN]
46
+ [--unit-price-max UNIT_PRICE_MAX] [--tax-rate TAX_RATE]
47
+ [--shipping-threshold SHIPPING_THRESHOLD]
48
+ [--shipping-fee SHIPPING_FEE]
49
+ [--progress-interval PROGRESS_INTERVAL] [--seed SEED]
50
+ [--order-id-stride ORDER_ID_STRIDE]
51
+ [--copy-orders-buf-mb COPY_ORDERS_BUF_MB]
52
+ [--copy-items-buf-mb COPY_ITEMS_BUF_MB]
53
+ [--join-timeout-sec JOIN_TIMEOUT_SEC]
54
+
55
+ Empty DB -> create schema/tables -> fill masters -> generate sales-like data FAST.
56
+
57
+ options:
58
+ --help, -? show this help message and exit
59
+ --dsn DSN libpq DSN. Overrides -h/-p/-U/-d.
60
+ -h, --host HOST database server host or socket directory (psql
61
+ compatible).
62
+ -p, --port PORT database server port (psql compatible).
63
+ -U, --user USER database user name (psql compatible).
64
+ -d, --dbname DBNAME database name (psql compatible).
65
+ --password PASSWORD database password (or use PGPASSWORD env / .pgpass).
66
+ --sslmode SSLMODE sslmode (require, verify-full, etc.).
67
+ --options OPTIONS libpq options string (e.g., "-c statement_timeout=0").
68
+ --print-psql Print equivalent psql command and exit.
69
+ --logged Create LOGGED tables (default UNLOGGED for speed).
70
+ --with-fk Create foreign keys (slower).
71
+ --create-indexes Create typical indexes + ANALYZE after load.
72
+ --customers CUSTOMERS
73
+ Number of customers to generate.
74
+ --products PRODUCTS Number of products to generate.
75
+ --target-gb TARGET_GB
76
+ Target size (orders+items) in GB.
77
+ --workers WORKERS Number of worker processes.
78
+ --batch-orders BATCH_ORDERS
79
+ Orders per batch per worker.
80
+ --avg-items AVG_ITEMS
81
+ Average number of items per order.
82
+ --max-items MAX_ITEMS
83
+ Max items per order.
84
+ --order-note-len ORDER_NOTE_LEN
85
+ Order note length.
86
+ --item-note-len ITEM_NOTE_LEN
87
+ Item note length.
88
+ --start-date START_DATE
89
+ Start date YYYY-MM-DD (UTC).
90
+ --end-date END_DATE End date YYYY-MM-DD (UTC).
91
+ --unit-price-min UNIT_PRICE_MIN
92
+ Min unit price (cents).
93
+ --unit-price-max UNIT_PRICE_MAX
94
+ Max unit price (cents).
95
+ --tax-rate TAX_RATE Tax rate (e.g., 0.10).
96
+ --shipping-threshold SHIPPING_THRESHOLD
97
+ Free shipping threshold (cents).
98
+ --shipping-fee SHIPPING_FEE
99
+ Shipping fee under threshold (cents).
100
+ --progress-interval PROGRESS_INTERVAL
101
+ Seconds between progress prints.
102
+ --seed SEED Base RNG seed.
103
+ --order-id-stride ORDER_ID_STRIDE
104
+ Per-worker order_id stride (must exceed total orders
105
+ per worker).
106
+ --copy-orders-buf-mb COPY_ORDERS_BUF_MB
107
+ COPY buffer for orders (MB).
108
+ --copy-items-buf-mb COPY_ITEMS_BUF_MB
109
+ COPY buffer for items (MB).
110
+ --join-timeout-sec JOIN_TIMEOUT_SEC
111
+ If >0, timeout seconds for joining each worker. 0
112
+ means wait indefinitely.
113
+ )
114
+ ```
115
+
116
+ ## Release Notes
117
+
118
+ ### 0.1.0 Release
119
+ - Initial release
120
+
121
+ ## License
122
+
123
+ MIT License
@@ -0,0 +1,41 @@
1
+ [project]
2
+ name = "pgsalesgen"
3
+ authors = [
4
+ { name = "Rio Fujita", email = "rifujita@microsoft.com" }
5
+ ]
6
+ version = "0.1.0"
7
+ license = {file = "LICENSE"}
8
+ description = "Generate sales data for PostgreSQL"
9
+ readme = "README.md"
10
+
11
+ requires-python = ">=3.14"
12
+ dependencies = [
13
+ "numpy>=2.4.2",
14
+ "psycopg>=3.3.2",
15
+ ]
16
+
17
+ [project.urls]
18
+ Homepage = "https://github.com/rioriost/homebrew-pgsalesgen"
19
+ Issues = "https://github.com/rioriost/homebrew-pgsalesgen/issues"
20
+
21
+ [project.scripts]
22
+ pg-salesgen = "pgsalesgen.main:main"
23
+
24
+ [build-system]
25
+ requires = ["hatchling"]
26
+ build-backend = "hatchling.build"
27
+
28
+ [tool.hatch.build.targets.wheel]
29
+ packages = ["src/pgsalesgen"]
30
+
31
+ [tool.hatch.build.targets.sdist]
32
+ include = [
33
+ "src/pgsalesgen/*.py",
34
+ "*.py",
35
+ ]
36
+ exclude = [
37
+ "pgsalesgen.rb",
38
+ "uv.lock",
39
+ "dist/.DS_Store",
40
+ ".envrc",
41
+ ]
File without changes
@@ -0,0 +1,918 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ pgsalesgen.py
4
+
5
+ Empty DB -> create schema/tables -> fill masters -> generate ~target GB of sales-like data FAST.
6
+
7
+ Speed upgrades in this version:
8
+ - COPY BINARY payload is built by struct.pack_into() into a big bytearray (min allocations)
9
+ - channel/status are kept as uint8 indices (no massive Python list[str] creation)
10
+ - cp.write() is called with large memoryview chunks (flush by bytes, not rows)
11
+
12
+ Also fixed:
13
+ - Previous version could hang after printing:
14
+ [done] target reached; stopping workers...
15
+ because workers might be blocked on q.get() and never see the stop condition,
16
+ while the coordinator only sent one None per worker after setting stop_evt.
17
+ This version:
18
+ - uses sentinel None to stop workers reliably
19
+ - does not rely on stop_evt for the normal stop path
20
+ - closes/join_thread() on the Queue to avoid lingering feeder threads
21
+ - joins workers without a short timeout (or uses a generous one)
22
+
23
+ psql-compatible flags:
24
+ - -h host, -p port, -U user, -d dbname
25
+ (argparse help is remapped to --help / -?)
26
+
27
+ Usage:
28
+ pgsalesgen -h localhost -p 5432 -U postgres -d emptydb --target-gb 10 --workers 8
29
+
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ import argparse
35
+ import os
36
+ import struct
37
+ import sys
38
+ import time
39
+ from dataclasses import dataclass
40
+ from datetime import datetime, timezone
41
+ from multiprocessing import Event, Process, Queue
42
+
43
+ import numpy as np
44
+ import psycopg
45
+
46
+ # -----------------------------
47
+ # PostgreSQL COPY BINARY helpers
48
+ # -----------------------------
49
+ PGCOPY_SIGNATURE = b"PGCOPY\n\xff\r\n\0"
50
+ EPOCH_2000 = datetime(2000, 1, 1, tzinfo=timezone.utc)
51
+ UNIX_EPOCH = datetime(1970, 1, 1, tzinfo=timezone.utc)
52
+
53
+ S_I16 = struct.Struct("!h")
54
+ S_I32 = struct.Struct("!i")
55
+ S_I64 = struct.Struct("!q")
56
+
57
+
58
+ # -----------------------------
59
+ # Connection (psql-compatible)
60
+ # -----------------------------
61
+ def build_libpq_dsn(args) -> str:
62
+ if args.dsn:
63
+ return args.dsn
64
+
65
+ parts: list[str] = []
66
+ if args.host:
67
+ parts.append(f"host={args.host}")
68
+ if args.port:
69
+ parts.append(f"port={args.port}")
70
+ if args.user:
71
+ parts.append(f"user={args.user}")
72
+ if args.dbname:
73
+ parts.append(f"dbname={args.dbname}")
74
+ if args.password:
75
+ parts.append(f"password={args.password}")
76
+ if args.sslmode:
77
+ parts.append(f"sslmode={args.sslmode}")
78
+ if args.options:
79
+ parts.append(f"options={args.options}")
80
+
81
+ return " ".join(parts) if parts else ""
82
+
83
+
84
+ def psql_equivalent_cmd(args) -> str:
85
+ cmd = ["psql"]
86
+ if args.host:
87
+ cmd += ["-h", args.host]
88
+ if args.port:
89
+ cmd += ["-p", str(args.port)]
90
+ if args.user:
91
+ cmd += ["-U", args.user]
92
+ if args.dbname:
93
+ cmd += ["-d", args.dbname]
94
+
95
+ prefix = ""
96
+ if args.password:
97
+ prefix += "PGPASSWORD='***' "
98
+ if args.sslmode:
99
+ prefix += f"PGSSLMODE='{args.sslmode}' "
100
+ if args.options:
101
+ prefix += f"PGOPTIONS='{args.options}' "
102
+ return prefix + " ".join(cmd)
103
+
104
+
105
+ # -----------------------------
106
+ # Schema / DDL
107
+ # -----------------------------
108
+ DDL_TEMPLATE = """
109
+ CREATE SCHEMA IF NOT EXISTS sales;
110
+
111
+ -- Drop in dependency order
112
+ DROP TABLE IF EXISTS sales.order_items;
113
+ DROP TABLE IF EXISTS sales.orders;
114
+ DROP TABLE IF EXISTS sales.products;
115
+ DROP TABLE IF EXISTS sales.customers;
116
+
117
+ {create_table} sales.customers (
118
+ customer_id bigserial PRIMARY KEY,
119
+ created_at timestamptz NOT NULL DEFAULT now(),
120
+ name text NOT NULL,
121
+ email text NOT NULL,
122
+ region text NOT NULL
123
+ );
124
+
125
+ {create_table} sales.products (
126
+ product_id bigserial PRIMARY KEY,
127
+ sku text NOT NULL,
128
+ name text NOT NULL,
129
+ category text NOT NULL,
130
+ price_cents int NOT NULL
131
+ );
132
+
133
+ -- order_id is BIGINT and provided by generator (worker range), not serial.
134
+ {create_table} sales.orders (
135
+ order_id bigint PRIMARY KEY,
136
+ ordered_at timestamptz NOT NULL,
137
+ customer_id bigint NOT NULL,
138
+ channel text NOT NULL,
139
+ status text NOT NULL,
140
+ subtotal_cents int NOT NULL,
141
+ tax_cents int NOT NULL,
142
+ shipping_cents int NOT NULL,
143
+ total_cents int NOT NULL,
144
+ note text NOT NULL
145
+ );
146
+
147
+ {create_table} sales.order_items (
148
+ order_item_id bigserial PRIMARY KEY,
149
+ order_id bigint NOT NULL,
150
+ product_id bigint NOT NULL,
151
+ qty int NOT NULL,
152
+ unit_price_cents int NOT NULL,
153
+ line_total_cents int NOT NULL,
154
+ note text NOT NULL
155
+ );
156
+
157
+ {fk}
158
+ """
159
+
160
+
161
+ def create_schema_and_tables(conn, logged: bool, with_fk: bool) -> None:
162
+ create_table = "CREATE TABLE" if logged else "CREATE UNLOGGED TABLE"
163
+ fk_sql = ""
164
+ if with_fk:
165
+ fk_sql = """
166
+ ALTER TABLE sales.orders
167
+ ADD CONSTRAINT orders_customer_fk
168
+ FOREIGN KEY (customer_id) REFERENCES sales.customers(customer_id);
169
+
170
+ ALTER TABLE sales.order_items
171
+ ADD CONSTRAINT items_order_fk
172
+ FOREIGN KEY (order_id) REFERENCES sales.orders(order_id);
173
+
174
+ ALTER TABLE sales.order_items
175
+ ADD CONSTRAINT items_product_fk
176
+ FOREIGN KEY (product_id) REFERENCES sales.products(product_id);
177
+ """
178
+ ddl = DDL_TEMPLATE.format(create_table=create_table, fk=fk_sql)
179
+ conn.execute(ddl)
180
+ conn.commit()
181
+
182
+
183
+ def create_indexes(conn) -> None:
184
+ conn.execute(
185
+ "CREATE INDEX IF NOT EXISTS orders_ordered_at_brin ON sales.orders USING brin (ordered_at);"
186
+ )
187
+ conn.execute(
188
+ "CREATE INDEX IF NOT EXISTS orders_customer_id_idx ON sales.orders (customer_id);"
189
+ )
190
+ conn.execute(
191
+ "CREATE INDEX IF NOT EXISTS items_order_id_idx ON sales.order_items (order_id);"
192
+ )
193
+ conn.execute(
194
+ "CREATE INDEX IF NOT EXISTS items_product_id_idx ON sales.order_items (product_id);"
195
+ )
196
+ conn.execute("ANALYZE sales.customers;")
197
+ conn.execute("ANALYZE sales.products;")
198
+ conn.execute("ANALYZE sales.orders;")
199
+ conn.execute("ANALYZE sales.order_items;")
200
+ conn.commit()
201
+
202
+
203
+ # -----------------------------
204
+ # Generation utilities (NumPy)
205
+ # -----------------------------
206
+ def parse_ymd(s: str) -> datetime:
207
+ y, m, d = map(int, s.split("-"))
208
+ return datetime(y, m, d, tzinfo=timezone.utc)
209
+
210
+
211
+ def us_since_2000_from_unix_seconds(unix_seconds: np.ndarray) -> np.ndarray:
212
+ base_unix_2000 = int((EPOCH_2000 - UNIX_EPOCH).total_seconds())
213
+ return (unix_seconds - base_unix_2000).astype(np.int64) * 1_000_000
214
+
215
+
216
+ def fixed_note(length: int, tag: str) -> str:
217
+ if length <= 0:
218
+ return ""
219
+ return (tag * ((length // max(1, len(tag))) + 1))[:length]
220
+
221
+
222
+ @dataclass(frozen=True)
223
+ class BatchConfig:
224
+ avg_items: float
225
+ max_items: int
226
+ order_note_len: int
227
+ item_note_len: int
228
+ start_unix: int
229
+ span_seconds: int
230
+ cust_max: int
231
+ prod_max: int
232
+ unit_price_min: int
233
+ unit_price_max: int
234
+ shipping_threshold: int
235
+ shipping_fee: int
236
+ tax_rate: float
237
+
238
+
239
+ def generate_batch_numpy(
240
+ rng: np.random.Generator,
241
+ start_order_id: int,
242
+ batch_orders: int,
243
+ cfg: BatchConfig,
244
+ ):
245
+ """
246
+ Returns:
247
+ orders: dict of numpy arrays (no Python list[str])
248
+ items: dict of numpy arrays
249
+ next_order_id: int
250
+ """
251
+ n = batch_orders
252
+ order_ids = np.arange(start_order_id, start_order_id + n, dtype=np.int64)
253
+
254
+ offsets = rng.integers(0, cfg.span_seconds, size=n, dtype=np.int64)
255
+ ordered_unix = (cfg.start_unix + offsets).astype(np.int64)
256
+ ordered_us2000 = us_since_2000_from_unix_seconds(ordered_unix)
257
+
258
+ customer_ids = rng.integers(1, cfg.cust_max + 1, size=n, dtype=np.int64)
259
+
260
+ # store indices only (uint8)
261
+ channel_idx = rng.integers(0, 3, size=n, dtype=np.uint8) # 0..2
262
+ status_idx = rng.integers(0, 3, size=n, dtype=np.uint8) # 0..2
263
+
264
+ lam = max(0.1, float(cfg.avg_items))
265
+ k = rng.poisson(lam=lam, size=n).astype(np.int64)
266
+ k = np.clip(k, 1, cfg.max_items)
267
+ total_items = int(k.sum())
268
+
269
+ item_order_ids = np.repeat(order_ids, k).astype(np.int64)
270
+ product_ids = rng.integers(1, cfg.prod_max + 1, size=total_items, dtype=np.int64)
271
+ qty = rng.integers(1, 6, size=total_items, dtype=np.int32)
272
+ unit = rng.integers(
273
+ cfg.unit_price_min, cfg.unit_price_max + 1, size=total_items, dtype=np.int32
274
+ )
275
+ line_total = (unit.astype(np.int64) * qty.astype(np.int64)).astype(np.int64)
276
+
277
+ boundaries = np.concatenate(([0], np.cumsum(k)[:-1]))
278
+ subtotal = np.add.reduceat(line_total, boundaries).astype(np.int64)
279
+
280
+ tax = (subtotal.astype(np.float64) * cfg.tax_rate).astype(np.int64)
281
+ shipping = np.where(subtotal >= cfg.shipping_threshold, 0, cfg.shipping_fee).astype(
282
+ np.int64
283
+ )
284
+ total = (subtotal + tax + shipping).astype(np.int64)
285
+
286
+ orders = {
287
+ "order_id": order_ids,
288
+ "ordered_us2000": ordered_us2000,
289
+ "customer_id": customer_ids,
290
+ "channel_idx": channel_idx,
291
+ "status_idx": status_idx,
292
+ "subtotal": subtotal,
293
+ "tax": tax,
294
+ "shipping": shipping,
295
+ "total": total,
296
+ }
297
+ items = {
298
+ "order_id": item_order_ids,
299
+ "product_id": product_ids,
300
+ "qty": qty,
301
+ "unit": unit,
302
+ "line_total": line_total,
303
+ }
304
+
305
+ return orders, items, int(start_order_id + n)
306
+
307
+
308
+ # -----------------------------
309
+ # COPY BINARY writers (FAST)
310
+ # -----------------------------
311
+ def copy_orders_binary_fast(
312
+ cur,
313
+ orders,
314
+ note_bytes: bytes,
315
+ buffer_mb: int = 16,
316
+ ):
317
+ """
318
+ Fast COPY BINARY writer for sales.orders.
319
+ - writes into a large bytearray via pack_into
320
+ - flushes by byte size
321
+ """
322
+ channel_vals = (b"web", b"store", b"marketplace")
323
+ status_vals = (b"paid", b"shipped", b"canceled")
324
+
325
+ channel_field = [S_I32.pack(len(v)) + v for v in channel_vals]
326
+ status_field = [S_I32.pack(len(v)) + v for v in status_vals]
327
+ note_field = S_I32.pack(len(note_bytes)) + note_bytes
328
+
329
+ ncols = 10
330
+ max_text = (
331
+ max(len(x) for x in channel_vals)
332
+ + max(len(x) for x in status_vals)
333
+ + len(note_bytes)
334
+ )
335
+ max_row = 2 + (ncols * 4) + (8 + 8 + 8) + (4 * 4) + max_text # conservative
336
+
337
+ buffer_bytes = max(1, buffer_mb) * 1024 * 1024
338
+
339
+ with cur.copy(
340
+ "COPY sales.orders("
341
+ "order_id, ordered_at, customer_id, channel, status, "
342
+ "subtotal_cents, tax_cents, shipping_cents, total_cents, note"
343
+ ") FROM STDIN WITH (FORMAT BINARY)"
344
+ ) as cp:
345
+ cp.write(PGCOPY_SIGNATURE + S_I32.pack(0) + S_I32.pack(0))
346
+
347
+ oid = orders["order_id"]
348
+ ous = orders["ordered_us2000"]
349
+ cid = orders["customer_id"]
350
+ chx = orders["channel_idx"]
351
+ stx = orders["status_idx"]
352
+ sub = orders["subtotal"]
353
+ tax = orders["tax"]
354
+ shp = orders["shipping"]
355
+ tot = orders["total"]
356
+
357
+ buf = bytearray(buffer_bytes)
358
+ pos = 0
359
+
360
+ for i in range(len(oid)):
361
+ if pos + max_row >= buffer_bytes:
362
+ cp.write(memoryview(buf)[:pos])
363
+ pos = 0
364
+
365
+ S_I16.pack_into(buf, pos, ncols)
366
+ pos += 2
367
+
368
+ # order_id int8
369
+ S_I32.pack_into(buf, pos, 8)
370
+ pos += 4
371
+ S_I64.pack_into(buf, pos, int(oid[i]))
372
+ pos += 8
373
+
374
+ # ordered_at timestamptz int8
375
+ S_I32.pack_into(buf, pos, 8)
376
+ pos += 4
377
+ S_I64.pack_into(buf, pos, int(ous[i]))
378
+ pos += 8
379
+
380
+ # customer_id int8
381
+ S_I32.pack_into(buf, pos, 8)
382
+ pos += 4
383
+ S_I64.pack_into(buf, pos, int(cid[i]))
384
+ pos += 8
385
+
386
+ # channel text (prebuilt)
387
+ cf = channel_field[int(chx[i])]
388
+ buf[pos : pos + len(cf)] = cf
389
+ pos += len(cf)
390
+
391
+ # status text (prebuilt)
392
+ sf = status_field[int(stx[i])]
393
+ buf[pos : pos + len(sf)] = sf
394
+ pos += len(sf)
395
+
396
+ # subtotal int4
397
+ S_I32.pack_into(buf, pos, 4)
398
+ pos += 4
399
+ S_I32.pack_into(buf, pos, int(sub[i]))
400
+ pos += 4
401
+
402
+ # tax int4
403
+ S_I32.pack_into(buf, pos, 4)
404
+ pos += 4
405
+ S_I32.pack_into(buf, pos, int(tax[i]))
406
+ pos += 4
407
+
408
+ # shipping int4
409
+ S_I32.pack_into(buf, pos, 4)
410
+ pos += 4
411
+ S_I32.pack_into(buf, pos, int(shp[i]))
412
+ pos += 4
413
+
414
+ # total int4
415
+ S_I32.pack_into(buf, pos, 4)
416
+ pos += 4
417
+ S_I32.pack_into(buf, pos, int(tot[i]))
418
+ pos += 4
419
+
420
+ # note text (prebuilt)
421
+ buf[pos : pos + len(note_field)] = note_field
422
+ pos += len(note_field)
423
+
424
+ if pos:
425
+ cp.write(memoryview(buf)[:pos])
426
+
427
+ cp.write(S_I16.pack(-1))
428
+
429
+
430
+ def copy_items_binary_fast(
431
+ cur,
432
+ items,
433
+ note_bytes: bytes,
434
+ buffer_mb: int = 32,
435
+ ):
436
+ """
437
+ Fast COPY BINARY writer for sales.order_items.
438
+ """
439
+ note_field = S_I32.pack(len(note_bytes)) + note_bytes
440
+
441
+ ncols = 6
442
+ max_row = 2 + (ncols * 4) + (8 + 8 + 4 + 4 + 4) + len(note_bytes)
443
+ buffer_bytes = max(1, buffer_mb) * 1024 * 1024
444
+
445
+ with cur.copy(
446
+ "COPY sales.order_items("
447
+ "order_id, product_id, qty, unit_price_cents, line_total_cents, note"
448
+ ") FROM STDIN WITH (FORMAT BINARY)"
449
+ ) as cp:
450
+ cp.write(PGCOPY_SIGNATURE + S_I32.pack(0) + S_I32.pack(0))
451
+
452
+ oid = items["order_id"]
453
+ pid = items["product_id"]
454
+ qty = items["qty"]
455
+ unit = items["unit"]
456
+ line = items["line_total"]
457
+
458
+ buf = bytearray(buffer_bytes)
459
+ pos = 0
460
+
461
+ for i in range(len(oid)):
462
+ if pos + max_row >= buffer_bytes:
463
+ cp.write(memoryview(buf)[:pos])
464
+ pos = 0
465
+
466
+ S_I16.pack_into(buf, pos, ncols)
467
+ pos += 2
468
+
469
+ # order_id int8
470
+ S_I32.pack_into(buf, pos, 8)
471
+ pos += 4
472
+ S_I64.pack_into(buf, pos, int(oid[i]))
473
+ pos += 8
474
+
475
+ # product_id int8
476
+ S_I32.pack_into(buf, pos, 8)
477
+ pos += 4
478
+ S_I64.pack_into(buf, pos, int(pid[i]))
479
+ pos += 8
480
+
481
+ # qty int4
482
+ S_I32.pack_into(buf, pos, 4)
483
+ pos += 4
484
+ S_I32.pack_into(buf, pos, int(qty[i]))
485
+ pos += 4
486
+
487
+ # unit_price int4
488
+ S_I32.pack_into(buf, pos, 4)
489
+ pos += 4
490
+ S_I32.pack_into(buf, pos, int(unit[i]))
491
+ pos += 4
492
+
493
+ # line_total int4
494
+ S_I32.pack_into(buf, pos, 4)
495
+ pos += 4
496
+ S_I32.pack_into(buf, pos, int(line[i]))
497
+ pos += 4
498
+
499
+ # note
500
+ buf[pos : pos + len(note_field)] = note_field
501
+ pos += len(note_field)
502
+
503
+ if pos:
504
+ cp.write(memoryview(buf)[:pos])
505
+
506
+ cp.write(S_I16.pack(-1))
507
+
508
+
509
+ # -----------------------------
510
+ # Master fillers
511
+ # -----------------------------
512
+ def fill_masters(conn, customers: int, products: int) -> None:
513
+ conn.execute(
514
+ """
515
+ INSERT INTO sales.customers(name, email, region)
516
+ SELECT
517
+ 'Customer-' || gs::text,
518
+ 'user' || gs::text || '@example.com',
519
+ (ARRAY['JP','US','EU','APAC','LATAM'])[1 + (random()*4)::int]
520
+ FROM generate_series(1, %s) gs;
521
+ """,
522
+ (customers,),
523
+ )
524
+
525
+ conn.execute(
526
+ """
527
+ INSERT INTO sales.products(sku, name, category, price_cents)
528
+ SELECT
529
+ 'SKU-' || gs::text,
530
+ 'Product-' || gs::text,
531
+ (ARRAY['food','apparel','home','electronics','book','beauty'])[1 + (random()*5)::int],
532
+ (500 + (random()*20000)::int)
533
+ FROM generate_series(1, %s) gs;
534
+ """,
535
+ (products,),
536
+ )
537
+ conn.commit()
538
+
539
+
540
+ # -----------------------------
541
+ # Monitoring
542
+ # -----------------------------
543
+ def current_total_gb(conn) -> float:
544
+ b = conn.execute(
545
+ "SELECT pg_total_relation_size('sales.orders'::regclass) + "
546
+ "pg_total_relation_size('sales.order_items'::regclass)"
547
+ ).fetchone()[0]
548
+ return float(b) / (1024.0**3)
549
+
550
+
551
+ # -----------------------------
552
+ # Worker
553
+ # -----------------------------
554
+ def worker_proc(
555
+ worker_id: int,
556
+ dsn: str,
557
+ q: Queue,
558
+ stop_evt: Event,
559
+ cfg: BatchConfig,
560
+ base_seed: int,
561
+ order_id_stride: int,
562
+ batch_orders: int,
563
+ copy_orders_buf_mb: int,
564
+ copy_items_buf_mb: int,
565
+ ):
566
+ rng = np.random.default_rng(base_seed + worker_id)
567
+ conn = psycopg.connect(dsn)
568
+
569
+ # note bytes: pre-encoded once per worker
570
+ ord_note_bytes = fixed_note(cfg.order_note_len, f"w{worker_id:02d}-ORDER-").encode(
571
+ "utf-8"
572
+ )
573
+ itm_note_bytes = fixed_note(cfg.item_note_len, f"w{worker_id:02d}-ITEM-").encode(
574
+ "utf-8"
575
+ )
576
+
577
+ try:
578
+ with conn.cursor() as cur:
579
+ cur.execute("SET synchronous_commit=off")
580
+ cur.execute("SET client_min_messages=warning")
581
+ cur.execute("SET work_mem='256MB'")
582
+
583
+ next_order_id = 1 + worker_id * order_id_stride
584
+
585
+ while True:
586
+ msg = q.get() # blocks
587
+ if msg is None:
588
+ break
589
+ if stop_evt.is_set():
590
+ break
591
+
592
+ orders, items, next_order_id = generate_batch_numpy(
593
+ rng=rng,
594
+ start_order_id=next_order_id,
595
+ batch_orders=batch_orders,
596
+ cfg=cfg,
597
+ )
598
+
599
+ with conn.cursor() as cur:
600
+ copy_orders_binary_fast(
601
+ cur, orders, note_bytes=ord_note_bytes, buffer_mb=copy_orders_buf_mb
602
+ )
603
+ copy_items_binary_fast(
604
+ cur, items, note_bytes=itm_note_bytes, buffer_mb=copy_items_buf_mb
605
+ )
606
+
607
+ conn.commit()
608
+
609
+ except Exception as e:
610
+ print(f"[worker {worker_id}] ERROR: {e}", file=sys.stderr)
611
+ try:
612
+ conn.rollback()
613
+ except Exception:
614
+ pass
615
+ raise
616
+ finally:
617
+ conn.close()
618
+
619
+
620
+ # -----------------------------
621
+ # Main
622
+ # -----------------------------
623
+ def main() -> int:
624
+ # argparse default -h conflicts with psql's -h(host).
625
+ ap = argparse.ArgumentParser(
626
+ description="Empty DB -> create schema/tables -> fill masters -> generate sales-like data FAST.",
627
+ add_help=False,
628
+ )
629
+ ap.add_argument(
630
+ "--help", "-?", action="help", help="show this help message and exit"
631
+ )
632
+
633
+ # Connection (psql-compatible)
634
+ ap.add_argument(
635
+ "--dsn",
636
+ default=os.environ.get("PG_DSN"),
637
+ help="libpq DSN. Overrides -h/-p/-U/-d.",
638
+ )
639
+ ap.add_argument(
640
+ "-h",
641
+ "--host",
642
+ default=None,
643
+ help="database server host or socket directory (psql compatible).",
644
+ )
645
+ ap.add_argument(
646
+ "-p",
647
+ "--port",
648
+ type=int,
649
+ default=None,
650
+ help="database server port (psql compatible).",
651
+ )
652
+ ap.add_argument(
653
+ "-U", "--user", default=None, help="database user name (psql compatible)."
654
+ )
655
+ ap.add_argument(
656
+ "-d", "--dbname", default=None, help="database name (psql compatible)."
657
+ )
658
+ ap.add_argument(
659
+ "--password",
660
+ default=None,
661
+ help="database password (or use PGPASSWORD env / .pgpass).",
662
+ )
663
+ ap.add_argument(
664
+ "--sslmode", default=None, help="sslmode (require, verify-full, etc.)."
665
+ )
666
+ ap.add_argument(
667
+ "--options",
668
+ default=None,
669
+ help='libpq options string (e.g., "-c statement_timeout=0").',
670
+ )
671
+ ap.add_argument(
672
+ "--print-psql",
673
+ action="store_true",
674
+ help="Print equivalent psql command and exit.",
675
+ )
676
+
677
+ # DDL options
678
+ ap.add_argument(
679
+ "--logged",
680
+ action="store_true",
681
+ help="Create LOGGED tables (default UNLOGGED for speed).",
682
+ )
683
+ ap.add_argument(
684
+ "--with-fk", action="store_true", help="Create foreign keys (slower)."
685
+ )
686
+ ap.add_argument(
687
+ "--create-indexes",
688
+ action="store_true",
689
+ help="Create typical indexes + ANALYZE after load.",
690
+ )
691
+
692
+ # Masters
693
+ ap.add_argument(
694
+ "--customers",
695
+ type=int,
696
+ default=2_000_000,
697
+ help="Number of customers to generate.",
698
+ )
699
+ ap.add_argument(
700
+ "--products", type=int, default=200_000, help="Number of products to generate."
701
+ )
702
+
703
+ # Generation
704
+ ap.add_argument(
705
+ "--target-gb",
706
+ type=float,
707
+ default=100.0,
708
+ help="Target size (orders+items) in GB.",
709
+ )
710
+ ap.add_argument(
711
+ "--workers",
712
+ type=int,
713
+ default=max(1, os.cpu_count() or 1),
714
+ help="Number of worker processes.",
715
+ )
716
+ ap.add_argument(
717
+ "--batch-orders", type=int, default=200_000, help="Orders per batch per worker."
718
+ )
719
+ ap.add_argument(
720
+ "--avg-items",
721
+ type=float,
722
+ default=3.2,
723
+ help="Average number of items per order.",
724
+ )
725
+ ap.add_argument("--max-items", type=int, default=12, help="Max items per order.")
726
+ ap.add_argument("--order-note-len", type=int, default=80, help="Order note length.")
727
+ ap.add_argument("--item-note-len", type=int, default=120, help="Item note length.")
728
+ ap.add_argument(
729
+ "--start-date", default="2022-01-01", help="Start date YYYY-MM-DD (UTC)."
730
+ )
731
+ ap.add_argument(
732
+ "--end-date", default="2026-01-01", help="End date YYYY-MM-DD (UTC)."
733
+ )
734
+ ap.add_argument(
735
+ "--unit-price-min", type=int, default=500, help="Min unit price (cents)."
736
+ )
737
+ ap.add_argument(
738
+ "--unit-price-max", type=int, default=20_500, help="Max unit price (cents)."
739
+ )
740
+ ap.add_argument(
741
+ "--tax-rate", type=float, default=0.10, help="Tax rate (e.g., 0.10)."
742
+ )
743
+ ap.add_argument(
744
+ "--shipping-threshold",
745
+ type=int,
746
+ default=5000,
747
+ help="Free shipping threshold (cents).",
748
+ )
749
+ ap.add_argument(
750
+ "--shipping-fee",
751
+ type=int,
752
+ default=500,
753
+ help="Shipping fee under threshold (cents).",
754
+ )
755
+ ap.add_argument(
756
+ "--progress-interval",
757
+ type=float,
758
+ default=2.0,
759
+ help="Seconds between progress prints.",
760
+ )
761
+ ap.add_argument("--seed", type=int, default=12345, help="Base RNG seed.")
762
+ ap.add_argument(
763
+ "--order-id-stride",
764
+ type=int,
765
+ default=10_000_000_000,
766
+ help="Per-worker order_id stride (must exceed total orders per worker).",
767
+ )
768
+
769
+ # COPY buffer sizes (MB)
770
+ ap.add_argument(
771
+ "--copy-orders-buf-mb",
772
+ type=int,
773
+ default=16,
774
+ help="COPY buffer for orders (MB).",
775
+ )
776
+ ap.add_argument(
777
+ "--copy-items-buf-mb", type=int, default=32, help="COPY buffer for items (MB)."
778
+ )
779
+
780
+ # shutdown behavior
781
+ ap.add_argument(
782
+ "--join-timeout-sec",
783
+ type=float,
784
+ default=0.0,
785
+ help="If >0, timeout seconds for joining each worker. 0 means wait indefinitely.",
786
+ )
787
+
788
+ args = ap.parse_args()
789
+ dsn = build_libpq_dsn(args)
790
+
791
+ if args.print_psql:
792
+ print(psql_equivalent_cmd(args))
793
+ return 0
794
+
795
+ start_dt = parse_ymd(args.start_date)
796
+ end_dt = parse_ymd(args.end_date)
797
+ span = int((end_dt - start_dt).total_seconds())
798
+ if span <= 0:
799
+ print("end-date must be after start-date", file=sys.stderr)
800
+ return 2
801
+
802
+ # Coordinator connection
803
+ coord = psycopg.connect(dsn)
804
+ coord.execute("SET client_min_messages=warning")
805
+ coord.execute("SET synchronous_commit=off")
806
+
807
+ print("[setup] creating schema/tables...")
808
+ create_schema_and_tables(coord, logged=args.logged, with_fk=args.with_fk)
809
+
810
+ print(
811
+ f"[setup] inserting masters: customers={args.customers:,} products={args.products:,} ..."
812
+ )
813
+ fill_masters(coord, customers=args.customers, products=args.products)
814
+
815
+ cfg = BatchConfig(
816
+ avg_items=args.avg_items,
817
+ max_items=args.max_items,
818
+ order_note_len=args.order_note_len,
819
+ item_note_len=args.item_note_len,
820
+ start_unix=int(start_dt.timestamp()),
821
+ span_seconds=span,
822
+ cust_max=args.customers,
823
+ prod_max=args.products,
824
+ unit_price_min=args.unit_price_min,
825
+ unit_price_max=args.unit_price_max,
826
+ shipping_threshold=args.shipping_threshold,
827
+ shipping_fee=args.shipping_fee,
828
+ tax_rate=args.tax_rate,
829
+ )
830
+
831
+ # Work queue
832
+ q: Queue = Queue(maxsize=args.workers * 4)
833
+ stop_evt = Event()
834
+
835
+ # Start workers (NOT daemon: allow clean join)
836
+ procs: list[Process] = []
837
+ for wid in range(args.workers):
838
+ p = Process(
839
+ target=worker_proc,
840
+ args=(
841
+ wid,
842
+ dsn,
843
+ q,
844
+ stop_evt,
845
+ cfg,
846
+ args.seed,
847
+ args.order_id_stride,
848
+ args.batch_orders,
849
+ args.copy_orders_buf_mb,
850
+ args.copy_items_buf_mb,
851
+ ),
852
+ daemon=False,
853
+ )
854
+ p.start()
855
+ procs.append(p)
856
+
857
+ try:
858
+ last_print = 0.0
859
+
860
+ while True:
861
+ gb = current_total_gb(coord)
862
+ now = time.time()
863
+ if now - last_print >= args.progress_interval:
864
+ print(f"[progress] {gb:.2f} GB / {args.target_gb:.2f} GB")
865
+ last_print = now
866
+
867
+ if gb >= args.target_gb:
868
+ break
869
+
870
+ # enqueue one batch per worker
871
+ for _ in range(args.workers):
872
+ q.put(1)
873
+
874
+ print("[done] target reached; stopping workers...")
875
+
876
+ finally:
877
+ # Normal stop path: send sentinels so workers break out of q.get()
878
+ stop_evt.set()
879
+
880
+ # Make sure we enqueue enough sentinels even if queue is partly full.
881
+ # Block until they're all sent.
882
+ for _ in procs:
883
+ q.put(None)
884
+
885
+ # Clean up queue feeder threads
886
+ try:
887
+ q.close()
888
+ q.join_thread()
889
+ except Exception:
890
+ pass
891
+
892
+ # Join workers reliably
893
+ for p in procs:
894
+ if args.join_timeout_sec and args.join_timeout_sec > 0:
895
+ p.join(timeout=args.join_timeout_sec)
896
+ else:
897
+ p.join()
898
+
899
+ if p.exitcode not in (0, None):
900
+ print(f"[warn] worker exited with code {p.exitcode}", file=sys.stderr)
901
+
902
+ coord.close()
903
+
904
+ # Optional indexes after load
905
+ if args.create_indexes:
906
+ # need a new connection because coord is closed above
907
+ coord2 = psycopg.connect(dsn)
908
+ try:
909
+ print("[post] creating indexes + analyze...")
910
+ create_indexes(coord2)
911
+ finally:
912
+ coord2.close()
913
+
914
+ return 0
915
+
916
+
917
+ if __name__ == "__main__":
918
+ raise SystemExit(main())