pgsalesgen 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pgsalesgen-0.1.0/.gitignore +10 -0
- pgsalesgen-0.1.0/LICENSE +21 -0
- pgsalesgen-0.1.0/PKG-INFO +157 -0
- pgsalesgen-0.1.0/README.md +123 -0
- pgsalesgen-0.1.0/pyproject.toml +41 -0
- pgsalesgen-0.1.0/src/pgsalesgen/__init__.py +0 -0
- pgsalesgen-0.1.0/src/pgsalesgen/main.py +918 -0
pgsalesgen-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Rio Fujita
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pgsalesgen
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Generate sales data for PostgreSQL
|
|
5
|
+
Project-URL: Homepage, https://github.com/rioriost/homebrew-pgsalesgen
|
|
6
|
+
Project-URL: Issues, https://github.com/rioriost/homebrew-pgsalesgen/issues
|
|
7
|
+
Author-email: Rio Fujita <rifujita@microsoft.com>
|
|
8
|
+
License: MIT License
|
|
9
|
+
|
|
10
|
+
Copyright (c) 2026 Rio Fujita
|
|
11
|
+
|
|
12
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
13
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
14
|
+
in the Software without restriction, including without limitation the rights
|
|
15
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
16
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
17
|
+
furnished to do so, subject to the following conditions:
|
|
18
|
+
|
|
19
|
+
The above copyright notice and this permission notice shall be included in all
|
|
20
|
+
copies or substantial portions of the Software.
|
|
21
|
+
|
|
22
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
23
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
24
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
25
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
26
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
27
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
28
|
+
SOFTWARE.
|
|
29
|
+
License-File: LICENSE
|
|
30
|
+
Requires-Python: >=3.14
|
|
31
|
+
Requires-Dist: numpy>=2.4.2
|
|
32
|
+
Requires-Dist: psycopg>=3.3.2
|
|
33
|
+
Description-Content-Type: text/markdown
|
|
34
|
+
|
|
35
|
+
# PostgreSQL Sales Generator
|
|
36
|
+
|
|
37
|
+

|
|
38
|
+

|
|
39
|
+
|
|
40
|
+
PostgreSQL Sales Generator
|
|
41
|
+
|
|
42
|
+
## Table of Contents
|
|
43
|
+
|
|
44
|
+
- [Prerequisites](#prerequisites)
|
|
45
|
+
- [Install](#install)
|
|
46
|
+
- [Usage with Claude](#usage-with-claude)
|
|
47
|
+
- [Usage with Visual Studio Code Insiders](#usage-with-visual-studio-code-insiders)
|
|
48
|
+
- [Write Operations](#write-operations)
|
|
49
|
+
- [Release Notes](#release-notes)
|
|
50
|
+
- [For More Information](#for-more-information)
|
|
51
|
+
- [License](#license)
|
|
52
|
+
|
|
53
|
+
## Prerequisites
|
|
54
|
+
|
|
55
|
+
- Python 3.14 and above
|
|
56
|
+
- This module runs on [psycopg](https://www.psycopg.org/)
|
|
57
|
+
|
|
58
|
+
## Install
|
|
59
|
+
|
|
60
|
+
- with brew
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
brew tap rioriost/pg-salesgen
|
|
64
|
+
brew install pg-salesgen
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Usage
|
|
68
|
+
|
|
69
|
+
```
|
|
70
|
+
[pg-salesgen --help](usage: pg-salesgen [--help] [--dsn DSN] [-h HOST] [-p PORT] [-U USER]
|
|
71
|
+
[-d DBNAME] [--password PASSWORD] [--sslmode SSLMODE]
|
|
72
|
+
[--options OPTIONS] [--print-psql] [--logged] [--with-fk]
|
|
73
|
+
[--create-indexes] [--customers CUSTOMERS]
|
|
74
|
+
[--products PRODUCTS] [--target-gb TARGET_GB]
|
|
75
|
+
[--workers WORKERS] [--batch-orders BATCH_ORDERS]
|
|
76
|
+
[--avg-items AVG_ITEMS] [--max-items MAX_ITEMS]
|
|
77
|
+
[--order-note-len ORDER_NOTE_LEN]
|
|
78
|
+
[--item-note-len ITEM_NOTE_LEN] [--start-date START_DATE]
|
|
79
|
+
[--end-date END_DATE] [--unit-price-min UNIT_PRICE_MIN]
|
|
80
|
+
[--unit-price-max UNIT_PRICE_MAX] [--tax-rate TAX_RATE]
|
|
81
|
+
[--shipping-threshold SHIPPING_THRESHOLD]
|
|
82
|
+
[--shipping-fee SHIPPING_FEE]
|
|
83
|
+
[--progress-interval PROGRESS_INTERVAL] [--seed SEED]
|
|
84
|
+
[--order-id-stride ORDER_ID_STRIDE]
|
|
85
|
+
[--copy-orders-buf-mb COPY_ORDERS_BUF_MB]
|
|
86
|
+
[--copy-items-buf-mb COPY_ITEMS_BUF_MB]
|
|
87
|
+
[--join-timeout-sec JOIN_TIMEOUT_SEC]
|
|
88
|
+
|
|
89
|
+
Empty DB -> create schema/tables -> fill masters -> generate sales-like data FAST.
|
|
90
|
+
|
|
91
|
+
options:
|
|
92
|
+
--help, -? show this help message and exit
|
|
93
|
+
--dsn DSN libpq DSN. Overrides -h/-p/-U/-d.
|
|
94
|
+
-h, --host HOST database server host or socket directory (psql
|
|
95
|
+
compatible).
|
|
96
|
+
-p, --port PORT database server port (psql compatible).
|
|
97
|
+
-U, --user USER database user name (psql compatible).
|
|
98
|
+
-d, --dbname DBNAME database name (psql compatible).
|
|
99
|
+
--password PASSWORD database password (or use PGPASSWORD env / .pgpass).
|
|
100
|
+
--sslmode SSLMODE sslmode (require, verify-full, etc.).
|
|
101
|
+
--options OPTIONS libpq options string (e.g., "-c statement_timeout=0").
|
|
102
|
+
--print-psql Print equivalent psql command and exit.
|
|
103
|
+
--logged Create LOGGED tables (default UNLOGGED for speed).
|
|
104
|
+
--with-fk Create foreign keys (slower).
|
|
105
|
+
--create-indexes Create typical indexes + ANALYZE after load.
|
|
106
|
+
--customers CUSTOMERS
|
|
107
|
+
Number of customers to generate.
|
|
108
|
+
--products PRODUCTS Number of products to generate.
|
|
109
|
+
--target-gb TARGET_GB
|
|
110
|
+
Target size (orders+items) in GB.
|
|
111
|
+
--workers WORKERS Number of worker processes.
|
|
112
|
+
--batch-orders BATCH_ORDERS
|
|
113
|
+
Orders per batch per worker.
|
|
114
|
+
--avg-items AVG_ITEMS
|
|
115
|
+
Average number of items per order.
|
|
116
|
+
--max-items MAX_ITEMS
|
|
117
|
+
Max items per order.
|
|
118
|
+
--order-note-len ORDER_NOTE_LEN
|
|
119
|
+
Order note length.
|
|
120
|
+
--item-note-len ITEM_NOTE_LEN
|
|
121
|
+
Item note length.
|
|
122
|
+
--start-date START_DATE
|
|
123
|
+
Start date YYYY-MM-DD (UTC).
|
|
124
|
+
--end-date END_DATE End date YYYY-MM-DD (UTC).
|
|
125
|
+
--unit-price-min UNIT_PRICE_MIN
|
|
126
|
+
Min unit price (cents).
|
|
127
|
+
--unit-price-max UNIT_PRICE_MAX
|
|
128
|
+
Max unit price (cents).
|
|
129
|
+
--tax-rate TAX_RATE Tax rate (e.g., 0.10).
|
|
130
|
+
--shipping-threshold SHIPPING_THRESHOLD
|
|
131
|
+
Free shipping threshold (cents).
|
|
132
|
+
--shipping-fee SHIPPING_FEE
|
|
133
|
+
Shipping fee under threshold (cents).
|
|
134
|
+
--progress-interval PROGRESS_INTERVAL
|
|
135
|
+
Seconds between progress prints.
|
|
136
|
+
--seed SEED Base RNG seed.
|
|
137
|
+
--order-id-stride ORDER_ID_STRIDE
|
|
138
|
+
Per-worker order_id stride (must exceed total orders
|
|
139
|
+
per worker).
|
|
140
|
+
--copy-orders-buf-mb COPY_ORDERS_BUF_MB
|
|
141
|
+
COPY buffer for orders (MB).
|
|
142
|
+
--copy-items-buf-mb COPY_ITEMS_BUF_MB
|
|
143
|
+
COPY buffer for items (MB).
|
|
144
|
+
--join-timeout-sec JOIN_TIMEOUT_SEC
|
|
145
|
+
If >0, timeout seconds for joining each worker. 0
|
|
146
|
+
means wait indefinitely.
|
|
147
|
+
)
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
## Release Notes
|
|
151
|
+
|
|
152
|
+
### 0.1.0 Release
|
|
153
|
+
- Initial release
|
|
154
|
+
|
|
155
|
+
## License
|
|
156
|
+
|
|
157
|
+
MIT License
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
# PostgreSQL Sales Generator
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+

|
|
5
|
+
|
|
6
|
+
PostgreSQL Sales Generator
|
|
7
|
+
|
|
8
|
+
## Table of Contents
|
|
9
|
+
|
|
10
|
+
- [Prerequisites](#prerequisites)
|
|
11
|
+
- [Install](#install)
|
|
12
|
+
- [Usage with Claude](#usage-with-claude)
|
|
13
|
+
- [Usage with Visual Studio Code Insiders](#usage-with-visual-studio-code-insiders)
|
|
14
|
+
- [Write Operations](#write-operations)
|
|
15
|
+
- [Release Notes](#release-notes)
|
|
16
|
+
- [For More Information](#for-more-information)
|
|
17
|
+
- [License](#license)
|
|
18
|
+
|
|
19
|
+
## Prerequisites
|
|
20
|
+
|
|
21
|
+
- Python 3.14 and above
|
|
22
|
+
- This module runs on [psycopg](https://www.psycopg.org/)
|
|
23
|
+
|
|
24
|
+
## Install
|
|
25
|
+
|
|
26
|
+
- with brew
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
brew tap rioriost/pg-salesgen
|
|
30
|
+
brew install pg-salesgen
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Usage
|
|
34
|
+
|
|
35
|
+
```
|
|
36
|
+
[pg-salesgen --help](usage: pg-salesgen [--help] [--dsn DSN] [-h HOST] [-p PORT] [-U USER]
|
|
37
|
+
[-d DBNAME] [--password PASSWORD] [--sslmode SSLMODE]
|
|
38
|
+
[--options OPTIONS] [--print-psql] [--logged] [--with-fk]
|
|
39
|
+
[--create-indexes] [--customers CUSTOMERS]
|
|
40
|
+
[--products PRODUCTS] [--target-gb TARGET_GB]
|
|
41
|
+
[--workers WORKERS] [--batch-orders BATCH_ORDERS]
|
|
42
|
+
[--avg-items AVG_ITEMS] [--max-items MAX_ITEMS]
|
|
43
|
+
[--order-note-len ORDER_NOTE_LEN]
|
|
44
|
+
[--item-note-len ITEM_NOTE_LEN] [--start-date START_DATE]
|
|
45
|
+
[--end-date END_DATE] [--unit-price-min UNIT_PRICE_MIN]
|
|
46
|
+
[--unit-price-max UNIT_PRICE_MAX] [--tax-rate TAX_RATE]
|
|
47
|
+
[--shipping-threshold SHIPPING_THRESHOLD]
|
|
48
|
+
[--shipping-fee SHIPPING_FEE]
|
|
49
|
+
[--progress-interval PROGRESS_INTERVAL] [--seed SEED]
|
|
50
|
+
[--order-id-stride ORDER_ID_STRIDE]
|
|
51
|
+
[--copy-orders-buf-mb COPY_ORDERS_BUF_MB]
|
|
52
|
+
[--copy-items-buf-mb COPY_ITEMS_BUF_MB]
|
|
53
|
+
[--join-timeout-sec JOIN_TIMEOUT_SEC]
|
|
54
|
+
|
|
55
|
+
Empty DB -> create schema/tables -> fill masters -> generate sales-like data FAST.
|
|
56
|
+
|
|
57
|
+
options:
|
|
58
|
+
--help, -? show this help message and exit
|
|
59
|
+
--dsn DSN libpq DSN. Overrides -h/-p/-U/-d.
|
|
60
|
+
-h, --host HOST database server host or socket directory (psql
|
|
61
|
+
compatible).
|
|
62
|
+
-p, --port PORT database server port (psql compatible).
|
|
63
|
+
-U, --user USER database user name (psql compatible).
|
|
64
|
+
-d, --dbname DBNAME database name (psql compatible).
|
|
65
|
+
--password PASSWORD database password (or use PGPASSWORD env / .pgpass).
|
|
66
|
+
--sslmode SSLMODE sslmode (require, verify-full, etc.).
|
|
67
|
+
--options OPTIONS libpq options string (e.g., "-c statement_timeout=0").
|
|
68
|
+
--print-psql Print equivalent psql command and exit.
|
|
69
|
+
--logged Create LOGGED tables (default UNLOGGED for speed).
|
|
70
|
+
--with-fk Create foreign keys (slower).
|
|
71
|
+
--create-indexes Create typical indexes + ANALYZE after load.
|
|
72
|
+
--customers CUSTOMERS
|
|
73
|
+
Number of customers to generate.
|
|
74
|
+
--products PRODUCTS Number of products to generate.
|
|
75
|
+
--target-gb TARGET_GB
|
|
76
|
+
Target size (orders+items) in GB.
|
|
77
|
+
--workers WORKERS Number of worker processes.
|
|
78
|
+
--batch-orders BATCH_ORDERS
|
|
79
|
+
Orders per batch per worker.
|
|
80
|
+
--avg-items AVG_ITEMS
|
|
81
|
+
Average number of items per order.
|
|
82
|
+
--max-items MAX_ITEMS
|
|
83
|
+
Max items per order.
|
|
84
|
+
--order-note-len ORDER_NOTE_LEN
|
|
85
|
+
Order note length.
|
|
86
|
+
--item-note-len ITEM_NOTE_LEN
|
|
87
|
+
Item note length.
|
|
88
|
+
--start-date START_DATE
|
|
89
|
+
Start date YYYY-MM-DD (UTC).
|
|
90
|
+
--end-date END_DATE End date YYYY-MM-DD (UTC).
|
|
91
|
+
--unit-price-min UNIT_PRICE_MIN
|
|
92
|
+
Min unit price (cents).
|
|
93
|
+
--unit-price-max UNIT_PRICE_MAX
|
|
94
|
+
Max unit price (cents).
|
|
95
|
+
--tax-rate TAX_RATE Tax rate (e.g., 0.10).
|
|
96
|
+
--shipping-threshold SHIPPING_THRESHOLD
|
|
97
|
+
Free shipping threshold (cents).
|
|
98
|
+
--shipping-fee SHIPPING_FEE
|
|
99
|
+
Shipping fee under threshold (cents).
|
|
100
|
+
--progress-interval PROGRESS_INTERVAL
|
|
101
|
+
Seconds between progress prints.
|
|
102
|
+
--seed SEED Base RNG seed.
|
|
103
|
+
--order-id-stride ORDER_ID_STRIDE
|
|
104
|
+
Per-worker order_id stride (must exceed total orders
|
|
105
|
+
per worker).
|
|
106
|
+
--copy-orders-buf-mb COPY_ORDERS_BUF_MB
|
|
107
|
+
COPY buffer for orders (MB).
|
|
108
|
+
--copy-items-buf-mb COPY_ITEMS_BUF_MB
|
|
109
|
+
COPY buffer for items (MB).
|
|
110
|
+
--join-timeout-sec JOIN_TIMEOUT_SEC
|
|
111
|
+
If >0, timeout seconds for joining each worker. 0
|
|
112
|
+
means wait indefinitely.
|
|
113
|
+
)
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## Release Notes
|
|
117
|
+
|
|
118
|
+
### 0.1.0 Release
|
|
119
|
+
- Initial release
|
|
120
|
+
|
|
121
|
+
## License
|
|
122
|
+
|
|
123
|
+
MIT License
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "pgsalesgen"
|
|
3
|
+
authors = [
|
|
4
|
+
{ name = "Rio Fujita", email = "rifujita@microsoft.com" }
|
|
5
|
+
]
|
|
6
|
+
version = "0.1.0"
|
|
7
|
+
license = {file = "LICENSE"}
|
|
8
|
+
description = "Generate sales data for PostgreSQL"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
|
|
11
|
+
requires-python = ">=3.14"
|
|
12
|
+
dependencies = [
|
|
13
|
+
"numpy>=2.4.2",
|
|
14
|
+
"psycopg>=3.3.2",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
[project.urls]
|
|
18
|
+
Homepage = "https://github.com/rioriost/homebrew-pgsalesgen"
|
|
19
|
+
Issues = "https://github.com/rioriost/homebrew-pgsalesgen/issues"
|
|
20
|
+
|
|
21
|
+
[project.scripts]
|
|
22
|
+
pg-salesgen = "pgsalesgen.main:main"
|
|
23
|
+
|
|
24
|
+
[build-system]
|
|
25
|
+
requires = ["hatchling"]
|
|
26
|
+
build-backend = "hatchling.build"
|
|
27
|
+
|
|
28
|
+
[tool.hatch.build.targets.wheel]
|
|
29
|
+
packages = ["src/pgsalesgen"]
|
|
30
|
+
|
|
31
|
+
[tool.hatch.build.targets.sdist]
|
|
32
|
+
include = [
|
|
33
|
+
"src/pgsalesgen/*.py",
|
|
34
|
+
"*.py",
|
|
35
|
+
]
|
|
36
|
+
exclude = [
|
|
37
|
+
"pgsalesgen.rb",
|
|
38
|
+
"uv.lock",
|
|
39
|
+
"dist/.DS_Store",
|
|
40
|
+
".envrc",
|
|
41
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,918 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
pgsalesgen.py
|
|
4
|
+
|
|
5
|
+
Empty DB -> create schema/tables -> fill masters -> generate ~target GB of sales-like data FAST.
|
|
6
|
+
|
|
7
|
+
Speed upgrades in this version:
|
|
8
|
+
- COPY BINARY payload is built by struct.pack_into() into a big bytearray (min allocations)
|
|
9
|
+
- channel/status are kept as uint8 indices (no massive Python list[str] creation)
|
|
10
|
+
- cp.write() is called with large memoryview chunks (flush by bytes, not rows)
|
|
11
|
+
|
|
12
|
+
Also fixed:
|
|
13
|
+
- Previous version could hang after printing:
|
|
14
|
+
[done] target reached; stopping workers...
|
|
15
|
+
because workers might be blocked on q.get() and never see the stop condition,
|
|
16
|
+
while the coordinator only sent one None per worker after setting stop_evt.
|
|
17
|
+
This version:
|
|
18
|
+
- uses sentinel None to stop workers reliably
|
|
19
|
+
- does not rely on stop_evt for the normal stop path
|
|
20
|
+
- closes/join_thread() on the Queue to avoid lingering feeder threads
|
|
21
|
+
- joins workers without a short timeout (or uses a generous one)
|
|
22
|
+
|
|
23
|
+
psql-compatible flags:
|
|
24
|
+
- -h host, -p port, -U user, -d dbname
|
|
25
|
+
(argparse help is remapped to --help / -?)
|
|
26
|
+
|
|
27
|
+
Usage:
|
|
28
|
+
pgsalesgen -h localhost -p 5432 -U postgres -d emptydb --target-gb 10 --workers 8
|
|
29
|
+
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
from __future__ import annotations
|
|
33
|
+
|
|
34
|
+
import argparse
|
|
35
|
+
import os
|
|
36
|
+
import struct
|
|
37
|
+
import sys
|
|
38
|
+
import time
|
|
39
|
+
from dataclasses import dataclass
|
|
40
|
+
from datetime import datetime, timezone
|
|
41
|
+
from multiprocessing import Event, Process, Queue
|
|
42
|
+
|
|
43
|
+
import numpy as np
|
|
44
|
+
import psycopg
|
|
45
|
+
|
|
46
|
+
# -----------------------------
|
|
47
|
+
# PostgreSQL COPY BINARY helpers
|
|
48
|
+
# -----------------------------
|
|
49
|
+
PGCOPY_SIGNATURE = b"PGCOPY\n\xff\r\n\0"
|
|
50
|
+
EPOCH_2000 = datetime(2000, 1, 1, tzinfo=timezone.utc)
|
|
51
|
+
UNIX_EPOCH = datetime(1970, 1, 1, tzinfo=timezone.utc)
|
|
52
|
+
|
|
53
|
+
S_I16 = struct.Struct("!h")
|
|
54
|
+
S_I32 = struct.Struct("!i")
|
|
55
|
+
S_I64 = struct.Struct("!q")
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# -----------------------------
|
|
59
|
+
# Connection (psql-compatible)
|
|
60
|
+
# -----------------------------
|
|
61
|
+
def build_libpq_dsn(args) -> str:
|
|
62
|
+
if args.dsn:
|
|
63
|
+
return args.dsn
|
|
64
|
+
|
|
65
|
+
parts: list[str] = []
|
|
66
|
+
if args.host:
|
|
67
|
+
parts.append(f"host={args.host}")
|
|
68
|
+
if args.port:
|
|
69
|
+
parts.append(f"port={args.port}")
|
|
70
|
+
if args.user:
|
|
71
|
+
parts.append(f"user={args.user}")
|
|
72
|
+
if args.dbname:
|
|
73
|
+
parts.append(f"dbname={args.dbname}")
|
|
74
|
+
if args.password:
|
|
75
|
+
parts.append(f"password={args.password}")
|
|
76
|
+
if args.sslmode:
|
|
77
|
+
parts.append(f"sslmode={args.sslmode}")
|
|
78
|
+
if args.options:
|
|
79
|
+
parts.append(f"options={args.options}")
|
|
80
|
+
|
|
81
|
+
return " ".join(parts) if parts else ""
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def psql_equivalent_cmd(args) -> str:
|
|
85
|
+
cmd = ["psql"]
|
|
86
|
+
if args.host:
|
|
87
|
+
cmd += ["-h", args.host]
|
|
88
|
+
if args.port:
|
|
89
|
+
cmd += ["-p", str(args.port)]
|
|
90
|
+
if args.user:
|
|
91
|
+
cmd += ["-U", args.user]
|
|
92
|
+
if args.dbname:
|
|
93
|
+
cmd += ["-d", args.dbname]
|
|
94
|
+
|
|
95
|
+
prefix = ""
|
|
96
|
+
if args.password:
|
|
97
|
+
prefix += "PGPASSWORD='***' "
|
|
98
|
+
if args.sslmode:
|
|
99
|
+
prefix += f"PGSSLMODE='{args.sslmode}' "
|
|
100
|
+
if args.options:
|
|
101
|
+
prefix += f"PGOPTIONS='{args.options}' "
|
|
102
|
+
return prefix + " ".join(cmd)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
# -----------------------------
|
|
106
|
+
# Schema / DDL
|
|
107
|
+
# -----------------------------
|
|
108
|
+
DDL_TEMPLATE = """
|
|
109
|
+
CREATE SCHEMA IF NOT EXISTS sales;
|
|
110
|
+
|
|
111
|
+
-- Drop in dependency order
|
|
112
|
+
DROP TABLE IF EXISTS sales.order_items;
|
|
113
|
+
DROP TABLE IF EXISTS sales.orders;
|
|
114
|
+
DROP TABLE IF EXISTS sales.products;
|
|
115
|
+
DROP TABLE IF EXISTS sales.customers;
|
|
116
|
+
|
|
117
|
+
{create_table} sales.customers (
|
|
118
|
+
customer_id bigserial PRIMARY KEY,
|
|
119
|
+
created_at timestamptz NOT NULL DEFAULT now(),
|
|
120
|
+
name text NOT NULL,
|
|
121
|
+
email text NOT NULL,
|
|
122
|
+
region text NOT NULL
|
|
123
|
+
);
|
|
124
|
+
|
|
125
|
+
{create_table} sales.products (
|
|
126
|
+
product_id bigserial PRIMARY KEY,
|
|
127
|
+
sku text NOT NULL,
|
|
128
|
+
name text NOT NULL,
|
|
129
|
+
category text NOT NULL,
|
|
130
|
+
price_cents int NOT NULL
|
|
131
|
+
);
|
|
132
|
+
|
|
133
|
+
-- order_id is BIGINT and provided by generator (worker range), not serial.
|
|
134
|
+
{create_table} sales.orders (
|
|
135
|
+
order_id bigint PRIMARY KEY,
|
|
136
|
+
ordered_at timestamptz NOT NULL,
|
|
137
|
+
customer_id bigint NOT NULL,
|
|
138
|
+
channel text NOT NULL,
|
|
139
|
+
status text NOT NULL,
|
|
140
|
+
subtotal_cents int NOT NULL,
|
|
141
|
+
tax_cents int NOT NULL,
|
|
142
|
+
shipping_cents int NOT NULL,
|
|
143
|
+
total_cents int NOT NULL,
|
|
144
|
+
note text NOT NULL
|
|
145
|
+
);
|
|
146
|
+
|
|
147
|
+
{create_table} sales.order_items (
|
|
148
|
+
order_item_id bigserial PRIMARY KEY,
|
|
149
|
+
order_id bigint NOT NULL,
|
|
150
|
+
product_id bigint NOT NULL,
|
|
151
|
+
qty int NOT NULL,
|
|
152
|
+
unit_price_cents int NOT NULL,
|
|
153
|
+
line_total_cents int NOT NULL,
|
|
154
|
+
note text NOT NULL
|
|
155
|
+
);
|
|
156
|
+
|
|
157
|
+
{fk}
|
|
158
|
+
"""
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def create_schema_and_tables(conn, logged: bool, with_fk: bool) -> None:
|
|
162
|
+
create_table = "CREATE TABLE" if logged else "CREATE UNLOGGED TABLE"
|
|
163
|
+
fk_sql = ""
|
|
164
|
+
if with_fk:
|
|
165
|
+
fk_sql = """
|
|
166
|
+
ALTER TABLE sales.orders
|
|
167
|
+
ADD CONSTRAINT orders_customer_fk
|
|
168
|
+
FOREIGN KEY (customer_id) REFERENCES sales.customers(customer_id);
|
|
169
|
+
|
|
170
|
+
ALTER TABLE sales.order_items
|
|
171
|
+
ADD CONSTRAINT items_order_fk
|
|
172
|
+
FOREIGN KEY (order_id) REFERENCES sales.orders(order_id);
|
|
173
|
+
|
|
174
|
+
ALTER TABLE sales.order_items
|
|
175
|
+
ADD CONSTRAINT items_product_fk
|
|
176
|
+
FOREIGN KEY (product_id) REFERENCES sales.products(product_id);
|
|
177
|
+
"""
|
|
178
|
+
ddl = DDL_TEMPLATE.format(create_table=create_table, fk=fk_sql)
|
|
179
|
+
conn.execute(ddl)
|
|
180
|
+
conn.commit()
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def create_indexes(conn) -> None:
|
|
184
|
+
conn.execute(
|
|
185
|
+
"CREATE INDEX IF NOT EXISTS orders_ordered_at_brin ON sales.orders USING brin (ordered_at);"
|
|
186
|
+
)
|
|
187
|
+
conn.execute(
|
|
188
|
+
"CREATE INDEX IF NOT EXISTS orders_customer_id_idx ON sales.orders (customer_id);"
|
|
189
|
+
)
|
|
190
|
+
conn.execute(
|
|
191
|
+
"CREATE INDEX IF NOT EXISTS items_order_id_idx ON sales.order_items (order_id);"
|
|
192
|
+
)
|
|
193
|
+
conn.execute(
|
|
194
|
+
"CREATE INDEX IF NOT EXISTS items_product_id_idx ON sales.order_items (product_id);"
|
|
195
|
+
)
|
|
196
|
+
conn.execute("ANALYZE sales.customers;")
|
|
197
|
+
conn.execute("ANALYZE sales.products;")
|
|
198
|
+
conn.execute("ANALYZE sales.orders;")
|
|
199
|
+
conn.execute("ANALYZE sales.order_items;")
|
|
200
|
+
conn.commit()
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
# -----------------------------
|
|
204
|
+
# Generation utilities (NumPy)
|
|
205
|
+
# -----------------------------
|
|
206
|
+
def parse_ymd(s: str) -> datetime:
|
|
207
|
+
y, m, d = map(int, s.split("-"))
|
|
208
|
+
return datetime(y, m, d, tzinfo=timezone.utc)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def us_since_2000_from_unix_seconds(unix_seconds: np.ndarray) -> np.ndarray:
|
|
212
|
+
base_unix_2000 = int((EPOCH_2000 - UNIX_EPOCH).total_seconds())
|
|
213
|
+
return (unix_seconds - base_unix_2000).astype(np.int64) * 1_000_000
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def fixed_note(length: int, tag: str) -> str:
|
|
217
|
+
if length <= 0:
|
|
218
|
+
return ""
|
|
219
|
+
return (tag * ((length // max(1, len(tag))) + 1))[:length]
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
@dataclass(frozen=True)
|
|
223
|
+
class BatchConfig:
|
|
224
|
+
avg_items: float
|
|
225
|
+
max_items: int
|
|
226
|
+
order_note_len: int
|
|
227
|
+
item_note_len: int
|
|
228
|
+
start_unix: int
|
|
229
|
+
span_seconds: int
|
|
230
|
+
cust_max: int
|
|
231
|
+
prod_max: int
|
|
232
|
+
unit_price_min: int
|
|
233
|
+
unit_price_max: int
|
|
234
|
+
shipping_threshold: int
|
|
235
|
+
shipping_fee: int
|
|
236
|
+
tax_rate: float
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def generate_batch_numpy(
|
|
240
|
+
rng: np.random.Generator,
|
|
241
|
+
start_order_id: int,
|
|
242
|
+
batch_orders: int,
|
|
243
|
+
cfg: BatchConfig,
|
|
244
|
+
):
|
|
245
|
+
"""
|
|
246
|
+
Returns:
|
|
247
|
+
orders: dict of numpy arrays (no Python list[str])
|
|
248
|
+
items: dict of numpy arrays
|
|
249
|
+
next_order_id: int
|
|
250
|
+
"""
|
|
251
|
+
n = batch_orders
|
|
252
|
+
order_ids = np.arange(start_order_id, start_order_id + n, dtype=np.int64)
|
|
253
|
+
|
|
254
|
+
offsets = rng.integers(0, cfg.span_seconds, size=n, dtype=np.int64)
|
|
255
|
+
ordered_unix = (cfg.start_unix + offsets).astype(np.int64)
|
|
256
|
+
ordered_us2000 = us_since_2000_from_unix_seconds(ordered_unix)
|
|
257
|
+
|
|
258
|
+
customer_ids = rng.integers(1, cfg.cust_max + 1, size=n, dtype=np.int64)
|
|
259
|
+
|
|
260
|
+
# store indices only (uint8)
|
|
261
|
+
channel_idx = rng.integers(0, 3, size=n, dtype=np.uint8) # 0..2
|
|
262
|
+
status_idx = rng.integers(0, 3, size=n, dtype=np.uint8) # 0..2
|
|
263
|
+
|
|
264
|
+
lam = max(0.1, float(cfg.avg_items))
|
|
265
|
+
k = rng.poisson(lam=lam, size=n).astype(np.int64)
|
|
266
|
+
k = np.clip(k, 1, cfg.max_items)
|
|
267
|
+
total_items = int(k.sum())
|
|
268
|
+
|
|
269
|
+
item_order_ids = np.repeat(order_ids, k).astype(np.int64)
|
|
270
|
+
product_ids = rng.integers(1, cfg.prod_max + 1, size=total_items, dtype=np.int64)
|
|
271
|
+
qty = rng.integers(1, 6, size=total_items, dtype=np.int32)
|
|
272
|
+
unit = rng.integers(
|
|
273
|
+
cfg.unit_price_min, cfg.unit_price_max + 1, size=total_items, dtype=np.int32
|
|
274
|
+
)
|
|
275
|
+
line_total = (unit.astype(np.int64) * qty.astype(np.int64)).astype(np.int64)
|
|
276
|
+
|
|
277
|
+
boundaries = np.concatenate(([0], np.cumsum(k)[:-1]))
|
|
278
|
+
subtotal = np.add.reduceat(line_total, boundaries).astype(np.int64)
|
|
279
|
+
|
|
280
|
+
tax = (subtotal.astype(np.float64) * cfg.tax_rate).astype(np.int64)
|
|
281
|
+
shipping = np.where(subtotal >= cfg.shipping_threshold, 0, cfg.shipping_fee).astype(
|
|
282
|
+
np.int64
|
|
283
|
+
)
|
|
284
|
+
total = (subtotal + tax + shipping).astype(np.int64)
|
|
285
|
+
|
|
286
|
+
orders = {
|
|
287
|
+
"order_id": order_ids,
|
|
288
|
+
"ordered_us2000": ordered_us2000,
|
|
289
|
+
"customer_id": customer_ids,
|
|
290
|
+
"channel_idx": channel_idx,
|
|
291
|
+
"status_idx": status_idx,
|
|
292
|
+
"subtotal": subtotal,
|
|
293
|
+
"tax": tax,
|
|
294
|
+
"shipping": shipping,
|
|
295
|
+
"total": total,
|
|
296
|
+
}
|
|
297
|
+
items = {
|
|
298
|
+
"order_id": item_order_ids,
|
|
299
|
+
"product_id": product_ids,
|
|
300
|
+
"qty": qty,
|
|
301
|
+
"unit": unit,
|
|
302
|
+
"line_total": line_total,
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
return orders, items, int(start_order_id + n)
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
# -----------------------------
|
|
309
|
+
# COPY BINARY writers (FAST)
|
|
310
|
+
# -----------------------------
|
|
311
|
+
def copy_orders_binary_fast(
|
|
312
|
+
cur,
|
|
313
|
+
orders,
|
|
314
|
+
note_bytes: bytes,
|
|
315
|
+
buffer_mb: int = 16,
|
|
316
|
+
):
|
|
317
|
+
"""
|
|
318
|
+
Fast COPY BINARY writer for sales.orders.
|
|
319
|
+
- writes into a large bytearray via pack_into
|
|
320
|
+
- flushes by byte size
|
|
321
|
+
"""
|
|
322
|
+
channel_vals = (b"web", b"store", b"marketplace")
|
|
323
|
+
status_vals = (b"paid", b"shipped", b"canceled")
|
|
324
|
+
|
|
325
|
+
channel_field = [S_I32.pack(len(v)) + v for v in channel_vals]
|
|
326
|
+
status_field = [S_I32.pack(len(v)) + v for v in status_vals]
|
|
327
|
+
note_field = S_I32.pack(len(note_bytes)) + note_bytes
|
|
328
|
+
|
|
329
|
+
ncols = 10
|
|
330
|
+
max_text = (
|
|
331
|
+
max(len(x) for x in channel_vals)
|
|
332
|
+
+ max(len(x) for x in status_vals)
|
|
333
|
+
+ len(note_bytes)
|
|
334
|
+
)
|
|
335
|
+
max_row = 2 + (ncols * 4) + (8 + 8 + 8) + (4 * 4) + max_text # conservative
|
|
336
|
+
|
|
337
|
+
buffer_bytes = max(1, buffer_mb) * 1024 * 1024
|
|
338
|
+
|
|
339
|
+
with cur.copy(
|
|
340
|
+
"COPY sales.orders("
|
|
341
|
+
"order_id, ordered_at, customer_id, channel, status, "
|
|
342
|
+
"subtotal_cents, tax_cents, shipping_cents, total_cents, note"
|
|
343
|
+
") FROM STDIN WITH (FORMAT BINARY)"
|
|
344
|
+
) as cp:
|
|
345
|
+
cp.write(PGCOPY_SIGNATURE + S_I32.pack(0) + S_I32.pack(0))
|
|
346
|
+
|
|
347
|
+
oid = orders["order_id"]
|
|
348
|
+
ous = orders["ordered_us2000"]
|
|
349
|
+
cid = orders["customer_id"]
|
|
350
|
+
chx = orders["channel_idx"]
|
|
351
|
+
stx = orders["status_idx"]
|
|
352
|
+
sub = orders["subtotal"]
|
|
353
|
+
tax = orders["tax"]
|
|
354
|
+
shp = orders["shipping"]
|
|
355
|
+
tot = orders["total"]
|
|
356
|
+
|
|
357
|
+
buf = bytearray(buffer_bytes)
|
|
358
|
+
pos = 0
|
|
359
|
+
|
|
360
|
+
for i in range(len(oid)):
|
|
361
|
+
if pos + max_row >= buffer_bytes:
|
|
362
|
+
cp.write(memoryview(buf)[:pos])
|
|
363
|
+
pos = 0
|
|
364
|
+
|
|
365
|
+
S_I16.pack_into(buf, pos, ncols)
|
|
366
|
+
pos += 2
|
|
367
|
+
|
|
368
|
+
# order_id int8
|
|
369
|
+
S_I32.pack_into(buf, pos, 8)
|
|
370
|
+
pos += 4
|
|
371
|
+
S_I64.pack_into(buf, pos, int(oid[i]))
|
|
372
|
+
pos += 8
|
|
373
|
+
|
|
374
|
+
# ordered_at timestamptz int8
|
|
375
|
+
S_I32.pack_into(buf, pos, 8)
|
|
376
|
+
pos += 4
|
|
377
|
+
S_I64.pack_into(buf, pos, int(ous[i]))
|
|
378
|
+
pos += 8
|
|
379
|
+
|
|
380
|
+
# customer_id int8
|
|
381
|
+
S_I32.pack_into(buf, pos, 8)
|
|
382
|
+
pos += 4
|
|
383
|
+
S_I64.pack_into(buf, pos, int(cid[i]))
|
|
384
|
+
pos += 8
|
|
385
|
+
|
|
386
|
+
# channel text (prebuilt)
|
|
387
|
+
cf = channel_field[int(chx[i])]
|
|
388
|
+
buf[pos : pos + len(cf)] = cf
|
|
389
|
+
pos += len(cf)
|
|
390
|
+
|
|
391
|
+
# status text (prebuilt)
|
|
392
|
+
sf = status_field[int(stx[i])]
|
|
393
|
+
buf[pos : pos + len(sf)] = sf
|
|
394
|
+
pos += len(sf)
|
|
395
|
+
|
|
396
|
+
# subtotal int4
|
|
397
|
+
S_I32.pack_into(buf, pos, 4)
|
|
398
|
+
pos += 4
|
|
399
|
+
S_I32.pack_into(buf, pos, int(sub[i]))
|
|
400
|
+
pos += 4
|
|
401
|
+
|
|
402
|
+
# tax int4
|
|
403
|
+
S_I32.pack_into(buf, pos, 4)
|
|
404
|
+
pos += 4
|
|
405
|
+
S_I32.pack_into(buf, pos, int(tax[i]))
|
|
406
|
+
pos += 4
|
|
407
|
+
|
|
408
|
+
# shipping int4
|
|
409
|
+
S_I32.pack_into(buf, pos, 4)
|
|
410
|
+
pos += 4
|
|
411
|
+
S_I32.pack_into(buf, pos, int(shp[i]))
|
|
412
|
+
pos += 4
|
|
413
|
+
|
|
414
|
+
# total int4
|
|
415
|
+
S_I32.pack_into(buf, pos, 4)
|
|
416
|
+
pos += 4
|
|
417
|
+
S_I32.pack_into(buf, pos, int(tot[i]))
|
|
418
|
+
pos += 4
|
|
419
|
+
|
|
420
|
+
# note text (prebuilt)
|
|
421
|
+
buf[pos : pos + len(note_field)] = note_field
|
|
422
|
+
pos += len(note_field)
|
|
423
|
+
|
|
424
|
+
if pos:
|
|
425
|
+
cp.write(memoryview(buf)[:pos])
|
|
426
|
+
|
|
427
|
+
cp.write(S_I16.pack(-1))
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
def copy_items_binary_fast(
|
|
431
|
+
cur,
|
|
432
|
+
items,
|
|
433
|
+
note_bytes: bytes,
|
|
434
|
+
buffer_mb: int = 32,
|
|
435
|
+
):
|
|
436
|
+
"""
|
|
437
|
+
Fast COPY BINARY writer for sales.order_items.
|
|
438
|
+
"""
|
|
439
|
+
note_field = S_I32.pack(len(note_bytes)) + note_bytes
|
|
440
|
+
|
|
441
|
+
ncols = 6
|
|
442
|
+
max_row = 2 + (ncols * 4) + (8 + 8 + 4 + 4 + 4) + len(note_bytes)
|
|
443
|
+
buffer_bytes = max(1, buffer_mb) * 1024 * 1024
|
|
444
|
+
|
|
445
|
+
with cur.copy(
|
|
446
|
+
"COPY sales.order_items("
|
|
447
|
+
"order_id, product_id, qty, unit_price_cents, line_total_cents, note"
|
|
448
|
+
") FROM STDIN WITH (FORMAT BINARY)"
|
|
449
|
+
) as cp:
|
|
450
|
+
cp.write(PGCOPY_SIGNATURE + S_I32.pack(0) + S_I32.pack(0))
|
|
451
|
+
|
|
452
|
+
oid = items["order_id"]
|
|
453
|
+
pid = items["product_id"]
|
|
454
|
+
qty = items["qty"]
|
|
455
|
+
unit = items["unit"]
|
|
456
|
+
line = items["line_total"]
|
|
457
|
+
|
|
458
|
+
buf = bytearray(buffer_bytes)
|
|
459
|
+
pos = 0
|
|
460
|
+
|
|
461
|
+
for i in range(len(oid)):
|
|
462
|
+
if pos + max_row >= buffer_bytes:
|
|
463
|
+
cp.write(memoryview(buf)[:pos])
|
|
464
|
+
pos = 0
|
|
465
|
+
|
|
466
|
+
S_I16.pack_into(buf, pos, ncols)
|
|
467
|
+
pos += 2
|
|
468
|
+
|
|
469
|
+
# order_id int8
|
|
470
|
+
S_I32.pack_into(buf, pos, 8)
|
|
471
|
+
pos += 4
|
|
472
|
+
S_I64.pack_into(buf, pos, int(oid[i]))
|
|
473
|
+
pos += 8
|
|
474
|
+
|
|
475
|
+
# product_id int8
|
|
476
|
+
S_I32.pack_into(buf, pos, 8)
|
|
477
|
+
pos += 4
|
|
478
|
+
S_I64.pack_into(buf, pos, int(pid[i]))
|
|
479
|
+
pos += 8
|
|
480
|
+
|
|
481
|
+
# qty int4
|
|
482
|
+
S_I32.pack_into(buf, pos, 4)
|
|
483
|
+
pos += 4
|
|
484
|
+
S_I32.pack_into(buf, pos, int(qty[i]))
|
|
485
|
+
pos += 4
|
|
486
|
+
|
|
487
|
+
# unit_price int4
|
|
488
|
+
S_I32.pack_into(buf, pos, 4)
|
|
489
|
+
pos += 4
|
|
490
|
+
S_I32.pack_into(buf, pos, int(unit[i]))
|
|
491
|
+
pos += 4
|
|
492
|
+
|
|
493
|
+
# line_total int4
|
|
494
|
+
S_I32.pack_into(buf, pos, 4)
|
|
495
|
+
pos += 4
|
|
496
|
+
S_I32.pack_into(buf, pos, int(line[i]))
|
|
497
|
+
pos += 4
|
|
498
|
+
|
|
499
|
+
# note
|
|
500
|
+
buf[pos : pos + len(note_field)] = note_field
|
|
501
|
+
pos += len(note_field)
|
|
502
|
+
|
|
503
|
+
if pos:
|
|
504
|
+
cp.write(memoryview(buf)[:pos])
|
|
505
|
+
|
|
506
|
+
cp.write(S_I16.pack(-1))
|
|
507
|
+
|
|
508
|
+
|
|
509
|
+
# -----------------------------
|
|
510
|
+
# Master fillers
|
|
511
|
+
# -----------------------------
|
|
512
|
+
def fill_masters(conn, customers: int, products: int) -> None:
|
|
513
|
+
conn.execute(
|
|
514
|
+
"""
|
|
515
|
+
INSERT INTO sales.customers(name, email, region)
|
|
516
|
+
SELECT
|
|
517
|
+
'Customer-' || gs::text,
|
|
518
|
+
'user' || gs::text || '@example.com',
|
|
519
|
+
(ARRAY['JP','US','EU','APAC','LATAM'])[1 + (random()*4)::int]
|
|
520
|
+
FROM generate_series(1, %s) gs;
|
|
521
|
+
""",
|
|
522
|
+
(customers,),
|
|
523
|
+
)
|
|
524
|
+
|
|
525
|
+
conn.execute(
|
|
526
|
+
"""
|
|
527
|
+
INSERT INTO sales.products(sku, name, category, price_cents)
|
|
528
|
+
SELECT
|
|
529
|
+
'SKU-' || gs::text,
|
|
530
|
+
'Product-' || gs::text,
|
|
531
|
+
(ARRAY['food','apparel','home','electronics','book','beauty'])[1 + (random()*5)::int],
|
|
532
|
+
(500 + (random()*20000)::int)
|
|
533
|
+
FROM generate_series(1, %s) gs;
|
|
534
|
+
""",
|
|
535
|
+
(products,),
|
|
536
|
+
)
|
|
537
|
+
conn.commit()
|
|
538
|
+
|
|
539
|
+
|
|
540
|
+
# -----------------------------
|
|
541
|
+
# Monitoring
|
|
542
|
+
# -----------------------------
|
|
543
|
+
def current_total_gb(conn) -> float:
|
|
544
|
+
b = conn.execute(
|
|
545
|
+
"SELECT pg_total_relation_size('sales.orders'::regclass) + "
|
|
546
|
+
"pg_total_relation_size('sales.order_items'::regclass)"
|
|
547
|
+
).fetchone()[0]
|
|
548
|
+
return float(b) / (1024.0**3)
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
# -----------------------------
|
|
552
|
+
# Worker
|
|
553
|
+
# -----------------------------
|
|
554
|
+
def worker_proc(
|
|
555
|
+
worker_id: int,
|
|
556
|
+
dsn: str,
|
|
557
|
+
q: Queue,
|
|
558
|
+
stop_evt: Event,
|
|
559
|
+
cfg: BatchConfig,
|
|
560
|
+
base_seed: int,
|
|
561
|
+
order_id_stride: int,
|
|
562
|
+
batch_orders: int,
|
|
563
|
+
copy_orders_buf_mb: int,
|
|
564
|
+
copy_items_buf_mb: int,
|
|
565
|
+
):
|
|
566
|
+
rng = np.random.default_rng(base_seed + worker_id)
|
|
567
|
+
conn = psycopg.connect(dsn)
|
|
568
|
+
|
|
569
|
+
# note bytes: pre-encoded once per worker
|
|
570
|
+
ord_note_bytes = fixed_note(cfg.order_note_len, f"w{worker_id:02d}-ORDER-").encode(
|
|
571
|
+
"utf-8"
|
|
572
|
+
)
|
|
573
|
+
itm_note_bytes = fixed_note(cfg.item_note_len, f"w{worker_id:02d}-ITEM-").encode(
|
|
574
|
+
"utf-8"
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
try:
|
|
578
|
+
with conn.cursor() as cur:
|
|
579
|
+
cur.execute("SET synchronous_commit=off")
|
|
580
|
+
cur.execute("SET client_min_messages=warning")
|
|
581
|
+
cur.execute("SET work_mem='256MB'")
|
|
582
|
+
|
|
583
|
+
next_order_id = 1 + worker_id * order_id_stride
|
|
584
|
+
|
|
585
|
+
while True:
|
|
586
|
+
msg = q.get() # blocks
|
|
587
|
+
if msg is None:
|
|
588
|
+
break
|
|
589
|
+
if stop_evt.is_set():
|
|
590
|
+
break
|
|
591
|
+
|
|
592
|
+
orders, items, next_order_id = generate_batch_numpy(
|
|
593
|
+
rng=rng,
|
|
594
|
+
start_order_id=next_order_id,
|
|
595
|
+
batch_orders=batch_orders,
|
|
596
|
+
cfg=cfg,
|
|
597
|
+
)
|
|
598
|
+
|
|
599
|
+
with conn.cursor() as cur:
|
|
600
|
+
copy_orders_binary_fast(
|
|
601
|
+
cur, orders, note_bytes=ord_note_bytes, buffer_mb=copy_orders_buf_mb
|
|
602
|
+
)
|
|
603
|
+
copy_items_binary_fast(
|
|
604
|
+
cur, items, note_bytes=itm_note_bytes, buffer_mb=copy_items_buf_mb
|
|
605
|
+
)
|
|
606
|
+
|
|
607
|
+
conn.commit()
|
|
608
|
+
|
|
609
|
+
except Exception as e:
|
|
610
|
+
print(f"[worker {worker_id}] ERROR: {e}", file=sys.stderr)
|
|
611
|
+
try:
|
|
612
|
+
conn.rollback()
|
|
613
|
+
except Exception:
|
|
614
|
+
pass
|
|
615
|
+
raise
|
|
616
|
+
finally:
|
|
617
|
+
conn.close()
|
|
618
|
+
|
|
619
|
+
|
|
620
|
+
# -----------------------------
|
|
621
|
+
# Main
|
|
622
|
+
# -----------------------------
|
|
623
|
+
def main() -> int:
|
|
624
|
+
# argparse default -h conflicts with psql's -h(host).
|
|
625
|
+
ap = argparse.ArgumentParser(
|
|
626
|
+
description="Empty DB -> create schema/tables -> fill masters -> generate sales-like data FAST.",
|
|
627
|
+
add_help=False,
|
|
628
|
+
)
|
|
629
|
+
ap.add_argument(
|
|
630
|
+
"--help", "-?", action="help", help="show this help message and exit"
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
# Connection (psql-compatible)
|
|
634
|
+
ap.add_argument(
|
|
635
|
+
"--dsn",
|
|
636
|
+
default=os.environ.get("PG_DSN"),
|
|
637
|
+
help="libpq DSN. Overrides -h/-p/-U/-d.",
|
|
638
|
+
)
|
|
639
|
+
ap.add_argument(
|
|
640
|
+
"-h",
|
|
641
|
+
"--host",
|
|
642
|
+
default=None,
|
|
643
|
+
help="database server host or socket directory (psql compatible).",
|
|
644
|
+
)
|
|
645
|
+
ap.add_argument(
|
|
646
|
+
"-p",
|
|
647
|
+
"--port",
|
|
648
|
+
type=int,
|
|
649
|
+
default=None,
|
|
650
|
+
help="database server port (psql compatible).",
|
|
651
|
+
)
|
|
652
|
+
ap.add_argument(
|
|
653
|
+
"-U", "--user", default=None, help="database user name (psql compatible)."
|
|
654
|
+
)
|
|
655
|
+
ap.add_argument(
|
|
656
|
+
"-d", "--dbname", default=None, help="database name (psql compatible)."
|
|
657
|
+
)
|
|
658
|
+
ap.add_argument(
|
|
659
|
+
"--password",
|
|
660
|
+
default=None,
|
|
661
|
+
help="database password (or use PGPASSWORD env / .pgpass).",
|
|
662
|
+
)
|
|
663
|
+
ap.add_argument(
|
|
664
|
+
"--sslmode", default=None, help="sslmode (require, verify-full, etc.)."
|
|
665
|
+
)
|
|
666
|
+
ap.add_argument(
|
|
667
|
+
"--options",
|
|
668
|
+
default=None,
|
|
669
|
+
help='libpq options string (e.g., "-c statement_timeout=0").',
|
|
670
|
+
)
|
|
671
|
+
ap.add_argument(
|
|
672
|
+
"--print-psql",
|
|
673
|
+
action="store_true",
|
|
674
|
+
help="Print equivalent psql command and exit.",
|
|
675
|
+
)
|
|
676
|
+
|
|
677
|
+
# DDL options
|
|
678
|
+
ap.add_argument(
|
|
679
|
+
"--logged",
|
|
680
|
+
action="store_true",
|
|
681
|
+
help="Create LOGGED tables (default UNLOGGED for speed).",
|
|
682
|
+
)
|
|
683
|
+
ap.add_argument(
|
|
684
|
+
"--with-fk", action="store_true", help="Create foreign keys (slower)."
|
|
685
|
+
)
|
|
686
|
+
ap.add_argument(
|
|
687
|
+
"--create-indexes",
|
|
688
|
+
action="store_true",
|
|
689
|
+
help="Create typical indexes + ANALYZE after load.",
|
|
690
|
+
)
|
|
691
|
+
|
|
692
|
+
# Masters
|
|
693
|
+
ap.add_argument(
|
|
694
|
+
"--customers",
|
|
695
|
+
type=int,
|
|
696
|
+
default=2_000_000,
|
|
697
|
+
help="Number of customers to generate.",
|
|
698
|
+
)
|
|
699
|
+
ap.add_argument(
|
|
700
|
+
"--products", type=int, default=200_000, help="Number of products to generate."
|
|
701
|
+
)
|
|
702
|
+
|
|
703
|
+
# Generation
|
|
704
|
+
ap.add_argument(
|
|
705
|
+
"--target-gb",
|
|
706
|
+
type=float,
|
|
707
|
+
default=100.0,
|
|
708
|
+
help="Target size (orders+items) in GB.",
|
|
709
|
+
)
|
|
710
|
+
ap.add_argument(
|
|
711
|
+
"--workers",
|
|
712
|
+
type=int,
|
|
713
|
+
default=max(1, os.cpu_count() or 1),
|
|
714
|
+
help="Number of worker processes.",
|
|
715
|
+
)
|
|
716
|
+
ap.add_argument(
|
|
717
|
+
"--batch-orders", type=int, default=200_000, help="Orders per batch per worker."
|
|
718
|
+
)
|
|
719
|
+
ap.add_argument(
|
|
720
|
+
"--avg-items",
|
|
721
|
+
type=float,
|
|
722
|
+
default=3.2,
|
|
723
|
+
help="Average number of items per order.",
|
|
724
|
+
)
|
|
725
|
+
ap.add_argument("--max-items", type=int, default=12, help="Max items per order.")
|
|
726
|
+
ap.add_argument("--order-note-len", type=int, default=80, help="Order note length.")
|
|
727
|
+
ap.add_argument("--item-note-len", type=int, default=120, help="Item note length.")
|
|
728
|
+
ap.add_argument(
|
|
729
|
+
"--start-date", default="2022-01-01", help="Start date YYYY-MM-DD (UTC)."
|
|
730
|
+
)
|
|
731
|
+
ap.add_argument(
|
|
732
|
+
"--end-date", default="2026-01-01", help="End date YYYY-MM-DD (UTC)."
|
|
733
|
+
)
|
|
734
|
+
ap.add_argument(
|
|
735
|
+
"--unit-price-min", type=int, default=500, help="Min unit price (cents)."
|
|
736
|
+
)
|
|
737
|
+
ap.add_argument(
|
|
738
|
+
"--unit-price-max", type=int, default=20_500, help="Max unit price (cents)."
|
|
739
|
+
)
|
|
740
|
+
ap.add_argument(
|
|
741
|
+
"--tax-rate", type=float, default=0.10, help="Tax rate (e.g., 0.10)."
|
|
742
|
+
)
|
|
743
|
+
ap.add_argument(
|
|
744
|
+
"--shipping-threshold",
|
|
745
|
+
type=int,
|
|
746
|
+
default=5000,
|
|
747
|
+
help="Free shipping threshold (cents).",
|
|
748
|
+
)
|
|
749
|
+
ap.add_argument(
|
|
750
|
+
"--shipping-fee",
|
|
751
|
+
type=int,
|
|
752
|
+
default=500,
|
|
753
|
+
help="Shipping fee under threshold (cents).",
|
|
754
|
+
)
|
|
755
|
+
ap.add_argument(
|
|
756
|
+
"--progress-interval",
|
|
757
|
+
type=float,
|
|
758
|
+
default=2.0,
|
|
759
|
+
help="Seconds between progress prints.",
|
|
760
|
+
)
|
|
761
|
+
ap.add_argument("--seed", type=int, default=12345, help="Base RNG seed.")
|
|
762
|
+
ap.add_argument(
|
|
763
|
+
"--order-id-stride",
|
|
764
|
+
type=int,
|
|
765
|
+
default=10_000_000_000,
|
|
766
|
+
help="Per-worker order_id stride (must exceed total orders per worker).",
|
|
767
|
+
)
|
|
768
|
+
|
|
769
|
+
# COPY buffer sizes (MB)
|
|
770
|
+
ap.add_argument(
|
|
771
|
+
"--copy-orders-buf-mb",
|
|
772
|
+
type=int,
|
|
773
|
+
default=16,
|
|
774
|
+
help="COPY buffer for orders (MB).",
|
|
775
|
+
)
|
|
776
|
+
ap.add_argument(
|
|
777
|
+
"--copy-items-buf-mb", type=int, default=32, help="COPY buffer for items (MB)."
|
|
778
|
+
)
|
|
779
|
+
|
|
780
|
+
# shutdown behavior
|
|
781
|
+
ap.add_argument(
|
|
782
|
+
"--join-timeout-sec",
|
|
783
|
+
type=float,
|
|
784
|
+
default=0.0,
|
|
785
|
+
help="If >0, timeout seconds for joining each worker. 0 means wait indefinitely.",
|
|
786
|
+
)
|
|
787
|
+
|
|
788
|
+
args = ap.parse_args()
|
|
789
|
+
dsn = build_libpq_dsn(args)
|
|
790
|
+
|
|
791
|
+
if args.print_psql:
|
|
792
|
+
print(psql_equivalent_cmd(args))
|
|
793
|
+
return 0
|
|
794
|
+
|
|
795
|
+
start_dt = parse_ymd(args.start_date)
|
|
796
|
+
end_dt = parse_ymd(args.end_date)
|
|
797
|
+
span = int((end_dt - start_dt).total_seconds())
|
|
798
|
+
if span <= 0:
|
|
799
|
+
print("end-date must be after start-date", file=sys.stderr)
|
|
800
|
+
return 2
|
|
801
|
+
|
|
802
|
+
# Coordinator connection
|
|
803
|
+
coord = psycopg.connect(dsn)
|
|
804
|
+
coord.execute("SET client_min_messages=warning")
|
|
805
|
+
coord.execute("SET synchronous_commit=off")
|
|
806
|
+
|
|
807
|
+
print("[setup] creating schema/tables...")
|
|
808
|
+
create_schema_and_tables(coord, logged=args.logged, with_fk=args.with_fk)
|
|
809
|
+
|
|
810
|
+
print(
|
|
811
|
+
f"[setup] inserting masters: customers={args.customers:,} products={args.products:,} ..."
|
|
812
|
+
)
|
|
813
|
+
fill_masters(coord, customers=args.customers, products=args.products)
|
|
814
|
+
|
|
815
|
+
cfg = BatchConfig(
|
|
816
|
+
avg_items=args.avg_items,
|
|
817
|
+
max_items=args.max_items,
|
|
818
|
+
order_note_len=args.order_note_len,
|
|
819
|
+
item_note_len=args.item_note_len,
|
|
820
|
+
start_unix=int(start_dt.timestamp()),
|
|
821
|
+
span_seconds=span,
|
|
822
|
+
cust_max=args.customers,
|
|
823
|
+
prod_max=args.products,
|
|
824
|
+
unit_price_min=args.unit_price_min,
|
|
825
|
+
unit_price_max=args.unit_price_max,
|
|
826
|
+
shipping_threshold=args.shipping_threshold,
|
|
827
|
+
shipping_fee=args.shipping_fee,
|
|
828
|
+
tax_rate=args.tax_rate,
|
|
829
|
+
)
|
|
830
|
+
|
|
831
|
+
# Work queue
|
|
832
|
+
q: Queue = Queue(maxsize=args.workers * 4)
|
|
833
|
+
stop_evt = Event()
|
|
834
|
+
|
|
835
|
+
# Start workers (NOT daemon: allow clean join)
|
|
836
|
+
procs: list[Process] = []
|
|
837
|
+
for wid in range(args.workers):
|
|
838
|
+
p = Process(
|
|
839
|
+
target=worker_proc,
|
|
840
|
+
args=(
|
|
841
|
+
wid,
|
|
842
|
+
dsn,
|
|
843
|
+
q,
|
|
844
|
+
stop_evt,
|
|
845
|
+
cfg,
|
|
846
|
+
args.seed,
|
|
847
|
+
args.order_id_stride,
|
|
848
|
+
args.batch_orders,
|
|
849
|
+
args.copy_orders_buf_mb,
|
|
850
|
+
args.copy_items_buf_mb,
|
|
851
|
+
),
|
|
852
|
+
daemon=False,
|
|
853
|
+
)
|
|
854
|
+
p.start()
|
|
855
|
+
procs.append(p)
|
|
856
|
+
|
|
857
|
+
try:
|
|
858
|
+
last_print = 0.0
|
|
859
|
+
|
|
860
|
+
while True:
|
|
861
|
+
gb = current_total_gb(coord)
|
|
862
|
+
now = time.time()
|
|
863
|
+
if now - last_print >= args.progress_interval:
|
|
864
|
+
print(f"[progress] {gb:.2f} GB / {args.target_gb:.2f} GB")
|
|
865
|
+
last_print = now
|
|
866
|
+
|
|
867
|
+
if gb >= args.target_gb:
|
|
868
|
+
break
|
|
869
|
+
|
|
870
|
+
# enqueue one batch per worker
|
|
871
|
+
for _ in range(args.workers):
|
|
872
|
+
q.put(1)
|
|
873
|
+
|
|
874
|
+
print("[done] target reached; stopping workers...")
|
|
875
|
+
|
|
876
|
+
finally:
|
|
877
|
+
# Normal stop path: send sentinels so workers break out of q.get()
|
|
878
|
+
stop_evt.set()
|
|
879
|
+
|
|
880
|
+
# Make sure we enqueue enough sentinels even if queue is partly full.
|
|
881
|
+
# Block until they're all sent.
|
|
882
|
+
for _ in procs:
|
|
883
|
+
q.put(None)
|
|
884
|
+
|
|
885
|
+
# Clean up queue feeder threads
|
|
886
|
+
try:
|
|
887
|
+
q.close()
|
|
888
|
+
q.join_thread()
|
|
889
|
+
except Exception:
|
|
890
|
+
pass
|
|
891
|
+
|
|
892
|
+
# Join workers reliably
|
|
893
|
+
for p in procs:
|
|
894
|
+
if args.join_timeout_sec and args.join_timeout_sec > 0:
|
|
895
|
+
p.join(timeout=args.join_timeout_sec)
|
|
896
|
+
else:
|
|
897
|
+
p.join()
|
|
898
|
+
|
|
899
|
+
if p.exitcode not in (0, None):
|
|
900
|
+
print(f"[warn] worker exited with code {p.exitcode}", file=sys.stderr)
|
|
901
|
+
|
|
902
|
+
coord.close()
|
|
903
|
+
|
|
904
|
+
# Optional indexes after load
|
|
905
|
+
if args.create_indexes:
|
|
906
|
+
# need a new connection because coord is closed above
|
|
907
|
+
coord2 = psycopg.connect(dsn)
|
|
908
|
+
try:
|
|
909
|
+
print("[post] creating indexes + analyze...")
|
|
910
|
+
create_indexes(coord2)
|
|
911
|
+
finally:
|
|
912
|
+
coord2.close()
|
|
913
|
+
|
|
914
|
+
return 0
|
|
915
|
+
|
|
916
|
+
|
|
917
|
+
if __name__ == "__main__":
|
|
918
|
+
raise SystemExit(main())
|