dbworkload 0.6.2__tar.gz → 0.6.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dbworkload-0.6.2 → dbworkload-0.6.4}/PKG-INFO +1 -1
- {dbworkload-0.6.2 → dbworkload-0.6.4}/dbworkload/cli/main.py +1 -0
- {dbworkload-0.6.2 → dbworkload-0.6.4}/dbworkload/cli/util.py +10 -4
- {dbworkload-0.6.2 → dbworkload-0.6.4}/dbworkload/models/util.py +61 -33
- {dbworkload-0.6.2 → dbworkload-0.6.4}/dbworkload/utils/common.py +1 -1
- {dbworkload-0.6.2 → dbworkload-0.6.4}/dbworkload/utils/simplefaker.py +50 -45
- {dbworkload-0.6.2 → dbworkload-0.6.4}/pyproject.toml +1 -1
- {dbworkload-0.6.2 → dbworkload-0.6.4}/LICENSE +0 -0
- {dbworkload-0.6.2 → dbworkload-0.6.4}/README.md +0 -0
- {dbworkload-0.6.2 → dbworkload-0.6.4}/dbworkload/__init__.py +0 -0
- {dbworkload-0.6.2 → dbworkload-0.6.4}/dbworkload/cli/dep.py +0 -0
- {dbworkload-0.6.2 → dbworkload-0.6.4}/dbworkload/models/run.py +0 -0
- {dbworkload-0.6.2 → dbworkload-0.6.4}/dbworkload/templates/stub.j2 +0 -0
|
@@ -134,12 +134,12 @@ def util_yaml(
|
|
|
134
134
|
|
|
135
135
|
|
|
136
136
|
@app.command(
|
|
137
|
-
"
|
|
137
|
+
"merge_sort",
|
|
138
138
|
epilog=EPILOG,
|
|
139
139
|
no_args_is_help=True,
|
|
140
|
-
help="Merge multiple sorted CSV files into 1+ files.",
|
|
140
|
+
help="Merge-Sort multiple sorted CSV files into 1+ files.",
|
|
141
141
|
)
|
|
142
|
-
def
|
|
142
|
+
def util_sort_merge(
|
|
143
143
|
input: Optional[Path] = typer.Option(
|
|
144
144
|
...,
|
|
145
145
|
"--input",
|
|
@@ -166,8 +166,14 @@ def util_merge(
|
|
|
166
166
|
resolve_path=True,
|
|
167
167
|
),
|
|
168
168
|
csv_max_rows: int = Param.CSVMaxRows,
|
|
169
|
+
compress: bool = typer.Option(
|
|
170
|
+
True,
|
|
171
|
+
"--no-compress",
|
|
172
|
+
show_default=False,
|
|
173
|
+
help="Do not gzip output files.",
|
|
174
|
+
),
|
|
169
175
|
):
|
|
170
|
-
dbworkload.models.util.
|
|
176
|
+
dbworkload.models.util.util_merge_sort(input, output, csv_max_rows, compress)
|
|
171
177
|
|
|
172
178
|
|
|
173
179
|
@app.command(
|
|
@@ -5,23 +5,25 @@ from jinja2 import Environment, PackageLoader
|
|
|
5
5
|
from pathlib import PosixPath
|
|
6
6
|
from plotly.subplots import make_subplots
|
|
7
7
|
from pytdigest import TDigest
|
|
8
|
-
import dbworkload
|
|
9
8
|
import datetime as dt
|
|
9
|
+
import dbworkload
|
|
10
10
|
import dbworkload.utils.common
|
|
11
11
|
import dbworkload.utils.simplefaker
|
|
12
|
+
import gzip
|
|
12
13
|
import itertools
|
|
13
14
|
import logging
|
|
14
15
|
import numpy as np
|
|
15
16
|
import os
|
|
16
17
|
import pandas as pd
|
|
17
|
-
import pandas as pd
|
|
18
18
|
import plotext as plt
|
|
19
19
|
import plotly.graph_objects as go
|
|
20
20
|
import plotly.io as pio
|
|
21
|
+
import shutil
|
|
21
22
|
import sqlparse
|
|
22
23
|
import sys
|
|
23
24
|
import yaml
|
|
24
25
|
|
|
26
|
+
|
|
25
27
|
logger = logging.getLogger("dbworkload")
|
|
26
28
|
logger.setLevel(logging.INFO)
|
|
27
29
|
|
|
@@ -111,9 +113,13 @@ def util_yaml(input: str, output: str):
|
|
|
111
113
|
f.write(dbworkload.utils.common.ddl_to_yaml(ddl))
|
|
112
114
|
|
|
113
115
|
|
|
114
|
-
def
|
|
115
|
-
|
|
116
|
-
|
|
116
|
+
def util_merge_sort(input_dir: str, output_dir: str, csv_max_rows: int, compress: bool):
|
|
117
|
+
from operator import itemgetter
|
|
118
|
+
|
|
119
|
+
class MergeSort:
|
|
120
|
+
def __init__(
|
|
121
|
+
self, input_dir: str, output_dir: str, csv_max_rows: int, compress: bool
|
|
122
|
+
):
|
|
117
123
|
# input CSV files - it assumes files are already sorted
|
|
118
124
|
files = os.listdir(input_dir)
|
|
119
125
|
# Filtering only the files.
|
|
@@ -123,12 +129,18 @@ def util_merge(input_dir: str, output_dir: str, csv_max_rows: int):
|
|
|
123
129
|
if os.path.isfile(os.path.join(input_dir, f))
|
|
124
130
|
]
|
|
125
131
|
|
|
132
|
+
self.compress = ".gz" if compress else ""
|
|
133
|
+
self.file_extension = self.CSVs[0][-3:]
|
|
134
|
+
|
|
126
135
|
self.CSV_MAX_ROWS = csv_max_rows
|
|
127
136
|
self.COUNTER = 0
|
|
128
137
|
self.C = 0
|
|
129
138
|
|
|
139
|
+
# source holds the list of lines in each CSV file, marked by the idx number
|
|
140
|
+
# file_handlers holds a the open file handler for each CSV file, marked by the idx number
|
|
130
141
|
self.source: dict[int, list] = {}
|
|
131
142
|
self.file_handlers: dict[int, TextIOWrapper] = {}
|
|
143
|
+
|
|
132
144
|
self.output: TextIOWrapper
|
|
133
145
|
if not output_dir:
|
|
134
146
|
self.output_dir = str(input_dir) + ".merged"
|
|
@@ -160,7 +172,7 @@ def util_merge(input_dir: str, output_dir: str, csv_max_rows: int):
|
|
|
160
172
|
self.source[idx].append(line)
|
|
161
173
|
else:
|
|
162
174
|
# reached end of file
|
|
163
|
-
logger.
|
|
175
|
+
logger.debug(
|
|
164
176
|
f"initial_fill: CSV file '{csv}' at source index {idx} reached EOF."
|
|
165
177
|
)
|
|
166
178
|
f.close()
|
|
@@ -179,23 +191,41 @@ def util_merge(input_dir: str, output_dir: str, csv_max_rows: int):
|
|
|
179
191
|
self.source[idx].append(line)
|
|
180
192
|
else:
|
|
181
193
|
# reached end of file
|
|
182
|
-
logger.
|
|
194
|
+
logger.debug(f"index {idx} reached EOF.")
|
|
183
195
|
f.close()
|
|
184
196
|
del self.file_handlers[idx]
|
|
185
197
|
except Exception as e:
|
|
186
198
|
logger.error("Excepton in replenish_queue: ", e)
|
|
187
199
|
|
|
200
|
+
def close_output(self):
|
|
201
|
+
self.output.close()
|
|
202
|
+
|
|
203
|
+
if self.compress:
|
|
204
|
+
with open(self.output.name, "rb") as f_in:
|
|
205
|
+
with gzip.open(f"{self.output.name}{self.compress}", "wb") as f_out:
|
|
206
|
+
shutil.copyfileobj(f_in, f_out)
|
|
207
|
+
os.remove(self.output.name)
|
|
208
|
+
|
|
209
|
+
logger.info(f"Saved {self.output_filename}{self.compress}")
|
|
210
|
+
|
|
211
|
+
def open_new_output(self):
|
|
212
|
+
self.output_filename = (
|
|
213
|
+
f"out_{str.zfill(str(self.COUNTER), 6)}.{self.file_extension}"
|
|
214
|
+
)
|
|
215
|
+
self.output = open(
|
|
216
|
+
os.path.join(self.output_dir, self.output_filename),
|
|
217
|
+
"+w",
|
|
218
|
+
)
|
|
219
|
+
|
|
188
220
|
def write_to_csv(self, v: str):
|
|
221
|
+
# create a new output file if the limit is reached
|
|
189
222
|
if self.C >= self.CSV_MAX_ROWS:
|
|
190
|
-
self.
|
|
223
|
+
self.close_output()
|
|
224
|
+
|
|
191
225
|
self.COUNTER += 1
|
|
192
226
|
self.C = 0
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
self.output_dir, f"out_{str.zfill(str(self.COUNTER), 3)}.csv"
|
|
196
|
-
),
|
|
197
|
-
"+w",
|
|
198
|
-
)
|
|
227
|
+
|
|
228
|
+
self.open_new_output()
|
|
199
229
|
|
|
200
230
|
self.output.write(v)
|
|
201
231
|
self.C += 1
|
|
@@ -209,56 +239,54 @@ def util_merge(input_dir: str, output_dir: str, csv_max_rows: int):
|
|
|
209
239
|
self.initial_fill(csv, idx)
|
|
210
240
|
|
|
211
241
|
# the source dict now has a key for every file and a list of the first values read
|
|
242
|
+
# the file_handler dict has a key for every file and a pointer to the open file handler
|
|
212
243
|
|
|
213
|
-
|
|
214
|
-
# pop the first value in each source to
|
|
215
|
-
# `
|
|
244
|
+
staging = []
|
|
245
|
+
# pop the first value in each source list to list `staging`
|
|
246
|
+
# `staging` will have the first values of all source CSV files
|
|
216
247
|
for k, v in self.source.items():
|
|
217
248
|
try:
|
|
218
|
-
|
|
249
|
+
staging.append((v.pop(0), k))
|
|
219
250
|
except IndexError as e:
|
|
220
251
|
pass
|
|
252
|
+
from pprint import pprint
|
|
221
253
|
|
|
222
254
|
first_k = None
|
|
223
255
|
first_v = None
|
|
224
|
-
self.
|
|
225
|
-
os.path.join(
|
|
226
|
-
self.output_dir, f"out_{str.zfill(str(self.COUNTER), 3)}.csv"
|
|
227
|
-
),
|
|
228
|
-
"+w",
|
|
229
|
-
)
|
|
256
|
+
self.open_new_output()
|
|
230
257
|
|
|
231
|
-
# sort list `
|
|
258
|
+
# sort list `staging`
|
|
232
259
|
# pop the first value (the smallest) in `first_v`
|
|
233
260
|
# make a note of the source of that value in `first_k`
|
|
234
261
|
# replenish the corrisponding source
|
|
262
|
+
|
|
235
263
|
while True:
|
|
236
264
|
if first_k is not None:
|
|
237
265
|
try:
|
|
238
266
|
self.replenish_source_list(first_k)
|
|
239
|
-
|
|
267
|
+
staging.append((self.source[first_k].pop(0), first_k))
|
|
240
268
|
|
|
241
269
|
except IndexError as e:
|
|
242
270
|
# the source list is empty
|
|
243
|
-
logger.
|
|
271
|
+
logger.debug(f"source list {first_k} is now empty")
|
|
244
272
|
first_k = None
|
|
245
273
|
|
|
246
|
-
if
|
|
247
|
-
|
|
274
|
+
if staging:
|
|
275
|
+
staging.sort(key=itemgetter(0))
|
|
248
276
|
try:
|
|
249
|
-
first_v, first_k =
|
|
277
|
+
first_v, first_k = staging.pop(0)
|
|
250
278
|
self.write_to_csv(first_v)
|
|
251
279
|
except IndexError as e:
|
|
252
|
-
logger.
|
|
280
|
+
logger.warning("Exception in main: ", e)
|
|
253
281
|
self.output.close()
|
|
254
282
|
else:
|
|
255
283
|
break
|
|
256
284
|
|
|
257
|
-
self.
|
|
285
|
+
self.close_output()
|
|
258
286
|
|
|
259
287
|
logger.info("Completed")
|
|
260
288
|
|
|
261
|
-
|
|
289
|
+
MergeSort(input_dir, output_dir, csv_max_rows, compress).run()
|
|
262
290
|
|
|
263
291
|
|
|
264
292
|
def util_plot(input: PosixPath):
|
|
@@ -157,7 +157,7 @@ class Stats:
|
|
|
157
157
|
int(self.cumulative_counts[id].weight),
|
|
158
158
|
int(self.cumulative_counts[id].weight // elapsed),
|
|
159
159
|
int(td.weight),
|
|
160
|
-
int(td.weight //
|
|
160
|
+
int(td.weight // window_elapsed),
|
|
161
161
|
round(td.mean * 1000, 2),
|
|
162
162
|
] + [round(x * 1000, 2) for x in td.inverse_cdf(self.quantiles)]
|
|
163
163
|
|
|
@@ -7,6 +7,7 @@ import pandas as pd
|
|
|
7
7
|
import uuid
|
|
8
8
|
import random
|
|
9
9
|
import builtins
|
|
10
|
+
from .common import import_class_at_runtime
|
|
10
11
|
|
|
11
12
|
logger = logging.getLogger("dbworkload")
|
|
12
13
|
|
|
@@ -584,6 +585,9 @@ class SimpleFaker:
|
|
|
584
585
|
return [SimpleFaker.Bit(seed=s, **args) for s in seeds]
|
|
585
586
|
elif obj_type == "bytes":
|
|
586
587
|
return [SimpleFaker.Bytes(seed=s, **args) for s in seeds]
|
|
588
|
+
elif obj_type == "custom":
|
|
589
|
+
custom_gen = import_class_at_runtime(args.pop("path"))
|
|
590
|
+
return [custom_gen(seed=s, **args) for s in seeds]
|
|
587
591
|
else:
|
|
588
592
|
raise ValueError(
|
|
589
593
|
f"SimpleFaker type not implemented or recognized: '{obj_type}'"
|
|
@@ -610,47 +614,57 @@ class SimpleFaker:
|
|
|
610
614
|
separator (str): the field delimiter in the CSV file
|
|
611
615
|
compression (str): the compression format (gzip, zip, None..)
|
|
612
616
|
"""
|
|
617
|
+
|
|
618
|
+
def gen_to_csv(iters: int):
|
|
619
|
+
# create individual Series and then concat them together
|
|
620
|
+
df = pd.concat(
|
|
621
|
+
[pd.Series([next(gen) for _ in range(iters)]) for gen in generators],
|
|
622
|
+
axis=1,
|
|
623
|
+
keys=col_names,
|
|
624
|
+
)
|
|
625
|
+
|
|
626
|
+
# get a list of the colums that are not to be sorted by
|
|
627
|
+
remaining = list(set(col_names) - set(sort_by))
|
|
628
|
+
|
|
629
|
+
# create a dataframe by concatenating:
|
|
630
|
+
# 1 - the df subset with the sort_by columns sorted by the sort_by columns
|
|
631
|
+
# 2 - the df subset with the remaining columns
|
|
632
|
+
# finally order the columns by the original col_names
|
|
633
|
+
# then save to csv
|
|
634
|
+
pd.concat(
|
|
635
|
+
[
|
|
636
|
+
df[sort_by].sort_values(sort_by).reset_index(drop=True),
|
|
637
|
+
df[remaining],
|
|
638
|
+
],
|
|
639
|
+
axis=1,
|
|
640
|
+
)[col_names].to_csv(
|
|
641
|
+
basename + "_" + str(counter) + suffix,
|
|
642
|
+
quoting=csv.QUOTE_MINIMAL,
|
|
643
|
+
sep=separator,
|
|
644
|
+
header=False,
|
|
645
|
+
index=False,
|
|
646
|
+
compression=compression,
|
|
647
|
+
)
|
|
648
|
+
|
|
613
649
|
logger.debug("SimpleFaker worker created")
|
|
614
650
|
if iterations > self.csv_max_rows:
|
|
615
|
-
count =
|
|
651
|
+
count = iterations // self.csv_max_rows
|
|
616
652
|
rem = iterations % self.csv_max_rows
|
|
617
653
|
iterations = self.csv_max_rows
|
|
618
654
|
else:
|
|
619
655
|
count = 1
|
|
620
656
|
rem = 0
|
|
621
657
|
|
|
622
|
-
if separator == "\t"
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
suffix
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
suffix += ".zip"
|
|
631
|
-
elif compression == "bz2":
|
|
632
|
-
suffix += ".bz2"
|
|
633
|
-
elif compression == "xz":
|
|
634
|
-
suffix += ".xz"
|
|
635
|
-
|
|
636
|
-
for x in range(count):
|
|
658
|
+
suffix = ".tsv" if separator == "\t" else ".csv"
|
|
659
|
+
|
|
660
|
+
if compression:
|
|
661
|
+
suffix += "." + {
|
|
662
|
+
"gzip": "gz",
|
|
663
|
+
}.get(compression, compression)
|
|
664
|
+
|
|
665
|
+
for counter in range(count):
|
|
637
666
|
try:
|
|
638
|
-
|
|
639
|
-
[
|
|
640
|
-
row
|
|
641
|
-
for row in [
|
|
642
|
-
[next(x) for x in generators] for _ in range(iterations)
|
|
643
|
-
]
|
|
644
|
-
],
|
|
645
|
-
columns=col_names,
|
|
646
|
-
).sort_values(by=sort_by).to_csv(
|
|
647
|
-
basename + "_" + str(x) + suffix,
|
|
648
|
-
quoting=csv.QUOTE_MINIMAL,
|
|
649
|
-
sep=separator,
|
|
650
|
-
header=False,
|
|
651
|
-
index=False,
|
|
652
|
-
compression=compression,
|
|
653
|
-
)
|
|
667
|
+
gen_to_csv(iterations)
|
|
654
668
|
except csv.Error as e:
|
|
655
669
|
logger.error(e)
|
|
656
670
|
if e.args[0] == "need to escape, but no escapechar set":
|
|
@@ -658,20 +672,11 @@ class SimpleFaker:
|
|
|
658
672
|
f"You cannot use the selected delimiter '{separator}'. Consider using another char or the the tab key."
|
|
659
673
|
)
|
|
660
674
|
|
|
661
|
-
logger.debug(f"Saved file '{basename + '_' + str(
|
|
675
|
+
logger.debug(f"Saved file '{basename + '_' + str(counter) + suffix}'")
|
|
662
676
|
|
|
663
677
|
# remaining rows, if any
|
|
664
678
|
if rem > 0:
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
columns=col_names,
|
|
668
|
-
).sort_values(by=sort_by).to_csv(
|
|
669
|
-
basename + "_" + str(count) + suffix,
|
|
670
|
-
quoting=csv.QUOTE_MINIMAL,
|
|
671
|
-
sep=separator,
|
|
672
|
-
header=False,
|
|
673
|
-
index=False,
|
|
674
|
-
compression=compression,
|
|
675
|
-
)
|
|
679
|
+
counter = count
|
|
680
|
+
gen_to_csv(rem)
|
|
676
681
|
|
|
677
|
-
logger.debug(f"Saved file '{basename + '_' + str(
|
|
682
|
+
logger.debug(f"Saved file '{basename + '_' + str(counter) + suffix}'")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|