dbworkload 0.6.2__tar.gz → 0.6.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dbworkload
3
- Version: 0.6.2
3
+ Version: 0.6.4
4
4
  Summary: Workload framework
5
5
  Home-page: https://dbworkload.github.io/dbworkload/
6
6
  License: GPLv3+
@@ -297,5 +297,6 @@ def version_option(
297
297
  ) -> None:
298
298
  pass
299
299
 
300
+
300
301
  # this is only needed for mkdocs-click
301
302
  click_app = typer.main.get_command(app)
@@ -134,12 +134,12 @@ def util_yaml(
134
134
 
135
135
 
136
136
  @app.command(
137
- "merge",
137
+ "merge_sort",
138
138
  epilog=EPILOG,
139
139
  no_args_is_help=True,
140
- help="Merge multiple sorted CSV files into 1+ files.",
140
+ help="Merge-Sort multiple sorted CSV files into 1+ files.",
141
141
  )
142
- def util_merge(
142
+ def util_sort_merge(
143
143
  input: Optional[Path] = typer.Option(
144
144
  ...,
145
145
  "--input",
@@ -166,8 +166,14 @@ def util_merge(
166
166
  resolve_path=True,
167
167
  ),
168
168
  csv_max_rows: int = Param.CSVMaxRows,
169
+ compress: bool = typer.Option(
170
+ True,
171
+ "--no-compress",
172
+ show_default=False,
173
+ help="Do not gzip output files.",
174
+ ),
169
175
  ):
170
- dbworkload.models.util.util_merge(input, output, csv_max_rows)
176
+ dbworkload.models.util.util_merge_sort(input, output, csv_max_rows, compress)
171
177
 
172
178
 
173
179
  @app.command(
@@ -5,23 +5,25 @@ from jinja2 import Environment, PackageLoader
5
5
  from pathlib import PosixPath
6
6
  from plotly.subplots import make_subplots
7
7
  from pytdigest import TDigest
8
- import dbworkload
9
8
  import datetime as dt
9
+ import dbworkload
10
10
  import dbworkload.utils.common
11
11
  import dbworkload.utils.simplefaker
12
+ import gzip
12
13
  import itertools
13
14
  import logging
14
15
  import numpy as np
15
16
  import os
16
17
  import pandas as pd
17
- import pandas as pd
18
18
  import plotext as plt
19
19
  import plotly.graph_objects as go
20
20
  import plotly.io as pio
21
+ import shutil
21
22
  import sqlparse
22
23
  import sys
23
24
  import yaml
24
25
 
26
+
25
27
  logger = logging.getLogger("dbworkload")
26
28
  logger.setLevel(logging.INFO)
27
29
 
@@ -111,9 +113,13 @@ def util_yaml(input: str, output: str):
111
113
  f.write(dbworkload.utils.common.ddl_to_yaml(ddl))
112
114
 
113
115
 
114
- def util_merge(input_dir: str, output_dir: str, csv_max_rows: int):
115
- class Merge:
116
- def __init__(self, input_dir: str, output_dir: str, csv_max_rows: int):
116
+ def util_merge_sort(input_dir: str, output_dir: str, csv_max_rows: int, compress: bool):
117
+ from operator import itemgetter
118
+
119
+ class MergeSort:
120
+ def __init__(
121
+ self, input_dir: str, output_dir: str, csv_max_rows: int, compress: bool
122
+ ):
117
123
  # input CSV files - it assumes files are already sorted
118
124
  files = os.listdir(input_dir)
119
125
  # Filtering only the files.
@@ -123,12 +129,18 @@ def util_merge(input_dir: str, output_dir: str, csv_max_rows: int):
123
129
  if os.path.isfile(os.path.join(input_dir, f))
124
130
  ]
125
131
 
132
+ self.compress = ".gz" if compress else ""
133
+ self.file_extension = self.CSVs[0][-3:]
134
+
126
135
  self.CSV_MAX_ROWS = csv_max_rows
127
136
  self.COUNTER = 0
128
137
  self.C = 0
129
138
 
139
+ # source holds the list of lines in each CSV file, marked by the idx number
140
+ # file_handlers holds a the open file handler for each CSV file, marked by the idx number
130
141
  self.source: dict[int, list] = {}
131
142
  self.file_handlers: dict[int, TextIOWrapper] = {}
143
+
132
144
  self.output: TextIOWrapper
133
145
  if not output_dir:
134
146
  self.output_dir = str(input_dir) + ".merged"
@@ -160,7 +172,7 @@ def util_merge(input_dir: str, output_dir: str, csv_max_rows: int):
160
172
  self.source[idx].append(line)
161
173
  else:
162
174
  # reached end of file
163
- logger.info(
175
+ logger.debug(
164
176
  f"initial_fill: CSV file '{csv}' at source index {idx} reached EOF."
165
177
  )
166
178
  f.close()
@@ -179,23 +191,41 @@ def util_merge(input_dir: str, output_dir: str, csv_max_rows: int):
179
191
  self.source[idx].append(line)
180
192
  else:
181
193
  # reached end of file
182
- logger.info(f"index {idx} reached EOF.")
194
+ logger.debug(f"index {idx} reached EOF.")
183
195
  f.close()
184
196
  del self.file_handlers[idx]
185
197
  except Exception as e:
186
198
  logger.error("Excepton in replenish_queue: ", e)
187
199
 
200
+ def close_output(self):
201
+ self.output.close()
202
+
203
+ if self.compress:
204
+ with open(self.output.name, "rb") as f_in:
205
+ with gzip.open(f"{self.output.name}{self.compress}", "wb") as f_out:
206
+ shutil.copyfileobj(f_in, f_out)
207
+ os.remove(self.output.name)
208
+
209
+ logger.info(f"Saved {self.output_filename}{self.compress}")
210
+
211
+ def open_new_output(self):
212
+ self.output_filename = (
213
+ f"out_{str.zfill(str(self.COUNTER), 6)}.{self.file_extension}"
214
+ )
215
+ self.output = open(
216
+ os.path.join(self.output_dir, self.output_filename),
217
+ "+w",
218
+ )
219
+
188
220
  def write_to_csv(self, v: str):
221
+ # create a new output file if the limit is reached
189
222
  if self.C >= self.CSV_MAX_ROWS:
190
- self.output.close()
223
+ self.close_output()
224
+
191
225
  self.COUNTER += 1
192
226
  self.C = 0
193
- self.output = open(
194
- os.path.join(
195
- self.output_dir, f"out_{str.zfill(str(self.COUNTER), 3)}.csv"
196
- ),
197
- "+w",
198
- )
227
+
228
+ self.open_new_output()
199
229
 
200
230
  self.output.write(v)
201
231
  self.C += 1
@@ -209,56 +239,54 @@ def util_merge(input_dir: str, output_dir: str, csv_max_rows: int):
209
239
  self.initial_fill(csv, idx)
210
240
 
211
241
  # the source dict now has a key for every file and a list of the first values read
242
+ # the file_handler dict has a key for every file and a pointer to the open file handler
212
243
 
213
- l = []
214
- # pop the first value in each source to a list `l`
215
- # `l` will have the first values of all source CSV files
244
+ staging = []
245
+ # pop the first value in each source list to list `staging`
246
+ # `staging` will have the first values of all source CSV files
216
247
  for k, v in self.source.items():
217
248
  try:
218
- l.append((v.pop(0), k))
249
+ staging.append((v.pop(0), k))
219
250
  except IndexError as e:
220
251
  pass
252
+ from pprint import pprint
221
253
 
222
254
  first_k = None
223
255
  first_v = None
224
- self.output = open(
225
- os.path.join(
226
- self.output_dir, f"out_{str.zfill(str(self.COUNTER), 3)}.csv"
227
- ),
228
- "+w",
229
- )
256
+ self.open_new_output()
230
257
 
231
- # sort list `l`
258
+ # sort list `staging`
232
259
  # pop the first value (the smallest) in `first_v`
233
260
  # make a note of the source of that value in `first_k`
234
261
  # replenish the corrisponding source
262
+
235
263
  while True:
236
264
  if first_k is not None:
237
265
  try:
238
266
  self.replenish_source_list(first_k)
239
- l.append((self.source[first_k].pop(0), first_k))
267
+ staging.append((self.source[first_k].pop(0), first_k))
240
268
 
241
269
  except IndexError as e:
242
270
  # the source list is empty
243
- logger.info(f"source list {first_k} is now empty")
271
+ logger.debug(f"source list {first_k} is now empty")
244
272
  first_k = None
245
273
 
246
- if l:
247
- l.sort(key=lambda x: x[0])
274
+ if staging:
275
+ staging.sort(key=itemgetter(0))
248
276
  try:
249
- first_v, first_k = l.pop(0)
277
+ first_v, first_k = staging.pop(0)
250
278
  self.write_to_csv(first_v)
251
279
  except IndexError as e:
252
- logger.info("Exception in main: ", e)
280
+ logger.warning("Exception in main: ", e)
253
281
  self.output.close()
254
282
  else:
255
283
  break
256
284
 
257
- self.output.close()
285
+ self.close_output()
258
286
 
259
287
  logger.info("Completed")
260
288
 
261
- Merge(input_dir, output_dir, csv_max_rows).run()
289
+ MergeSort(input_dir, output_dir, csv_max_rows, compress).run()
262
290
 
263
291
 
264
292
  def util_plot(input: PosixPath):
@@ -157,7 +157,7 @@ class Stats:
157
157
  int(self.cumulative_counts[id].weight),
158
158
  int(self.cumulative_counts[id].weight // elapsed),
159
159
  int(td.weight),
160
- int(td.weight // (endtime - window_elapsed)),
160
+ int(td.weight // window_elapsed),
161
161
  round(td.mean * 1000, 2),
162
162
  ] + [round(x * 1000, 2) for x in td.inverse_cdf(self.quantiles)]
163
163
 
@@ -7,6 +7,7 @@ import pandas as pd
7
7
  import uuid
8
8
  import random
9
9
  import builtins
10
+ from .common import import_class_at_runtime
10
11
 
11
12
  logger = logging.getLogger("dbworkload")
12
13
 
@@ -584,6 +585,9 @@ class SimpleFaker:
584
585
  return [SimpleFaker.Bit(seed=s, **args) for s in seeds]
585
586
  elif obj_type == "bytes":
586
587
  return [SimpleFaker.Bytes(seed=s, **args) for s in seeds]
588
+ elif obj_type == "custom":
589
+ custom_gen = import_class_at_runtime(args.pop("path"))
590
+ return [custom_gen(seed=s, **args) for s in seeds]
587
591
  else:
588
592
  raise ValueError(
589
593
  f"SimpleFaker type not implemented or recognized: '{obj_type}'"
@@ -610,47 +614,57 @@ class SimpleFaker:
610
614
  separator (str): the field delimiter in the CSV file
611
615
  compression (str): the compression format (gzip, zip, None..)
612
616
  """
617
+
618
+ def gen_to_csv(iters: int):
619
+ # create individual Series and then concat them together
620
+ df = pd.concat(
621
+ [pd.Series([next(gen) for _ in range(iters)]) for gen in generators],
622
+ axis=1,
623
+ keys=col_names,
624
+ )
625
+
626
+ # get a list of the colums that are not to be sorted by
627
+ remaining = list(set(col_names) - set(sort_by))
628
+
629
+ # create a dataframe by concatenating:
630
+ # 1 - the df subset with the sort_by columns sorted by the sort_by columns
631
+ # 2 - the df subset with the remaining columns
632
+ # finally order the columns by the original col_names
633
+ # then save to csv
634
+ pd.concat(
635
+ [
636
+ df[sort_by].sort_values(sort_by).reset_index(drop=True),
637
+ df[remaining],
638
+ ],
639
+ axis=1,
640
+ )[col_names].to_csv(
641
+ basename + "_" + str(counter) + suffix,
642
+ quoting=csv.QUOTE_MINIMAL,
643
+ sep=separator,
644
+ header=False,
645
+ index=False,
646
+ compression=compression,
647
+ )
648
+
613
649
  logger.debug("SimpleFaker worker created")
614
650
  if iterations > self.csv_max_rows:
615
- count = int(iterations / self.csv_max_rows)
651
+ count = iterations // self.csv_max_rows
616
652
  rem = iterations % self.csv_max_rows
617
653
  iterations = self.csv_max_rows
618
654
  else:
619
655
  count = 1
620
656
  rem = 0
621
657
 
622
- if separator == "\t":
623
- suffix = ".tsv"
624
- else:
625
- suffix = ".csv"
626
-
627
- if compression == "gzip":
628
- suffix += ".gz"
629
- elif compression == "zip":
630
- suffix += ".zip"
631
- elif compression == "bz2":
632
- suffix += ".bz2"
633
- elif compression == "xz":
634
- suffix += ".xz"
635
-
636
- for x in range(count):
658
+ suffix = ".tsv" if separator == "\t" else ".csv"
659
+
660
+ if compression:
661
+ suffix += "." + {
662
+ "gzip": "gz",
663
+ }.get(compression, compression)
664
+
665
+ for counter in range(count):
637
666
  try:
638
- pd.DataFrame(
639
- [
640
- row
641
- for row in [
642
- [next(x) for x in generators] for _ in range(iterations)
643
- ]
644
- ],
645
- columns=col_names,
646
- ).sort_values(by=sort_by).to_csv(
647
- basename + "_" + str(x) + suffix,
648
- quoting=csv.QUOTE_MINIMAL,
649
- sep=separator,
650
- header=False,
651
- index=False,
652
- compression=compression,
653
- )
667
+ gen_to_csv(iterations)
654
668
  except csv.Error as e:
655
669
  logger.error(e)
656
670
  if e.args[0] == "need to escape, but no escapechar set":
@@ -658,20 +672,11 @@ class SimpleFaker:
658
672
  f"You cannot use the selected delimiter '{separator}'. Consider using another char or the the tab key."
659
673
  )
660
674
 
661
- logger.debug(f"Saved file '{basename + '_' + str(x) + suffix}'")
675
+ logger.debug(f"Saved file '{basename + '_' + str(counter) + suffix}'")
662
676
 
663
677
  # remaining rows, if any
664
678
  if rem > 0:
665
- pd.DataFrame(
666
- [row for row in [[next(x) for x in generators] for _ in range(rem)]],
667
- columns=col_names,
668
- ).sort_values(by=sort_by).to_csv(
669
- basename + "_" + str(count) + suffix,
670
- quoting=csv.QUOTE_MINIMAL,
671
- sep=separator,
672
- header=False,
673
- index=False,
674
- compression=compression,
675
- )
679
+ counter = count
680
+ gen_to_csv(rem)
676
681
 
677
- logger.debug(f"Saved file '{basename + '_' + str(x) + suffix}'")
682
+ logger.debug(f"Saved file '{basename + '_' + str(counter) + suffix}'")
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "dbworkload"
3
- version = "0.6.2"
3
+ version = "0.6.4"
4
4
  description = "Workload framework"
5
5
  authors = ["Fabio Ghirardello"]
6
6
  license = "GPLv3+"
File without changes
File without changes