PyPI - dbworkload - Versions diffs - 0.6.2__tar.gz → 0.6.4__tar.gz - Mend

dbworkload 0.6.2tar.gz → 0.6.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

{dbworkload-0.6.2 → dbworkload-0.6.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: dbworkload
-Version: 0.6.2
+Version: 0.6.4
 Summary: Workload framework
 Home-page: https://dbworkload.github.io/dbworkload/
 License: GPLv3+

{dbworkload-0.6.2 → dbworkload-0.6.4}/dbworkload/cli/main.py RENAMED Viewed

@@ -297,5 +297,6 @@ def version_option(
 ) -> None:
     pass
 # this is only needed for mkdocs-click
 click_app = typer.main.get_command(app)

{dbworkload-0.6.2 → dbworkload-0.6.4}/dbworkload/cli/util.py RENAMED Viewed

@@ -134,12 +134,12 @@ def util_yaml(
 @app.command(
-    "merge",
+    "merge_sort",
     epilog=EPILOG,
     no_args_is_help=True,
-    help="Merge multiple sorted CSV files into 1+ files.",
+    help="Merge-Sort multiple sorted CSV files into 1+ files.",
 )
-def util_merge(
+def util_sort_merge(
     input: Optional[Path] = typer.Option(
         ...,
         "--input",
@@ -166,8 +166,14 @@ def util_merge(
         resolve_path=True,
     ),
     csv_max_rows: int = Param.CSVMaxRows,
+    compress: bool = typer.Option(
+        True,
+        "--no-compress",
+        show_default=False,
+        help="Do not gzip output files.",
+    ),
 ):
-    dbworkload.models.util.util_merge(input, output, csv_max_rows)
+    dbworkload.models.util.util_merge_sort(input, output, csv_max_rows, compress)
 @app.command(

{dbworkload-0.6.2 → dbworkload-0.6.4}/dbworkload/models/util.py RENAMED Viewed

@@ -5,23 +5,25 @@ from jinja2 import Environment, PackageLoader
 from pathlib import PosixPath
 from plotly.subplots import make_subplots
 from pytdigest import TDigest
-import dbworkload
 import datetime as dt
+import dbworkload
 import dbworkload.utils.common
 import dbworkload.utils.simplefaker
+import gzip
 import itertools
 import logging
 import numpy as np
 import os
 import pandas as pd
-import pandas as pd
 import plotext as plt
 import plotly.graph_objects as go
 import plotly.io as pio
+import shutil
 import sqlparse
 import sys
 import yaml
 logger = logging.getLogger("dbworkload")
 logger.setLevel(logging.INFO)
@@ -111,9 +113,13 @@ def util_yaml(input: str, output: str):
         f.write(dbworkload.utils.common.ddl_to_yaml(ddl))
-def util_merge(input_dir: str, output_dir: str, csv_max_rows: int):
-    class Merge:
-        def __init__(self, input_dir: str, output_dir: str, csv_max_rows: int):
+def util_merge_sort(input_dir: str, output_dir: str, csv_max_rows: int, compress: bool):
+    from operator import itemgetter
+    class MergeSort:
+        def __init__(
+            self, input_dir: str, output_dir: str, csv_max_rows: int, compress: bool
+        ):
             # input CSV files - it assumes files are already sorted
             files = os.listdir(input_dir)
             # Filtering only the files.
@@ -123,12 +129,18 @@ def util_merge(input_dir: str, output_dir: str, csv_max_rows: int):
                 if os.path.isfile(os.path.join(input_dir, f))
             ]
+            self.compress = ".gz" if compress else ""
+            self.file_extension = self.CSVs[0][-3:]
             self.CSV_MAX_ROWS = csv_max_rows
             self.COUNTER = 0
             self.C = 0
+            # source holds the list of lines in each CSV file, marked by the idx number
+            # file_handlers holds a the open file handler for each CSV file, marked by the idx number
             self.source: dict[int, list] = {}
             self.file_handlers: dict[int, TextIOWrapper] = {}
             self.output: TextIOWrapper
             if not output_dir:
                 self.output_dir = str(input_dir) + ".merged"
@@ -160,7 +172,7 @@ def util_merge(input_dir: str, output_dir: str, csv_max_rows: int):
                     self.source[idx].append(line)
                 else:
                     # reached end of file
-                    logger.info(
+                    logger.debug(
                         f"initial_fill: CSV file '{csv}' at source index {idx} reached EOF."
                     )
                     f.close()
@@ -179,23 +191,41 @@ def util_merge(input_dir: str, output_dir: str, csv_max_rows: int):
                     self.source[idx].append(line)
                 else:
                     # reached end of file
-                    logger.info(f"index {idx} reached EOF.")
+                    logger.debug(f"index {idx} reached EOF.")
                     f.close()
                     del self.file_handlers[idx]
             except Exception as e:
                 logger.error("Excepton in replenish_queue: ", e)
+        def close_output(self):
+            self.output.close()
+            if self.compress:
+                with open(self.output.name, "rb") as f_in:
+                    with gzip.open(f"{self.output.name}{self.compress}", "wb") as f_out:
+                        shutil.copyfileobj(f_in, f_out)
+                os.remove(self.output.name)
+            logger.info(f"Saved {self.output_filename}{self.compress}")
+        def open_new_output(self):
+            self.output_filename = (
+                f"out_{str.zfill(str(self.COUNTER), 6)}.{self.file_extension}"
+            )
+            self.output = open(
+                os.path.join(self.output_dir, self.output_filename),
+                "+w",
+            )
         def write_to_csv(self, v: str):
+            # create a new output file if the limit is reached
             if self.C >= self.CSV_MAX_ROWS:
-                self.output.close()
+                self.close_output()
                 self.COUNTER += 1
                 self.C = 0
-                self.output = open(
-                    os.path.join(
-                        self.output_dir, f"out_{str.zfill(str(self.COUNTER), 3)}.csv"
-                    ),
-                    "+w",
-                )
+                self.open_new_output()
             self.output.write(v)
             self.C += 1
@@ -209,56 +239,54 @@ def util_merge(input_dir: str, output_dir: str, csv_max_rows: int):
                 self.initial_fill(csv, idx)
             # the source dict now has a key for every file and a list of the first values read
+            # the file_handler dict has a key for every file and a pointer to the open file handler
-            l = []
-            # pop the first value in each source to a list `l`
-            # `l` will have the first values of all source CSV files
+            staging = []
+            # pop the first value in each source list to list `staging`
+            # `staging` will have the first values of all source CSV files
             for k, v in self.source.items():
                 try:
-                    l.append((v.pop(0), k))
+                    staging.append((v.pop(0), k))
                 except IndexError as e:
                     pass
+            from pprint import pprint
             first_k = None
             first_v = None
-            self.output = open(
-                os.path.join(
-                    self.output_dir, f"out_{str.zfill(str(self.COUNTER), 3)}.csv"
-                ),
-                "+w",
-            )
+            self.open_new_output()
-            # sort list `l`
+            # sort list `staging`
             # pop the first value (the smallest) in `first_v`
             # make a note of the source of that value in `first_k`
             # replenish the corrisponding source
             while True:
                 if first_k is not None:
                     try:
                         self.replenish_source_list(first_k)
-                        l.append((self.source[first_k].pop(0), first_k))
+                        staging.append((self.source[first_k].pop(0), first_k))
                     except IndexError as e:
                         # the source list is empty
-                        logger.info(f"source list {first_k} is now empty")
+                        logger.debug(f"source list {first_k} is now empty")
                         first_k = None
-                if l:
-                    l.sort(key=lambda x: x[0])
+                if staging:
+                    staging.sort(key=itemgetter(0))
                     try:
-                        first_v, first_k = l.pop(0)
+                        first_v, first_k = staging.pop(0)
                         self.write_to_csv(first_v)
                     except IndexError as e:
-                        logger.info("Exception in main: ", e)
+                        logger.warning("Exception in main: ", e)
                         self.output.close()
                 else:
                     break
-            self.output.close()
+            self.close_output()
             logger.info("Completed")
-    Merge(input_dir, output_dir, csv_max_rows).run()
+    MergeSort(input_dir, output_dir, csv_max_rows, compress).run()
 def util_plot(input: PosixPath):

{dbworkload-0.6.2 → dbworkload-0.6.4}/dbworkload/utils/common.py RENAMED Viewed

@@ -157,7 +157,7 @@ class Stats:
                 int(self.cumulative_counts[id].weight),
                 int(self.cumulative_counts[id].weight // elapsed),
                 int(td.weight),
-                int(td.weight // (endtime - window_elapsed)),
+                int(td.weight // window_elapsed),
                 round(td.mean * 1000, 2),
             ] + [round(x * 1000, 2) for x in td.inverse_cdf(self.quantiles)]

{dbworkload-0.6.2 → dbworkload-0.6.4}/dbworkload/utils/simplefaker.py RENAMED Viewed

@@ -7,6 +7,7 @@ import pandas as pd
 import uuid
 import random
 import builtins
+from .common import import_class_at_runtime
 logger = logging.getLogger("dbworkload")
@@ -584,6 +585,9 @@ class SimpleFaker:
             return [SimpleFaker.Bit(seed=s, **args) for s in seeds]
         elif obj_type == "bytes":
             return [SimpleFaker.Bytes(seed=s, **args) for s in seeds]
+        elif obj_type == "custom":
+            custom_gen = import_class_at_runtime(args.pop("path"))
+            return [custom_gen(seed=s, **args) for s in seeds]
         else:
             raise ValueError(
                 f"SimpleFaker type not implemented or recognized: '{obj_type}'"
@@ -610,47 +614,57 @@ class SimpleFaker:
             separator (str): the field delimiter in the CSV file
             compression (str): the compression format (gzip, zip, None..)
         """
+        def gen_to_csv(iters: int):
+            # create individual Series and then concat them together
+            df = pd.concat(
+                [pd.Series([next(gen) for _ in range(iters)]) for gen in generators],
+                axis=1,
+                keys=col_names,
+            )
+            # get a list of the colums that are not to be sorted by
+            remaining = list(set(col_names) - set(sort_by))
+            # create a dataframe by concatenating:
+            # 1 - the df subset with the sort_by columns sorted by the sort_by columns
+            # 2 - the df subset with the remaining columns
+            # finally order the columns by the original col_names
+            # then save to csv
+            pd.concat(
+                [
+                    df[sort_by].sort_values(sort_by).reset_index(drop=True),
+                    df[remaining],
+                ],
+                axis=1,
+            )[col_names].to_csv(
+                basename + "_" + str(counter) + suffix,
+                quoting=csv.QUOTE_MINIMAL,
+                sep=separator,
+                header=False,
+                index=False,
+                compression=compression,
+            )
         logger.debug("SimpleFaker worker created")
         if iterations > self.csv_max_rows:
-            count = int(iterations / self.csv_max_rows)
+            count = iterations // self.csv_max_rows
             rem = iterations % self.csv_max_rows
             iterations = self.csv_max_rows
         else:
             count = 1
             rem = 0
-        if separator == "\t":
-            suffix = ".tsv"
-        else:
-            suffix = ".csv"
-        if compression == "gzip":
-            suffix += ".gz"
-        elif compression == "zip":
-            suffix += ".zip"
-        elif compression == "bz2":
-            suffix += ".bz2"
-        elif compression == "xz":
-            suffix += ".xz"
-        for x in range(count):
+        suffix = ".tsv" if separator == "\t" else ".csv"
+        if compression:
+            suffix += "." + {
+                "gzip": "gz",
+            }.get(compression, compression)
+        for counter in range(count):
             try:
-                pd.DataFrame(
-                    [
-                        row
-                        for row in [
-                            [next(x) for x in generators] for _ in range(iterations)
-                        ]
-                    ],
-                    columns=col_names,
-                ).sort_values(by=sort_by).to_csv(
-                    basename + "_" + str(x) + suffix,
-                    quoting=csv.QUOTE_MINIMAL,
-                    sep=separator,
-                    header=False,
-                    index=False,
-                    compression=compression,
-                )
+                gen_to_csv(iterations)
             except csv.Error as e:
                 logger.error(e)
                 if e.args[0] == "need to escape, but no escapechar set":
@@ -658,20 +672,11 @@ class SimpleFaker:
                         f"You cannot use the selected delimiter '{separator}'. Consider using another char or the the tab key."
                     )
-            logger.debug(f"Saved file '{basename + '_' + str(x) + suffix}'")
+            logger.debug(f"Saved file '{basename + '_' + str(counter) + suffix}'")
         # remaining rows, if any
         if rem > 0:
-            pd.DataFrame(
-                [row for row in [[next(x) for x in generators] for _ in range(rem)]],
-                columns=col_names,
-            ).sort_values(by=sort_by).to_csv(
-                basename + "_" + str(count) + suffix,
-                quoting=csv.QUOTE_MINIMAL,
-                sep=separator,
-                header=False,
-                index=False,
-                compression=compression,
-            )
+            counter = count
+            gen_to_csv(rem)
-            logger.debug(f"Saved file '{basename + '_' + str(x) + suffix}'")
+            logger.debug(f"Saved file '{basename + '_' + str(counter) + suffix}'")

{dbworkload-0.6.2 → dbworkload-0.6.4}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dbworkload"
-version = "0.6.2"
+version = "0.6.4"
 description = "Workload framework"
 authors = ["Fabio Ghirardello"]
 license = "GPLv3+"

{dbworkload-0.6.2 → dbworkload-0.6.4}/LICENSE RENAMED Viewed

File without changes

{dbworkload-0.6.2 → dbworkload-0.6.4}/README.md RENAMED Viewed

File without changes

{dbworkload-0.6.2 → dbworkload-0.6.4}/dbworkload/__init__.py RENAMED Viewed

File without changes

{dbworkload-0.6.2 → dbworkload-0.6.4}/dbworkload/cli/dep.py RENAMED Viewed

File without changes

{dbworkload-0.6.2 → dbworkload-0.6.4}/dbworkload/models/run.py RENAMED Viewed

File without changes

{dbworkload-0.6.2 → dbworkload-0.6.4}/dbworkload/templates/stub.j2 RENAMED Viewed

File without changes

dbworkload 0.6.2__tar.gz → 0.6.4__tar.gz

dbworkload 0.6.2tar.gz → 0.6.4tar.gz