dbworkload 0.11.0__tar.gz → 0.11.1.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dbworkload
3
- Version: 0.11.0
3
+ Version: 0.11.1.dev1
4
4
  Summary: Workload framework
5
5
  License: GPLv3+
6
6
  License-File: LICENSE
7
7
  Author: Fabio Ghirardello
8
- Requires-Python: >=3.11,<4.0
8
+ Requires-Python: >=3.11,<4
9
9
  Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
10
10
  Classifier: License :: Other/Proprietary License
11
11
  Classifier: Operating System :: OS Independent
@@ -26,6 +26,7 @@ Provides-Extra: pinecone
26
26
  Provides-Extra: postgres
27
27
  Provides-Extra: spanner
28
28
  Requires-Dist: cassandra-driver ; extra == "all" or extra == "cassandra"
29
+ Requires-Dist: fastdigest (>=0.12.0,<0.13.0)
29
30
  Requires-Dist: fastembed (>=0.7.3,<0.8.0) ; extra == "convert"
30
31
  Requires-Dist: google-cloud-spanner ; extra == "all" or extra == "spanner"
31
32
  Requires-Dist: jinja2
@@ -49,7 +50,6 @@ Requires-Dist: psycopg ; extra == "all" or extra == "postgres"
49
50
  Requires-Dist: psycopg-binary ; extra == "all" or extra == "postgres"
50
51
  Requires-Dist: pymongo ; extra == "all" or extra == "mongo"
51
52
  Requires-Dist: pyodbc ; extra == "all" or extra == "odbc"
52
- Requires-Dist: pytdigest
53
53
  Requires-Dist: pyyaml
54
54
  Requires-Dist: sentence-transformers ; extra == "pinecone"
55
55
  Requires-Dist: sqlparse
@@ -19,11 +19,11 @@ import sqlparse
19
19
  import yaml
20
20
  from jinja2 import Environment, PackageLoader
21
21
  from plotly.subplots import make_subplots
22
- from pytdigest import TDigest
23
22
 
24
23
  import dbworkload
25
24
  import dbworkload.utils.common
26
25
  import dbworkload.utils.simplefaker
26
+ import dbworkload.utils.tdigest as tdigest
27
27
 
28
28
  logger = logging.getLogger("dbworkload")
29
29
  logger.setLevel(logging.INFO)
@@ -506,12 +506,10 @@ def util_merge_csvs(input_dir: str):
506
506
  """
507
507
  combine centroids of multiple TDigests together,
508
508
  and return the new aggregated centroids.
509
- Note: compression=1000
509
+ Note: max_centroids=1000
510
510
  """
511
- return (
512
- TDigest(compression=1000)
513
- .combine([TDigest.of_centroids(y, compression=1000) for y in x])
514
- .get_centroids()
511
+ return tdigest.centroids(
512
+ tdigest.combine(tdigest.from_centroids(y) for y in x)
515
513
  )
516
514
 
517
515
  # for each elapsed range bucket, merge the data for all `id` together
@@ -520,9 +518,9 @@ def util_merge_csvs(input_dir: str):
520
518
  {"ts": min, "threads": sum, "centroids": combine_centroids}
521
519
  )
522
520
 
523
- # the weight of the TDigest represents the count of ops
521
+ # the mass of the TDigest represents the count of ops
524
522
  df["period_ops"] = df["centroids"].map(
525
- lambda x: TDigest(compression=1000).of_centroids(x, compression=1000).weight
523
+ lambda x: tdigest.count(tdigest.from_centroids(x))
526
524
  )
527
525
 
528
526
  df["period_ops_s"] = df["period_ops"].apply(lambda x: x // 10)
@@ -536,17 +534,21 @@ def util_merge_csvs(input_dir: str):
536
534
 
537
535
  # calculate mean and quantiles and convert from seconds to millis
538
536
  df["mean_ms"] = df["centroids"].map(
539
- lambda x: TDigest(compression=1000).of_centroids(x, compression=1000).mean
540
- * 1000
537
+ lambda x: tdigest.from_centroids(x).mean() * 1000
541
538
  )
542
- df[["p50_ms", "p90_ms", "p95_ms", "p99_ms", "max_ms"]] = [
543
- x * 1000
544
- for x in df["centroids"].map(
545
- lambda x: TDigest(compression=1000)
546
- .of_centroids(x, compression=1000)
547
- .inverse_cdf([0.50, 0.90, 0.95, 0.99, 1.00])
539
+ df[["p50_ms", "p90_ms", "p95_ms", "p99_ms", "max_ms"]] = (
540
+ pd.DataFrame(
541
+ df["centroids"]
542
+ .map(
543
+ lambda x: tdigest.from_centroids(x).quantile_vec(
544
+ [0.50, 0.90, 0.95, 0.99, 1.00]
545
+ )
546
+ )
547
+ .tolist(),
548
+ index=df.index,
548
549
  )
549
- ]
550
+ * 1000
551
+ )
550
552
 
551
553
  # round all values to 2 decimals
552
554
  df[["mean_ms", "p50_ms", "p90_ms", "p95_ms", "p99_ms", "max_ms"]] = df[
@@ -13,7 +13,9 @@ import prometheus_client as prom
13
13
  import yaml
14
14
  from prometheus_client.core import REGISTRY, HistogramMetricFamily
15
15
  from prometheus_client.registry import Collector
16
- from pytdigest import TDigest
16
+
17
+ import dbworkload.utils.tdigest as tdigest
18
+ from fastdigest import TDigest
17
19
 
18
20
  RESERVED_WORDS = [
19
21
  "unique",
@@ -77,9 +79,7 @@ class Stats:
77
79
  for x in l:
78
80
  self.cumulative_counts.setdefault(x[0], TDigest())
79
81
  self.window_stats.setdefault(x[0], [])
80
- self.window_stats[x[0]].append(
81
- TDigest(compression=1000).of_centroids(x[1], compression=1000)
82
- )
82
+ self.window_stats[x[0]].append(tdigest.from_centroids(x[1]))
83
83
 
84
84
  # calculate the current stats this instance has collected.
85
85
  def calculate_stats(self, active_connections: int, endtime: int) -> list:
@@ -97,23 +97,21 @@ class Stats:
97
97
  )
98
98
 
99
99
  def get_stats_row(id: str):
100
- td = TDigest(compression=1000).combine(self.window_stats[id])
100
+ td = tdigest.combine(self.window_stats[id])
101
101
 
102
- self.window_stats_centroids[id] = td.get_centroids()
102
+ self.window_stats_centroids[id] = tdigest.centroids(td)
103
103
 
104
- self.cumulative_counts[id] = TDigest(compression=1000).combine(
105
- self.cumulative_counts[id], td
106
- )
104
+ self.cumulative_counts[id] = self.cumulative_counts[id].merge(td)
107
105
  return [
108
106
  elapsed,
109
107
  id,
110
108
  active_connections,
111
- int(self.cumulative_counts[id].weight),
112
- int(self.cumulative_counts[id].weight // elapsed),
113
- int(td.weight),
114
- int(td.weight // window_elapsed),
115
- round(td.mean * 1000, 2),
116
- ] + [round(x * 1000, 2) for x in td.inverse_cdf(self.quantiles)]
109
+ tdigest.count(self.cumulative_counts[id]),
110
+ tdigest.count(self.cumulative_counts[id]) // elapsed,
111
+ tdigest.count(td),
112
+ tdigest.count(td) // window_elapsed,
113
+ round(td.mean() * 1000, 2),
114
+ ] + [round(x * 1000, 2) for x in td.quantile_vec(self.quantiles)]
117
115
 
118
116
  return [get_stats_row(id) for id in sorted(list(self.window_stats.keys()))]
119
117
 
@@ -129,12 +127,12 @@ class Stats:
129
127
  elapsed,
130
128
  id,
131
129
  active_connections,
132
- int(self.cumulative_counts[id].weight),
133
- int(self.cumulative_counts[id].weight // elapsed),
134
- round(self.cumulative_counts[id].mean * 1000, 2),
130
+ tdigest.count(self.cumulative_counts[id]),
131
+ tdigest.count(self.cumulative_counts[id]) // elapsed,
132
+ round(self.cumulative_counts[id].mean() * 1000, 2),
135
133
  ] + [
136
134
  round(x * 1000, 2)
137
- for x in self.cumulative_counts[id].inverse_cdf(self.quantiles)
135
+ for x in self.cumulative_counts[id].quantile_vec(self.quantiles)
138
136
  ]
139
137
 
140
138
  return [get_stats_row(id) for id in sorted(list(self.window_stats.keys()))]
@@ -165,7 +163,7 @@ class WorkerStats:
165
163
 
166
164
  def get_tdigest_ndarray(self):
167
165
  return [
168
- (id, TDigest.compute(np.array(l), compression=1000).get_centroids())
166
+ (id, tdigest.centroids(tdigest.from_values(l)))
169
167
  for id, l in self.window_stats.items()
170
168
  ]
171
169
 
@@ -182,10 +180,13 @@ class CustomHistogram(Collector):
182
180
  return [["+Inf", 0]]
183
181
 
184
182
  # create buckets from 10 ... 180
185
- td_hist = [[x, int(td.cdf((int(x) + 1) / 1000) * td.weight)] for x in self.bins]
186
- td_hist.append(["+Inf", td.weight])
183
+ td_count = tdigest.count(td)
184
+ td_hist = [
185
+ [x, int(td.cdf((int(x) + 1) / 1000) * td_count)] for x in self.bins
186
+ ]
187
+ td_hist.append(["+Inf", td_count])
187
188
 
188
- return td.mean * 1000 * td.weight, td_hist
189
+ return td.mean() * 1000 * td_count, td_hist
189
190
 
190
191
  def collect(self):
191
192
  sum_value, buckets = self.get_buckets(self.name)
@@ -0,0 +1,34 @@
1
+ #!/usr/bin/python
2
+
3
+ import numpy as np
4
+ from fastdigest import TDigest, merge_all
5
+
6
+ MAX_CENTROIDS = 1000
7
+
8
+
9
+ def from_values(values) -> TDigest:
10
+ return TDigest.from_values(
11
+ np.asarray(values, dtype=float), max_centroids=MAX_CENTROIDS
12
+ )
13
+
14
+
15
+ def from_centroids(centroids) -> TDigest:
16
+ arr = np.asarray(centroids, dtype=float)
17
+
18
+ if arr.size == 0:
19
+ return TDigest(MAX_CENTROIDS)
20
+
21
+ arr = np.atleast_2d(arr)
22
+ return TDigest.from_values(arr[:, 0], arr[:, 1], max_centroids=MAX_CENTROIDS)
23
+
24
+
25
+ def combine(digests) -> TDigest:
26
+ return merge_all(list(digests))
27
+
28
+
29
+ def centroids(td: TDigest) -> np.ndarray:
30
+ return np.asarray(td.centroids, dtype=float).reshape(-1, 2)
31
+
32
+
33
+ def count(td: TDigest) -> int:
34
+ return int(td.mass())
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "dbworkload"
3
- version = "0.11.0"
3
+ version = "0.11.1.dev1"
4
4
  description = "Workload framework"
5
5
  authors = ["Fabio Ghirardello"]
6
6
  license = "GPLv3+"
@@ -17,7 +17,7 @@ classifiers = [
17
17
  dbworkload = 'dbworkload.cli.main:app'
18
18
 
19
19
  [tool.poetry.dependencies]
20
- python = "^3.11"
20
+ python = ">=3.11,<4"
21
21
  pandas = "*"
22
22
  tabulate = "*"
23
23
  numpy = "*"
@@ -42,12 +42,12 @@ fastembed = {version = "^0.7.3", optional = true }
42
42
  pgvector = {version = "^0.4.1", optional = true }
43
43
  langgraph = { version = "^1.0.3", optional = true }
44
44
  openai = { version = "^2.8.0", optional = true }
45
- pytdigest = "*"
46
45
  plotext = "*"
47
46
  plotly = "*"
48
47
  jinja2 = "*"
49
48
  sqlparse = "*"
50
49
  psutil = "^7.0.0"
50
+ fastdigest = "^0.12.0"
51
51
 
52
52
  [tool.poetry.extras]
53
53
  all = ["psycopg", "psycopg-binary", "mysql-connector-python", "mariadb", "oracledb", "pyodbc", "pymongo", "cassandra-driver", "google-cloud-spanner", "pinecone", "convert"]
File without changes
File without changes