opteryx-catalog 0.4.11__py3-none-any.whl → 0.4.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. opteryx_catalog/catalog/compaction.py +15 -8
  2. opteryx_catalog/catalog/dataset.py +449 -111
  3. opteryx_catalog/catalog/manifest.py +390 -330
  4. opteryx_catalog/catalog/metadata.py +3 -0
  5. opteryx_catalog/iops/fileio.py +13 -0
  6. opteryx_catalog/maki_nage/__init__.py +8 -0
  7. opteryx_catalog/maki_nage/distogram.py +558 -0
  8. opteryx_catalog/maki_nage/tests/_test_histogram.py +52 -0
  9. opteryx_catalog/maki_nage/tests/test_bounds.py +24 -0
  10. opteryx_catalog/maki_nage/tests/test_count.py +19 -0
  11. opteryx_catalog/maki_nage/tests/test_count_at.py +89 -0
  12. opteryx_catalog/maki_nage/tests/test_quantile.py +81 -0
  13. opteryx_catalog/maki_nage/tests/test_stats.py +25 -0
  14. opteryx_catalog/maki_nage/tests/test_update.py +44 -0
  15. opteryx_catalog/opteryx_catalog.py +82 -54
  16. opteryx_catalog/webhooks/__init__.py +230 -0
  17. opteryx_catalog/webhooks/events.py +177 -0
  18. {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/METADATA +15 -18
  19. opteryx_catalog-0.4.26.dist-info/RECORD +45 -0
  20. {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/WHEEL +1 -1
  21. scripts/collect_byte_counts.py +42 -0
  22. scripts/emit_full_single_file.py +81 -0
  23. scripts/inspect_manifest_dryrun.py +322 -0
  24. scripts/inspect_single_file.py +147 -0
  25. scripts/inspect_single_file_gcs.py +124 -0
  26. tests/test_collections.py +37 -0
  27. tests/test_describe_uncompressed.py +127 -0
  28. tests/test_refresh_manifest.py +275 -0
  29. tests/test_webhooks.py +177 -0
  30. opteryx_catalog-0.4.11.dist-info/RECORD +0 -25
  31. {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/licenses/LICENSE +0 -0
  32. {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/top_level.txt +0 -0
@@ -68,6 +68,9 @@ class DatasetMetadata:
68
68
  # Each schema dict may also include `timestamp-ms` and `author`.
69
69
  schemas: List[dict] = field(default_factory=list)
70
70
  current_schema_id: Optional[str] = None
71
+ # Annotations: list of annotation objects attached to this dataset
72
+ # Each annotation is a dict with keys like 'key' and 'value'.
73
+ annotations: List[dict] = field(default_factory=list)
71
74
 
72
75
  def current_snapshot(self) -> Optional[Snapshot]:
73
76
  if self.current_snapshot_id is None:
@@ -123,3 +123,16 @@ class GcsFileIO(FileIO):
123
123
  return True
124
124
  except Exception:
125
125
  return False
126
+
127
+
128
+ # Centralized Parquet write options used across the codebase when writing
129
+ # parquet files. Exported here so all writers share the same configuration.
130
+ WRITE_PARQUET_OPTIONS = {
131
+ "compression": "ZSTD",
132
+ "compression_level": 3,
133
+ "use_dictionary": True,
134
+ "dictionary_pagesize_limit": 1024 * 1024,
135
+ "data_page_size": 1024 * 1024,
136
+ "version": "2.6",
137
+ "write_statistics": True,
138
+ }
@@ -0,0 +1,8 @@
1
+ # Lightweight package shim so `opteryx.third_party.maki_nage` is importable
2
+ from .distogram import Distogram
3
+ from .distogram import histogram
4
+ from .distogram import load
5
+ from .distogram import merge
6
+ from .distogram import quantile
7
+
8
+ __all__ = ["Distogram", "load", "merge", "histogram", "quantile"]
@@ -0,0 +1,558 @@
1
+ # type:ignore
2
+ import math
3
+ from bisect import bisect_left
4
+ from itertools import accumulate
5
+ from operator import itemgetter
6
+ from typing import List
7
+ from typing import Optional
8
+ from typing import Tuple
9
+
10
+ import numpy
11
+
12
+ __author__ = """Romain Picard"""
13
+ __email__ = "romain.picard@oakbits.com"
14
+ __version__ = "3.0.0"
15
+
16
+ """
17
+ The following changes have been made for Opteryx:
18
+ - The ability to weight the differences has been removed
19
+ - Dump and Load functionality
20
+ - Bulk load functionality added
21
+ """
22
+
23
+
24
+ EPSILON = 1e-5
25
+ BIN_COUNT: int = 50
26
+ Bin = Tuple[float, int]
27
+
28
+ _caster = numpy.float64
29
+
30
+
31
+ # bins is a tuple of (cut point, count)
32
+ class Distogram: # pragma: no cover
33
+ """Compressed representation of a distribution."""
34
+
35
+ __slots__ = "bins", "min", "max", "diffs", "min_diff", "_bin_count"
36
+
37
+ def __init__(self, bin_count: int = BIN_COUNT):
38
+ """Creates a new Distogram object
39
+
40
+ Args:
41
+ bin_count: [Optional] the number of bins to use.
42
+ weighted_diff: [Optional] Whether to use weighted bin sizes.
43
+
44
+ Returns:
45
+ A Distogram object.
46
+ """
47
+ self.bins: List[Bin] = list()
48
+ self.min: Optional[float] = None
49
+ self.max: Optional[float] = None
50
+ self.diffs: Optional[List[float]] = None
51
+ self.min_diff: Optional[float] = None
52
+
53
+ self._bin_count = bin_count
54
+
55
+ ## all class methods below here have been added for Opteryx
56
+ def dumps(self): # pragma: no cover
57
+ import orjson
58
+
59
+ def handler(obj):
60
+ obj_type = type(obj)
61
+ if obj_type is numpy.integer:
62
+ return int(obj)
63
+ if obj_type is numpy.inexact:
64
+ return float(obj)
65
+ raise TypeError
66
+
67
+ return orjson.dumps(self.dump(), default=handler)
68
+
69
+ def dump(self):
70
+ bin_vals, bin_counts = zip(*self.bins)
71
+ bin_vals = numpy.array(bin_vals, dtype=numpy.float128)
72
+ self.bins = list(zip(bin_vals, bin_counts))
73
+ return {
74
+ "bins": self.bins,
75
+ "min": self.min,
76
+ "max": self.max,
77
+ }
78
+
79
+ def __add__(self, operand): # pragma: no cover
80
+ dgram = merge(self, operand)
81
+ # merge estimates min and max, so set them manually
82
+ dgram.min = min(self.min, operand.min)
83
+ dgram.max = max(self.max, operand.max)
84
+ return dgram
85
+
86
+ def bulkload(self, values):
87
+ # To speed up bulk loads we use numpy to get a histogram at a higher resolution
88
+ # and add this to the distogram.
89
+ # Histogram gives us n+1 values, so we average consecutive values.
90
+ # This ends up being an approximation of an approximation but 1000x faster.
91
+ # The accuracy of this approach is poor on datasets with very low record counts,
92
+ # but even if a bad decision is made on a table with 500 rows, the consequence
93
+ # is minimal, if a bad decision is made on a table with 5m rows, it starts to
94
+ # matter.
95
+ if len(values) == 0:
96
+ return
97
+ bin_values, counts = numpy.unique(values, return_counts=True)
98
+ if len(bin_values) > (self._bin_count * 5):
99
+ counts, bin_values = numpy.histogram(values, self._bin_count * 5, density=False)
100
+ bin_values = [bin_values[i] + bin_values[i + 1] / 2 for i in range(len(bin_values) - 1)]
101
+ for index, count in enumerate(counts):
102
+ if count > 0:
103
+ update(
104
+ self,
105
+ value=bin_values[index],
106
+ count=count,
107
+ )
108
+
109
+ # we need to overwrite any range values as we've approximated the dataset
110
+ if self.min is None:
111
+ self.min = values.min()
112
+ self.max = values.max()
113
+ else:
114
+ self.min = min(self.min, values.min())
115
+ self.max = max(self.max, values.max())
116
+
117
+ def count(self):
118
+ return sum(f for _, f in self.bins)
119
+
120
+ @property
121
+ def max_bin_count(self):
122
+ return self._bin_count
123
+
124
+ @property
125
+ def bin_count(self):
126
+ return len(self.bins)
127
+
128
+
129
+ # added for opteryx
130
+ def load(bins: list, minimum, maximum): # pragma: no cover
131
+ dgram = Distogram()
132
+ dgram.bins = bins
133
+ dgram.min = minimum
134
+ dgram.max = maximum
135
+ dgram.diffs = []
136
+
137
+ for i in range(len(dgram.bins) - 1):
138
+ diff = dgram.bins[i][0] - dgram.bins[i - 1][0]
139
+ dgram.diffs.append(diff)
140
+ if dgram.diffs:
141
+ dgram.min_diff = min(dgram.diffs)
142
+ else:
143
+ dgram.min_diff = float("inf")
144
+
145
+ return dgram
146
+
147
+
148
+ def _linspace(start: float, stop: float, num: int) -> List[float]: # pragma: no cover
149
+ if num == 1:
150
+ return [start, stop]
151
+ step = (stop - start) / float(num)
152
+ values = [start + step * i for i in range(num)]
153
+ values.append(stop)
154
+ return values
155
+
156
+
157
+ def _moment(x: List[float], counts: List[float], c: float, n: int) -> float: # pragma: no cover
158
+ """
159
+ Calculates the k-th moment of the distribution using the formula:
160
+
161
+ moment_k = sum((v - mean)**k * f) / sum(f)
162
+
163
+ where v is the value of a bin, f is its frequency, and mean is the mean of
164
+ the distribution.
165
+
166
+ Args:
167
+ h (Distogram): The input distribution.
168
+ k (int): The order of the moment to calculate.
169
+
170
+ Returns:
171
+ float: The k-th moment of the distribution.
172
+
173
+ Raises:
174
+ ValueError: If the distribution has no bins.
175
+
176
+ """
177
+ m = sum(ci * (v - c) ** n for ci, v in zip(counts, x))
178
+ return m / sum(counts)
179
+
180
+
181
+ def _update_diffs(h: Distogram, i: int) -> None: # pragma: no cover
182
+ if h.diffs is not None:
183
+ update_min = False
184
+
185
+ if i > 0:
186
+ if h.diffs[i - 1] == h.min_diff:
187
+ update_min = True
188
+
189
+ h.diffs[i - 1] = h.bins[i][0] - h.bins[i - 1][0]
190
+ if h.diffs[i - 1] < h.min_diff:
191
+ h.min_diff = h.diffs[i - 1]
192
+
193
+ if i < len(h.bins) - 1:
194
+ if h.diffs[i] == h.min_diff:
195
+ update_min = True
196
+
197
+ h.diffs[i] = h.bins[i + 1][0] - h.bins[i][0]
198
+ if h.diffs[i] < h.min_diff:
199
+ h.min_diff = h.diffs[i]
200
+
201
+ if update_min is True:
202
+ h.min_diff = min(h.diffs)
203
+
204
+ return
205
+
206
+
207
+ def _trim(h: Distogram) -> Distogram: # pragma: no cover
208
+ while len(h.bins) > h._bin_count:
209
+ if h.diffs is not None:
210
+ i = h.diffs.index(h.min_diff)
211
+ else:
212
+ diffs = [(i - 1, b[0] - h.bins[i - 1][0]) for i, b in enumerate(h.bins[1:], start=1)]
213
+ i, _ = min(diffs, key=itemgetter(1))
214
+
215
+ v1, f1 = h.bins[i]
216
+ v2, f2 = h.bins.pop(i + 1)
217
+ h.bins[i] = (v1 * f1 + v2 * f2) / (f1 + f2), f1 + f2
218
+
219
+ if h.diffs is not None:
220
+ h.diffs.pop(i)
221
+ _update_diffs(h, i)
222
+ h.min_diff = min(h.diffs)
223
+
224
+ return h
225
+
226
+
227
+ def _trim_in_place(
228
+ distogram: Distogram, new_value: float, new_count: int, bin_index: int
229
+ ) -> Distogram:
230
+ current_value, current_frequency = distogram.bins[bin_index]
231
+ current_value = _caster(current_value)
232
+ distogram.bins[bin_index] = (
233
+ (current_value * current_frequency + new_value * new_count)
234
+ / (current_frequency + new_count),
235
+ current_frequency + new_count,
236
+ )
237
+ _update_diffs(distogram, bin_index)
238
+ return distogram
239
+
240
+
241
+ def _compute_diffs(h: Distogram) -> List[float]: # pragma: no cover
242
+ diffs = [v2 - v1 for (v1, _), (v2, _) in zip(h.bins[:-1], h.bins[1:])]
243
+ h.min_diff = min(diffs)
244
+
245
+ return diffs
246
+
247
+
248
+ def _search_in_place_index(h: Distogram, new_value: float, index: int) -> int: # pragma: no cover
249
+ if h.diffs is None:
250
+ h.diffs = _compute_diffs(h)
251
+
252
+ if index > 0:
253
+ diff1 = new_value - h.bins[index - 1][0]
254
+ diff2 = h.bins[index][0] - new_value
255
+
256
+ i_bin, diff = (index - 1, diff1) if diff1 < diff2 else (index, diff2)
257
+
258
+ return i_bin if diff < h.min_diff else -1
259
+
260
+ return -1
261
+
262
+
263
+ def update(h: Distogram, value: float, count: int = 1) -> Distogram: # pragma: no cover
264
+ """Adds a new element to the distribution.
265
+
266
+ Args:
267
+ h: A Distogram object.
268
+ value: The value to add on the histogram.
269
+ count: [Optional] The number of times that value must be added.
270
+
271
+ Returns:
272
+ A Distogram object where value as been processed.
273
+
274
+ Raises:
275
+ ValueError if count is not strictly positive.
276
+ """
277
+ if count <= 0:
278
+ raise ValueError("count must be strictly positive")
279
+
280
+ index = 0
281
+ if len(h.bins) > 0:
282
+ if value <= h.bins[0][0]:
283
+ index = 0
284
+ elif value >= h.bins[-1][0]:
285
+ index = -1
286
+ else:
287
+ index = bisect_left(h.bins, (value, 1))
288
+
289
+ vi, fi = h.bins[index]
290
+ if vi == value:
291
+ h.bins[index] = (_caster(vi), fi + count)
292
+ return h
293
+
294
+ if index > 0 and len(h.bins) >= h._bin_count:
295
+ in_place_index = _search_in_place_index(h, value, index)
296
+ if in_place_index > 0:
297
+ h = _trim_in_place(h, value, count, in_place_index)
298
+ return h
299
+
300
+ if index == -1:
301
+ h.bins.append((_caster(value), count))
302
+ if h.diffs is not None:
303
+ diff = h.bins[-1][0] - h.bins[-2][0]
304
+ h.diffs.append(diff)
305
+ h.min_diff = min(h.min_diff, diff)
306
+ else:
307
+ h.bins.insert(index, (_caster(value), count))
308
+ if h.diffs is not None:
309
+ h.diffs.insert(index, 0)
310
+ _update_diffs(h, index)
311
+
312
+ if (h.min is None) or (h.min > value):
313
+ h.min = value
314
+ if (h.max is None) or (h.max < value):
315
+ h.max = value
316
+
317
+ h = _trim(h)
318
+ return h
319
+
320
+
321
+ def merge(h1: Distogram, h2: Distogram) -> Distogram: # pragma: no cover
322
+ """Merges two Distogram objects
323
+
324
+ Args:
325
+ h1: First Distogram.
326
+ h2: Second Distogram.
327
+
328
+ Returns:
329
+ A Distogram object being the composition of h1 and h2. The number of
330
+ bins in this Distogram is equal to the number of bins in h1.
331
+ """
332
+ if h1 is None:
333
+ return h2
334
+ if h2 is None:
335
+ return h1
336
+
337
+ h = h1 # Start with the initial value
338
+
339
+ # Loop through each item in h2.bins
340
+ for value, counts in h2.bins:
341
+ h = update(h, value, counts)
342
+ return h
343
+
344
+
345
+ def count_up_to(h: Distogram, value: float): # pragma: no cover
346
+ """Counts the number of elements present in the distribution up to value.
347
+
348
+ Args:
349
+ h: A Distogram object.
350
+ value: The value up to what elements must be counted.
351
+
352
+ Returns:
353
+ An estimation of the real count, computed from the compressed
354
+ representation of the distribution. Returns None if the Distogram
355
+ object contains no element or value is outside of the distribution
356
+ bounds.
357
+ """
358
+ if len(h.bins) == 0:
359
+ return None
360
+
361
+ if value < h.min or value > h.max:
362
+ return None
363
+
364
+ if value == h.min:
365
+ return 0
366
+
367
+ if value == h.max:
368
+ return count(h)
369
+
370
+ v0, f0 = h.bins[0]
371
+ vl, fl = h.bins[-1]
372
+ if value <= v0: # left
373
+ ratio = (value - h.min) / (v0 - h.min)
374
+ result = ratio * v0 / 2
375
+ elif value >= vl: # right
376
+ ratio = (value - vl) / (h.max - vl)
377
+ result = (1 + ratio) * fl / 2
378
+ result += sum((f for _, f in h.bins[:-1]))
379
+ else:
380
+ i = sum(((value > v) for v, _ in h.bins)) - 1
381
+ vi, fi = h.bins[i]
382
+ vj, fj = h.bins[i + 1]
383
+
384
+ mb = fi + (fj - fi) / (vj - vi) * (value - vi)
385
+ result = (fi + mb) / 2 * (value - vi) / (vj - vi)
386
+ result += sum((f for _, f in h.bins[:i]))
387
+
388
+ result = result + fi / 2
389
+
390
+ return result
391
+
392
+
393
+ def count(h: Distogram) -> float: # pragma: no cover
394
+ """Counts the number of elements in the distribution.
395
+
396
+ Args:
397
+ h: A Distogram object.
398
+
399
+ Returns:
400
+ The number of elements in the distribution.
401
+ """
402
+ return sum(f for _, f in h.bins)
403
+
404
+
405
+ def bin_size(h: Distogram, value) -> int: # pragma: no cover
406
+ for v, c in h.bins:
407
+ if value < v:
408
+ return c
409
+ return None
410
+
411
+
412
+ def bounds(h: Distogram) -> Tuple[float, float]: # pragma: no cover
413
+ """Returns the min and max values of the distribution.
414
+
415
+ Args:
416
+ h: A Distogram object.
417
+
418
+ Returns:
419
+ A tuple containing the minimum and maximum values of the distribution.
420
+ """
421
+ return h.min, h.max
422
+
423
+
424
+ def mean(h: Distogram) -> float: # pragma: no cover
425
+ """Returns the mean of the distribution.
426
+
427
+ Args:
428
+ h: A Distogram object.
429
+
430
+ Returns:
431
+ An estimation of the mean of the values in the distribution.
432
+ """
433
+ p, m = zip(*h.bins)
434
+ return _moment(p, m, 0, 1)
435
+
436
+
437
+ def variance(h: Distogram) -> float: # pragma: no cover
438
+ """Returns the variance of the distribution.
439
+
440
+ Args:
441
+ h: A Distogram object.
442
+
443
+ Returns:
444
+ An estimation of the variance of the values in the distribution.
445
+ """
446
+ p, m = zip(*h.bins)
447
+ return _moment(p, m, mean(h), 2)
448
+
449
+
450
+ def stddev(h: Distogram) -> float: # pragma: no cover
451
+ """Returns the standard deviation of the distribution.
452
+
453
+ Args:
454
+ h: A Distogram object.
455
+
456
+ Returns:
457
+ An estimation of the standard deviation of the values in the
458
+ distribution.
459
+ """
460
+ return math.sqrt(variance(h))
461
+
462
+
463
+ def histogram(
464
+ h: Distogram, bin_count: Optional[int] = None
465
+ ) -> Tuple[List[float], List[float]]: # pragma: no cover
466
+ """Returns a histogram of the distribution in numpy format.
467
+
468
+ Args:
469
+ h: A Distogram object.
470
+ bin_count: [Optional] The number of bins in the histogram.
471
+
472
+ Returns:
473
+ An estimation of the histogram of the distribution, or None
474
+ if there is not enough items in the distribution.
475
+ """
476
+
477
+ if bin_count is None:
478
+ bin_count = 20
479
+ bin_count = min(bin_count, len(h.bins))
480
+ if bin_count < 2:
481
+ return None
482
+
483
+ bin_bounds = _linspace(h.min, h.max, num=bin_count)
484
+ counts = [count_up_to(h, e) for e in bin_bounds]
485
+ counts = [new - last for new, last in zip(counts[1:], counts[:-1])]
486
+
487
+ result = {f"{bin_bounds[i]} - {bin_bounds[i + 1]}": c for i, c in enumerate(counts)}
488
+
489
+ return result
490
+
491
+
492
+ def frequency_density_distribution(
493
+ h: Distogram,
494
+ ) -> Tuple[List[float], List[float]]: # pragma: no cover
495
+ """Returns a histogram of the distribution
496
+
497
+ Args:
498
+ h: A Distogram object.
499
+
500
+ Returns:
501
+ An estimation of the frequency density distribution, or None if
502
+ there are not enough values in the distribution.
503
+ """
504
+
505
+ if count(h) < 2:
506
+ return None
507
+
508
+ bin_bounds = [float(i[0]) for i in h.bins]
509
+ bin_widths = [(bin_bounds[i] - bin_bounds[i - 1]) for i in range(1, len(bin_bounds))]
510
+ counts = [0]
511
+ counts.extend([count_up_to(h, e) for e in bin_bounds[1:]])
512
+ densities = [
513
+ (new - last) / delta for new, last, delta in zip(counts[1:], counts[:-1], bin_widths)
514
+ ]
515
+ return (densities, bin_bounds)
516
+
517
+
518
+ def quantile(h: Distogram, value: float) -> Optional[float]: # pragma: no cover
519
+ """Returns a quantile of the distribution
520
+
521
+ Args:
522
+ h: A Distogram object.
523
+ value: The quantile to compute. Must be between 0 and 1
524
+
525
+ Returns:
526
+ An estimation of the quantile. Returns None if the Distogram
527
+ object contains no element or value is outside of [0, 1].
528
+ """
529
+ if len(h.bins) == 0:
530
+ return None
531
+
532
+ if not (0 <= value <= 1):
533
+ return None
534
+
535
+ total_count = count(h)
536
+ q_count = int(total_count * value)
537
+ v0, f0 = h.bins[0]
538
+ vl, fl = h.bins[-1]
539
+
540
+ if q_count <= (f0 / 2): # left values
541
+ fraction = q_count / (f0 / 2)
542
+ result = h.min + (fraction * (v0 - h.min))
543
+
544
+ elif q_count >= (total_count - (fl / 2)): # right values
545
+ base = q_count - (total_count - (fl / 2))
546
+ fraction = base / (fl / 2)
547
+ result = vl + (fraction * (h.max - vl))
548
+
549
+ else:
550
+ mb = q_count - f0 / 2
551
+ mids = [(fi + fj) / 2 for (_, fi), (_, fj) in zip(h.bins[:-1], h.bins[1:])]
552
+ i, _ = next(filter(lambda i_f: mb < i_f[1], enumerate(accumulate(mids))))
553
+
554
+ (vi, _), (vj, _) = h.bins[i], h.bins[i + 1]
555
+ fraction = (mb - sum(mids[:i])) / mids[i]
556
+ result = vi + (fraction * (vj - vi))
557
+
558
+ return result
@@ -0,0 +1,52 @@
1
+ # type:ignore
2
+ # isort: skip_file
3
+ import sys
4
+ import os
5
+
6
+ sys.path.insert(1, os.path.join(sys.path[0], "../../../.."))
7
+
8
+ from opteryx.third_party.maki_nage import distogram
9
+ import random
10
+ import numpy as np
11
+ from pytest import approx
12
+
13
+
14
+ def test_histogram():
15
+ normal = [random.normalvariate(0.0, 1.0) for _ in range(10000)]
16
+ h = distogram.Distogram(bin_count=64)
17
+
18
+ for i in normal:
19
+ distogram.update(h, i)
20
+
21
+ np_values, np_edges = np.histogram(normal, 10)
22
+ d_values, d_edges = distogram.histogram(h, 10)
23
+
24
+ h = distogram.Distogram(bin_count=3)
25
+ distogram.update(h, 23)
26
+ distogram.update(h, 28)
27
+ distogram.update(h, 16)
28
+ assert distogram.histogram(h, bin_count=3) == (
29
+ approx([1.0714285714285714, 0.6285714285714286, 1.3]),
30
+ [16.0, 20.0, 24.0, 28],
31
+ )
32
+ assert sum(distogram.histogram(h, bin_count=3)[0]) == approx(3.0)
33
+
34
+
35
+ def test_histogram_on_too_small_distribution():
36
+ h = distogram.Distogram(bin_count=64)
37
+
38
+ for i in range(5):
39
+ distogram.update(h, i)
40
+
41
+ assert distogram.histogram(h, 10) is None
42
+
43
+
44
+ def test_format_histogram():
45
+ bin_count = 4
46
+ h = distogram.Distogram(bin_count=bin_count)
47
+
48
+ for i in range(4):
49
+ distogram.update(h, i)
50
+
51
+ hist = distogram.histogram(h, bin_count=bin_count)
52
+ assert len(hist[1]) == len(hist[0]) + 1
@@ -0,0 +1,24 @@
1
+ # isort: skip_file
2
+ import sys
3
+ import os
4
+
5
+ sys.path.insert(1, os.path.join(sys.path[0], "../../../.."))
6
+
7
+ from opteryx.third_party.maki_nage import distogram
8
+ import random
9
+
10
+
11
+ def test_bounds():
12
+ normal = [random.normalvariate(0.0, 1.0) for _ in range(10000)]
13
+ h = distogram.Distogram()
14
+
15
+ for i in normal:
16
+ distogram.update(h, i)
17
+
18
+ dmin, dmax = distogram.bounds(h)
19
+ assert dmin == min(normal)
20
+ assert dmax == max(normal)
21
+
22
+
23
+ if __name__ == "__main__": # pragma: no cover
24
+ test_bounds()
@@ -0,0 +1,19 @@
1
+ # isort: skip_file
2
+ import sys
3
+ import os
4
+
5
+ sys.path.insert(1, os.path.join(sys.path[0], "../../../.."))
6
+
7
+ from opteryx.third_party.maki_nage import distogram
8
+
9
+
10
+ def test_count():
11
+ h = distogram.Distogram(bin_count=3)
12
+ assert distogram.count(h) == 0
13
+
14
+ distogram.update(h, 16, count=4)
15
+ assert distogram.count(h) == 4
16
+ distogram.update(h, 23, count=3)
17
+ assert distogram.count(h) == 7
18
+ distogram.update(h, 28, count=5)
19
+ assert distogram.count(h) == 12