opteryx-catalog 0.4.4__py3-none-any.whl → 0.4.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. opteryx_catalog/__init__.py +1 -1
  2. opteryx_catalog/catalog/__init__.py +2 -1
  3. opteryx_catalog/catalog/compaction.py +536 -0
  4. opteryx_catalog/catalog/dataset.py +840 -520
  5. opteryx_catalog/catalog/manifest.py +475 -0
  6. opteryx_catalog/catalog/metadata.py +5 -2
  7. opteryx_catalog/catalog/metastore.py +2 -2
  8. opteryx_catalog/exceptions.py +1 -1
  9. opteryx_catalog/iops/fileio.py +13 -0
  10. opteryx_catalog/iops/gcs.py +35 -5
  11. opteryx_catalog/maki_nage/__init__.py +8 -0
  12. opteryx_catalog/maki_nage/distogram.py +558 -0
  13. opteryx_catalog/maki_nage/tests/_test_histogram.py +52 -0
  14. opteryx_catalog/maki_nage/tests/test_bounds.py +24 -0
  15. opteryx_catalog/maki_nage/tests/test_count.py +19 -0
  16. opteryx_catalog/maki_nage/tests/test_count_at.py +89 -0
  17. opteryx_catalog/maki_nage/tests/test_quantile.py +81 -0
  18. opteryx_catalog/maki_nage/tests/test_stats.py +25 -0
  19. opteryx_catalog/maki_nage/tests/test_update.py +44 -0
  20. opteryx_catalog/opteryx_catalog.py +296 -242
  21. opteryx_catalog/webhooks/__init__.py +230 -0
  22. opteryx_catalog/webhooks/events.py +177 -0
  23. {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/METADATA +15 -18
  24. opteryx_catalog-0.4.26.dist-info/RECORD +45 -0
  25. {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/WHEEL +1 -1
  26. scripts/collect_byte_counts.py +42 -0
  27. scripts/create_dataset.py +1 -1
  28. scripts/emit_full_single_file.py +81 -0
  29. scripts/inspect_manifest_dryrun.py +322 -0
  30. scripts/inspect_single_file.py +147 -0
  31. scripts/inspect_single_file_gcs.py +124 -0
  32. scripts/read_dataset.py +1 -1
  33. tests/test_collections.py +37 -0
  34. tests/test_compaction.py +233 -0
  35. tests/test_dataset_metadata.py +14 -0
  36. tests/test_describe_uncompressed.py +127 -0
  37. tests/test_refresh_manifest.py +275 -0
  38. tests/test_webhooks.py +177 -0
  39. opteryx_catalog-0.4.4.dist-info/RECORD +0 -23
  40. {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/licenses/LICENSE +0 -0
  41. {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,89 @@
1
+ # type:ignore
2
+ # isort: skip_file
3
+ import sys
4
+ import os
5
+
6
+ sys.path.insert(1, os.path.join(sys.path[0], "../../../.."))
7
+
8
+ from opteryx.third_party.maki_nage import distogram
9
+ from pytest import approx
10
+ import random
11
+
12
+
13
+ def test_count_at():
14
+ h = distogram.Distogram(bin_count=3)
15
+ print(h)
16
+
17
+ # fill histogram
18
+ distogram.update(h, 16, count=4)
19
+ distogram.update(h, 23, count=3)
20
+ distogram.update(h, 28, count=5)
21
+ print(h)
22
+
23
+ actual_result = distogram.count_at(h, 25)
24
+ assert actual_result == approx(6.859999999)
25
+
26
+
27
+ def test_count_at_normal():
28
+ points = 10000
29
+ normal = [random.normalvariate(0.0, 1.0) for _ in range(points)]
30
+ h = distogram.Distogram()
31
+
32
+ for i in normal:
33
+ distogram.update(h, i)
34
+
35
+ assert distogram.count_at(h, 0) == approx(points / 2, rel=0.05)
36
+
37
+
38
+ def test_count_at_not_enough_elements():
39
+ h = distogram.Distogram()
40
+
41
+ distogram.update(h, 1)
42
+ distogram.update(h, 2)
43
+ distogram.update(h, 3)
44
+
45
+ assert distogram.count_at(h, 2.5) == 2
46
+
47
+
48
+ def test_count_at_left():
49
+ h = distogram.Distogram(bin_count=6)
50
+
51
+ for i in [1, 2, 3, 4, 5, 6, 0.7, 1.1]:
52
+ distogram.update(h, i)
53
+
54
+ assert distogram.count_at(h, 0.77) == approx(0.14), distogram.count_at(h, 0.77)
55
+
56
+
57
+ def test_count_at_right():
58
+ h = distogram.Distogram(bin_count=6)
59
+
60
+ for i in [1, 2, 3, 4, 5, 6, 6.7, 6.1]:
61
+ distogram.update(h, i)
62
+
63
+ assert distogram.count_at(h, 6.5) == approx(7.307692307692308)
64
+
65
+
66
+ def test_count_at_empty():
67
+ h = distogram.Distogram()
68
+
69
+ assert distogram.count_at(h, 6.5) is None
70
+
71
+
72
+ def test_count_at_out_of_bouns():
73
+ h = distogram.Distogram()
74
+
75
+ for i in [1, 2, 3, 4, 5, 6, 6.7, 6.1]:
76
+ distogram.update(h, i)
77
+
78
+ assert distogram.count_at(h, 0.2) is None
79
+ assert distogram.count_at(h, 10) is None
80
+
81
+
82
+ if __name__ == "__main__": # pragma: no cover
83
+ test_count_at()
84
+ test_count_at_empty()
85
+ test_count_at_left()
86
+ test_count_at_normal()
87
+ test_count_at_not_enough_elements()
88
+ test_count_at_out_of_bouns()
89
+ test_count_at_right()
@@ -0,0 +1,81 @@
1
+ # type:ignore
2
+ # isort: skip_file
3
+ import sys
4
+ import os
5
+
6
+ sys.path.insert(1, os.path.join(sys.path[0], "../../../.."))
7
+
8
+ from opteryx.third_party.maki_nage import distogram
9
+ from pytest import approx
10
+
11
+ import numpy as np
12
+ import random
13
+
14
+
15
+ def test_quantile():
16
+ h = distogram.Distogram(bin_count=3)
17
+ distogram.update(h, 16, count=4)
18
+ distogram.update(h, 23, count=3)
19
+ distogram.update(h, 28, count=5)
20
+
21
+ assert distogram.quantile(h, 0.5) == approx(23.625)
22
+
23
+
24
+ def test_quantile_not_enough_elemnts():
25
+ h = distogram.Distogram(bin_count=10)
26
+
27
+ for i in [12.3, 5.4, 8.2, 100.53, 23.5, 13.98]:
28
+ distogram.update(h, i)
29
+
30
+ assert distogram.quantile(h, 0.5) == approx(13.14)
31
+
32
+
33
+ def test_quantile_on_left():
34
+ h = distogram.Distogram(bin_count=6)
35
+
36
+ data = [12.3, 5.2, 5.4, 4.9, 5.5, 5.6, 8.2, 30.53, 23.5, 13.98]
37
+ for i in data:
38
+ distogram.update(h, i)
39
+
40
+ assert distogram.quantile(h, 0.01) == approx(np.quantile(data, 0.01), rel=0.01)
41
+ assert distogram.quantile(h, 0.05) == approx(np.quantile(data, 0.05), rel=0.05)
42
+ assert distogram.quantile(h, 0.25) == approx(np.quantile(data, 0.25), rel=0.05)
43
+
44
+
45
+ def test_quantile_on_right():
46
+ h = distogram.Distogram(bin_count=6)
47
+
48
+ data = [12.3, 8.2, 100.53, 23.5, 13.98, 200, 200.2, 200.8, 200.4, 200.1]
49
+ for i in data:
50
+ distogram.update(h, i)
51
+
52
+ assert distogram.quantile(h, 0.99) == approx(np.quantile(data, 0.99), rel=0.01)
53
+ assert distogram.quantile(h, 0.85) == approx(np.quantile(data, 0.85), rel=0.01)
54
+
55
+
56
+ def test_normal():
57
+ # normal = np.random.normal(0,1, 1000)
58
+ normal = [random.normalvariate(0.0, 1.0) for _ in range(10000)]
59
+ h = distogram.Distogram(bin_count=64)
60
+
61
+ for i in normal:
62
+ distogram.update(h, i)
63
+
64
+ assert distogram.quantile(h, 0.5) == approx(np.quantile(normal, 0.5), abs=0.2)
65
+ assert distogram.quantile(h, 0.95) == approx(np.quantile(normal, 0.95), abs=0.2)
66
+
67
+
68
+ def test_quantile_empty():
69
+ h = distogram.Distogram()
70
+
71
+ assert distogram.quantile(h, 0.3) is None
72
+
73
+
74
+ def test_quantile_out_of_bouns():
75
+ h = distogram.Distogram()
76
+
77
+ for i in [1, 2, 3, 4, 5, 6, 6.7, 6.1]:
78
+ distogram.update(h, i)
79
+
80
+ assert distogram.quantile(h, -0.2) is None
81
+ assert distogram.quantile(h, 10) is None
@@ -0,0 +1,25 @@
1
+ # type:ignore
2
+ # isort: skip_file
3
+ import sys
4
+ import os
5
+
6
+ sys.path.insert(1, os.path.join(sys.path[0], "../../../.."))
7
+
8
+ from opteryx.third_party.maki_nage import distogram
9
+ from pytest import approx
10
+
11
+
12
+ import numpy as np
13
+ import random
14
+
15
+
16
+ def test_stats():
17
+ normal = [random.normalvariate(0.0, 1.0) for _ in range(10000)]
18
+ h = distogram.Distogram()
19
+
20
+ for i in normal:
21
+ distogram.update(h, i)
22
+
23
+ assert distogram.mean(h) == approx(np.mean(normal), abs=0.1)
24
+ assert distogram.variance(h) == approx(np.var(normal), abs=0.1)
25
+ assert distogram.stddev(h) == approx(np.std(normal), abs=0.1)
@@ -0,0 +1,44 @@
1
+ # type:ignore
2
+ # isort: skip_file
3
+ import sys
4
+ import os
5
+
6
+ sys.path.insert(1, os.path.join(sys.path[0], "../../../.."))
7
+
8
+ from opteryx.third_party.maki_nage import distogram
9
+ import pytest
10
+ from pytest import approx
11
+
12
+
13
+ def test_update():
14
+ h = distogram.Distogram(bin_count=3)
15
+
16
+ # fill histogram
17
+ distogram.update(h, 23)
18
+ assert h.bins == [(23, 1)]
19
+ distogram.update(h, 28)
20
+ assert h.bins == [(23, 1), (28, 1)]
21
+ distogram.update(h, 16)
22
+ assert h.bins == [(16, 1), (23, 1), (28, 1)]
23
+
24
+ # update count on existing value
25
+ distogram.update(h, 23)
26
+ assert h.bins == [(16, 1), (23, 2), (28, 1)]
27
+ distogram.update(h, 28)
28
+ assert h.bins == [(16, 1), (23, 2), (28, 2)]
29
+ distogram.update(h, 16)
30
+ assert h.bins == [(16, 2), (23, 2), (28, 2)]
31
+
32
+ # merge values
33
+ h = distogram.update(h, 26)
34
+ assert h.bins[0] == (16, 2)
35
+ assert h.bins[1] == (23, 2)
36
+ assert h.bins[2][0] == approx(27.33333)
37
+ assert h.bins[2][1] == 3
38
+
39
+
40
+ def test_update_with_invalid_count():
41
+ h = distogram.Distogram(bin_count=3)
42
+
43
+ with pytest.raises(ValueError):
44
+ distogram.update(h, 23, count=0)