opteryx-catalog 0.4.4__py3-none-any.whl → 0.4.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opteryx_catalog/__init__.py +1 -1
- opteryx_catalog/catalog/__init__.py +2 -1
- opteryx_catalog/catalog/compaction.py +536 -0
- opteryx_catalog/catalog/dataset.py +840 -520
- opteryx_catalog/catalog/manifest.py +475 -0
- opteryx_catalog/catalog/metadata.py +5 -2
- opteryx_catalog/catalog/metastore.py +2 -2
- opteryx_catalog/exceptions.py +1 -1
- opteryx_catalog/iops/fileio.py +13 -0
- opteryx_catalog/iops/gcs.py +35 -5
- opteryx_catalog/maki_nage/__init__.py +8 -0
- opteryx_catalog/maki_nage/distogram.py +558 -0
- opteryx_catalog/maki_nage/tests/_test_histogram.py +52 -0
- opteryx_catalog/maki_nage/tests/test_bounds.py +24 -0
- opteryx_catalog/maki_nage/tests/test_count.py +19 -0
- opteryx_catalog/maki_nage/tests/test_count_at.py +89 -0
- opteryx_catalog/maki_nage/tests/test_quantile.py +81 -0
- opteryx_catalog/maki_nage/tests/test_stats.py +25 -0
- opteryx_catalog/maki_nage/tests/test_update.py +44 -0
- opteryx_catalog/opteryx_catalog.py +296 -242
- opteryx_catalog/webhooks/__init__.py +230 -0
- opteryx_catalog/webhooks/events.py +177 -0
- {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/METADATA +15 -18
- opteryx_catalog-0.4.26.dist-info/RECORD +45 -0
- {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/WHEEL +1 -1
- scripts/collect_byte_counts.py +42 -0
- scripts/create_dataset.py +1 -1
- scripts/emit_full_single_file.py +81 -0
- scripts/inspect_manifest_dryrun.py +322 -0
- scripts/inspect_single_file.py +147 -0
- scripts/inspect_single_file_gcs.py +124 -0
- scripts/read_dataset.py +1 -1
- tests/test_collections.py +37 -0
- tests/test_compaction.py +233 -0
- tests/test_dataset_metadata.py +14 -0
- tests/test_describe_uncompressed.py +127 -0
- tests/test_refresh_manifest.py +275 -0
- tests/test_webhooks.py +177 -0
- opteryx_catalog-0.4.4.dist-info/RECORD +0 -23
- {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/licenses/LICENSE +0 -0
- {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# type:ignore
|
|
2
|
+
# isort: skip_file
|
|
3
|
+
import sys
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
sys.path.insert(1, os.path.join(sys.path[0], "../../../.."))
|
|
7
|
+
|
|
8
|
+
from opteryx.third_party.maki_nage import distogram
|
|
9
|
+
from pytest import approx
|
|
10
|
+
import random
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_count_at():
|
|
14
|
+
h = distogram.Distogram(bin_count=3)
|
|
15
|
+
print(h)
|
|
16
|
+
|
|
17
|
+
# fill histogram
|
|
18
|
+
distogram.update(h, 16, count=4)
|
|
19
|
+
distogram.update(h, 23, count=3)
|
|
20
|
+
distogram.update(h, 28, count=5)
|
|
21
|
+
print(h)
|
|
22
|
+
|
|
23
|
+
actual_result = distogram.count_at(h, 25)
|
|
24
|
+
assert actual_result == approx(6.859999999)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def test_count_at_normal():
|
|
28
|
+
points = 10000
|
|
29
|
+
normal = [random.normalvariate(0.0, 1.0) for _ in range(points)]
|
|
30
|
+
h = distogram.Distogram()
|
|
31
|
+
|
|
32
|
+
for i in normal:
|
|
33
|
+
distogram.update(h, i)
|
|
34
|
+
|
|
35
|
+
assert distogram.count_at(h, 0) == approx(points / 2, rel=0.05)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_count_at_not_enough_elements():
|
|
39
|
+
h = distogram.Distogram()
|
|
40
|
+
|
|
41
|
+
distogram.update(h, 1)
|
|
42
|
+
distogram.update(h, 2)
|
|
43
|
+
distogram.update(h, 3)
|
|
44
|
+
|
|
45
|
+
assert distogram.count_at(h, 2.5) == 2
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def test_count_at_left():
|
|
49
|
+
h = distogram.Distogram(bin_count=6)
|
|
50
|
+
|
|
51
|
+
for i in [1, 2, 3, 4, 5, 6, 0.7, 1.1]:
|
|
52
|
+
distogram.update(h, i)
|
|
53
|
+
|
|
54
|
+
assert distogram.count_at(h, 0.77) == approx(0.14), distogram.count_at(h, 0.77)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def test_count_at_right():
|
|
58
|
+
h = distogram.Distogram(bin_count=6)
|
|
59
|
+
|
|
60
|
+
for i in [1, 2, 3, 4, 5, 6, 6.7, 6.1]:
|
|
61
|
+
distogram.update(h, i)
|
|
62
|
+
|
|
63
|
+
assert distogram.count_at(h, 6.5) == approx(7.307692307692308)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def test_count_at_empty():
|
|
67
|
+
h = distogram.Distogram()
|
|
68
|
+
|
|
69
|
+
assert distogram.count_at(h, 6.5) is None
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def test_count_at_out_of_bouns():
|
|
73
|
+
h = distogram.Distogram()
|
|
74
|
+
|
|
75
|
+
for i in [1, 2, 3, 4, 5, 6, 6.7, 6.1]:
|
|
76
|
+
distogram.update(h, i)
|
|
77
|
+
|
|
78
|
+
assert distogram.count_at(h, 0.2) is None
|
|
79
|
+
assert distogram.count_at(h, 10) is None
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
if __name__ == "__main__": # pragma: no cover
|
|
83
|
+
test_count_at()
|
|
84
|
+
test_count_at_empty()
|
|
85
|
+
test_count_at_left()
|
|
86
|
+
test_count_at_normal()
|
|
87
|
+
test_count_at_not_enough_elements()
|
|
88
|
+
test_count_at_out_of_bouns()
|
|
89
|
+
test_count_at_right()
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# type:ignore
|
|
2
|
+
# isort: skip_file
|
|
3
|
+
import sys
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
sys.path.insert(1, os.path.join(sys.path[0], "../../../.."))
|
|
7
|
+
|
|
8
|
+
from opteryx.third_party.maki_nage import distogram
|
|
9
|
+
from pytest import approx
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
import random
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def test_quantile():
|
|
16
|
+
h = distogram.Distogram(bin_count=3)
|
|
17
|
+
distogram.update(h, 16, count=4)
|
|
18
|
+
distogram.update(h, 23, count=3)
|
|
19
|
+
distogram.update(h, 28, count=5)
|
|
20
|
+
|
|
21
|
+
assert distogram.quantile(h, 0.5) == approx(23.625)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def test_quantile_not_enough_elemnts():
|
|
25
|
+
h = distogram.Distogram(bin_count=10)
|
|
26
|
+
|
|
27
|
+
for i in [12.3, 5.4, 8.2, 100.53, 23.5, 13.98]:
|
|
28
|
+
distogram.update(h, i)
|
|
29
|
+
|
|
30
|
+
assert distogram.quantile(h, 0.5) == approx(13.14)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def test_quantile_on_left():
|
|
34
|
+
h = distogram.Distogram(bin_count=6)
|
|
35
|
+
|
|
36
|
+
data = [12.3, 5.2, 5.4, 4.9, 5.5, 5.6, 8.2, 30.53, 23.5, 13.98]
|
|
37
|
+
for i in data:
|
|
38
|
+
distogram.update(h, i)
|
|
39
|
+
|
|
40
|
+
assert distogram.quantile(h, 0.01) == approx(np.quantile(data, 0.01), rel=0.01)
|
|
41
|
+
assert distogram.quantile(h, 0.05) == approx(np.quantile(data, 0.05), rel=0.05)
|
|
42
|
+
assert distogram.quantile(h, 0.25) == approx(np.quantile(data, 0.25), rel=0.05)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def test_quantile_on_right():
|
|
46
|
+
h = distogram.Distogram(bin_count=6)
|
|
47
|
+
|
|
48
|
+
data = [12.3, 8.2, 100.53, 23.5, 13.98, 200, 200.2, 200.8, 200.4, 200.1]
|
|
49
|
+
for i in data:
|
|
50
|
+
distogram.update(h, i)
|
|
51
|
+
|
|
52
|
+
assert distogram.quantile(h, 0.99) == approx(np.quantile(data, 0.99), rel=0.01)
|
|
53
|
+
assert distogram.quantile(h, 0.85) == approx(np.quantile(data, 0.85), rel=0.01)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def test_normal():
|
|
57
|
+
# normal = np.random.normal(0,1, 1000)
|
|
58
|
+
normal = [random.normalvariate(0.0, 1.0) for _ in range(10000)]
|
|
59
|
+
h = distogram.Distogram(bin_count=64)
|
|
60
|
+
|
|
61
|
+
for i in normal:
|
|
62
|
+
distogram.update(h, i)
|
|
63
|
+
|
|
64
|
+
assert distogram.quantile(h, 0.5) == approx(np.quantile(normal, 0.5), abs=0.2)
|
|
65
|
+
assert distogram.quantile(h, 0.95) == approx(np.quantile(normal, 0.95), abs=0.2)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def test_quantile_empty():
|
|
69
|
+
h = distogram.Distogram()
|
|
70
|
+
|
|
71
|
+
assert distogram.quantile(h, 0.3) is None
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def test_quantile_out_of_bouns():
|
|
75
|
+
h = distogram.Distogram()
|
|
76
|
+
|
|
77
|
+
for i in [1, 2, 3, 4, 5, 6, 6.7, 6.1]:
|
|
78
|
+
distogram.update(h, i)
|
|
79
|
+
|
|
80
|
+
assert distogram.quantile(h, -0.2) is None
|
|
81
|
+
assert distogram.quantile(h, 10) is None
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# type:ignore
|
|
2
|
+
# isort: skip_file
|
|
3
|
+
import sys
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
sys.path.insert(1, os.path.join(sys.path[0], "../../../.."))
|
|
7
|
+
|
|
8
|
+
from opteryx.third_party.maki_nage import distogram
|
|
9
|
+
from pytest import approx
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
import random
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_stats():
|
|
17
|
+
normal = [random.normalvariate(0.0, 1.0) for _ in range(10000)]
|
|
18
|
+
h = distogram.Distogram()
|
|
19
|
+
|
|
20
|
+
for i in normal:
|
|
21
|
+
distogram.update(h, i)
|
|
22
|
+
|
|
23
|
+
assert distogram.mean(h) == approx(np.mean(normal), abs=0.1)
|
|
24
|
+
assert distogram.variance(h) == approx(np.var(normal), abs=0.1)
|
|
25
|
+
assert distogram.stddev(h) == approx(np.std(normal), abs=0.1)
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# type:ignore
|
|
2
|
+
# isort: skip_file
|
|
3
|
+
import sys
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
sys.path.insert(1, os.path.join(sys.path[0], "../../../.."))
|
|
7
|
+
|
|
8
|
+
from opteryx.third_party.maki_nage import distogram
|
|
9
|
+
import pytest
|
|
10
|
+
from pytest import approx
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_update():
|
|
14
|
+
h = distogram.Distogram(bin_count=3)
|
|
15
|
+
|
|
16
|
+
# fill histogram
|
|
17
|
+
distogram.update(h, 23)
|
|
18
|
+
assert h.bins == [(23, 1)]
|
|
19
|
+
distogram.update(h, 28)
|
|
20
|
+
assert h.bins == [(23, 1), (28, 1)]
|
|
21
|
+
distogram.update(h, 16)
|
|
22
|
+
assert h.bins == [(16, 1), (23, 1), (28, 1)]
|
|
23
|
+
|
|
24
|
+
# update count on existing value
|
|
25
|
+
distogram.update(h, 23)
|
|
26
|
+
assert h.bins == [(16, 1), (23, 2), (28, 1)]
|
|
27
|
+
distogram.update(h, 28)
|
|
28
|
+
assert h.bins == [(16, 1), (23, 2), (28, 2)]
|
|
29
|
+
distogram.update(h, 16)
|
|
30
|
+
assert h.bins == [(16, 2), (23, 2), (28, 2)]
|
|
31
|
+
|
|
32
|
+
# merge values
|
|
33
|
+
h = distogram.update(h, 26)
|
|
34
|
+
assert h.bins[0] == (16, 2)
|
|
35
|
+
assert h.bins[1] == (23, 2)
|
|
36
|
+
assert h.bins[2][0] == approx(27.33333)
|
|
37
|
+
assert h.bins[2][1] == 3
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def test_update_with_invalid_count():
|
|
41
|
+
h = distogram.Distogram(bin_count=3)
|
|
42
|
+
|
|
43
|
+
with pytest.raises(ValueError):
|
|
44
|
+
distogram.update(h, 23, count=0)
|