datasketches 0.3.1 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/NOTICE +1 -1
- data/README.md +0 -2
- data/ext/datasketches/cpc_wrapper.cpp +3 -3
- data/ext/datasketches/kll_wrapper.cpp +0 -10
- data/lib/datasketches/version.rb +1 -1
- data/lib/datasketches.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +23 -20
- data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
- data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
- data/vendor/datasketches-cpp/Doxyfile +2827 -0
- data/vendor/datasketches-cpp/LICENSE +0 -76
- data/vendor/datasketches-cpp/README.md +1 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +15 -10
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -2
- data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
- data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
- data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +2 -2
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
- data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
- data/vendor/datasketches-cpp/{python/src/__init__.py → count/CMakeLists.txt} +25 -1
- data/vendor/datasketches-cpp/count/include/count_min.hpp +405 -0
- data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +497 -0
- data/vendor/datasketches-cpp/{MANIFEST.in → count/test/CMakeLists.txt} +23 -20
- data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
- data/vendor/datasketches-cpp/count/test/count_min_test.cpp +303 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +14 -11
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_serialize_for_java.cpp +38 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
- data/vendor/datasketches-cpp/{tox.ini → density/CMakeLists.txt} +24 -8
- data/vendor/datasketches-cpp/density/include/density_sketch.hpp +256 -0
- data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
- data/vendor/datasketches-cpp/{python/datasketches/__init__.py → density/test/CMakeLists.txt} +15 -3
- data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +94 -61
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +20 -8
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -18
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
- data/vendor/datasketches-cpp/hll/include/hll.hpp +79 -65
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
- data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
- data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +79 -53
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +61 -132
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +5 -40
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +76 -54
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +66 -136
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +15 -39
- data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -4
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +105 -26
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +50 -111
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
- data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +89 -32
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +33 -19
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +13 -10
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +23 -19
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
- data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
- data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
- data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -51
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -20
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +3 -1
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +180 -33
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +10 -10
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +21 -6
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +13 -3
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +15 -1
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +39 -188
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
- data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
- data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection_impl.hpp +31 -0
- data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
- data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
- data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
- data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
- data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
- data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
- data/vendor/datasketches-cpp/{python/src/datasketches.cpp → tuple/test/tuple_sketch_serialize_for_java.cpp} +16 -30
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +61 -79
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/pyproject.toml +0 -23
- data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -81
- data/vendor/datasketches-cpp/python/README.md +0 -85
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -104
- data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -90
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -128
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -134
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -210
- data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -111
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -204
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -215
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -172
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
- data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -110
- data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -130
- data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -125
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -126
- data/vendor/datasketches-cpp/python/tests/req_test.py +0 -126
- data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -146
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
- data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -125
- data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +0 -110
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
@@ -1,403 +0,0 @@
|
|
1
|
-
{
|
2
|
-
"cells": [
|
3
|
-
{
|
4
|
-
"cell_type": "markdown",
|
5
|
-
"metadata": {},
|
6
|
-
"source": [
|
7
|
-
"## Theta Sketch Examples"
|
8
|
-
]
|
9
|
-
},
|
10
|
-
{
|
11
|
-
"cell_type": "markdown",
|
12
|
-
"metadata": {},
|
13
|
-
"source": [
|
14
|
-
"### Basic Sketch Usage"
|
15
|
-
]
|
16
|
-
},
|
17
|
-
{
|
18
|
-
"cell_type": "code",
|
19
|
-
"execution_count": 1,
|
20
|
-
"metadata": {},
|
21
|
-
"outputs": [],
|
22
|
-
"source": [
|
23
|
-
"from datasketches import theta_sketch, update_theta_sketch, compact_theta_sketch\n",
|
24
|
-
"from datasketches import theta_union, theta_intersection, theta_a_not_b"
|
25
|
-
]
|
26
|
-
},
|
27
|
-
{
|
28
|
-
"cell_type": "markdown",
|
29
|
-
"metadata": {},
|
30
|
-
"source": [
|
31
|
-
"To start, we'll create a sketch with 1 million points in order to demonstrate basic sketch operations."
|
32
|
-
]
|
33
|
-
},
|
34
|
-
{
|
35
|
-
"cell_type": "code",
|
36
|
-
"execution_count": 2,
|
37
|
-
"metadata": {},
|
38
|
-
"outputs": [
|
39
|
-
{
|
40
|
-
"name": "stdout",
|
41
|
-
"output_type": "stream",
|
42
|
-
"text": [
|
43
|
-
"### Theta sketch summary:\n",
|
44
|
-
" num retained entries : 6560\n",
|
45
|
-
" seed hash : 37836\n",
|
46
|
-
" empty? : false\n",
|
47
|
-
" ordered? : false\n",
|
48
|
-
" estimation mode? : true\n",
|
49
|
-
" theta (fraction) : 0.00654224\n",
|
50
|
-
" theta (raw 64-bit) : 60341508738660257\n",
|
51
|
-
" estimate : 1.00271e+06\n",
|
52
|
-
" lower bound 95% conf : 978261\n",
|
53
|
-
" upper bound 95% conf : 1.02778e+06\n",
|
54
|
-
" lg nominal size : 12\n",
|
55
|
-
" lg current size : 13\n",
|
56
|
-
" resize factor : 8\n",
|
57
|
-
"### End sketch summary\n",
|
58
|
-
"\n"
|
59
|
-
]
|
60
|
-
}
|
61
|
-
],
|
62
|
-
"source": [
|
63
|
-
"n = 1000000\n",
|
64
|
-
"k = 12\n",
|
65
|
-
"sk1 = update_theta_sketch(k)\n",
|
66
|
-
"for i in range(0, n):\n",
|
67
|
-
" sk1.update(i)\n",
|
68
|
-
"print(sk1)"
|
69
|
-
]
|
70
|
-
},
|
71
|
-
{
|
72
|
-
"cell_type": "markdown",
|
73
|
-
"metadata": {},
|
74
|
-
"source": [
|
75
|
-
"The summary contains most data fo interest, but we can also query for specific information. And in this case, since we know the exact number of distinct items presented ot the sketch, we can look at the estimate, upper, and lower bounds as a percentage of the exact value."
|
76
|
-
]
|
77
|
-
},
|
78
|
-
{
|
79
|
-
"cell_type": "code",
|
80
|
-
"execution_count": 3,
|
81
|
-
"metadata": {},
|
82
|
-
"outputs": [
|
83
|
-
{
|
84
|
-
"name": "stdout",
|
85
|
-
"output_type": "stream",
|
86
|
-
"text": [
|
87
|
-
"Upper bound (1 std. dev) as % of true value:\t 101.5208\n",
|
88
|
-
"Sketch estimate as % of true value:\t\t 100.2715\n",
|
89
|
-
"Lower bound (1 std. dev) as % of true value:\t 99.0374\n"
|
90
|
-
]
|
91
|
-
}
|
92
|
-
],
|
93
|
-
"source": [
|
94
|
-
"print(\"Upper bound (1 std. dev) as % of true value:\\t\", round(100*sk1.get_upper_bound(1) / n, 4))\n",
|
95
|
-
"print(\"Sketch estimate as % of true value:\\t\\t\", round(100*sk1.get_estimate() / n, 4))\n",
|
96
|
-
"print(\"Lower bound (1 std. dev) as % of true value:\\t\", round(100*sk1.get_lower_bound(1) / n, 4))"
|
97
|
-
]
|
98
|
-
},
|
99
|
-
{
|
100
|
-
"cell_type": "markdown",
|
101
|
-
"metadata": {},
|
102
|
-
"source": [
|
103
|
-
"We can serialize and reconstruct the sketch. Serialization necessarily produces a compact sketch, meaning the sketch can be deserialized and queried or used for further unions or set operations but can not be updated directly."
|
104
|
-
]
|
105
|
-
},
|
106
|
-
{
|
107
|
-
"cell_type": "code",
|
108
|
-
"execution_count": 4,
|
109
|
-
"metadata": {},
|
110
|
-
"outputs": [
|
111
|
-
{
|
112
|
-
"data": {
|
113
|
-
"text/plain": [
|
114
|
-
"52504"
|
115
|
-
]
|
116
|
-
},
|
117
|
-
"execution_count": 4,
|
118
|
-
"metadata": {},
|
119
|
-
"output_type": "execute_result"
|
120
|
-
}
|
121
|
-
],
|
122
|
-
"source": [
|
123
|
-
"sk1_bytes = sk1.compact().serialize()\n",
|
124
|
-
"len(sk1_bytes)"
|
125
|
-
]
|
126
|
-
},
|
127
|
-
{
|
128
|
-
"cell_type": "code",
|
129
|
-
"execution_count": 5,
|
130
|
-
"metadata": {},
|
131
|
-
"outputs": [
|
132
|
-
{
|
133
|
-
"name": "stdout",
|
134
|
-
"output_type": "stream",
|
135
|
-
"text": [
|
136
|
-
"Estimate: \t\t 1002714.745231455\n",
|
137
|
-
"Estimation mode: \t True\n"
|
138
|
-
]
|
139
|
-
}
|
140
|
-
],
|
141
|
-
"source": [
|
142
|
-
"new_sk1 = compact_theta_sketch.deserialize(sk1_bytes)\n",
|
143
|
-
"print(\"Estimate: \\t\\t\", new_sk1.get_estimate())\n",
|
144
|
-
"print(\"Estimation mode: \\t\", new_sk1.is_estimation_mode())"
|
145
|
-
]
|
146
|
-
},
|
147
|
-
{
|
148
|
-
"cell_type": "markdown",
|
149
|
-
"metadata": {},
|
150
|
-
"source": [
|
151
|
-
"### Sketch Unions"
|
152
|
-
]
|
153
|
-
},
|
154
|
-
{
|
155
|
-
"cell_type": "markdown",
|
156
|
-
"metadata": {},
|
157
|
-
"source": [
|
158
|
-
"Theta Sketch unions make use of a separate union object. The union will accept input sketches with different values of $k$.\n",
|
159
|
-
"\n",
|
160
|
-
"For this example, we will create a sketch with distinct values that partially overlap those in `sk1`."
|
161
|
-
]
|
162
|
-
},
|
163
|
-
{
|
164
|
-
"cell_type": "code",
|
165
|
-
"execution_count": 6,
|
166
|
-
"metadata": {},
|
167
|
-
"outputs": [
|
168
|
-
{
|
169
|
-
"name": "stdout",
|
170
|
-
"output_type": "stream",
|
171
|
-
"text": [
|
172
|
-
"### Theta sketch summary:\n",
|
173
|
-
" num retained entries : 12488\n",
|
174
|
-
" seed hash : 37836\n",
|
175
|
-
" empty? : false\n",
|
176
|
-
" ordered? : false\n",
|
177
|
-
" estimation mode? : true\n",
|
178
|
-
" theta (fraction) : 0.0123336\n",
|
179
|
-
" theta (raw 64-bit) : 113757656857900725\n",
|
180
|
-
" estimate : 1.01252e+06\n",
|
181
|
-
" lower bound 95% conf : 994626\n",
|
182
|
-
" upper bound 95% conf : 1.03073e+06\n",
|
183
|
-
" lg nominal size : 13\n",
|
184
|
-
" lg current size : 14\n",
|
185
|
-
" resize factor : 8\n",
|
186
|
-
"### End sketch summary\n",
|
187
|
-
"\n"
|
188
|
-
]
|
189
|
-
}
|
190
|
-
],
|
191
|
-
"source": [
|
192
|
-
"offset = int(3 * n / 4)\n",
|
193
|
-
"sk2 = update_theta_sketch(k+1)\n",
|
194
|
-
"for i in range(0, n):\n",
|
195
|
-
" sk2.update(i + offset)\n",
|
196
|
-
"print(sk2)"
|
197
|
-
]
|
198
|
-
},
|
199
|
-
{
|
200
|
-
"cell_type": "markdown",
|
201
|
-
"metadata": {},
|
202
|
-
"source": [
|
203
|
-
"We can now feed the sketches into the union. As constructed, the exact number of unique values presented to the two sketches is $\\frac{7}{4}n$."
|
204
|
-
]
|
205
|
-
},
|
206
|
-
{
|
207
|
-
"cell_type": "code",
|
208
|
-
"execution_count": null,
|
209
|
-
"metadata": {},
|
210
|
-
"outputs": [],
|
211
|
-
"source": []
|
212
|
-
},
|
213
|
-
{
|
214
|
-
"cell_type": "code",
|
215
|
-
"execution_count": 7,
|
216
|
-
"metadata": {},
|
217
|
-
"outputs": [
|
218
|
-
{
|
219
|
-
"name": "stdout",
|
220
|
-
"output_type": "stream",
|
221
|
-
"text": [
|
222
|
-
"Union estimate as % of true value: 99.6787\n"
|
223
|
-
]
|
224
|
-
}
|
225
|
-
],
|
226
|
-
"source": [
|
227
|
-
"union = theta_union(k)\n",
|
228
|
-
"union.update(sk1)\n",
|
229
|
-
"union.update(sk2)\n",
|
230
|
-
"result = union.get_result()\n",
|
231
|
-
"print(\"Union estimate as % of true value: \", round(100*result.get_estimate()/(1.75*n), 4))"
|
232
|
-
]
|
233
|
-
},
|
234
|
-
{
|
235
|
-
"cell_type": "markdown",
|
236
|
-
"metadata": {},
|
237
|
-
"source": [
|
238
|
-
"### Sketch Intersections"
|
239
|
-
]
|
240
|
-
},
|
241
|
-
{
|
242
|
-
"cell_type": "markdown",
|
243
|
-
"metadata": {},
|
244
|
-
"source": [
|
245
|
-
"Beyond unions, theta sketches also support intersctions through the use of an intersection object. These set intersections can have vastly superior error bounds than the classic inclusion-exclusion rule used with sketches like HLL."
|
246
|
-
]
|
247
|
-
},
|
248
|
-
{
|
249
|
-
"cell_type": "code",
|
250
|
-
"execution_count": 8,
|
251
|
-
"metadata": {},
|
252
|
-
"outputs": [
|
253
|
-
{
|
254
|
-
"name": "stdout",
|
255
|
-
"output_type": "stream",
|
256
|
-
"text": [
|
257
|
-
"Has result: True\n",
|
258
|
-
"### Theta sketch summary:\n",
|
259
|
-
" num retained entries : 1668\n",
|
260
|
-
" seed hash : 37836\n",
|
261
|
-
" empty? : false\n",
|
262
|
-
" ordered? : true\n",
|
263
|
-
" estimation mode? : true\n",
|
264
|
-
" theta (fraction) : 0.00654224\n",
|
265
|
-
" theta (raw 64-bit) : 60341508738660257\n",
|
266
|
-
" estimate : 254959\n",
|
267
|
-
" lower bound 95% conf : 242739\n",
|
268
|
-
" upper bound 95% conf : 267789\n",
|
269
|
-
"### End sketch summary\n",
|
270
|
-
"\n"
|
271
|
-
]
|
272
|
-
}
|
273
|
-
],
|
274
|
-
"source": [
|
275
|
-
"intersection = theta_intersection()\n",
|
276
|
-
"intersection.update(sk1)\n",
|
277
|
-
"intersection.update(sk2)\n",
|
278
|
-
"print(\"Has result: \", intersection.has_result())\n",
|
279
|
-
"result = intersection.get_result()\n",
|
280
|
-
"print(result)"
|
281
|
-
]
|
282
|
-
},
|
283
|
-
{
|
284
|
-
"cell_type": "markdown",
|
285
|
-
"metadata": {},
|
286
|
-
"source": [
|
287
|
-
"In this case, we expect the sets to have an overlap of $\\frac{1}{4}n$."
|
288
|
-
]
|
289
|
-
},
|
290
|
-
{
|
291
|
-
"cell_type": "code",
|
292
|
-
"execution_count": 9,
|
293
|
-
"metadata": {},
|
294
|
-
"outputs": [
|
295
|
-
{
|
296
|
-
"name": "stdout",
|
297
|
-
"output_type": "stream",
|
298
|
-
"text": [
|
299
|
-
"Intersection estimate as % of true value: 101.9834\n"
|
300
|
-
]
|
301
|
-
}
|
302
|
-
],
|
303
|
-
"source": [
|
304
|
-
"print(\"Intersection estimate as % of true value: \", round(100*result.get_estimate()/(0.25*n), 4))"
|
305
|
-
]
|
306
|
-
},
|
307
|
-
{
|
308
|
-
"cell_type": "markdown",
|
309
|
-
"metadata": {},
|
310
|
-
"source": [
|
311
|
-
"### Set Subtraction (A-not-B)"
|
312
|
-
]
|
313
|
-
},
|
314
|
-
{
|
315
|
-
"cell_type": "markdown",
|
316
|
-
"metadata": {},
|
317
|
-
"source": [
|
318
|
-
"Finally, we have the set subtraction operation. Unlike `theta_union` and `theta_intersection`, `theta_a_not_b` always takes as input 2 sketches at a time, namely $a$ and $b$, and directly returns the result as a sketch."
|
319
|
-
]
|
320
|
-
},
|
321
|
-
{
|
322
|
-
"cell_type": "code",
|
323
|
-
"execution_count": 10,
|
324
|
-
"metadata": {},
|
325
|
-
"outputs": [
|
326
|
-
{
|
327
|
-
"name": "stdout",
|
328
|
-
"output_type": "stream",
|
329
|
-
"text": [
|
330
|
-
"### Theta sketch summary:\n",
|
331
|
-
" num retained entries : 4892\n",
|
332
|
-
" seed hash : 37836\n",
|
333
|
-
" empty? : false\n",
|
334
|
-
" ordered? : true\n",
|
335
|
-
" estimation mode? : true\n",
|
336
|
-
" theta (fraction) : 0.00654224\n",
|
337
|
-
" theta (raw 64-bit) : 60341508738660257\n",
|
338
|
-
" estimate : 747756\n",
|
339
|
-
" lower bound 95% conf : 726670\n",
|
340
|
-
" upper bound 95% conf : 769452\n",
|
341
|
-
"### End sketch summary\n",
|
342
|
-
"\n"
|
343
|
-
]
|
344
|
-
}
|
345
|
-
],
|
346
|
-
"source": [
|
347
|
-
"anb = theta_a_not_b()\n",
|
348
|
-
"result = anb.compute(sk1, sk2)\n",
|
349
|
-
"print(result)"
|
350
|
-
]
|
351
|
-
},
|
352
|
-
{
|
353
|
-
"cell_type": "markdown",
|
354
|
-
"metadata": {},
|
355
|
-
"source": [
|
356
|
-
"By using the same two sketches as before, the expected result here is $\\frac{3}{4}n$."
|
357
|
-
]
|
358
|
-
},
|
359
|
-
{
|
360
|
-
"cell_type": "code",
|
361
|
-
"execution_count": 11,
|
362
|
-
"metadata": {},
|
363
|
-
"outputs": [
|
364
|
-
{
|
365
|
-
"name": "stdout",
|
366
|
-
"output_type": "stream",
|
367
|
-
"text": [
|
368
|
-
"A-not-B estimate as % of true value: 99.7008\n"
|
369
|
-
]
|
370
|
-
}
|
371
|
-
],
|
372
|
-
"source": [
|
373
|
-
"print(\"A-not-B estimate as % of true value: \", round(100*result.get_estimate()/(0.75*n), 4))"
|
374
|
-
]
|
375
|
-
}
|
376
|
-
],
|
377
|
-
"metadata": {
|
378
|
-
"kernelspec": {
|
379
|
-
"display_name": "Python 3.10.6 64-bit",
|
380
|
-
"language": "python",
|
381
|
-
"name": "python3"
|
382
|
-
},
|
383
|
-
"language_info": {
|
384
|
-
"codemirror_mode": {
|
385
|
-
"name": "ipython",
|
386
|
-
"version": 3
|
387
|
-
},
|
388
|
-
"file_extension": ".py",
|
389
|
-
"mimetype": "text/x-python",
|
390
|
-
"name": "python",
|
391
|
-
"nbconvert_exporter": "python",
|
392
|
-
"pygments_lexer": "ipython3",
|
393
|
-
"version": "3.10.6"
|
394
|
-
},
|
395
|
-
"vscode": {
|
396
|
-
"interpreter": {
|
397
|
-
"hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"
|
398
|
-
}
|
399
|
-
}
|
400
|
-
},
|
401
|
-
"nbformat": 4,
|
402
|
-
"nbformat_minor": 2
|
403
|
-
}
|
@@ -1,21 +0,0 @@
|
|
1
|
-
:: Licensed to the Apache Software Foundation (ASF) under one
|
2
|
-
:: or more contributor license agreements. See the NOTICE file
|
3
|
-
:: distributed with this work for additional information
|
4
|
-
:: regarding copyright ownership. The ASF licenses this file
|
5
|
-
:: to you under the Apache License, Version 2.0 (the
|
6
|
-
:: "License"); you may not use this file except in compliance
|
7
|
-
:: with the License. You may obtain a copy of the License at
|
8
|
-
::
|
9
|
-
:: http://www.apache.org/licenses/LICENSE-2.0
|
10
|
-
::
|
11
|
-
:: Unless required by applicable law or agreed to in writing,
|
12
|
-
:: software distributed under the License is distributed on an
|
13
|
-
:: "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
-
:: KIND, either express or implied. See the License for the
|
15
|
-
:: specific language governing permissions and limitations
|
16
|
-
:: under the License.
|
17
|
-
|
18
|
-
|
19
|
-
@echo off
|
20
|
-
:: Takes path to the Python interpreter and returns the path to pybind11
|
21
|
-
%1 -c "import pybind11,sys;sys.stdout.write(pybind11.get_cmake_dir())"
|
@@ -1,90 +0,0 @@
|
|
1
|
-
/*
|
2
|
-
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
-
* or more contributor license agreements. See the NOTICE file
|
4
|
-
* distributed with this work for additional information
|
5
|
-
* regarding copyright ownership. The ASF licenses this file
|
6
|
-
* to you under the Apache License, Version 2.0 (the
|
7
|
-
* "License"); you may not use this file except in compliance
|
8
|
-
* with the License. You may obtain a copy of the License at
|
9
|
-
*
|
10
|
-
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
-
*
|
12
|
-
* Unless required by applicable law or agreed to in writing,
|
13
|
-
* software distributed under the License is distributed on an
|
14
|
-
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
-
* KIND, either express or implied. See the License for the
|
16
|
-
* specific language governing permissions and limitations
|
17
|
-
* under the License.
|
18
|
-
*/
|
19
|
-
|
20
|
-
#include <sstream>
|
21
|
-
#include <pybind11/pybind11.h>
|
22
|
-
|
23
|
-
#include "cpc_sketch.hpp"
|
24
|
-
#include "cpc_union.hpp"
|
25
|
-
#include "cpc_common.hpp"
|
26
|
-
#include "common_defs.hpp"
|
27
|
-
|
28
|
-
namespace py = pybind11;
|
29
|
-
|
30
|
-
namespace datasketches {
|
31
|
-
namespace python {
|
32
|
-
|
33
|
-
cpc_sketch* cpc_sketch_deserialize(py::bytes skBytes) {
|
34
|
-
std::string skStr = skBytes; // implicit cast
|
35
|
-
return new cpc_sketch(cpc_sketch::deserialize(skStr.c_str(), skStr.length()));
|
36
|
-
}
|
37
|
-
|
38
|
-
py::object cpc_sketch_serialize(const cpc_sketch& sk) {
|
39
|
-
auto serResult = sk.serialize();
|
40
|
-
return py::bytes((char*)serResult.data(), serResult.size());
|
41
|
-
}
|
42
|
-
|
43
|
-
cpc_sketch* cpc_union_get_result(const cpc_union& u) {
|
44
|
-
return new cpc_sketch(u.get_result());
|
45
|
-
}
|
46
|
-
|
47
|
-
}
|
48
|
-
}
|
49
|
-
|
50
|
-
namespace dspy = datasketches::python;
|
51
|
-
|
52
|
-
void init_cpc(py::module &m) {
|
53
|
-
using namespace datasketches;
|
54
|
-
|
55
|
-
py::class_<cpc_sketch>(m, "cpc_sketch")
|
56
|
-
.def(py::init<uint8_t, uint64_t>(), py::arg("lg_k")=cpc_constants::DEFAULT_LG_K, py::arg("seed")=DEFAULT_SEED)
|
57
|
-
.def(py::init<const cpc_sketch&>())
|
58
|
-
.def("__str__", &cpc_sketch::to_string,
|
59
|
-
"Produces a string summary of the sketch")
|
60
|
-
.def("to_string", &cpc_sketch::to_string,
|
61
|
-
"Produces a string summary of the sketch")
|
62
|
-
.def("serialize", &dspy::cpc_sketch_serialize,
|
63
|
-
"Serializes the sketch into a bytes object")
|
64
|
-
.def_static("deserialize", &dspy::cpc_sketch_deserialize,
|
65
|
-
"Reads a bytes object and returns the corresponding cpc_sketch")
|
66
|
-
.def<void (cpc_sketch::*)(uint64_t)>("update", &cpc_sketch::update, py::arg("datum"),
|
67
|
-
"Updates the sketch with the given 64-bit integer value")
|
68
|
-
.def<void (cpc_sketch::*)(double)>("update", &cpc_sketch::update, py::arg("datum"),
|
69
|
-
"Updates the sketch with the given 64-bit floating point")
|
70
|
-
.def<void (cpc_sketch::*)(const std::string&)>("update", &cpc_sketch::update, py::arg("datum"),
|
71
|
-
"Updates the sketch with the given string")
|
72
|
-
.def("is_empty", &cpc_sketch::is_empty,
|
73
|
-
"Returns True if the sketch is empty, otherwise Dalse")
|
74
|
-
.def("get_estimate", &cpc_sketch::get_estimate,
|
75
|
-
"Estimate of the distinct count of the input stream")
|
76
|
-
.def("get_lower_bound", &cpc_sketch::get_lower_bound, py::arg("kappa"),
|
77
|
-
"Returns an approximate lower bound on the estimate for kappa values in {1, 2, 3}, roughly corresponding to standard deviations")
|
78
|
-
.def("get_upper_bound", &cpc_sketch::get_upper_bound, py::arg("kappa"),
|
79
|
-
"Returns an approximate upper bound on the estimate for kappa values in {1, 2, 3}, roughly corresponding to standard deviations")
|
80
|
-
;
|
81
|
-
|
82
|
-
py::class_<cpc_union>(m, "cpc_union")
|
83
|
-
.def(py::init<uint8_t, uint64_t>(), py::arg("lg_k"), py::arg("seed")=DEFAULT_SEED)
|
84
|
-
.def(py::init<const cpc_union&>())
|
85
|
-
.def("update", (void (cpc_union::*)(const cpc_sketch&)) &cpc_union::update, py::arg("sketch"),
|
86
|
-
"Updates the union with the provided CPC sketch")
|
87
|
-
.def("get_result", &dspy::cpc_union_get_result,
|
88
|
-
"Returns a CPC sketch with the result of the union")
|
89
|
-
;
|
90
|
-
}
|
@@ -1,128 +0,0 @@
|
|
1
|
-
/*
|
2
|
-
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
-
* or more contributor license agreements. See the NOTICE file
|
4
|
-
* distributed with this work for additional information
|
5
|
-
* regarding copyright ownership. The ASF licenses this file
|
6
|
-
* to you under the Apache License, Version 2.0 (the
|
7
|
-
* "License"); you may not use this file except in compliance
|
8
|
-
* with the License. You may obtain a copy of the License at
|
9
|
-
*
|
10
|
-
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
-
*
|
12
|
-
* Unless required by applicable law or agreed to in writing,
|
13
|
-
* software distributed under the License is distributed on an
|
14
|
-
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
-
* KIND, either express or implied. See the License for the
|
16
|
-
* specific language governing permissions and limitations
|
17
|
-
* under the License.
|
18
|
-
*/
|
19
|
-
|
20
|
-
#include "frequent_items_sketch.hpp"
|
21
|
-
|
22
|
-
#include <pybind11/pybind11.h>
|
23
|
-
#include <sstream>
|
24
|
-
|
25
|
-
namespace py = pybind11;
|
26
|
-
|
27
|
-
namespace datasketches {
|
28
|
-
namespace python {
|
29
|
-
|
30
|
-
template<typename T>
|
31
|
-
frequent_items_sketch<T> fi_sketch_deserialize(py::bytes skBytes) {
|
32
|
-
std::string skStr = skBytes; // implicit cast
|
33
|
-
return frequent_items_sketch<T>::deserialize(skStr.c_str(), skStr.length());
|
34
|
-
}
|
35
|
-
|
36
|
-
template<typename T>
|
37
|
-
py::object fi_sketch_serialize(const frequent_items_sketch<T>& sk) {
|
38
|
-
auto serResult = sk.serialize();
|
39
|
-
return py::bytes((char*)serResult.data(), serResult.size());
|
40
|
-
}
|
41
|
-
|
42
|
-
// maybe possible to disambiguate the static vs method get_epsilon calls, but
|
43
|
-
// this is easier for now
|
44
|
-
template<typename T>
|
45
|
-
double fi_sketch_get_generic_epsilon(uint8_t lg_max_map_size) {
|
46
|
-
return frequent_items_sketch<T>::get_epsilon(lg_max_map_size);
|
47
|
-
}
|
48
|
-
|
49
|
-
template<typename T>
|
50
|
-
py::list fi_sketch_get_frequent_items(const frequent_items_sketch<T>& sk,
|
51
|
-
frequent_items_error_type err_type,
|
52
|
-
uint64_t threshold = 0) {
|
53
|
-
if (threshold == 0) { threshold = sk.get_maximum_error(); }
|
54
|
-
|
55
|
-
py::list list;
|
56
|
-
auto items = sk.get_frequent_items(err_type, threshold);
|
57
|
-
for (auto iter = items.begin(); iter != items.end(); ++iter) {
|
58
|
-
py::tuple t = py::make_tuple(iter->get_item(),
|
59
|
-
iter->get_estimate(),
|
60
|
-
iter->get_lower_bound(),
|
61
|
-
iter->get_upper_bound());
|
62
|
-
list.append(t);
|
63
|
-
}
|
64
|
-
return list;
|
65
|
-
}
|
66
|
-
|
67
|
-
template<typename T>
|
68
|
-
size_t fi_sketch_get_serialized_size_bytes(const frequent_items_sketch<T>& sk) {
|
69
|
-
return sk.get_serialized_size_bytes();
|
70
|
-
}
|
71
|
-
|
72
|
-
}
|
73
|
-
}
|
74
|
-
|
75
|
-
namespace dspy = datasketches::python;
|
76
|
-
|
77
|
-
template<typename T>
|
78
|
-
void bind_fi_sketch(py::module &m, const char* name) {
|
79
|
-
using namespace datasketches;
|
80
|
-
|
81
|
-
py::class_<frequent_items_sketch<T>>(m, name)
|
82
|
-
.def(py::init<uint8_t>(), py::arg("lg_max_k"))
|
83
|
-
.def("__str__", &frequent_items_sketch<T>::to_string, py::arg("print_items")=false,
|
84
|
-
"Produces a string summary of the sketch")
|
85
|
-
.def("to_string", &frequent_items_sketch<T>::to_string, py::arg("print_items")=false,
|
86
|
-
"Produces a string summary of the sketch")
|
87
|
-
.def("update", (void (frequent_items_sketch<T>::*)(const T&, uint64_t)) &frequent_items_sketch<T>::update, py::arg("item"), py::arg("weight")=1,
|
88
|
-
"Updates the sketch with the given string and, optionally, a weight")
|
89
|
-
.def("get_frequent_items", &dspy::fi_sketch_get_frequent_items<T>, py::arg("err_type"), py::arg("threshold")=0)
|
90
|
-
.def("merge", (void (frequent_items_sketch<T>::*)(const frequent_items_sketch<T>&)) &frequent_items_sketch<T>::merge,
|
91
|
-
"Merges the given sketch into this one")
|
92
|
-
.def("is_empty", &frequent_items_sketch<T>::is_empty,
|
93
|
-
"Returns True if the sketch is empty, otherwise False")
|
94
|
-
.def("get_num_active_items", &frequent_items_sketch<T>::get_num_active_items,
|
95
|
-
"Returns the number of active items in the sketch")
|
96
|
-
.def("get_total_weight", &frequent_items_sketch<T>::get_total_weight,
|
97
|
-
"Returns the sum of the weights (frequencies) in the stream seen so far by the sketch")
|
98
|
-
.def("get_estimate", &frequent_items_sketch<T>::get_estimate, py::arg("item"),
|
99
|
-
"Returns the estimate of the weight (frequency) of the given item.\n"
|
100
|
-
"Note: The true frequency of a item would be the sum of the counts as a result of the "
|
101
|
-
"two update functions.")
|
102
|
-
.def("get_lower_bound", &frequent_items_sketch<T>::get_lower_bound, py::arg("item"),
|
103
|
-
"Returns the guaranteed lower bound weight (frequency) of the given item.")
|
104
|
-
.def("get_upper_bound", &frequent_items_sketch<T>::get_upper_bound, py::arg("item"),
|
105
|
-
"Returns the guaranteed upper bound weight (frequency) of the given item.")
|
106
|
-
.def("get_sketch_epsilon", (double (frequent_items_sketch<T>::*)(void) const) &frequent_items_sketch<T>::get_epsilon,
|
107
|
-
"Returns the epsilon value used by the sketch to compute error")
|
108
|
-
.def_static("get_epsilon_for_lg_size", &dspy::fi_sketch_get_generic_epsilon<T>, py::arg("lg_max_map_size"),
|
109
|
-
"Returns the epsilon value used to compute a priori error for a given log2(max_map_size)")
|
110
|
-
.def_static("get_apriori_error", &frequent_items_sketch<T>::get_apriori_error, py::arg("lg_max_map_size"), py::arg("estimated_total_weight"),
|
111
|
-
"Returns the estimated a priori error given the max_map_size for the sketch and the estimated_total_stream_weight.")
|
112
|
-
.def("get_serialized_size_bytes", &dspy::fi_sketch_get_serialized_size_bytes<T>,
|
113
|
-
"Computes the size needed to serialize the current state of the sketch. This can be expensive since every item needs to be looked at.")
|
114
|
-
.def("serialize", &dspy::fi_sketch_serialize<T>, "Serializes the sketch into a bytes object")
|
115
|
-
.def_static("deserialize", &dspy::fi_sketch_deserialize<T>, "Reads a bytes object and returns the corresponding frequent_strings_sketch")
|
116
|
-
;
|
117
|
-
}
|
118
|
-
|
119
|
-
void init_fi(py::module &m) {
|
120
|
-
using namespace datasketches;
|
121
|
-
|
122
|
-
py::enum_<frequent_items_error_type>(m, "frequent_items_error_type")
|
123
|
-
.value("NO_FALSE_POSITIVES", NO_FALSE_POSITIVES)
|
124
|
-
.value("NO_FALSE_NEGATIVES", NO_FALSE_NEGATIVES)
|
125
|
-
.export_values();
|
126
|
-
|
127
|
-
bind_fi_sketch<std::string>(m, "frequent_strings_sketch");
|
128
|
-
}
|