clip-protocol 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- clip_protocol/__init__.py +0 -0
- clip_protocol/cli.py +55 -0
- clip_protocol/count_mean/__init__.py +18 -0
- clip_protocol/count_mean/cms_client_mean.py +144 -0
- clip_protocol/count_mean/private_cms_client.py +115 -0
- clip_protocol/count_mean/private_cms_server.py +203 -0
- clip_protocol/hadamard_count_mean/__init__.py +17 -0
- clip_protocol/hadamard_count_mean/private_hcms_client.py +127 -0
- clip_protocol/hadamard_count_mean/private_hcms_server.py +149 -0
- clip_protocol/main/__init__.py +0 -0
- clip_protocol/main/agregate.py +0 -0
- clip_protocol/main/estimate.py +0 -0
- clip_protocol/main/general_method.py +96 -0
- clip_protocol/main/individual_method.py +198 -0
- clip_protocol/main/mask.py +171 -0
- clip_protocol/main/setup.py +197 -0
- clip_protocol/scripts/__init__.py +9 -0
- clip_protocol/scripts/parameter_fitting.py +276 -0
- clip_protocol/scripts/preprocess.py +98 -0
- clip_protocol/scripts/server.py +128 -0
- clip_protocol/utils/__init__.py +0 -0
- clip_protocol/utils/errors.py +35 -0
- clip_protocol/utils/utils.py +145 -0
- clip_protocol-2.0.0.dist-info/METADATA +137 -0
- clip_protocol-2.0.0.dist-info/RECORD +28 -0
- clip_protocol-2.0.0.dist-info/WHEEL +4 -0
- clip_protocol-2.0.0.dist-info/entry_points.txt +3 -0
- clip_protocol-2.0.0.dist-info/licenses/LICENSE +21 -0
|
File without changes
|
clip_protocol/cli.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import os
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from colorama import Style
|
|
5
|
+
|
|
6
|
+
from clip_protocol.main.individual_method import run_individual_method
|
|
7
|
+
from clip_protocol.main.general_method import run_general_method
|
|
8
|
+
|
|
9
|
+
def main():
|
|
10
|
+
parser = argparse.ArgumentParser(description="Run the individual method for private frequency estimation.")
|
|
11
|
+
parser.add_argument("file_path", type=str, help="The path to the input dataset file.")
|
|
12
|
+
parser.add_argument("output_path", type=str, help="The path to the output where you want the final database to be saved.")
|
|
13
|
+
args = parser.parse_args()
|
|
14
|
+
|
|
15
|
+
if not os.path.exists(args.file_path):
|
|
16
|
+
raise FileNotFoundError(f"File not found at {args.file_path}")
|
|
17
|
+
elif not os.path.exists(args.output_path):
|
|
18
|
+
raise FileNotFoundError(f"Output path not found at {args.output_path}")
|
|
19
|
+
|
|
20
|
+
file_name = os.path.basename(args.file_path)
|
|
21
|
+
print(f"Processing {Style.BRIGHT}{file_name}{Style.RESET_ALL}")
|
|
22
|
+
df = pd.read_excel(args.file_path)
|
|
23
|
+
|
|
24
|
+
priv_df = run_individual_method(df)
|
|
25
|
+
output = os.path.join(args.output_path, 'private_database.csv')
|
|
26
|
+
priv_df.to_csv(output, index=False)
|
|
27
|
+
print(f"{Style.BRIGHT}Private dataset saved at {args.output_path}{Style.RESET_ALL}")
|
|
28
|
+
|
|
29
|
+
def main_general():
|
|
30
|
+
parser = argparse.ArgumentParser(description="Run the individual method for private frequency estimation.")
|
|
31
|
+
parser.add_argument("file_path", type=str, help="The path to the input dataset file.")
|
|
32
|
+
parser.add_argument("output_path", type=str, help="The path to the output where you want the final database to be saved.")
|
|
33
|
+
args = parser.parse_args()
|
|
34
|
+
|
|
35
|
+
if not os.path.exists(args.file_path):
|
|
36
|
+
raise FileNotFoundError(f"File not found at {args.file_path}")
|
|
37
|
+
elif not os.path.exists(args.output_path):
|
|
38
|
+
raise FileNotFoundError(f"Output path not found at {args.output_path}")
|
|
39
|
+
|
|
40
|
+
file_name = os.path.basename(args.file_path)
|
|
41
|
+
print(f"Processing {Style.BRIGHT}{file_name}{Style.RESET_ALL}")
|
|
42
|
+
df = pd.read_excel(args.file_path)
|
|
43
|
+
|
|
44
|
+
priv = run_general_method(df)
|
|
45
|
+
|
|
46
|
+
for user, privatized_data in priv.items():
|
|
47
|
+
output_file = os.path.join(args.output_path, f'{user}.csv')
|
|
48
|
+
privatized_data.to_csv(args.output_path, index=False)
|
|
49
|
+
|
|
50
|
+
print(f"{Style.BRIGHT}Private datasets saved at {args.output_path}{Style.RESET_ALL}")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This subpackage contains implementations of algorithms for calculating private count means
|
|
3
|
+
using the Count-Mean-Sketch (CMS) approach. The Count-Mean-Sketch is a probabilistic data
|
|
4
|
+
structure that allows for efficient frequency estimation while providing differential privacy
|
|
5
|
+
guarantees. This subpackage includes both client-side and server-side implementations for
|
|
6
|
+
privacy-preserving data aggregation.
|
|
7
|
+
|
|
8
|
+
Modules:
|
|
9
|
+
- cms_client_mean.py: Implements the client-side logic for generating private count means.
|
|
10
|
+
- private_cms_client.py: Contains the client-side logic for perturbing data before sending it to the server.
|
|
11
|
+
- private_cms_server.py: Implements the server-side logic for aggregating and analyzing perturbed data.
|
|
12
|
+
|
|
13
|
+
Main Functions:
|
|
14
|
+
- execute_client: Simulates the client side of the privatized Count-Min Sketch for all elements in the dataset.
|
|
15
|
+
- server_simulator: Simulates the server side of the privatized Count-Min Sketch, processes the privatized data, and estimates frequencies.
|
|
16
|
+
- update_sketch_matrix: Updates the sketch matrix based on the privatized data received from the client.
|
|
17
|
+
- estimate_client: Estimates the frequency of an element based on the private CMS sketch matrix.
|
|
18
|
+
"""
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
import random
|
|
2
|
+
import numpy as np
|
|
3
|
+
from sympy import primerange
|
|
4
|
+
from rich.progress import Progress
|
|
5
|
+
|
|
6
|
+
from clip_protocol.utils.utils import generate_hash_functions, display_results
|
|
7
|
+
|
|
8
|
+
class CMSClient:
|
|
9
|
+
"""
|
|
10
|
+
A class to represent the Count-Min Sketch (CMS) Client.
|
|
11
|
+
|
|
12
|
+
Attributes:
|
|
13
|
+
df: DataFrame containing the dataset.
|
|
14
|
+
k: Number of hash functions used in the CMS.
|
|
15
|
+
m: Size of the sketch matrix.
|
|
16
|
+
dataset: List of values from the dataset.
|
|
17
|
+
domain: Unique values in the dataset.
|
|
18
|
+
N: Total number of elements in the dataset.
|
|
19
|
+
M: Count-Min Sketch matrix.
|
|
20
|
+
H: List of hash functions.
|
|
21
|
+
|
|
22
|
+
Methods:
|
|
23
|
+
client(d):
|
|
24
|
+
Simulates the client side of the CMS, returning a vector with hash values.
|
|
25
|
+
update_sketch_matrix(d):
|
|
26
|
+
Updates the sketch matrix based on the given element.
|
|
27
|
+
estimate_client(d):
|
|
28
|
+
Estimates the frequency of an element using the CMS sketch matrix.
|
|
29
|
+
server_simulator():
|
|
30
|
+
Simulates the server side of the CMS, processes the data, and estimates frequencies.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, k, m, df):
|
|
34
|
+
"""
|
|
35
|
+
Initializes the CMSClient with the given parameters.
|
|
36
|
+
"""
|
|
37
|
+
self.df = df
|
|
38
|
+
self.k = k
|
|
39
|
+
self.m = m
|
|
40
|
+
self.dataset = self.df['value'].tolist()
|
|
41
|
+
self.domain = self.df['value'].unique().tolist()
|
|
42
|
+
self.N = len(self.dataset)
|
|
43
|
+
|
|
44
|
+
# Creation of the sketch matrix
|
|
45
|
+
self.M = np.zeros((self.k, self.m))
|
|
46
|
+
|
|
47
|
+
# Definition of the hash family 3 by 3
|
|
48
|
+
primes = list(primerange(10**6, 10**7))
|
|
49
|
+
p = primes[random.randint(0, len(primes)-1)]
|
|
50
|
+
self.H = generate_hash_functions(self.k,p, 3,self.m)
|
|
51
|
+
|
|
52
|
+
def client(self, d):
|
|
53
|
+
"""
|
|
54
|
+
Simulates the client side of the Count-Min Sketch.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
d (element): The element for which the sketch vector is generated.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
tuple: A tuple containing the sketch vector and the index of the chosen hash function.
|
|
61
|
+
"""
|
|
62
|
+
j = random.randint(0, self.k-1)
|
|
63
|
+
v = np.full(self.m, -1)
|
|
64
|
+
selected_hash = self.H[j]
|
|
65
|
+
v[selected_hash(d)] = 1
|
|
66
|
+
return v, j
|
|
67
|
+
|
|
68
|
+
def update_sketch_matrix(self, d):
|
|
69
|
+
"""
|
|
70
|
+
Updates the sketch matrix based on the given element.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
d (element): The element to be used for updating the sketch matrix.
|
|
74
|
+
"""
|
|
75
|
+
for i in range (self.k):
|
|
76
|
+
selected_hash = self.H[i]
|
|
77
|
+
hash_index = selected_hash(d)
|
|
78
|
+
self.M[i ,hash_index] += 1
|
|
79
|
+
|
|
80
|
+
def estimate_client(self,d):
|
|
81
|
+
"""
|
|
82
|
+
Estimates the frequency of an element based on the sketch matrix.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
d (element): The element whose frequency is estimated.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
float: The estimated frequency of the element.
|
|
89
|
+
"""
|
|
90
|
+
mean = 0
|
|
91
|
+
for i in range(self.k):
|
|
92
|
+
selected_hash = self.H[i]
|
|
93
|
+
mean += self.M[i,selected_hash(d)]
|
|
94
|
+
return mean/self.k
|
|
95
|
+
|
|
96
|
+
def server_simulator(self):
|
|
97
|
+
"""
|
|
98
|
+
Simulates the server side of the CMS by processing the dataset
|
|
99
|
+
and estimating the frequencies of each element.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
dict: A dictionary with the elements and their estimated frequencies.
|
|
103
|
+
"""
|
|
104
|
+
with Progress() as progress:
|
|
105
|
+
bar = progress.add_task("[cyan]Processing client data...", total=len(self.dataset))
|
|
106
|
+
|
|
107
|
+
for d in self.dataset:
|
|
108
|
+
self.update_sketch_matrix(d)
|
|
109
|
+
progress.update(bar, advance=1)
|
|
110
|
+
|
|
111
|
+
F_estimated = {}
|
|
112
|
+
bar = progress.add_task("[cyan]Obtaining histogram of estimated frequencies...", total=len(self.domain))
|
|
113
|
+
for x in self.domain:
|
|
114
|
+
F_estimated[x] = self.estimate_client(x)
|
|
115
|
+
progress.update(bar, advance=1)
|
|
116
|
+
return F_estimated
|
|
117
|
+
|
|
118
|
+
def run_cms_client_mean(k, m, df):
|
|
119
|
+
"""
|
|
120
|
+
Runs the Count-Min Sketch algorithm and displays the results.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
k (int): Number of hash functions.
|
|
124
|
+
m (int): Size of the sketch matrix.
|
|
125
|
+
df (DataFrame): Dataset to be processed.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
DataFrame: A table containing the elements and their estimated frequencies.
|
|
129
|
+
"""
|
|
130
|
+
# Initialize the CMSClient
|
|
131
|
+
PCMS = CMSClient(k, m, df)
|
|
132
|
+
|
|
133
|
+
# Simulate the server side
|
|
134
|
+
f_estimated = PCMS.server_simulator()
|
|
135
|
+
|
|
136
|
+
# Show the results
|
|
137
|
+
data_table = display_results(df, f_estimated)
|
|
138
|
+
|
|
139
|
+
return data_table
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
import random
|
|
2
|
+
import numpy as np
|
|
3
|
+
from sympy import primerange
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from rich.progress import Progress
|
|
6
|
+
from numba import njit
|
|
7
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
8
|
+
|
|
9
|
+
from clip_protocol.utils.utils import generate_hash_functions
|
|
10
|
+
|
|
11
|
+
@njit
|
|
12
|
+
def bernoulli_vector(epsilon, m):
|
|
13
|
+
b = np.random.binomial(1, (np.exp(epsilon/2)) / ((np.exp(epsilon/2)) + 1), m)
|
|
14
|
+
return 2 * b - 1
|
|
15
|
+
|
|
16
|
+
@njit
|
|
17
|
+
def update_sketch_matrix(M, v, j, epsilon, k, m):
|
|
18
|
+
c_e = (np.exp(epsilon/2)+1) / ((np.exp(epsilon/2))-1)
|
|
19
|
+
x = k * ((c_e/2) * v + (1/2) * np.ones_like(v))
|
|
20
|
+
for i in range (m):
|
|
21
|
+
M[j,i] += x[i]
|
|
22
|
+
|
|
23
|
+
class privateCMSClient:
|
|
24
|
+
def __init__(self, epsilon, k, m, df):
|
|
25
|
+
self.df = df
|
|
26
|
+
self.epsilon = epsilon
|
|
27
|
+
self.k = k
|
|
28
|
+
self.m = m
|
|
29
|
+
self.dataset = self.df['value'].tolist()
|
|
30
|
+
self.domain = self.df['value'].unique().tolist()
|
|
31
|
+
self.N = len(self.dataset)
|
|
32
|
+
|
|
33
|
+
# Creation of the sketch matrix
|
|
34
|
+
self.M = np.zeros((self.k, self.m))
|
|
35
|
+
|
|
36
|
+
# List to store the privatized matrices
|
|
37
|
+
self.client_matrix = []
|
|
38
|
+
|
|
39
|
+
# Definition of the hash family 3 by 3
|
|
40
|
+
primes = list(primerange(10**6, 10**7))
|
|
41
|
+
p = primes[random.randint(0, len(primes)-1)]
|
|
42
|
+
self.H, self.coefs = generate_hash_functions(self.k, p, 3, self.m)
|
|
43
|
+
|
|
44
|
+
def client(self, d):
|
|
45
|
+
j = random.randint(0, self.k-1)
|
|
46
|
+
v = np.full(self.m, -1)
|
|
47
|
+
selected_hash = self.H[j]
|
|
48
|
+
v[selected_hash(d)] = 1
|
|
49
|
+
b = bernoulli_vector(self.epsilon, self.m)
|
|
50
|
+
v_aux = v * b
|
|
51
|
+
return v_aux, j
|
|
52
|
+
|
|
53
|
+
def estimate_client(self,d):
|
|
54
|
+
sum_aux = 0
|
|
55
|
+
for i in range(self.k):
|
|
56
|
+
selected_hash = self.H[i]
|
|
57
|
+
sum_aux += self.M[i, selected_hash(d)]
|
|
58
|
+
f_estimated = (self.m/(self.m-1))*((sum_aux/self.k)-(self.N/self.m))
|
|
59
|
+
return f_estimated
|
|
60
|
+
|
|
61
|
+
def execute_client(self):
|
|
62
|
+
privatized_data = []
|
|
63
|
+
def process(d):
|
|
64
|
+
return self.client(d)
|
|
65
|
+
|
|
66
|
+
with Progress() as progress:
|
|
67
|
+
bar = progress.add_task("Processing client data", total=len(self.dataset))
|
|
68
|
+
with ThreadPoolExecutor() as executor:
|
|
69
|
+
for result in executor.map(process, self.dataset):
|
|
70
|
+
privatized_data.append(result)
|
|
71
|
+
progress.update(bar, advance=1)
|
|
72
|
+
self.client_matrix = privatized_data
|
|
73
|
+
return privatized_data
|
|
74
|
+
|
|
75
|
+
def server_simulator(self,privatized_data):
|
|
76
|
+
with Progress() as progress:
|
|
77
|
+
bar = progress.add_task('Update sketch matrix', total=len(privatized_data))
|
|
78
|
+
|
|
79
|
+
for v, j in privatized_data:
|
|
80
|
+
update_sketch_matrix(self.M, v, j, self.epsilon, self.k, self.m)
|
|
81
|
+
progress.update(bar, advance=1)
|
|
82
|
+
|
|
83
|
+
bar = progress.add_task('Estimate frequencies', total=len(self.domain))
|
|
84
|
+
F_estimated = {}
|
|
85
|
+
for x in self.domain:
|
|
86
|
+
F_estimated[x] = self.estimate_client(x)
|
|
87
|
+
progress.update(bar, advance=1)
|
|
88
|
+
|
|
89
|
+
return F_estimated, self.coefs
|
|
90
|
+
|
|
91
|
+
def run_private_cms_client(k, m, e, df):
|
|
92
|
+
"""
|
|
93
|
+
Runs the privatized Count-Min Sketch algorithm and displays the results.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
k (int): Number of hash functions.
|
|
97
|
+
m (int): Size of the sketch matrix.
|
|
98
|
+
e (float): Privacy parameter.
|
|
99
|
+
df (DataFrame): Dataset to be processed.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
tuple: A tuple containing the hash functions, the results table, the error table, the privatized data, and the estimated frequency DataFrame.
|
|
103
|
+
"""
|
|
104
|
+
# Initialize the private Count-Mean Sketch
|
|
105
|
+
PCMS = privateCMSClient(e, k, m, df)
|
|
106
|
+
|
|
107
|
+
# Client side: process the private data
|
|
108
|
+
privatized_data = PCMS.execute_client()
|
|
109
|
+
|
|
110
|
+
# Simulate the server side
|
|
111
|
+
f_estimated, coefs = PCMS.server_simulator(privatized_data)
|
|
112
|
+
|
|
113
|
+
df_estimated = pd.DataFrame(list(f_estimated.items()), columns=['Element', 'Frequency'])
|
|
114
|
+
|
|
115
|
+
return coefs, privatized_data, df_estimated
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import os
|
|
5
|
+
from colorama import Fore, Style
|
|
6
|
+
from rich.progress import Progress
|
|
7
|
+
|
|
8
|
+
from clip_protocol.utils.utils import display_results
|
|
9
|
+
|
|
10
|
+
class privateCMSServer:
|
|
11
|
+
"""
|
|
12
|
+
This class represents the server side of the Private Count-Mean Sketch (PCMS).
|
|
13
|
+
It is responsible for updating the sketch matrix and providing frequency estimations.
|
|
14
|
+
|
|
15
|
+
Attributes:
|
|
16
|
+
df (pandas.DataFrame): The dataset containing the values.
|
|
17
|
+
epsilon (float): The privacy parameter epsilon.
|
|
18
|
+
k (int): The number of hash functions.
|
|
19
|
+
m (int): The size of the sketch.
|
|
20
|
+
dataset (list): The list of values in the dataset.
|
|
21
|
+
domain (list): The unique values in the dataset.
|
|
22
|
+
N (int): The size of the dataset.
|
|
23
|
+
H (list): The list of hash functions.
|
|
24
|
+
M (numpy.ndarray): The sketch matrix.
|
|
25
|
+
"""
|
|
26
|
+
def __init__(self, epsilon, k, m, df, H):
|
|
27
|
+
"""
|
|
28
|
+
Initializes the privateCMSServer class with the given parameters.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
epsilon (float): The privacy parameter epsilon.
|
|
32
|
+
k (int): The number of hash functions.
|
|
33
|
+
m (int): The size of the sketch.
|
|
34
|
+
df (pandas.DataFrame): The dataset containing the values.
|
|
35
|
+
H (list): The list of hash functions.
|
|
36
|
+
"""
|
|
37
|
+
self.df = df
|
|
38
|
+
self.epsilon = epsilon
|
|
39
|
+
self.k = k
|
|
40
|
+
self.m = m
|
|
41
|
+
self.dataset = self.df['value'].tolist()
|
|
42
|
+
self.domain = self.df['value'].unique().tolist()
|
|
43
|
+
self.N = len(self.dataset)
|
|
44
|
+
self.H = H
|
|
45
|
+
|
|
46
|
+
# Creation of the sketch matrix
|
|
47
|
+
self.M = np.zeros((self.k, self.m))
|
|
48
|
+
|
|
49
|
+
def update_sketch_matrix(self,v,j):
|
|
50
|
+
"""
|
|
51
|
+
Updates the sketch matrix based on the given privatized data.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
v (numpy.ndarray): The privatized vector.
|
|
55
|
+
j (int): The index of the hash function used.
|
|
56
|
+
"""
|
|
57
|
+
c_e = (np.exp(self.epsilon/2)+1) / ((np.exp(self.epsilon/2))-1)
|
|
58
|
+
x = self.k * ((c_e/2) * v + (1/2) * np.ones_like(v))
|
|
59
|
+
for i in range (self.m):
|
|
60
|
+
self.M[j,i] += x[i]
|
|
61
|
+
|
|
62
|
+
def execute_server(self,privatized_data):
|
|
63
|
+
"""
|
|
64
|
+
Executes the server-side operations, including updating the sketch matrix
|
|
65
|
+
and estimating the frequencies.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
privatized_data (list): The privatized data from the client.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
dict: A dictionary containing the estimated frequencies for each element.
|
|
72
|
+
"""
|
|
73
|
+
with Progress() as progress:
|
|
74
|
+
task = progress.add_task('[cyan]Update sketch matrix', total=len(privatized_data))
|
|
75
|
+
|
|
76
|
+
for data in privatized_data:
|
|
77
|
+
self.update_sketch_matrix(data[0],data[1])
|
|
78
|
+
progress.update(task, advance=1)
|
|
79
|
+
|
|
80
|
+
F_estimated = {}
|
|
81
|
+
task = progress.add_task('[cyan]Obtaining histogram of estimated frequencies', total=len(self.domain))
|
|
82
|
+
for x in self.domain:
|
|
83
|
+
F_estimated[x] = self.estimate_server(x)
|
|
84
|
+
progress.update(task, advance=1)
|
|
85
|
+
|
|
86
|
+
return F_estimated
|
|
87
|
+
|
|
88
|
+
def estimate_server(self,d):
|
|
89
|
+
"""
|
|
90
|
+
Estimates the frequency of an element based on the current sketch matrix.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
d (any): The element whose frequency is to be estimated.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
float: The estimated frequency of the element.
|
|
97
|
+
"""
|
|
98
|
+
sum_aux = 0
|
|
99
|
+
for i in range(self.k):
|
|
100
|
+
selected_hash = self.H[i]
|
|
101
|
+
sum_aux += self.M[i, selected_hash(d)]
|
|
102
|
+
|
|
103
|
+
f_estimated = (self.m/(self.m-1))*((sum_aux/self.k)-(self.N/self.m))
|
|
104
|
+
return f_estimated
|
|
105
|
+
|
|
106
|
+
def query_server(self, query_element):
|
|
107
|
+
"""
|
|
108
|
+
Queries the server for the estimated frequency of an element.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
query_element (any): The element to query.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
float or str: The estimated frequency of the element, or a message if the element is not in the domain.
|
|
115
|
+
"""
|
|
116
|
+
if query_element not in self.domain:
|
|
117
|
+
return "Element not in the domain"
|
|
118
|
+
estimation = self.estimate_server(query_element)
|
|
119
|
+
return estimation
|
|
120
|
+
|
|
121
|
+
def run_private_cms_server(k, m, e, df, H, privatized_data):
|
|
122
|
+
"""
|
|
123
|
+
Runs the server-side operations for the Private Count-Mean Sketch, including
|
|
124
|
+
estimating frequencies and querying the server.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
k (int): The number of hash functions.
|
|
128
|
+
m (int): The size of the sketch.
|
|
129
|
+
e (float): The privacy parameter epsilon.
|
|
130
|
+
df (pandas.DataFrame): The dataset containing the values.
|
|
131
|
+
H (list): The list of hash functions.
|
|
132
|
+
privatized_data (list): The privatized data from the client.
|
|
133
|
+
"""
|
|
134
|
+
#Initialize the server Count-Mean Sketch
|
|
135
|
+
server = privateCMSServer(e, k, m, df, H)
|
|
136
|
+
|
|
137
|
+
# Save the privatized data
|
|
138
|
+
privatized_data_save = pd.DataFrame(privatized_data)
|
|
139
|
+
|
|
140
|
+
# Execute the server
|
|
141
|
+
f_estimated = server.execute_server(privatized_data)
|
|
142
|
+
|
|
143
|
+
# Query the server
|
|
144
|
+
while True:
|
|
145
|
+
query = input("Enter a event to query the server or 'exit' to finish: ")
|
|
146
|
+
if query.lower() == 'exit':
|
|
147
|
+
break
|
|
148
|
+
estimation = server.query_server(query)
|
|
149
|
+
print(f"The estimated frequency of {query} is {estimation:.2f}")
|
|
150
|
+
|
|
151
|
+
return privatized_data_save
|
|
152
|
+
|
|
153
|
+
def run_private_cms_server_multiuser(k, m, private):
|
|
154
|
+
"""
|
|
155
|
+
Runs the server-side operations for the Private Count-Mean Sketch,
|
|
156
|
+
storing a separate server instance for each user.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
k (int): The number of hash functions.
|
|
160
|
+
m (int): The size of the sketch.
|
|
161
|
+
e (float): The privacy parameter epsilon.
|
|
162
|
+
df (pandas.DataFrame): The dataset containing the values.
|
|
163
|
+
H (list): The list of hash functions.
|
|
164
|
+
privatized (dict): A dictionary where keys are users and values contain privatized data.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
dict: A dictionary of servers where each user has its own server instance.
|
|
168
|
+
"""
|
|
169
|
+
|
|
170
|
+
user_servers = {}
|
|
171
|
+
|
|
172
|
+
with Progress() as progress:
|
|
173
|
+
task = progress.add_task("[cyan]Initializing servers...", total=len(private))
|
|
174
|
+
|
|
175
|
+
for user, data in private.items():
|
|
176
|
+
progress.update(task, advance=1, description=f"[cyan]Processing user {user}...")
|
|
177
|
+
|
|
178
|
+
e = data["e"]
|
|
179
|
+
privatized_data = data["privatized_data"]
|
|
180
|
+
H = data["result"]
|
|
181
|
+
|
|
182
|
+
df = pd.DataFrame(privatized_data)
|
|
183
|
+
|
|
184
|
+
#Initialize the server Count-Mean Sketch
|
|
185
|
+
server = privateCMSServer(e, k, m, df, H)
|
|
186
|
+
|
|
187
|
+
f_estimated = server.execute_server(privatized_data)
|
|
188
|
+
|
|
189
|
+
user_servers[user] = server
|
|
190
|
+
|
|
191
|
+
print(F"✅ {Fore.GREEN}All user servers initialized.{Style.RESET_ALL}")
|
|
192
|
+
|
|
193
|
+
while True:
|
|
194
|
+
user_query = input("Enter a user to query or 'exit' to finish: ")
|
|
195
|
+
if user_query.lower() == 'exit':
|
|
196
|
+
break
|
|
197
|
+
if user_query not in user_servers:
|
|
198
|
+
raise ValueError(f"❌ {Fore.RED}User '{user_query}' not found.{Style.RESET_ALL}")
|
|
199
|
+
|
|
200
|
+
event_query = input(f"Enter an event for user {user_query}: ")
|
|
201
|
+
estimation = user_servers[user_query].query_server(event_query)
|
|
202
|
+
print(f"The estimated frequency of '{event_query}' for user '{user_query}' is {estimation:.2f}")
|
|
203
|
+
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This subpackage contains implementations of algorithms for calculating private count means
|
|
3
|
+
using the Hadamard Count-Mean-Sketch (HCMS) approach. The Hadamard transform is used to
|
|
4
|
+
efficiently encode and perturb data, ensuring differential privacy while maintaining data
|
|
5
|
+
utility. This subpackage includes both client-side and server-side implementations for
|
|
6
|
+
privacy-preserving data aggregation.
|
|
7
|
+
|
|
8
|
+
Modules:
|
|
9
|
+
- private_hcms_client.py: Contains the client-side logic for perturbing data using the Hadamard transform.
|
|
10
|
+
- private_hcms_server.py: Implements the server-side logic for aggregating and analyzing perturbed data.
|
|
11
|
+
|
|
12
|
+
Main Functions:
|
|
13
|
+
- execute_client: Simulates the client side of the privatized Count-Min Sketch for all elements in the dataset.
|
|
14
|
+
- server_simulator: Simulates the server side of the privatized Count-Min Sketch, processes the privatized data, and estimates frequencies.
|
|
15
|
+
- update_sketch_matrix: Updates the sketch matrix based on the privatized data received from the client.
|
|
16
|
+
- estimate_client: Estimates the frequency of an element based on the private CMS sketch matrix.
|
|
17
|
+
"""
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
from sympy import primerange
|
|
2
|
+
import random
|
|
3
|
+
import numpy as np
|
|
4
|
+
from rich.progress import Progress
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from numba import njit
|
|
7
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
8
|
+
|
|
9
|
+
from clip_protocol.utils.utils import generate_hash_functions
|
|
10
|
+
|
|
11
|
+
@njit
|
|
12
|
+
def hadamard_matrix(n):
|
|
13
|
+
if n == 1:
|
|
14
|
+
return np.array([[1]])
|
|
15
|
+
else:
|
|
16
|
+
h_half = hadamard_matrix(n // 2)
|
|
17
|
+
return np.block([[h_half, h_half], [h_half, -h_half]])
|
|
18
|
+
|
|
19
|
+
@njit
|
|
20
|
+
def update_sketch_matrix(epsilon, k, M, w, j, l):
|
|
21
|
+
c_e = (np.exp(epsilon/2)+1) / ((np.exp(epsilon/2))-1)
|
|
22
|
+
x = k * c_e * w
|
|
23
|
+
M[j,l] = M[j,l] + x
|
|
24
|
+
|
|
25
|
+
@njit
|
|
26
|
+
def traspose_M(M, H):
|
|
27
|
+
return M @ np.transpose(H)
|
|
28
|
+
|
|
29
|
+
class privateHCMSClient:
|
|
30
|
+
def __init__(self, epsilon, k, m, df):
|
|
31
|
+
self.df = df
|
|
32
|
+
self.epsilon = epsilon
|
|
33
|
+
self.k = k
|
|
34
|
+
self.m = m
|
|
35
|
+
self.dataset = self.df['value'].tolist()
|
|
36
|
+
self.domain = self.df['value'].unique().tolist()
|
|
37
|
+
self.H = hadamard_matrix(self.m)
|
|
38
|
+
self.N = len(self.dataset)
|
|
39
|
+
|
|
40
|
+
# Creation of the sketch matrix
|
|
41
|
+
self.M = np.zeros((self.k, self.m))
|
|
42
|
+
|
|
43
|
+
# List to store the privatized matrices
|
|
44
|
+
self.client_matrix = []
|
|
45
|
+
|
|
46
|
+
# Definition of the hash family 3 by 3
|
|
47
|
+
primes = list(primerange(10**6, 10**7))
|
|
48
|
+
p = primes[random.randint(0, len(primes)-1)]
|
|
49
|
+
self.hashes, self.coeffs = generate_hash_functions(self.k, p, 3, self.m)
|
|
50
|
+
|
|
51
|
+
def client(self,d):
|
|
52
|
+
j = random.randint(0, self.k - 1)
|
|
53
|
+
v = np.full(self.m, 0)
|
|
54
|
+
selected_hash = self.hashes[j]
|
|
55
|
+
v[selected_hash(d)] = 1
|
|
56
|
+
w = np.dot(self.H, v)
|
|
57
|
+
l = random.randint(0, self.m-1)
|
|
58
|
+
|
|
59
|
+
P_active = np.exp(self.epsilon) / (np.exp(self.epsilon) + 1)
|
|
60
|
+
if random.random() <= P_active:
|
|
61
|
+
b = 1
|
|
62
|
+
else:
|
|
63
|
+
b = -1
|
|
64
|
+
return b * w[l],j,l
|
|
65
|
+
|
|
66
|
+
def estimate_client(self, d):
|
|
67
|
+
return (self.m / (self.m-1)) * (1/self.k * np.sum([self.M[i,self.hashes[i](d)] for i in range(self.k)]) - self.N/self.m)
|
|
68
|
+
|
|
69
|
+
def execute_client(self):
|
|
70
|
+
privatized_data = []
|
|
71
|
+
def process(d):
|
|
72
|
+
return self.client(d)
|
|
73
|
+
|
|
74
|
+
with Progress() as progress:
|
|
75
|
+
task = progress.add_task('Processing client data', total=len(self.dataset))
|
|
76
|
+
with ThreadPoolExecutor() as executor:
|
|
77
|
+
for result in executor.map(process, self.dataset):
|
|
78
|
+
privatized_data.append(result)
|
|
79
|
+
progress.update(task, advance=1)
|
|
80
|
+
self.client_matrix = privatized_data
|
|
81
|
+
return privatized_data
|
|
82
|
+
|
|
83
|
+
def server_simulator(self, privatized_data):
|
|
84
|
+
with Progress() as progress:
|
|
85
|
+
task = progress.add_task('[cyan]Update sketch matrix', total=len(privatized_data))
|
|
86
|
+
for v, j, w in privatized_data:
|
|
87
|
+
update_sketch_matrix(self.epsilon, self.k, self.M, v, j, w)
|
|
88
|
+
progress.update(task, advance=1)
|
|
89
|
+
|
|
90
|
+
# Transpose the matrix
|
|
91
|
+
self.M = traspose_M(self.M, self.H)
|
|
92
|
+
|
|
93
|
+
# Estimate the frequencies
|
|
94
|
+
F_estimated = {}
|
|
95
|
+
task = progress.add_task('[cyan]Obtaining histogram of estimated frequencies', total=len(self.domain))
|
|
96
|
+
for x in self.domain:
|
|
97
|
+
F_estimated[x] = self.estimate_client(x)
|
|
98
|
+
progress.update(task, advance=1)
|
|
99
|
+
return F_estimated, self.coeffs
|
|
100
|
+
|
|
101
|
+
def run_private_hcms_client(k, m, e, df):
|
|
102
|
+
"""
|
|
103
|
+
Runs the private Count-Min Sketch client, processes the data, and estimates frequencies on the server.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
k (int): The number of hash functions.
|
|
107
|
+
m (int): The size of the sketch matrix.
|
|
108
|
+
e (float): The privacy parameter epsilon for differential privacy.
|
|
109
|
+
df (pandas.DataFrame): The dataset in DataFrame format.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
tuple: A tuple containing the hash functions, data table, error table, privatized data, and the estimated frequencies.
|
|
113
|
+
"""
|
|
114
|
+
# Initialize the client
|
|
115
|
+
client = privateHCMSClient(e, k, m, df)
|
|
116
|
+
|
|
117
|
+
# Client side: process the private data
|
|
118
|
+
privatized_data = client.execute_client()
|
|
119
|
+
|
|
120
|
+
# Simulate the server side
|
|
121
|
+
f_estimated, coeffs = client.server_simulator(privatized_data)
|
|
122
|
+
df_estimated = pd.DataFrame(list(f_estimated.items()), columns=['Element', 'Frequency'])
|
|
123
|
+
|
|
124
|
+
return coeffs, privatized_data, df_estimated
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
|