pyaerial 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aerial/__init__.py +29 -0
- aerial/__pycache__/__init__.cpython-310.pyc +0 -0
- aerial/__pycache__/data_preparation.cpython-310.pyc +0 -0
- aerial/__pycache__/discretization.cpython-310.pyc +0 -0
- aerial/__pycache__/model.cpython-310.pyc +0 -0
- aerial/__pycache__/rule_extraction.cpython-310.pyc +0 -0
- aerial/__pycache__/rule_quality.cpython-310.pyc +0 -0
- aerial/__pycache__/table.cpython-310.pyc +0 -0
- aerial/data_preparation.py +74 -0
- aerial/discretization.py +56 -0
- aerial/model.py +179 -0
- aerial/rule_extraction.py +243 -0
- aerial/rule_quality.py +195 -0
- aerial/table.py +23 -0
- pyaerial-0.1.0.dist-info/METADATA +547 -0
- pyaerial-0.1.0.dist-info/RECORD +19 -0
- pyaerial-0.1.0.dist-info/WHEEL +5 -0
- pyaerial-0.1.0.dist-info/licenses/LICENSE +21 -0
- pyaerial-0.1.0.dist-info/top_level.txt +1 -0
aerial/__init__.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from . import discretization, rule_quality, model
|
|
3
|
+
from aerial.rule_extraction import generate_rules, generate_frequent_itemsets
|
|
4
|
+
|
|
5
|
+
__all__ = [discretization, rule_quality, model, generate_rules, generate_frequent_itemsets]
|
|
6
|
+
|
|
7
|
+
# Create a package-wide logger
|
|
8
|
+
logger = logging.getLogger("aerial")
|
|
9
|
+
logger.propagate = True
|
|
10
|
+
logger.addHandler(logging.NullHandler())
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def setup_logging(level=logging.INFO, propagate=True):
|
|
14
|
+
"""Configure package logging"""
|
|
15
|
+
logger.propagate = propagate
|
|
16
|
+
logger.setLevel(level)
|
|
17
|
+
|
|
18
|
+
# Remove all existing handlers
|
|
19
|
+
for handler in logger.handlers[:]:
|
|
20
|
+
logger.removeHandler(handler)
|
|
21
|
+
|
|
22
|
+
# Add new console handler if level is not NOTSET
|
|
23
|
+
if level != logging.NOTSET:
|
|
24
|
+
handler = logging.StreamHandler()
|
|
25
|
+
formatter = logging.Formatter(
|
|
26
|
+
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
27
|
+
)
|
|
28
|
+
handler.setFormatter(formatter)
|
|
29
|
+
logger.addHandler(handler)
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright (c) [2025] [Erkan Karabulut - DiTEC Project]
|
|
3
|
+
|
|
4
|
+
This script implements data preparation functions for tabular for association rule mining with Aerial
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import concurrent
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
12
|
+
|
|
13
|
+
from aerial.table import get_unique_values_per_column
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _one_hot_encoding_with_feature_tracking(transactions: pd.DataFrame, parallel_workers=1):
|
|
17
|
+
"""
|
|
18
|
+
Create input vectors for training the Autoencoder in a one-hot encoded form. And returns indices of each feature
|
|
19
|
+
values in a structured way for future tracking when extracting rules from a trained Autoencoder
|
|
20
|
+
:param transactions: pandas DataFrame of transactions
|
|
21
|
+
:return: a python dictionary with 3 objects
|
|
22
|
+
vector_list: transactions as a list of one-hot encoded vectors,
|
|
23
|
+
vector_tracker_list: a list ,
|
|
24
|
+
"feature_value_indices": feature_value_indices,
|
|
25
|
+
"""
|
|
26
|
+
# Aerial uses "__" to separate column names and their values when one-hot encoding, {COL_NAME}__{value}
|
|
27
|
+
# therefore, replace all "__" in column names with "--" to avoid later confusion in naming
|
|
28
|
+
transactions.columns = [col.replace('__', '--') for col in transactions.columns]
|
|
29
|
+
columns = transactions.columns.tolist()
|
|
30
|
+
|
|
31
|
+
# Get input vectors in the form of one-hot encoded vectors
|
|
32
|
+
unique_values, value_count = get_unique_values_per_column(transactions)
|
|
33
|
+
feature_value_indices = []
|
|
34
|
+
vector_tracker = []
|
|
35
|
+
start = 0
|
|
36
|
+
|
|
37
|
+
# Track what each value in the input vector corresponds to
|
|
38
|
+
# Track where do values for each feature start and end in the input feature
|
|
39
|
+
for feature, values in unique_values.items():
|
|
40
|
+
end = start + len(values)
|
|
41
|
+
feature_value_indices.append({'feature': feature, 'start': start, 'end': end})
|
|
42
|
+
vector_tracker.extend([f"{feature}__{value}" for value in values])
|
|
43
|
+
start = end
|
|
44
|
+
|
|
45
|
+
# Map tracker entries to indices for fast lookup
|
|
46
|
+
tracker_index_map = {key: idx for idx, key in enumerate(vector_tracker)}
|
|
47
|
+
|
|
48
|
+
# Preallocate vector list
|
|
49
|
+
vector_list = np.zeros((len(transactions), value_count), dtype=int)
|
|
50
|
+
|
|
51
|
+
# Function to process each transaction
|
|
52
|
+
def process_transaction(transaction_idx, transaction):
|
|
53
|
+
transaction_vector = np.zeros(value_count, dtype=int)
|
|
54
|
+
for col_idx, value in enumerate(transaction):
|
|
55
|
+
if not pd.isna(value):
|
|
56
|
+
key = f"{columns[col_idx]}__{value}"
|
|
57
|
+
transaction_vector[tracker_index_map[key]] = 1
|
|
58
|
+
return transaction_idx, transaction_vector
|
|
59
|
+
|
|
60
|
+
# Parallelize transaction processing
|
|
61
|
+
# NOTE: Preparing the input data for each of the algorithms is not included in the execution time calculation
|
|
62
|
+
# Therefore, we preprocess data in parallel where possible for each of the algorithm
|
|
63
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=parallel_workers) as executor:
|
|
64
|
+
futures = [
|
|
65
|
+
executor.submit(process_transaction, transaction_idx, transaction)
|
|
66
|
+
for transaction_idx, transaction in enumerate(transactions.itertuples(index=False))
|
|
67
|
+
]
|
|
68
|
+
|
|
69
|
+
for future in concurrent.futures.as_completed(futures):
|
|
70
|
+
transaction_idx, transaction_vector = future.result()
|
|
71
|
+
vector_list[transaction_idx] = transaction_vector
|
|
72
|
+
|
|
73
|
+
vector_list = pd.DataFrame(vector_list, columns=vector_tracker)
|
|
74
|
+
return vector_list, feature_value_indices
|
aerial/discretization.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright (c) [2025] [Erkan Karabulut - DiTEC Project]
|
|
3
|
+
|
|
4
|
+
This script include different discretization methods for tabular data
|
|
5
|
+
"""
|
|
6
|
+
import logging
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger("aerial")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def equal_frequency_discretization(df: pd.DataFrame, n_bins=10):
|
|
15
|
+
"""
|
|
16
|
+
Detect numerical columns automatically and discretize them into n_bins intervals based on equal frequency.
|
|
17
|
+
Intervals are represented as strings.
|
|
18
|
+
|
|
19
|
+
:param df: tabular data in pandas DataFrame form
|
|
20
|
+
:param n_bins: number of intervals (bins)
|
|
21
|
+
:return: df with discrete columns
|
|
22
|
+
"""
|
|
23
|
+
df_discretized = df.copy()
|
|
24
|
+
num_cols = df.select_dtypes(include=[np.number]).columns
|
|
25
|
+
|
|
26
|
+
for col in num_cols:
|
|
27
|
+
try:
|
|
28
|
+
# Use labels=True to get string intervals
|
|
29
|
+
df_discretized[col] = pd.qcut(df[col], q=n_bins, duplicates='drop')
|
|
30
|
+
df_discretized[col] = df_discretized[col].astype(str)
|
|
31
|
+
except ValueError:
|
|
32
|
+
logger.debug(f"Column '{col}' could not be discretized due to insufficient unique values.")
|
|
33
|
+
|
|
34
|
+
return df_discretized
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def equal_width_discretization(df: pd.DataFrame, n_bins=10):
|
|
38
|
+
"""
|
|
39
|
+
Detect numerical columns automatically and discretize them into n_bins intervals based on equal width.
|
|
40
|
+
Intervals are represented as strings.
|
|
41
|
+
|
|
42
|
+
:param df: tabular data in pandas DataFrame form
|
|
43
|
+
:param n_bins: number of intervals (bins)
|
|
44
|
+
:return: df with discrete columns
|
|
45
|
+
"""
|
|
46
|
+
df_discretized = df.copy()
|
|
47
|
+
num_cols = df.select_dtypes(include=[np.number]).columns
|
|
48
|
+
|
|
49
|
+
for col in num_cols:
|
|
50
|
+
try:
|
|
51
|
+
df_discretized[col] = pd.cut(df[col], bins=n_bins)
|
|
52
|
+
df_discretized[col] = df_discretized[col].astype(str)
|
|
53
|
+
except ValueError:
|
|
54
|
+
logger.debug(f"Column '{col}' could not be discretized due to insufficient unique values.")
|
|
55
|
+
|
|
56
|
+
return df_discretized
|
aerial/model.py
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright (c) [2025] [Erkan Karabulut - DiTEC Project]
|
|
3
|
+
|
|
4
|
+
Construct an Autoencoder for association rule mining as described in the paper (Neurosymbolic association rule mining
|
|
5
|
+
from tabular data - https://arxiv.org/abs/2504.19354)
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import logging
|
|
10
|
+
import torch
|
|
11
|
+
import pandas as pd
|
|
12
|
+
from torch import nn
|
|
13
|
+
import math
|
|
14
|
+
import torch.nn.functional as F
|
|
15
|
+
|
|
16
|
+
from torch.utils.data import TensorDataset, DataLoader
|
|
17
|
+
|
|
18
|
+
from aerial.data_preparation import _one_hot_encoding_with_feature_tracking
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger("aerial")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class AutoEncoder(nn.Module):
|
|
24
|
+
"""
|
|
25
|
+
This autoencoder is used to create a neural representation of tabular data for association rule mining
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self, input_dimension, feature_count, layer_dims: list = None):
|
|
29
|
+
"""
|
|
30
|
+
The init function can either construct an under-complete Autoencoder based on the input dimension and feature
|
|
31
|
+
count, automatically deciding the number of layers and layer dimensions.
|
|
32
|
+
Or, if specified by the user, it can also use the layer counts and dimensions from the user.
|
|
33
|
+
Note that fine-tuning layer count and dimensions based on your table dimension and size
|
|
34
|
+
will result in better performance in general
|
|
35
|
+
|
|
36
|
+
:param input_dimension: number of features after one-hot encoding (input dimension)
|
|
37
|
+
:param feature_count: target feature count (initial column count of tabular data)
|
|
38
|
+
:param layer_dims: (optional) list of int, specific dimensions for hidden layers
|
|
39
|
+
(excluding input/output dimensions)
|
|
40
|
+
"""
|
|
41
|
+
super().__init__()
|
|
42
|
+
|
|
43
|
+
self.input_dimension = input_dimension
|
|
44
|
+
self.feature_count = feature_count
|
|
45
|
+
self.input_vectors = None
|
|
46
|
+
self.feature_value_indices = None
|
|
47
|
+
self.feature_values = None
|
|
48
|
+
|
|
49
|
+
# Determine the layer dimensions
|
|
50
|
+
if layer_dims is None:
|
|
51
|
+
# Compute default number of layers based on log base 16
|
|
52
|
+
layer_count = max(1, math.ceil(math.log(input_dimension, 16)) - 1)
|
|
53
|
+
|
|
54
|
+
# Calculate dimensions with consistent reduction ratio
|
|
55
|
+
reduction_ratio = (feature_count / input_dimension) ** (1 / (layer_count))
|
|
56
|
+
dimensions = [input_dimension]
|
|
57
|
+
for i in range(1, layer_count):
|
|
58
|
+
next_dim = max(feature_count, int(dimensions[-1] * reduction_ratio))
|
|
59
|
+
dimensions.append(next_dim)
|
|
60
|
+
dimensions.append(feature_count)
|
|
61
|
+
else:
|
|
62
|
+
# Use provided layer dimensions, adding input and output dimensions
|
|
63
|
+
dimensions = [input_dimension] + layer_dims
|
|
64
|
+
|
|
65
|
+
self.dimensions = dimensions # save for inspection
|
|
66
|
+
|
|
67
|
+
# Build Encoder
|
|
68
|
+
encoder_layers = []
|
|
69
|
+
for i in range(len(dimensions) - 1):
|
|
70
|
+
encoder_layers.append(nn.Linear(dimensions[i], dimensions[i + 1]))
|
|
71
|
+
if i != len(dimensions) - 2: # No activation after last encoder layer
|
|
72
|
+
encoder_layers.append(nn.Tanh())
|
|
73
|
+
|
|
74
|
+
self.encoder = nn.Sequential(*encoder_layers)
|
|
75
|
+
|
|
76
|
+
# Build Decoder (mirror of encoder, excluding final layer's activation)
|
|
77
|
+
decoder_layers = []
|
|
78
|
+
reversed_dimensions = list(reversed(dimensions))
|
|
79
|
+
for i in range(len(reversed_dimensions) - 1):
|
|
80
|
+
decoder_layers.append(nn.Linear(reversed_dimensions[i], reversed_dimensions[i + 1]))
|
|
81
|
+
if i != len(reversed_dimensions) - 2:
|
|
82
|
+
decoder_layers.append(nn.Tanh())
|
|
83
|
+
|
|
84
|
+
self.decoder = nn.Sequential(*decoder_layers)
|
|
85
|
+
|
|
86
|
+
self.encoder.apply(self.init_weights)
|
|
87
|
+
self.decoder.apply(self.init_weights)
|
|
88
|
+
|
|
89
|
+
@staticmethod
|
|
90
|
+
def init_weights(m):
|
|
91
|
+
"""
|
|
92
|
+
all weights are initialized with values sampled from uniform distributions with the Xavier initialization
|
|
93
|
+
and the biases are set to 0, as described in the paper by Delong et al. (2023)
|
|
94
|
+
"""
|
|
95
|
+
if isinstance(m, nn.Linear):
|
|
96
|
+
torch.nn.init.xavier_uniform_(m.weight)
|
|
97
|
+
m.bias.data.zero_()
|
|
98
|
+
|
|
99
|
+
def save(self, name):
|
|
100
|
+
torch.save(self.encoder.state_dict(), name + "_encoder.pt")
|
|
101
|
+
torch.save(self.decoder.state_dict(), name + '_decoder.pt')
|
|
102
|
+
|
|
103
|
+
def load(self, name):
|
|
104
|
+
if os.path.isfile(name + '_encoder.pt') and os.path.isfile(name + '_decoder.pt'):
|
|
105
|
+
self.encoder.load_state_dict(torch.load(name + '_encoder.pt'))
|
|
106
|
+
self.decoder.load_state_dict(torch.load(name + '_decoder.pt'))
|
|
107
|
+
self.encoder.eval()
|
|
108
|
+
self.decoder.eval()
|
|
109
|
+
return True
|
|
110
|
+
else:
|
|
111
|
+
return False
|
|
112
|
+
|
|
113
|
+
def forward(self, x, feature_value_indices):
|
|
114
|
+
y = self.encoder(x)
|
|
115
|
+
y = self.decoder(y)
|
|
116
|
+
|
|
117
|
+
# Split the tensor into chunks based on the ranges
|
|
118
|
+
chunks = [y[:, start:end] for start, end in feature_value_indices]
|
|
119
|
+
|
|
120
|
+
# Apply softmax to each chunk
|
|
121
|
+
softmax_chunks = [F.softmax(chunk, dim=1) for chunk in chunks]
|
|
122
|
+
|
|
123
|
+
# Concatenate the chunks back together
|
|
124
|
+
y = torch.cat(softmax_chunks, dim=1)
|
|
125
|
+
|
|
126
|
+
return y
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def train(transactions: pd.DataFrame, autoencoder: AutoEncoder = None, noise_factor=0.5,
|
|
130
|
+
lr=5e-3, epochs=1, batch_size=2, loss_function=torch.nn.BCELoss(), num_workers=1, layer_dims: list = None):
|
|
131
|
+
"""
|
|
132
|
+
train an autoencoder for association rule mining
|
|
133
|
+
"""
|
|
134
|
+
input_vectors, feature_value_indices = _one_hot_encoding_with_feature_tracking(transactions, num_workers)
|
|
135
|
+
columns = input_vectors.columns.tolist()
|
|
136
|
+
|
|
137
|
+
if not autoencoder:
|
|
138
|
+
autoencoder = AutoEncoder(input_dimension=len(columns), feature_count=len(feature_value_indices),
|
|
139
|
+
layer_dims=layer_dims)
|
|
140
|
+
|
|
141
|
+
autoencoder.input_vectors = input_vectors
|
|
142
|
+
|
|
143
|
+
input_vectors = input_vectors.to_numpy()
|
|
144
|
+
|
|
145
|
+
autoencoder = torch.compile(autoencoder)
|
|
146
|
+
autoencoder.feature_value_indices = feature_value_indices
|
|
147
|
+
autoencoder.feature_values = columns
|
|
148
|
+
|
|
149
|
+
optimizer = torch.optim.Adam(autoencoder.parameters(), lr=lr, weight_decay=2e-8)
|
|
150
|
+
|
|
151
|
+
vectors_tensor = torch.tensor(input_vectors, dtype=torch.float32)
|
|
152
|
+
dataset = TensorDataset(vectors_tensor)
|
|
153
|
+
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
|
|
154
|
+
|
|
155
|
+
softmax_ranges = [(cat['start'], cat['end']) for cat in feature_value_indices]
|
|
156
|
+
|
|
157
|
+
for epoch in range(epochs):
|
|
158
|
+
# print(f"Epoch {epoch + 1}/{epochs}")
|
|
159
|
+
for batch_index, (batch,) in enumerate(dataloader):
|
|
160
|
+
noisy_batch = (batch + torch.randn_like(batch) * noise_factor).clamp(0, 1)
|
|
161
|
+
|
|
162
|
+
# Forward pass
|
|
163
|
+
reconstructed_batch = autoencoder(noisy_batch, softmax_ranges)
|
|
164
|
+
|
|
165
|
+
# Compute loss for the entire batch
|
|
166
|
+
total_loss = sum(
|
|
167
|
+
loss_function(
|
|
168
|
+
reconstructed_batch[:, start:end],
|
|
169
|
+
batch[:, start:end]
|
|
170
|
+
)
|
|
171
|
+
for (start, end) in softmax_ranges
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
# Backpropagation and optimization step
|
|
175
|
+
optimizer.zero_grad()
|
|
176
|
+
total_loss.backward()
|
|
177
|
+
optimizer.step()
|
|
178
|
+
|
|
179
|
+
return autoencoder
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright (c) [2025] [Erkan Karabulut - DiTEC Project]
|
|
3
|
+
|
|
4
|
+
Includes the Aerial algorithm's source code for association rule (and frequent itemsets) extraction from a
|
|
5
|
+
trained Autoencoder (Neurosymbolic association rule mining from tabular data - https://arxiv.org/abs/2504.19354)
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import torch
|
|
9
|
+
|
|
10
|
+
from itertools import combinations
|
|
11
|
+
|
|
12
|
+
from aerial.model import AutoEncoder
|
|
13
|
+
import numpy as np
|
|
14
|
+
import logging
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger("aerial")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def generate_rules(autoencoder: AutoEncoder, ant_similarity=0.5, cons_similarity=0.8, max_antecedents=2,
|
|
20
|
+
target_class=None):
|
|
21
|
+
"""
|
|
22
|
+
extract rules from a trained Autoencoder using Aerial+ algorithm
|
|
23
|
+
@param target_class: if given a target class, generate rules with the target class on the right hand side only
|
|
24
|
+
:param max_antecedents: max number of antecedents that the rules will contain
|
|
25
|
+
:param cons_similarity: consequent simi
|
|
26
|
+
:param ant_similarity:
|
|
27
|
+
:param autoencoder:
|
|
28
|
+
"""
|
|
29
|
+
if not autoencoder:
|
|
30
|
+
logger.error("A trained Autoencoder has to be provided before generating rules.")
|
|
31
|
+
return None
|
|
32
|
+
|
|
33
|
+
logger.debug("Extracting association rules from the given trained Autoencoder ...")
|
|
34
|
+
|
|
35
|
+
association_rules = []
|
|
36
|
+
input_vector_size = autoencoder.encoder[0].in_features
|
|
37
|
+
|
|
38
|
+
feature_value_indices = autoencoder.feature_value_indices
|
|
39
|
+
target_range = range(input_vector_size)
|
|
40
|
+
|
|
41
|
+
# If target_class is specified, narrow the target range and features
|
|
42
|
+
# this is to do "constraint-based rule mining"
|
|
43
|
+
if target_class:
|
|
44
|
+
for feature in feature_value_indices:
|
|
45
|
+
if feature["feature"] == target_class:
|
|
46
|
+
target_range = range(feature["start"], feature["end"])
|
|
47
|
+
break
|
|
48
|
+
|
|
49
|
+
low_support_antecedents = np.array([])
|
|
50
|
+
|
|
51
|
+
# Initialize input vectors
|
|
52
|
+
unmarked_features = _initialize_input_vectors(input_vector_size, feature_value_indices)
|
|
53
|
+
|
|
54
|
+
# Precompute target indices for softmax to speed things up
|
|
55
|
+
feature_value_indices = [(cat['start'], cat['end']) for cat in feature_value_indices]
|
|
56
|
+
softmax_ranges = feature_value_indices
|
|
57
|
+
|
|
58
|
+
for r in range(1, max_antecedents + 1):
|
|
59
|
+
if r == 2:
|
|
60
|
+
softmax_ranges = [
|
|
61
|
+
(start, end) for (start, end) in softmax_ranges
|
|
62
|
+
if not all(idx in low_support_antecedents for idx in range(start, end))
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
feature_combinations = list(combinations(softmax_ranges, r)) # Generate combinations
|
|
66
|
+
|
|
67
|
+
# Vectorized model evaluation batch
|
|
68
|
+
batch_vectors = []
|
|
69
|
+
batch_candidate_antecedent_list = []
|
|
70
|
+
|
|
71
|
+
for category_list in feature_combinations:
|
|
72
|
+
test_vectors, candidate_antecedent_list = _mark_features(unmarked_features, list(category_list),
|
|
73
|
+
low_support_antecedents)
|
|
74
|
+
if len(test_vectors) > 0:
|
|
75
|
+
batch_vectors.extend(test_vectors)
|
|
76
|
+
batch_candidate_antecedent_list.extend(candidate_antecedent_list)
|
|
77
|
+
|
|
78
|
+
if batch_vectors:
|
|
79
|
+
batch_vectors = torch.tensor(np.array(batch_vectors), dtype=torch.float32)
|
|
80
|
+
# Perform a single model evaluation for the batch
|
|
81
|
+
implications_batch = autoencoder(batch_vectors, feature_value_indices).detach().numpy()
|
|
82
|
+
for test_vector, implication_probabilities, candidate_antecedents \
|
|
83
|
+
in zip(batch_vectors, implications_batch, batch_candidate_antecedent_list):
|
|
84
|
+
if len(candidate_antecedents) == 0:
|
|
85
|
+
continue
|
|
86
|
+
|
|
87
|
+
# Identify low-support antecedents
|
|
88
|
+
if any(implication_probabilities[ant] <= ant_similarity for ant in candidate_antecedents):
|
|
89
|
+
if r == 1:
|
|
90
|
+
low_support_antecedents = np.append(low_support_antecedents, candidate_antecedents)
|
|
91
|
+
continue
|
|
92
|
+
|
|
93
|
+
# Identify high-support consequents
|
|
94
|
+
consequent_list = [
|
|
95
|
+
prob_index for prob_index in target_range
|
|
96
|
+
if prob_index not in candidate_antecedents and
|
|
97
|
+
implication_probabilities[prob_index] >= cons_similarity
|
|
98
|
+
]
|
|
99
|
+
|
|
100
|
+
if consequent_list:
|
|
101
|
+
new_rule = _get_rule(candidate_antecedents, consequent_list, autoencoder.feature_values)
|
|
102
|
+
for consequent in new_rule['consequents']:
|
|
103
|
+
association_rules.append({'antecedents': new_rule['antecedents'], 'consequent': consequent})
|
|
104
|
+
|
|
105
|
+
logger.debug("%d association rules extracted.", len(association_rules))
|
|
106
|
+
return association_rules
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def generate_frequent_itemsets(autoencoder: AutoEncoder, similarity=0.5, max_length=2):
|
|
110
|
+
"""
|
|
111
|
+
Generate frequent itemsets using the Aerial+ algorithm.
|
|
112
|
+
"""
|
|
113
|
+
if not autoencoder:
|
|
114
|
+
logger.error("A trained Autoencoder has to be provided before extracting frequent items.")
|
|
115
|
+
return None
|
|
116
|
+
|
|
117
|
+
logger.debug("Extracting frequent items from the given trained Autoencoder ...")
|
|
118
|
+
|
|
119
|
+
frequent_itemsets = []
|
|
120
|
+
input_vector_size = len(autoencoder.feature_values)
|
|
121
|
+
|
|
122
|
+
low_support_antecedents = np.array([])
|
|
123
|
+
|
|
124
|
+
feature_value_indices = autoencoder.feature_value_indices
|
|
125
|
+
|
|
126
|
+
# Initialize input vectors once
|
|
127
|
+
unmarked_features = _initialize_input_vectors(input_vector_size, feature_value_indices)
|
|
128
|
+
|
|
129
|
+
# Precompute target indices for softmax
|
|
130
|
+
feature_value_indices = [(cat['start'], cat['end']) for cat in feature_value_indices]
|
|
131
|
+
softmax_ranges = feature_value_indices
|
|
132
|
+
|
|
133
|
+
# Iteratively process combinations of increasing size
|
|
134
|
+
for r in range(1, max_length + 1):
|
|
135
|
+
softmax_ranges = [
|
|
136
|
+
(start, end) for (start, end) in softmax_ranges
|
|
137
|
+
if not all(idx in low_support_antecedents for idx in range(start, end))
|
|
138
|
+
]
|
|
139
|
+
|
|
140
|
+
feature_combinations = list(combinations(softmax_ranges, r)) # Generate combinations
|
|
141
|
+
|
|
142
|
+
# Vectorized model evaluation batch
|
|
143
|
+
batch_vectors = []
|
|
144
|
+
batch_candidate_antecedent_list = []
|
|
145
|
+
|
|
146
|
+
for category_list in feature_combinations:
|
|
147
|
+
test_vectors, candidate_antecedent_list = _mark_features(unmarked_features, list(category_list),
|
|
148
|
+
low_support_antecedents)
|
|
149
|
+
if len(test_vectors) > 0:
|
|
150
|
+
batch_vectors.extend(test_vectors)
|
|
151
|
+
batch_candidate_antecedent_list.extend(candidate_antecedent_list)
|
|
152
|
+
if batch_vectors:
|
|
153
|
+
batch_vectors = torch.tensor(np.array(batch_vectors), dtype=torch.float32)
|
|
154
|
+
# Perform a single model evaluation for the batch
|
|
155
|
+
implications_batch = autoencoder(batch_vectors, feature_value_indices).detach().numpy()
|
|
156
|
+
for test_vector, implication_probabilities, candidate_antecedents \
|
|
157
|
+
in zip(batch_vectors, implications_batch, batch_candidate_antecedent_list):
|
|
158
|
+
if len(candidate_antecedents) == 0:
|
|
159
|
+
continue
|
|
160
|
+
|
|
161
|
+
# Identify low-support antecedents
|
|
162
|
+
if any(implication_probabilities[ant] <= similarity for ant in candidate_antecedents):
|
|
163
|
+
if r == 1:
|
|
164
|
+
low_support_antecedents = np.append(low_support_antecedents, candidate_antecedents)
|
|
165
|
+
continue
|
|
166
|
+
|
|
167
|
+
# Add to frequent itemsets
|
|
168
|
+
frequent_itemsets.append(
|
|
169
|
+
[autoencoder.feature_values[idx] for idx in candidate_antecedents]
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
logger.debug("%d frequent itemsets extracted.", len(frequent_itemsets))
|
|
173
|
+
return frequent_itemsets
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _mark_features(unmarked_test_vector, features, low_support_antecedents):
|
|
177
|
+
"""
|
|
178
|
+
Create a list of test vectors by marking the given features in the unmarked test vector.
|
|
179
|
+
This optimized version processes features in bulk using NumPy operations.
|
|
180
|
+
"""
|
|
181
|
+
input_vector_size = unmarked_test_vector.shape[0]
|
|
182
|
+
|
|
183
|
+
# Compute valid feature ranges excluding low_support_antecedents
|
|
184
|
+
feature_ranges = [
|
|
185
|
+
np.setdiff1d(np.arange(start, end), low_support_antecedents)
|
|
186
|
+
for (start, end) in features
|
|
187
|
+
]
|
|
188
|
+
|
|
189
|
+
# Create all combinations of feature indices
|
|
190
|
+
combinations = np.array(np.meshgrid(*feature_ranges)).T.reshape(-1, len(features))
|
|
191
|
+
|
|
192
|
+
# Initialize test_vectors and candidate_antecedents
|
|
193
|
+
n_combinations = combinations.shape[0]
|
|
194
|
+
test_vectors = np.tile(unmarked_test_vector, (n_combinations, 1))
|
|
195
|
+
candidate_antecedents = [[] for _ in range(n_combinations)]
|
|
196
|
+
|
|
197
|
+
# Vectorized marking of test_vectors
|
|
198
|
+
for i, (start, end) in enumerate(features):
|
|
199
|
+
# Get the feature range
|
|
200
|
+
valid_indices = combinations[:, i]
|
|
201
|
+
|
|
202
|
+
# Ensure indices are within bounds
|
|
203
|
+
valid_indices = valid_indices[(valid_indices >= 0) & (valid_indices < input_vector_size)]
|
|
204
|
+
|
|
205
|
+
# Mark test_vectors based on valid indices for the current feature
|
|
206
|
+
for j, idx in enumerate(valid_indices):
|
|
207
|
+
test_vectors[j, start:end] = 0 # Set feature range to 0
|
|
208
|
+
test_vectors[j, idx] = 1 # Mark the valid index with 1
|
|
209
|
+
candidate_antecedents[j].append(idx) # Append the index to the j-th test vector's antecedents
|
|
210
|
+
|
|
211
|
+
# Convert lists of candidate_antecedents to numpy arrays
|
|
212
|
+
candidate_antecedents = [np.array(lst) for lst in candidate_antecedents]
|
|
213
|
+
return test_vectors, candidate_antecedents
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _initialize_input_vectors(input_vector_size, categories):
|
|
217
|
+
"""
|
|
218
|
+
Initialize the input vectors with equal probabilities for each feature range.
|
|
219
|
+
"""
|
|
220
|
+
vector_with_unmarked_features = np.zeros(input_vector_size)
|
|
221
|
+
for category in categories:
|
|
222
|
+
vector_with_unmarked_features[category['start']:category['end']] = 1 / (
|
|
223
|
+
category['end'] - category['start'])
|
|
224
|
+
return vector_with_unmarked_features
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _get_rule(antecedents, consequents, feature_values):
|
|
228
|
+
"""
|
|
229
|
+
Find the corresponding feature value for the given antecedents and consequent that are indices in test vectors
|
|
230
|
+
:param antecedents: a list of indices in the test vectors marking the antecedent locations
|
|
231
|
+
:param consequents: an index in the test vector marking the consequent location
|
|
232
|
+
:param feature_values: a list of string that keeps track of which neuron in the Autoencoder input corresponds
|
|
233
|
+
to which feature value in the tabular data
|
|
234
|
+
:return:
|
|
235
|
+
"""
|
|
236
|
+
rule = {'antecedents': [], 'consequents': []}
|
|
237
|
+
for antecedent in antecedents:
|
|
238
|
+
rule['antecedents'].append(feature_values[antecedent])
|
|
239
|
+
|
|
240
|
+
for consequent in consequents:
|
|
241
|
+
rule['consequents'].append(feature_values[consequent])
|
|
242
|
+
|
|
243
|
+
return rule
|