itertoolkit 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bm_preprocessing/__init__.py +14 -0
- bm_preprocessing/importer/DM/__init__.py +7 -0
- bm_preprocessing/importer/DM/agg.py +6 -0
- bm_preprocessing/importer/DM/dbscan.py +6 -0
- bm_preprocessing/importer/DM/finals.py +6 -0
- bm_preprocessing/importer/DM/gsp.py +6 -0
- bm_preprocessing/importer/DM/test.py +6 -0
- bm_preprocessing/importer/Finals/__init__.py +7 -0
- bm_preprocessing/importer/Finals/kaadhal.py +6 -0
- bm_preprocessing/importer/Finals/raaka.py +6 -0
- bm_preprocessing/importer/Finals/seedan.py +6 -0
- bm_preprocessing/importer/Finals/vikram.py +6 -0
- bm_preprocessing/importer/IR/__init__.py +6 -0
- bm_preprocessing/importer/IR/finals.py +6 -0
- bm_preprocessing/importer/IR/pagerank.py +6 -0
- bm_preprocessing/importer/IR/recommenders_pca.py +8 -0
- bm_preprocessing/importer/IR/test.py +6 -0
- bm_preprocessing/importer/PY/__init__.py +4 -0
- bm_preprocessing/importer/PY/lib_doc.py +6 -0
- bm_preprocessing/importer/PY/python_doc.py +6 -0
- bm_preprocessing/importer/__init__.py +8 -0
- bm_preprocessing/importer/_module_printer.py +23 -0
- bm_preprocessing/src/DM/__init__.py +1 -0
- bm_preprocessing/src/DM/agg.py +267 -0
- bm_preprocessing/src/DM/dbscan.py +218 -0
- bm_preprocessing/src/DM/finals.py +19 -0
- bm_preprocessing/src/DM/gsp.py +378 -0
- bm_preprocessing/src/DM/test.py +19 -0
- bm_preprocessing/src/Finals/__init__.py +1 -0
- bm_preprocessing/src/Finals/kaadhal.py +1453 -0
- bm_preprocessing/src/Finals/raaka.py +1338 -0
- bm_preprocessing/src/Finals/seedan.py +1173 -0
- bm_preprocessing/src/Finals/vikram.py +520 -0
- bm_preprocessing/src/IR/__init__.py +1 -0
- bm_preprocessing/src/IR/finals.py +14 -0
- bm_preprocessing/src/IR/pagerank.py +109 -0
- bm_preprocessing/src/IR/recommenders_pca.py +487 -0
- bm_preprocessing/src/IR/test.py +14 -0
- bm_preprocessing/src/PY/__init__.py +1 -0
- bm_preprocessing/src/PY/lib_doc.py +295 -0
- bm_preprocessing/src/PY/python_doc.py +177 -0
- bm_preprocessing/src/__init__.py +1 -0
- itertoolkit-1.5.0.dist-info/METADATA +120 -0
- itertoolkit-1.5.0.dist-info/RECORD +45 -0
- itertoolkit-1.5.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Data-mining quick reference.
|
|
2
|
+
|
|
3
|
+
Printing bm_preprocessing.DM.dm_doc displays this source file.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from pandas import DataFrame, display
|
|
7
|
+
|
|
8
|
+
df = DataFrame() # Placeholder for type hinting
|
|
9
|
+
|
|
10
|
+
df = {
|
|
11
|
+
"col1": [1, 2, 3],
|
|
12
|
+
"col2": ["a", "b", "c"],
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
df["col1"].mean()
|
|
16
|
+
|
|
17
|
+
print(df["col2"].unique())
|
|
18
|
+
|
|
19
|
+
display(df.head())
|
|
@@ -0,0 +1,378 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""PS15.ipynb
|
|
3
|
+
|
|
4
|
+
Automatically generated by Colab.
|
|
5
|
+
|
|
6
|
+
Original file is located at
|
|
7
|
+
https://colab.research.google.com/drive/1L11gBakFRdb1QWPyk4d0e8MxgnAN8WVE
|
|
8
|
+
|
|
9
|
+
## Own Impl
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import re
|
|
13
|
+
|
|
14
|
+
# 1. PARSE INPUT FILES
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def parse_data(filepath):
|
|
18
|
+
sequences = []
|
|
19
|
+
with open(filepath) as f:
|
|
20
|
+
for line in f:
|
|
21
|
+
line = line.strip()
|
|
22
|
+
if not line:
|
|
23
|
+
continue
|
|
24
|
+
itemsets = re.findall(r"\{([^}]+)\}", line)
|
|
25
|
+
seq = []
|
|
26
|
+
for itemset in itemsets:
|
|
27
|
+
items = frozenset(int(x.strip()) for x in itemset.split(","))
|
|
28
|
+
seq.append(items)
|
|
29
|
+
if seq:
|
|
30
|
+
sequences.append(seq)
|
|
31
|
+
return sequences
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def parse_params(filepath):
|
|
35
|
+
mis = {}
|
|
36
|
+
sdc = None
|
|
37
|
+
with open(filepath) as f:
|
|
38
|
+
for line in f:
|
|
39
|
+
line = line.strip()
|
|
40
|
+
if not line:
|
|
41
|
+
continue
|
|
42
|
+
m = re.match(r"MIS\((\d+)\)\s*=\s*([\d.]+)", line)
|
|
43
|
+
if m:
|
|
44
|
+
mis[int(m.group(1))] = float(m.group(2))
|
|
45
|
+
continue
|
|
46
|
+
m = re.match(r"SDC\s*=\s*([\d.]+)", line)
|
|
47
|
+
if m:
|
|
48
|
+
sdc = float(m.group(1))
|
|
49
|
+
return mis, sdc
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# 2. SUPPORT COUNTING
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def is_subsequence(pattern, sequence):
|
|
56
|
+
"""Check if pattern (list of frozensets) occurs in sequence (in order)."""
|
|
57
|
+
pat_idx = 0
|
|
58
|
+
for itemset in sequence:
|
|
59
|
+
if pattern[pat_idx].issubset(itemset):
|
|
60
|
+
pat_idx += 1
|
|
61
|
+
if pat_idx == len(pattern):
|
|
62
|
+
return True
|
|
63
|
+
return False
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def count_support(pattern, sequences):
|
|
67
|
+
return sum(1 for seq in sequences if is_subsequence(pattern, seq))
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# 3. MS-GSP CHECKS
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def min_mis(pattern, mis_map):
|
|
74
|
+
"""Minimum MIS among all items in the pattern."""
|
|
75
|
+
all_items = [item for itemset in pattern for item in itemset]
|
|
76
|
+
return min(mis_map.get(item, 1.0) for item in all_items)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def check_sdc(pattern, mis_map, sdc):
|
|
80
|
+
"""max(MIS of items) - min(MIS of items) <= SDC."""
|
|
81
|
+
all_items = [item for itemset in pattern for item in itemset]
|
|
82
|
+
mis_vals = [mis_map.get(item, 1.0) for item in all_items]
|
|
83
|
+
return (max(mis_vals) - min(mis_vals)) <= sdc
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def is_frequent(pattern, sequences, mis_map, sdc, n):
|
|
87
|
+
if not check_sdc(pattern, mis_map, sdc):
|
|
88
|
+
return False, 0
|
|
89
|
+
cnt = count_support(pattern, sequences)
|
|
90
|
+
sup = cnt / n
|
|
91
|
+
return sup >= min_mis(pattern, mis_map), cnt
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# 4. CANDIDATE GENERATION
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def get_all_items(sequences):
|
|
98
|
+
items = set()
|
|
99
|
+
for seq in sequences:
|
|
100
|
+
for itemset in seq:
|
|
101
|
+
items |= itemset
|
|
102
|
+
return sorted(items)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def generate_size1_candidates(sequences, mis_map, sdc, n):
|
|
106
|
+
items = sorted(get_all_items(sequences), key=lambda x: mis_map.get(x, 1.0))
|
|
107
|
+
frequent = []
|
|
108
|
+
for item in items:
|
|
109
|
+
pattern = [frozenset([item])]
|
|
110
|
+
is_freq, cnt = is_frequent(pattern, sequences, mis_map, sdc, n)
|
|
111
|
+
if is_freq:
|
|
112
|
+
frequent.append((pattern, cnt))
|
|
113
|
+
return frequent
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def join_sequences(s1, s2):
|
|
117
|
+
"""Join two k-sequences to produce (k+1)-sequence candidates."""
|
|
118
|
+
candidates = []
|
|
119
|
+
s1_flat = [item for itemset in s1 for item in sorted(itemset)]
|
|
120
|
+
s2_flat = [item for itemset in s2 for item in sorted(itemset)]
|
|
121
|
+
|
|
122
|
+
# Core MS-GSP join condition
|
|
123
|
+
if s1_flat[1:] != s2_flat[:-1]:
|
|
124
|
+
return candidates
|
|
125
|
+
|
|
126
|
+
last_item = s2_flat[-1]
|
|
127
|
+
|
|
128
|
+
# Candidate 1: last item becomes a new itemset
|
|
129
|
+
candidates.append(s1 + [frozenset([last_item])])
|
|
130
|
+
|
|
131
|
+
# Candidate 2: last item merges into last itemset of s1
|
|
132
|
+
if last_item > max(s1[-1]):
|
|
133
|
+
new_last = frozenset(s1[-1] | {last_item})
|
|
134
|
+
candidates.append(s1[:-1] + [new_last])
|
|
135
|
+
|
|
136
|
+
return candidates
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def generate_candidates(freq_prev, mis_map, sdc):
|
|
140
|
+
candidates = []
|
|
141
|
+
seen = set()
|
|
142
|
+
patterns = [p for p, _ in freq_prev]
|
|
143
|
+
|
|
144
|
+
for i in range(len(patterns)):
|
|
145
|
+
for j in range(len(patterns)):
|
|
146
|
+
for cand in join_sequences(patterns[i], patterns[j]):
|
|
147
|
+
key = tuple(tuple(sorted(itemset)) for itemset in cand)
|
|
148
|
+
if key not in seen:
|
|
149
|
+
seen.add(key)
|
|
150
|
+
if check_sdc(cand, mis_map, sdc):
|
|
151
|
+
candidates.append(cand)
|
|
152
|
+
return candidates
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
# 5. MAIN MS-GSP ALGORITHM
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def msgsp(data_file, param_file):
|
|
159
|
+
sequences = parse_data(data_file)
|
|
160
|
+
mis_map, sdc = parse_params(param_file)
|
|
161
|
+
n = len(sequences)
|
|
162
|
+
|
|
163
|
+
print(f"Loaded {n} sequences | SDC = {sdc}")
|
|
164
|
+
print(f"MIS values: {mis_map}\n")
|
|
165
|
+
|
|
166
|
+
all_frequent = []
|
|
167
|
+
|
|
168
|
+
freq1 = generate_size1_candidates(sequences, mis_map, sdc, n)
|
|
169
|
+
print(f"Frequent 1-sequences: {len(freq1)}")
|
|
170
|
+
all_frequent.extend(freq1)
|
|
171
|
+
|
|
172
|
+
freq_prev = freq1
|
|
173
|
+
k = 2
|
|
174
|
+
|
|
175
|
+
while freq_prev:
|
|
176
|
+
candidates = generate_candidates(freq_prev, mis_map, sdc)
|
|
177
|
+
freq_k = []
|
|
178
|
+
for cand in candidates:
|
|
179
|
+
is_freq, cnt = is_frequent(cand, sequences, mis_map, sdc, n)
|
|
180
|
+
if is_freq:
|
|
181
|
+
freq_k.append((cand, cnt))
|
|
182
|
+
print(f"Frequent {k}-sequences: {len(freq_k)}")
|
|
183
|
+
all_frequent.extend(freq_k)
|
|
184
|
+
freq_prev = freq_k
|
|
185
|
+
k += 1
|
|
186
|
+
|
|
187
|
+
return all_frequent
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
# 6. OUTPUT
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def format_pattern(pattern):
|
|
194
|
+
parts = [
|
|
195
|
+
"{" + ",".join(str(i) for i in sorted(itemset)) + "}" for itemset in pattern
|
|
196
|
+
]
|
|
197
|
+
return "<" + "".join(parts) + ">"
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def print_results(frequent):
|
|
201
|
+
print("\n=== Frequent Sequential Patterns ===")
|
|
202
|
+
if not frequent:
|
|
203
|
+
print("No frequent patterns found.")
|
|
204
|
+
return
|
|
205
|
+
for pattern, count in frequent:
|
|
206
|
+
print(f"Pattern :{format_pattern(pattern)} count: {count}")
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
# 7. RUN
|
|
210
|
+
|
|
211
|
+
if __name__ == "__main__":
|
|
212
|
+
data_txt = (
|
|
213
|
+
"/content/drive/MyDrive/Academic_Resources/Semester 08/20XW87/PS15/data.txt"
|
|
214
|
+
)
|
|
215
|
+
para_txt = (
|
|
216
|
+
"/content/drive/MyDrive/Academic_Resources/Semester 08/20XW87/PS15/para.txt"
|
|
217
|
+
)
|
|
218
|
+
frequent = msgsp(data_txt, para_txt)
|
|
219
|
+
print_results(frequent)
|
|
220
|
+
|
|
221
|
+
"""## Lib Impl"""
|
|
222
|
+
|
|
223
|
+
"""
|
|
224
|
+
MS-GSP using gsppy
|
|
225
|
+
Strategy:
|
|
226
|
+
1. Run gsppy.GSP with the lowest possible MIS as threshold (catches all candidates)
|
|
227
|
+
2. Post-filter results using per-item MIS and SDC constraints
|
|
228
|
+
"""
|
|
229
|
+
|
|
230
|
+
# ── In Colab, run first: !pip install gsppy ──────────────────────────────────
|
|
231
|
+
import re
|
|
232
|
+
|
|
233
|
+
from gsppy.gsp import GSP
|
|
234
|
+
|
|
235
|
+
# 1. PARSE INPUT FILES
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def parse_data(filepath):
|
|
239
|
+
"""Parse data.txt → list of flat item lists for gsppy.
|
|
240
|
+
Each sequence becomes a flat list of items (gsppy treats each element as a timestep).
|
|
241
|
+
To preserve itemset structure, we also return the structured form separately.
|
|
242
|
+
"""
|
|
243
|
+
sequences_flat = [] # for gsppy: [[10, 40, 50, 40, 90], ...]
|
|
244
|
+
sequences_struct = [] # for MIS/SDC: [[{10,40,50}, {40,90}], ...]
|
|
245
|
+
|
|
246
|
+
with open(filepath) as f:
|
|
247
|
+
for line in f:
|
|
248
|
+
line = line.strip()
|
|
249
|
+
if not line:
|
|
250
|
+
continue
|
|
251
|
+
itemsets = re.findall(r"\{([^}]+)\}", line)
|
|
252
|
+
struct = []
|
|
253
|
+
flat = []
|
|
254
|
+
for itemset in itemsets:
|
|
255
|
+
items = frozenset(int(x.strip()) for x in itemset.split(","))
|
|
256
|
+
struct.append(items)
|
|
257
|
+
flat.extend(sorted(items)) # gsppy needs flat list
|
|
258
|
+
if struct:
|
|
259
|
+
sequences_flat.append(flat)
|
|
260
|
+
sequences_struct.append(struct)
|
|
261
|
+
|
|
262
|
+
return sequences_flat, sequences_struct
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def parse_params(filepath):
|
|
266
|
+
mis = {}
|
|
267
|
+
sdc = None
|
|
268
|
+
with open(filepath) as f:
|
|
269
|
+
for line in f:
|
|
270
|
+
line = line.strip()
|
|
271
|
+
m = re.match(r"MIS\((\d+)\)\s*=\s*([\d.]+)", line)
|
|
272
|
+
if m:
|
|
273
|
+
mis[int(m.group(1))] = float(m.group(2))
|
|
274
|
+
continue
|
|
275
|
+
m = re.match(r"SDC\s*=\s*([\d.]+)", line)
|
|
276
|
+
if m:
|
|
277
|
+
sdc = float(m.group(1))
|
|
278
|
+
return mis, sdc
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
# 2. MS-GSP FILTERS (applied to gsppy output)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def min_mis(pattern_items, mis_map):
|
|
285
|
+
return min(mis_map.get(item, 1.0) for item in pattern_items)
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def check_sdc(pattern_items, mis_map, sdc):
|
|
289
|
+
vals = [mis_map.get(item, 1.0) for item in pattern_items]
|
|
290
|
+
return (max(vals) - min(vals)) <= sdc
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def count_support_struct(pattern_flat, sequences_struct):
|
|
294
|
+
"""Count how many structured sequences contain the flat pattern as an ordered subsequence."""
|
|
295
|
+
count = 0
|
|
296
|
+
for seq in sequences_struct:
|
|
297
|
+
flat_seq = [item for itemset in seq for item in sorted(itemset)]
|
|
298
|
+
# Check if pattern_flat is a subsequence of flat_seq
|
|
299
|
+
pi = 0
|
|
300
|
+
for item in flat_seq:
|
|
301
|
+
if pi < len(pattern_flat) and item == pattern_flat[pi]:
|
|
302
|
+
pi += 1
|
|
303
|
+
if pi == len(pattern_flat):
|
|
304
|
+
count += 1
|
|
305
|
+
return count
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
# 3. RUN GSPPY + APPLY MS-GSP FILTERING
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def run_msgsp(data_file, param_file):
|
|
312
|
+
sequences_flat, sequences_struct = parse_data(data_file)
|
|
313
|
+
mis_map, sdc = parse_params(param_file)
|
|
314
|
+
n = len(sequences_flat)
|
|
315
|
+
|
|
316
|
+
print(f"Loaded {n} sequences | SDC = {sdc}")
|
|
317
|
+
print(f"MIS values: {mis_map}\n")
|
|
318
|
+
|
|
319
|
+
# Use the global minimum MIS as threshold for gsppy
|
|
320
|
+
# This lets gsppy find all candidates; we filter with per-item MIS after
|
|
321
|
+
global_min_mis = min(mis_map.values())
|
|
322
|
+
print(f"Running gsppy with global min_support = {global_min_mis} (lowest MIS)")
|
|
323
|
+
print("─" * 50)
|
|
324
|
+
|
|
325
|
+
# gsppy returns: list of dicts, one dict per pattern length
|
|
326
|
+
# e.g. [{(10,): 4, (40,): 3}, {(10,40): 2}, ...]
|
|
327
|
+
gsp_results = GSP(sequences_flat).search(global_min_mis)
|
|
328
|
+
|
|
329
|
+
# ── Apply MS-GSP filters ──────────────────────────────
|
|
330
|
+
ms_gsp_patterns = []
|
|
331
|
+
|
|
332
|
+
for level_dict in gsp_results:
|
|
333
|
+
for pattern_tuple, gsppy_count in level_dict.items():
|
|
334
|
+
pattern_items = list(pattern_tuple)
|
|
335
|
+
|
|
336
|
+
# Filter 1: SDC check
|
|
337
|
+
if not check_sdc(pattern_items, mis_map, sdc):
|
|
338
|
+
continue
|
|
339
|
+
|
|
340
|
+
# Filter 2: support must meet min MIS of the pattern
|
|
341
|
+
threshold = min_mis(pattern_items, mis_map)
|
|
342
|
+
sup = gsppy_count / n
|
|
343
|
+
if sup < threshold:
|
|
344
|
+
continue
|
|
345
|
+
|
|
346
|
+
ms_gsp_patterns.append((pattern_tuple, gsppy_count))
|
|
347
|
+
|
|
348
|
+
return ms_gsp_patterns
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
# 4. OUTPUT
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def format_pattern(pattern_tuple):
|
|
355
|
+
# Each item becomes its own itemset for display: (10, 40) → <{10}{40}>
|
|
356
|
+
return "<" + "".join("{" + str(item) + "}" for item in pattern_tuple) + ">"
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def print_results(ms_gsp_patterns):
|
|
360
|
+
print("\n=== MS-GSP Frequent Sequential Patterns ===")
|
|
361
|
+
if not ms_gsp_patterns:
|
|
362
|
+
print("No frequent patterns found.")
|
|
363
|
+
return
|
|
364
|
+
for pattern, count in ms_gsp_patterns:
|
|
365
|
+
print(f"Pattern :{format_pattern(pattern)} count: {count}")
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
# 5. RUN
|
|
369
|
+
|
|
370
|
+
if __name__ == "__main__":
|
|
371
|
+
data_txt = (
|
|
372
|
+
"/content/drive/MyDrive/Academic_Resources/Semester 08/20XW87/PS15/data.txt"
|
|
373
|
+
)
|
|
374
|
+
para_txt = (
|
|
375
|
+
"/content/drive/MyDrive/Academic_Resources/Semester 08/20XW87/PS15/para.txt"
|
|
376
|
+
)
|
|
377
|
+
patterns = run_msgsp(data_txt, para_txt)
|
|
378
|
+
print_results(patterns)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Data-mining Test quick reference.
|
|
2
|
+
|
|
3
|
+
Printing bm_preprocessing.DM.dm_doc displays this source file.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from pandas import DataFrame, display
|
|
7
|
+
|
|
8
|
+
df = DataFrame() # Placeholder for type hinting
|
|
9
|
+
|
|
10
|
+
df = {
|
|
11
|
+
"col1": [1, 2, 3],
|
|
12
|
+
"col2": ["a", "b", "c"],
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
df["col1"].mean()
|
|
16
|
+
|
|
17
|
+
print(df["col2"].unique())
|
|
18
|
+
|
|
19
|
+
display(df.head())
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Finals source snippets."""
|