itertoolkit 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. bm_preprocessing/__init__.py +14 -0
  2. bm_preprocessing/importer/DM/__init__.py +7 -0
  3. bm_preprocessing/importer/DM/agg.py +6 -0
  4. bm_preprocessing/importer/DM/dbscan.py +6 -0
  5. bm_preprocessing/importer/DM/finals.py +6 -0
  6. bm_preprocessing/importer/DM/gsp.py +6 -0
  7. bm_preprocessing/importer/DM/test.py +6 -0
  8. bm_preprocessing/importer/Finals/__init__.py +7 -0
  9. bm_preprocessing/importer/Finals/kaadhal.py +6 -0
  10. bm_preprocessing/importer/Finals/raaka.py +6 -0
  11. bm_preprocessing/importer/Finals/seedan.py +6 -0
  12. bm_preprocessing/importer/Finals/vikram.py +6 -0
  13. bm_preprocessing/importer/IR/__init__.py +6 -0
  14. bm_preprocessing/importer/IR/finals.py +6 -0
  15. bm_preprocessing/importer/IR/pagerank.py +6 -0
  16. bm_preprocessing/importer/IR/recommenders_pca.py +8 -0
  17. bm_preprocessing/importer/IR/test.py +6 -0
  18. bm_preprocessing/importer/PY/__init__.py +4 -0
  19. bm_preprocessing/importer/PY/lib_doc.py +6 -0
  20. bm_preprocessing/importer/PY/python_doc.py +6 -0
  21. bm_preprocessing/importer/__init__.py +8 -0
  22. bm_preprocessing/importer/_module_printer.py +23 -0
  23. bm_preprocessing/src/DM/__init__.py +1 -0
  24. bm_preprocessing/src/DM/agg.py +267 -0
  25. bm_preprocessing/src/DM/dbscan.py +218 -0
  26. bm_preprocessing/src/DM/finals.py +19 -0
  27. bm_preprocessing/src/DM/gsp.py +378 -0
  28. bm_preprocessing/src/DM/test.py +19 -0
  29. bm_preprocessing/src/Finals/__init__.py +1 -0
  30. bm_preprocessing/src/Finals/kaadhal.py +1453 -0
  31. bm_preprocessing/src/Finals/raaka.py +1338 -0
  32. bm_preprocessing/src/Finals/seedan.py +1173 -0
  33. bm_preprocessing/src/Finals/vikram.py +520 -0
  34. bm_preprocessing/src/IR/__init__.py +1 -0
  35. bm_preprocessing/src/IR/finals.py +14 -0
  36. bm_preprocessing/src/IR/pagerank.py +109 -0
  37. bm_preprocessing/src/IR/recommenders_pca.py +487 -0
  38. bm_preprocessing/src/IR/test.py +14 -0
  39. bm_preprocessing/src/PY/__init__.py +1 -0
  40. bm_preprocessing/src/PY/lib_doc.py +295 -0
  41. bm_preprocessing/src/PY/python_doc.py +177 -0
  42. bm_preprocessing/src/__init__.py +1 -0
  43. itertoolkit-1.5.0.dist-info/METADATA +120 -0
  44. itertoolkit-1.5.0.dist-info/RECORD +45 -0
  45. itertoolkit-1.5.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,19 @@
1
+ """Data-mining quick reference.
2
+
3
+ Printing bm_preprocessing.DM.dm_doc displays this source file.
4
+ """
5
+
6
+ from pandas import DataFrame, display
7
+
8
+ df = DataFrame() # Placeholder for type hinting
9
+
10
+ df = {
11
+ "col1": [1, 2, 3],
12
+ "col2": ["a", "b", "c"],
13
+ }
14
+
15
+ df["col1"].mean()
16
+
17
+ print(df["col2"].unique())
18
+
19
+ display(df.head())
@@ -0,0 +1,378 @@
1
+ # -*- coding: utf-8 -*-
2
+ """PS15.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1L11gBakFRdb1QWPyk4d0e8MxgnAN8WVE
8
+
9
+ ## Own Impl
10
+ """
11
+
12
+ import re
13
+
14
+ # 1. PARSE INPUT FILES
15
+
16
+
17
+ def parse_data(filepath):
18
+ sequences = []
19
+ with open(filepath) as f:
20
+ for line in f:
21
+ line = line.strip()
22
+ if not line:
23
+ continue
24
+ itemsets = re.findall(r"\{([^}]+)\}", line)
25
+ seq = []
26
+ for itemset in itemsets:
27
+ items = frozenset(int(x.strip()) for x in itemset.split(","))
28
+ seq.append(items)
29
+ if seq:
30
+ sequences.append(seq)
31
+ return sequences
32
+
33
+
34
+ def parse_params(filepath):
35
+ mis = {}
36
+ sdc = None
37
+ with open(filepath) as f:
38
+ for line in f:
39
+ line = line.strip()
40
+ if not line:
41
+ continue
42
+ m = re.match(r"MIS\((\d+)\)\s*=\s*([\d.]+)", line)
43
+ if m:
44
+ mis[int(m.group(1))] = float(m.group(2))
45
+ continue
46
+ m = re.match(r"SDC\s*=\s*([\d.]+)", line)
47
+ if m:
48
+ sdc = float(m.group(1))
49
+ return mis, sdc
50
+
51
+
52
+ # 2. SUPPORT COUNTING
53
+
54
+
55
+ def is_subsequence(pattern, sequence):
56
+ """Check if pattern (list of frozensets) occurs in sequence (in order)."""
57
+ pat_idx = 0
58
+ for itemset in sequence:
59
+ if pattern[pat_idx].issubset(itemset):
60
+ pat_idx += 1
61
+ if pat_idx == len(pattern):
62
+ return True
63
+ return False
64
+
65
+
66
+ def count_support(pattern, sequences):
67
+ return sum(1 for seq in sequences if is_subsequence(pattern, seq))
68
+
69
+
70
+ # 3. MS-GSP CHECKS
71
+
72
+
73
+ def min_mis(pattern, mis_map):
74
+ """Minimum MIS among all items in the pattern."""
75
+ all_items = [item for itemset in pattern for item in itemset]
76
+ return min(mis_map.get(item, 1.0) for item in all_items)
77
+
78
+
79
+ def check_sdc(pattern, mis_map, sdc):
80
+ """max(MIS of items) - min(MIS of items) <= SDC."""
81
+ all_items = [item for itemset in pattern for item in itemset]
82
+ mis_vals = [mis_map.get(item, 1.0) for item in all_items]
83
+ return (max(mis_vals) - min(mis_vals)) <= sdc
84
+
85
+
86
+ def is_frequent(pattern, sequences, mis_map, sdc, n):
87
+ if not check_sdc(pattern, mis_map, sdc):
88
+ return False, 0
89
+ cnt = count_support(pattern, sequences)
90
+ sup = cnt / n
91
+ return sup >= min_mis(pattern, mis_map), cnt
92
+
93
+
94
+ # 4. CANDIDATE GENERATION
95
+
96
+
97
+ def get_all_items(sequences):
98
+ items = set()
99
+ for seq in sequences:
100
+ for itemset in seq:
101
+ items |= itemset
102
+ return sorted(items)
103
+
104
+
105
+ def generate_size1_candidates(sequences, mis_map, sdc, n):
106
+ items = sorted(get_all_items(sequences), key=lambda x: mis_map.get(x, 1.0))
107
+ frequent = []
108
+ for item in items:
109
+ pattern = [frozenset([item])]
110
+ is_freq, cnt = is_frequent(pattern, sequences, mis_map, sdc, n)
111
+ if is_freq:
112
+ frequent.append((pattern, cnt))
113
+ return frequent
114
+
115
+
116
+ def join_sequences(s1, s2):
117
+ """Join two k-sequences to produce (k+1)-sequence candidates."""
118
+ candidates = []
119
+ s1_flat = [item for itemset in s1 for item in sorted(itemset)]
120
+ s2_flat = [item for itemset in s2 for item in sorted(itemset)]
121
+
122
+ # Core MS-GSP join condition
123
+ if s1_flat[1:] != s2_flat[:-1]:
124
+ return candidates
125
+
126
+ last_item = s2_flat[-1]
127
+
128
+ # Candidate 1: last item becomes a new itemset
129
+ candidates.append(s1 + [frozenset([last_item])])
130
+
131
+ # Candidate 2: last item merges into last itemset of s1
132
+ if last_item > max(s1[-1]):
133
+ new_last = frozenset(s1[-1] | {last_item})
134
+ candidates.append(s1[:-1] + [new_last])
135
+
136
+ return candidates
137
+
138
+
139
+ def generate_candidates(freq_prev, mis_map, sdc):
140
+ candidates = []
141
+ seen = set()
142
+ patterns = [p for p, _ in freq_prev]
143
+
144
+ for i in range(len(patterns)):
145
+ for j in range(len(patterns)):
146
+ for cand in join_sequences(patterns[i], patterns[j]):
147
+ key = tuple(tuple(sorted(itemset)) for itemset in cand)
148
+ if key not in seen:
149
+ seen.add(key)
150
+ if check_sdc(cand, mis_map, sdc):
151
+ candidates.append(cand)
152
+ return candidates
153
+
154
+
155
+ # 5. MAIN MS-GSP ALGORITHM
156
+
157
+
158
+ def msgsp(data_file, param_file):
159
+ sequences = parse_data(data_file)
160
+ mis_map, sdc = parse_params(param_file)
161
+ n = len(sequences)
162
+
163
+ print(f"Loaded {n} sequences | SDC = {sdc}")
164
+ print(f"MIS values: {mis_map}\n")
165
+
166
+ all_frequent = []
167
+
168
+ freq1 = generate_size1_candidates(sequences, mis_map, sdc, n)
169
+ print(f"Frequent 1-sequences: {len(freq1)}")
170
+ all_frequent.extend(freq1)
171
+
172
+ freq_prev = freq1
173
+ k = 2
174
+
175
+ while freq_prev:
176
+ candidates = generate_candidates(freq_prev, mis_map, sdc)
177
+ freq_k = []
178
+ for cand in candidates:
179
+ is_freq, cnt = is_frequent(cand, sequences, mis_map, sdc, n)
180
+ if is_freq:
181
+ freq_k.append((cand, cnt))
182
+ print(f"Frequent {k}-sequences: {len(freq_k)}")
183
+ all_frequent.extend(freq_k)
184
+ freq_prev = freq_k
185
+ k += 1
186
+
187
+ return all_frequent
188
+
189
+
190
+ # 6. OUTPUT
191
+
192
+
193
+ def format_pattern(pattern):
194
+ parts = [
195
+ "{" + ",".join(str(i) for i in sorted(itemset)) + "}" for itemset in pattern
196
+ ]
197
+ return "<" + "".join(parts) + ">"
198
+
199
+
200
+ def print_results(frequent):
201
+ print("\n=== Frequent Sequential Patterns ===")
202
+ if not frequent:
203
+ print("No frequent patterns found.")
204
+ return
205
+ for pattern, count in frequent:
206
+ print(f"Pattern :{format_pattern(pattern)} count: {count}")
207
+
208
+
209
+ # 7. RUN
210
+
211
+ if __name__ == "__main__":
212
+ data_txt = (
213
+ "/content/drive/MyDrive/Academic_Resources/Semester 08/20XW87/PS15/data.txt"
214
+ )
215
+ para_txt = (
216
+ "/content/drive/MyDrive/Academic_Resources/Semester 08/20XW87/PS15/para.txt"
217
+ )
218
+ frequent = msgsp(data_txt, para_txt)
219
+ print_results(frequent)
220
+
221
+ """## Lib Impl"""
222
+
223
+ """
224
+ MS-GSP using gsppy
225
+ Strategy:
226
+ 1. Run gsppy.GSP with the lowest possible MIS as threshold (catches all candidates)
227
+ 2. Post-filter results using per-item MIS and SDC constraints
228
+ """
229
+
230
+ # ── In Colab, run first: !pip install gsppy ──────────────────────────────────
231
+ import re
232
+
233
+ from gsppy.gsp import GSP
234
+
235
+ # 1. PARSE INPUT FILES
236
+
237
+
238
+ def parse_data(filepath):
239
+ """Parse data.txt → list of flat item lists for gsppy.
240
+ Each sequence becomes a flat list of items (gsppy treats each element as a timestep).
241
+ To preserve itemset structure, we also return the structured form separately.
242
+ """
243
+ sequences_flat = [] # for gsppy: [[10, 40, 50, 40, 90], ...]
244
+ sequences_struct = [] # for MIS/SDC: [[{10,40,50}, {40,90}], ...]
245
+
246
+ with open(filepath) as f:
247
+ for line in f:
248
+ line = line.strip()
249
+ if not line:
250
+ continue
251
+ itemsets = re.findall(r"\{([^}]+)\}", line)
252
+ struct = []
253
+ flat = []
254
+ for itemset in itemsets:
255
+ items = frozenset(int(x.strip()) for x in itemset.split(","))
256
+ struct.append(items)
257
+ flat.extend(sorted(items)) # gsppy needs flat list
258
+ if struct:
259
+ sequences_flat.append(flat)
260
+ sequences_struct.append(struct)
261
+
262
+ return sequences_flat, sequences_struct
263
+
264
+
265
+ def parse_params(filepath):
266
+ mis = {}
267
+ sdc = None
268
+ with open(filepath) as f:
269
+ for line in f:
270
+ line = line.strip()
271
+ m = re.match(r"MIS\((\d+)\)\s*=\s*([\d.]+)", line)
272
+ if m:
273
+ mis[int(m.group(1))] = float(m.group(2))
274
+ continue
275
+ m = re.match(r"SDC\s*=\s*([\d.]+)", line)
276
+ if m:
277
+ sdc = float(m.group(1))
278
+ return mis, sdc
279
+
280
+
281
+ # 2. MS-GSP FILTERS (applied to gsppy output)
282
+
283
+
284
+ def min_mis(pattern_items, mis_map):
285
+ return min(mis_map.get(item, 1.0) for item in pattern_items)
286
+
287
+
288
+ def check_sdc(pattern_items, mis_map, sdc):
289
+ vals = [mis_map.get(item, 1.0) for item in pattern_items]
290
+ return (max(vals) - min(vals)) <= sdc
291
+
292
+
293
+ def count_support_struct(pattern_flat, sequences_struct):
294
+ """Count how many structured sequences contain the flat pattern as an ordered subsequence."""
295
+ count = 0
296
+ for seq in sequences_struct:
297
+ flat_seq = [item for itemset in seq for item in sorted(itemset)]
298
+ # Check if pattern_flat is a subsequence of flat_seq
299
+ pi = 0
300
+ for item in flat_seq:
301
+ if pi < len(pattern_flat) and item == pattern_flat[pi]:
302
+ pi += 1
303
+ if pi == len(pattern_flat):
304
+ count += 1
305
+ return count
306
+
307
+
308
+ # 3. RUN GSPPY + APPLY MS-GSP FILTERING
309
+
310
+
311
+ def run_msgsp(data_file, param_file):
312
+ sequences_flat, sequences_struct = parse_data(data_file)
313
+ mis_map, sdc = parse_params(param_file)
314
+ n = len(sequences_flat)
315
+
316
+ print(f"Loaded {n} sequences | SDC = {sdc}")
317
+ print(f"MIS values: {mis_map}\n")
318
+
319
+ # Use the global minimum MIS as threshold for gsppy
320
+ # This lets gsppy find all candidates; we filter with per-item MIS after
321
+ global_min_mis = min(mis_map.values())
322
+ print(f"Running gsppy with global min_support = {global_min_mis} (lowest MIS)")
323
+ print("─" * 50)
324
+
325
+ # gsppy returns: list of dicts, one dict per pattern length
326
+ # e.g. [{(10,): 4, (40,): 3}, {(10,40): 2}, ...]
327
+ gsp_results = GSP(sequences_flat).search(global_min_mis)
328
+
329
+ # ── Apply MS-GSP filters ──────────────────────────────
330
+ ms_gsp_patterns = []
331
+
332
+ for level_dict in gsp_results:
333
+ for pattern_tuple, gsppy_count in level_dict.items():
334
+ pattern_items = list(pattern_tuple)
335
+
336
+ # Filter 1: SDC check
337
+ if not check_sdc(pattern_items, mis_map, sdc):
338
+ continue
339
+
340
+ # Filter 2: support must meet min MIS of the pattern
341
+ threshold = min_mis(pattern_items, mis_map)
342
+ sup = gsppy_count / n
343
+ if sup < threshold:
344
+ continue
345
+
346
+ ms_gsp_patterns.append((pattern_tuple, gsppy_count))
347
+
348
+ return ms_gsp_patterns
349
+
350
+
351
+ # 4. OUTPUT
352
+
353
+
354
+ def format_pattern(pattern_tuple):
355
+ # Each item becomes its own itemset for display: (10, 40) → <{10}{40}>
356
+ return "<" + "".join("{" + str(item) + "}" for item in pattern_tuple) + ">"
357
+
358
+
359
+ def print_results(ms_gsp_patterns):
360
+ print("\n=== MS-GSP Frequent Sequential Patterns ===")
361
+ if not ms_gsp_patterns:
362
+ print("No frequent patterns found.")
363
+ return
364
+ for pattern, count in ms_gsp_patterns:
365
+ print(f"Pattern :{format_pattern(pattern)} count: {count}")
366
+
367
+
368
+ # 5. RUN
369
+
370
+ if __name__ == "__main__":
371
+ data_txt = (
372
+ "/content/drive/MyDrive/Academic_Resources/Semester 08/20XW87/PS15/data.txt"
373
+ )
374
+ para_txt = (
375
+ "/content/drive/MyDrive/Academic_Resources/Semester 08/20XW87/PS15/para.txt"
376
+ )
377
+ patterns = run_msgsp(data_txt, para_txt)
378
+ print_results(patterns)
@@ -0,0 +1,19 @@
1
+ """Data-mining Test quick reference.
2
+
3
+ Printing bm_preprocessing.DM.dm_doc displays this source file.
4
+ """
5
+
6
+ from pandas import DataFrame, display
7
+
8
+ df = DataFrame() # Placeholder for type hinting
9
+
10
+ df = {
11
+ "col1": [1, 2, 3],
12
+ "col2": ["a", "b", "c"],
13
+ }
14
+
15
+ df["col1"].mean()
16
+
17
+ print(df["col2"].unique())
18
+
19
+ display(df.head())
@@ -0,0 +1 @@
1
+ """Finals source snippets."""