bm-preprocessing 0.2.1__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. bm_preprocessing-0.4.0/.gitignore +24 -0
  2. {bm_preprocessing-0.2.1 → bm_preprocessing-0.4.0}/PKG-INFO +4 -1
  3. bm_preprocessing-0.4.0/USAGE.md +112 -0
  4. {bm_preprocessing-0.2.1 → bm_preprocessing-0.4.0}/pyproject.toml +4 -1
  5. bm_preprocessing-0.4.0/src/bm_preprocessing/DM/__init__.py +13 -0
  6. {bm_preprocessing-0.2.1 → bm_preprocessing-0.4.0}/src/bm_preprocessing/DM/all.py +5 -5
  7. {bm_preprocessing-0.2.1 → bm_preprocessing-0.4.0}/src/bm_preprocessing/DM/apriori.py +5 -5
  8. {bm_preprocessing-0.2.1 → bm_preprocessing-0.4.0}/src/bm_preprocessing/DM/hash.py +5 -5
  9. bm_preprocessing-0.4.0/src/bm_preprocessing/DM/hunts.py +30 -0
  10. bm_preprocessing-0.4.0/src/bm_preprocessing/DM/hunts_test.py +30 -0
  11. bm_preprocessing-0.4.0/src/bm_preprocessing/DM/id3.py +30 -0
  12. bm_preprocessing-0.4.0/src/bm_preprocessing/DM/id3_test.py +30 -0
  13. {bm_preprocessing-0.2.1 → bm_preprocessing-0.4.0}/src/bm_preprocessing/DM/preprocessing.py +5 -5
  14. {bm_preprocessing-0.2.1 → bm_preprocessing-0.4.0}/src/bm_preprocessing/DM/sources/all.py +16 -14
  15. {bm_preprocessing-0.2.1 → bm_preprocessing-0.4.0}/src/bm_preprocessing/DM/sources/apriori.py +18 -12
  16. bm_preprocessing-0.4.0/src/bm_preprocessing/DM/sources/data.csv +11 -0
  17. {bm_preprocessing-0.2.1 → bm_preprocessing-0.4.0}/src/bm_preprocessing/DM/sources/hash.py +33 -17
  18. bm_preprocessing-0.4.0/src/bm_preprocessing/DM/sources/hunts.py +96 -0
  19. bm_preprocessing-0.4.0/src/bm_preprocessing/DM/sources/hunts_test.py +101 -0
  20. bm_preprocessing-0.4.0/src/bm_preprocessing/DM/sources/id3.py +134 -0
  21. bm_preprocessing-0.4.0/src/bm_preprocessing/DM/sources/id3_test.py +148 -0
  22. {bm_preprocessing-0.2.1 → bm_preprocessing-0.4.0}/src/bm_preprocessing/DM/sources/preprocessing.py +12 -5
  23. bm_preprocessing-0.4.0/src/bm_preprocessing/DM/sources/tennis.csv +15 -0
  24. {bm_preprocessing-0.2.1 → bm_preprocessing-0.4.0}/src/bm_preprocessing/IR/all.py +5 -5
  25. {bm_preprocessing-0.2.1 → bm_preprocessing-0.4.0}/src/bm_preprocessing/IR/sources/all.py +32 -20
  26. {bm_preprocessing-0.2.1 → bm_preprocessing-0.4.0}/src/bm_preprocessing/__init__.py +1 -2
  27. bm_preprocessing-0.2.1/.gitignore +0 -10
  28. bm_preprocessing-0.2.1/src/bm_preprocessing/DM/__init__.py +0 -8
  29. bm_preprocessing-0.2.1/uv.lock +0 -1100
  30. {bm_preprocessing-0.2.1 → bm_preprocessing-0.4.0}/README.md +0 -0
  31. {bm_preprocessing-0.2.1 → bm_preprocessing-0.4.0}/src/bm_preprocessing/IR/__init__.py +0 -0
@@ -0,0 +1,24 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ *.egg
8
+
9
+ # Virtual environment
10
+ .venv/
11
+
12
+ # IDE
13
+ .vscode/
14
+ .idea/
15
+
16
+ # Generated images
17
+ *.png
18
+
19
+ # OS files
20
+ Thumbs.db
21
+ .DS_Store
22
+
23
+ # UV
24
+ uv.lock
@@ -1,9 +1,12 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bm-preprocessing
3
- Version: 0.2.1
3
+ Version: 0.4.0
4
4
  Summary: A package to preprocess text data
5
5
  Requires-Python: >=3.8
6
6
  Requires-Dist: build>=1.2.2.post1
7
+ Requires-Dist: graphviz>=0.20.3
8
+ Requires-Dist: matplotlib>=3.7.5
9
+ Requires-Dist: pandas>=2.0.3
7
10
  Requires-Dist: twine>=6.1.0
8
11
  Description-Content-Type: text/markdown
9
12
 
@@ -0,0 +1,112 @@
1
+ # bm-preprocessing Usage Guide
2
+
3
+ ## Installation
4
+
5
+ ```bash
6
+ pip install bm-preprocessing
7
+ ```
8
+
9
+ ---
10
+
11
+ ## Usage in Python File
12
+
13
+ Create a file `example.py`:
14
+
15
+ ```python
16
+ # Import modules
17
+ from bm_preprocessing.IR import all
18
+ from bm_preprocessing.DM import apriori, hash, hunts, hunts_test, id3, id3_test, preprocessing
19
+
20
+ # Print the source code
21
+ print("=== IR All Module ===")
22
+ print(all)
23
+
24
+ print("\n=== DM Apriori Module ===")
25
+ print(apriori)
26
+
27
+ print("\n=== DM Hash Module ===")
28
+ print(hash)
29
+
30
+ print("\n=== DM Hunts Module ===")
31
+ print(hunts)
32
+
33
+ print("\n=== DM Hunts Test Module ===")
34
+ print(hunts_test)
35
+
36
+ print("\n=== DM ID3 Module ===")
37
+ print(id3)
38
+
39
+ print("\n=== DM ID3 Test Module ===")
40
+ print(id3_test)
41
+
42
+ print("\n=== DM Preprocessing Module ===")
43
+ print(preprocessing)
44
+ ```
45
+
46
+ Run it:
47
+ ```bash
48
+ python example.py
49
+ ```
50
+
51
+ ---
52
+
53
+ ## Usage in Terminal (Interactive Python)
54
+
55
+ ```bash
56
+ python
57
+ ```
58
+
59
+ Then in the Python REPL:
60
+
61
+ ```python
62
+ >>> from bm_preprocessing.IR import all
63
+ >>> print(all)
64
+ # Prints entire IR/all.py source code
65
+
66
+ >>> from bm_preprocessing.DM import apriori
67
+ >>> print(apriori)
68
+ # Prints entire DM/apriori.py source code
69
+
70
+ >>> from bm_preprocessing.DM import hunts, hunts_test
71
+ >>> print(hunts)
72
+ # Prints entire DM/hunts.py source code
73
+ >>> print(hunts_test)
74
+ # Prints entire DM/hunts_test.py source code
75
+
76
+ >>> from bm_preprocessing.DM import id3, id3_test
77
+ >>> print(id3)
78
+ # Prints entire DM/id3.py source code
79
+ >>> print(id3_test)
80
+ # Prints entire DM/id3_test.py source code
81
+ ```
82
+
83
+ ---
84
+
85
+ ## One-liner in Terminal
86
+
87
+ ```bash
88
+ python -c "from bm_preprocessing.IR import all; print(all)"
89
+ python -c "from bm_preprocessing.DM import apriori; print(apriori)"
90
+ python -c "from bm_preprocessing.DM import hash; print(hash)"
91
+ python -c "from bm_preprocessing.DM import hunts; print(hunts)"
92
+ python -c "from bm_preprocessing.DM import hunts_test; print(hunts_test)"
93
+ python -c "from bm_preprocessing.DM import id3; print(id3)"
94
+ python -c "from bm_preprocessing.DM import id3_test; print(id3_test)"
95
+ python -c "from bm_preprocessing.DM import preprocessing; print(preprocessing)"
96
+ ```
97
+
98
+ ---
99
+
100
+ ## Available Modules
101
+
102
+ | Import | Description |
103
+ |--------|-------------|
104
+ | `from bm_preprocessing.IR import all` | Information Retrieval (BM25, TF-IDF, Boolean) |
105
+ | `from bm_preprocessing.DM import all` | Data Mining algorithms |
106
+ | `from bm_preprocessing.DM import apriori` | Apriori algorithm |
107
+ | `from bm_preprocessing.DM import hash` | Hash-based mining |
108
+ | `from bm_preprocessing.DM import hunts` | Hunt's decision tree algorithm |
109
+ | `from bm_preprocessing.DM import hunts_test` | Hunt's decision tree with visualization |
110
+ | `from bm_preprocessing.DM import id3` | ID3 decision tree algorithm |
111
+ | `from bm_preprocessing.DM import id3_test` | ID3 decision tree with visualization |
112
+ | `from bm_preprocessing.DM import preprocessing` | Data preprocessing utilities |
@@ -4,12 +4,15 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "bm-preprocessing"
7
- version = "0.2.1"
7
+ version = "0.4.0"
8
8
  description = "A package to preprocess text data"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
11
11
  dependencies = [
12
12
  "build>=1.2.2.post1",
13
+ "graphviz>=0.20.3",
14
+ "matplotlib>=3.7.5",
15
+ "pandas>=2.0.3",
13
16
  "twine>=6.1.0",
14
17
  ]
15
18
 
@@ -0,0 +1,13 @@
1
+ """DM subpackage - Data Mining source code."""
2
+
3
+ from .all import all
4
+ from .apriori import apriori
5
+ from .hash import hash
6
+ from .hunts import hunts
7
+ from .hunts_test import hunts_test
8
+ from .id3 import id3
9
+ from .id3_test import id3_test
10
+ from .preprocessing import preprocessing
11
+
12
+ __all__ = ["all", "apriori", "hash", "hunts", "hunts_test", "id3", "id3_test", "preprocessing"]
13
+
@@ -5,22 +5,22 @@ from pathlib import Path
5
5
 
6
6
  class SourceCodeModule:
7
7
  """A class that displays source code when printed."""
8
-
8
+
9
9
  def __init__(self, name: str, source_path: Path):
10
10
  self.name = name
11
11
  self._source_path = source_path
12
12
  self._source_code = None
13
-
13
+
14
14
  @property
15
15
  def source_code(self) -> str:
16
16
  """Lazily load source code."""
17
17
  if self._source_code is None:
18
- self._source_code = self._source_path.read_text(encoding='utf-8')
18
+ self._source_code = self._source_path.read_text(encoding="utf-8")
19
19
  return self._source_code
20
-
20
+
21
21
  def __repr__(self) -> str:
22
22
  return self.source_code
23
-
23
+
24
24
  def __str__(self) -> str:
25
25
  return self.source_code
26
26
 
@@ -5,22 +5,22 @@ from pathlib import Path
5
5
 
6
6
  class SourceCodeModule:
7
7
  """A class that displays source code when printed."""
8
-
8
+
9
9
  def __init__(self, name: str, source_path: Path):
10
10
  self.name = name
11
11
  self._source_path = source_path
12
12
  self._source_code = None
13
-
13
+
14
14
  @property
15
15
  def source_code(self) -> str:
16
16
  """Lazily load source code."""
17
17
  if self._source_code is None:
18
- self._source_code = self._source_path.read_text(encoding='utf-8')
18
+ self._source_code = self._source_path.read_text(encoding="utf-8")
19
19
  return self._source_code
20
-
20
+
21
21
  def __repr__(self) -> str:
22
22
  return self.source_code
23
-
23
+
24
24
  def __str__(self) -> str:
25
25
  return self.source_code
26
26
 
@@ -5,22 +5,22 @@ from pathlib import Path
5
5
 
6
6
  class SourceCodeModule:
7
7
  """A class that displays source code when printed."""
8
-
8
+
9
9
  def __init__(self, name: str, source_path: Path):
10
10
  self.name = name
11
11
  self._source_path = source_path
12
12
  self._source_code = None
13
-
13
+
14
14
  @property
15
15
  def source_code(self) -> str:
16
16
  """Lazily load source code."""
17
17
  if self._source_code is None:
18
- self._source_code = self._source_path.read_text(encoding='utf-8')
18
+ self._source_code = self._source_path.read_text(encoding="utf-8")
19
19
  return self._source_code
20
-
20
+
21
21
  def __repr__(self) -> str:
22
22
  return self.source_code
23
-
23
+
24
24
  def __str__(self) -> str:
25
25
  return self.source_code
26
26
 
@@ -0,0 +1,30 @@
1
+ """Source code loader for DM/hunts.py"""
2
+
3
+ from pathlib import Path
4
+
5
+
6
+ class SourceCodeModule:
7
+ """A class that displays source code when printed."""
8
+
9
+ def __init__(self, name: str, source_path: Path):
10
+ self.name = name
11
+ self._source_path = source_path
12
+ self._source_code = None
13
+
14
+ @property
15
+ def source_code(self) -> str:
16
+ """Lazily load source code."""
17
+ if self._source_code is None:
18
+ self._source_code = self._source_path.read_text(encoding="utf-8")
19
+ return self._source_code
20
+
21
+ def __repr__(self) -> str:
22
+ return self.source_code
23
+
24
+ def __str__(self) -> str:
25
+ return self.source_code
26
+
27
+
28
+ # Get the path to the source file
29
+ _source_file = Path(__file__).parent / "sources" / "hunts.py"
30
+ hunts = SourceCodeModule("DM.hunts", _source_file)
@@ -0,0 +1,30 @@
1
+ """Source code loader for DM/hunts_test.py"""
2
+
3
+ from pathlib import Path
4
+
5
+
6
+ class SourceCodeModule:
7
+ """A class that displays source code when printed."""
8
+
9
+ def __init__(self, name: str, source_path: Path):
10
+ self.name = name
11
+ self._source_path = source_path
12
+ self._source_code = None
13
+
14
+ @property
15
+ def source_code(self) -> str:
16
+ """Lazily load source code."""
17
+ if self._source_code is None:
18
+ self._source_code = self._source_path.read_text(encoding="utf-8")
19
+ return self._source_code
20
+
21
+ def __repr__(self) -> str:
22
+ return self.source_code
23
+
24
+ def __str__(self) -> str:
25
+ return self.source_code
26
+
27
+
28
+ # Get the path to the source file
29
+ _source_file = Path(__file__).parent / "sources" / "hunts_test.py"
30
+ hunts_test = SourceCodeModule("DM.hunts_test", _source_file)
@@ -0,0 +1,30 @@
1
+ """Source code loader for DM/id3.py"""
2
+
3
+ from pathlib import Path
4
+
5
+
6
+ class SourceCodeModule:
7
+ """A class that displays source code when printed."""
8
+
9
+ def __init__(self, name: str, source_path: Path):
10
+ self.name = name
11
+ self._source_path = source_path
12
+ self._source_code = None
13
+
14
+ @property
15
+ def source_code(self) -> str:
16
+ """Lazily load source code."""
17
+ if self._source_code is None:
18
+ self._source_code = self._source_path.read_text(encoding="utf-8")
19
+ return self._source_code
20
+
21
+ def __repr__(self) -> str:
22
+ return self.source_code
23
+
24
+ def __str__(self) -> str:
25
+ return self.source_code
26
+
27
+
28
+ # Get the path to the source file
29
+ _source_file = Path(__file__).parent / "sources" / "id3.py"
30
+ id3 = SourceCodeModule("DM.id3", _source_file)
@@ -0,0 +1,30 @@
1
+ """Source code loader for DM/id3_test.py"""
2
+
3
+ from pathlib import Path
4
+
5
+
6
+ class SourceCodeModule:
7
+ """A class that displays source code when printed."""
8
+
9
+ def __init__(self, name: str, source_path: Path):
10
+ self.name = name
11
+ self._source_path = source_path
12
+ self._source_code = None
13
+
14
+ @property
15
+ def source_code(self) -> str:
16
+ """Lazily load source code."""
17
+ if self._source_code is None:
18
+ self._source_code = self._source_path.read_text(encoding="utf-8")
19
+ return self._source_code
20
+
21
+ def __repr__(self) -> str:
22
+ return self.source_code
23
+
24
+ def __str__(self) -> str:
25
+ return self.source_code
26
+
27
+
28
+ # Get the path to the source file
29
+ _source_file = Path(__file__).parent / "sources" / "id3_test.py"
30
+ id3_test = SourceCodeModule("DM.id3_test", _source_file)
@@ -5,22 +5,22 @@ from pathlib import Path
5
5
 
6
6
  class SourceCodeModule:
7
7
  """A class that displays source code when printed."""
8
-
8
+
9
9
  def __init__(self, name: str, source_path: Path):
10
10
  self.name = name
11
11
  self._source_path = source_path
12
12
  self._source_code = None
13
-
13
+
14
14
  @property
15
15
  def source_code(self) -> str:
16
16
  """Lazily load source code."""
17
17
  if self._source_code is None:
18
- self._source_code = self._source_path.read_text(encoding='utf-8')
18
+ self._source_code = self._source_path.read_text(encoding="utf-8")
19
19
  return self._source_code
20
-
20
+
21
21
  def __repr__(self) -> str:
22
22
  return self.source_code
23
-
23
+
24
24
  def __str__(self) -> str:
25
25
  return self.source_code
26
26
 
@@ -1,21 +1,21 @@
1
- from itertools import combinations, chain
2
1
  from collections import defaultdict
2
+ from itertools import chain, combinations
3
3
 
4
4
  min_support = 2
5
5
  min_conf = 0.7
6
6
 
7
7
  transactions = {
8
- "T1": {"I1","I2","I4","I5","I6"},
9
- "T2": {"I2","I4","I6"},
10
- "T3": {"I2","I3"},
11
- "T4": {"I1","I2","I4"},
12
- "T5": {"I1","I2","I3"},
13
- "T6": {"I2","I3"},
14
- "T7": {"I1","I3"},
15
- "T8": {"I1","I2","I3","I5"},
16
- "T9": {"I1","I2","I3"},
17
- "T10": {"I1","I2","I4","I5"},
18
- "T11": {"I5","I6"}
8
+ "T1": {"I1", "I2", "I4", "I5", "I6"},
9
+ "T2": {"I2", "I4", "I6"},
10
+ "T3": {"I2", "I3"},
11
+ "T4": {"I1", "I2", "I4"},
12
+ "T5": {"I1", "I2", "I3"},
13
+ "T6": {"I2", "I3"},
14
+ "T7": {"I1", "I3"},
15
+ "T8": {"I1", "I2", "I3", "I5"},
16
+ "T9": {"I1", "I2", "I3"},
17
+ "T10": {"I1", "I2", "I4", "I5"},
18
+ "T11": {"I5", "I6"},
19
19
  }
20
20
 
21
21
  genL = lambda C: {k: v for k, v in C.items() if v >= min_support}
@@ -85,13 +85,15 @@ for k, v in C.items():
85
85
  "Empty" if not v else "\n".join(f"{set(x)} : {y}" for x, y in v.items()),
86
86
  )
87
87
  for k, v in L.items():
88
- print(f"\nL{k}:\n", "Empty" if not v else "\n".join(f"{set(x)} : {v[x]}" for x in v))
88
+ print(
89
+ f"\nL{k}:\n", "Empty" if not v else "\n".join(f"{set(x)} : {v[x]}" for x in v)
90
+ )
89
91
 
90
92
  print(f"\nAssociation Rules (conf >= {min_conf:.0%}):")
91
93
  for a, c, s, conf in rules:
92
94
  print(f"{set(a)} => {set(c)} | support: {s:.2f}, confidence: {conf:.2f}")
93
95
 
94
- from itertools import combinations, chain
96
+ from itertools import chain, combinations
95
97
 
96
98
  transactions = {
97
99
  "10": {"A", "C", "D"},
@@ -1,11 +1,13 @@
1
1
  from collections import defaultdict
2
2
  from itertools import combinations
3
3
 
4
+
4
5
  def print_table(data, title):
5
6
  print(f"\n--- {title} ---")
6
7
  for itemset, count in data.items():
7
8
  print(f"{itemset}: {count}")
8
9
 
10
+
9
11
  C = {}
10
12
  L = {}
11
13
 
@@ -18,26 +20,26 @@ def generate_candidates(prev_frequent_itemsets, k):
18
20
  union_set = set(itemset1).union(set(itemset2))
19
21
  if len(union_set) == k:
20
22
  candidates.add(tuple(sorted(union_set)))
21
-
23
+
22
24
  return sorted(list(candidates))
23
25
 
24
26
 
25
27
  def count_candidates(candidates, transactions):
26
-
28
+
27
29
  candidate_count = defaultdict(int)
28
-
30
+
29
31
  for candidate in candidates:
30
32
  for transaction in transactions.values():
31
33
  if all(item in transaction for item in candidate):
32
34
  candidate_count[candidate] += 1
33
-
35
+
34
36
  return candidate_count
35
37
 
36
38
 
37
39
  def prune_candidates(candidate_count, min_support, prev_freq_itemsets=None):
38
40
 
39
41
  filtered_candidates = {}
40
-
42
+
41
43
  for itemset, count in candidate_count.items():
42
44
 
43
45
  if count >= min_support:
@@ -45,7 +47,9 @@ def prune_candidates(candidate_count, min_support, prev_freq_itemsets=None):
45
47
  filtered_candidates[itemset] = count
46
48
  else:
47
49
  subsets = combinations(itemset, len(itemset) - 1)
48
- if all(tuple(sorted(subset)) in prev_freq_itemsets for subset in subsets):
50
+ if all(
51
+ tuple(sorted(subset)) in prev_freq_itemsets for subset in subsets
52
+ ):
49
53
  filtered_candidates[itemset] = count
50
54
 
51
55
  return filtered_candidates
@@ -53,12 +57,14 @@ def prune_candidates(candidate_count, min_support, prev_freq_itemsets=None):
53
57
 
54
58
  def apriori(transactions, min_support):
55
59
 
56
- items = sorted(set(item for transaction in transactions.values() for item in transaction))
60
+ items = sorted(
61
+ set(item for transaction in transactions.values() for item in transaction)
62
+ )
57
63
  c1_list = [(item,) for item in items]
58
-
64
+
59
65
  C[1] = count_candidates(c1_list, transactions)
60
66
  L[1] = prune_candidates(C[1], min_support)
61
-
67
+
62
68
  print_table(C[1], "Candidate 1-itemsets (C1)")
63
69
  print_table(L[1], "Frequent 1-itemsets (L1)")
64
70
 
@@ -66,12 +72,12 @@ def apriori(transactions, min_support):
66
72
 
67
73
  while True:
68
74
 
69
- candidates = generate_candidates(L[k-1].keys(), k)
75
+ candidates = generate_candidates(L[k - 1].keys(), k)
70
76
  if not candidates:
71
77
  break
72
78
 
73
79
  C[k] = count_candidates(candidates, transactions)
74
- L[k] = prune_candidates(C[k], min_support, L[k-1].keys())
80
+ L[k] = prune_candidates(C[k], min_support, L[k - 1].keys())
75
81
 
76
82
  if not L[k]:
77
83
  print_table(C[k], f"Candidate {k}-itemsets (C{k})")
@@ -99,7 +105,7 @@ def main():
99
105
  }
100
106
 
101
107
  min_support = 2
102
-
108
+
103
109
  apriori(transactions, min_support)
104
110
 
105
111
 
@@ -0,0 +1,11 @@
1
+ Tid,Home Owner,Marital Status,Annual Income,Default id
2
+ 1,Yes,Single,125K,No
3
+ 2,No,Married,100K,No
4
+ 3,No,Single,70K,No
5
+ 4,Yes,Married,120K,No
6
+ 5,No,Divorced,95K,Yes
7
+ 6,No,Married,60K,No
8
+ 7,Yes,Divorced,220K,No
9
+ 8,No,Single,85K,Yes
10
+ 9,No,Married,75K,No
11
+ 10,No,Single,90K,Yes