PyPI - bm-preprocessing - Versions diffs - 0.2.1__tar.gz → 0.3.0__tar.gz - Mend

bm-preprocessing 0.2.1tar.gz → 0.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

bm_preprocessing-0.3.0/.gitignore ADDED Viewed

@@ -0,0 +1,24 @@
+# Python
+__pycache__/
+*.py[cod]
+*.egg-info/
+dist/
+build/
+*.egg
+# Virtual environment
+.venv/
+# IDE
+.vscode/
+.idea/
+# Generated images
+*.png
+# OS files
+Thumbs.db
+.DS_Store
+# UV
+uv.lock

{bm_preprocessing-0.2.1 → bm_preprocessing-0.3.0}/PKG-INFO RENAMED Viewed

@@ -1,9 +1,12 @@
 Metadata-Version: 2.4
 Name: bm-preprocessing
-Version: 0.2.1
+Version: 0.3.0
 Summary: A package to preprocess text data
 Requires-Python: >=3.8
 Requires-Dist: build>=1.2.2.post1
+Requires-Dist: graphviz>=0.20.3
+Requires-Dist: matplotlib>=3.7.5
+Requires-Dist: pandas>=2.0.3
 Requires-Dist: twine>=6.1.0
 Description-Content-Type: text/markdown

bm_preprocessing-0.3.0/USAGE.md ADDED Viewed

@@ -0,0 +1,96 @@
+# bm-preprocessing Usage Guide
+## Installation
+```bash
+pip install bm-preprocessing
+```
+---
+## Usage in Python File
+Create a file `example.py`:
+```python
+# Import modules
+from bm_preprocessing.IR import all
+from bm_preprocessing.DM import apriori, hash, hunts, hunts_test, preprocessing
+# Print the source code
+print("=== IR All Module ===")
+print(all)
+print("\n=== DM Apriori Module ===")
+print(apriori)
+print("\n=== DM Hash Module ===")
+print(hash)
+print("\n=== DM Hunts Module ===")
+print(hunts)
+print("\n=== DM Hunts Test Module ===")
+print(hunts_test)
+print("\n=== DM Preprocessing Module ===")
+print(preprocessing)
+```
+Run it:
+```bash
+python example.py
+```
+---
+## Usage in Terminal (Interactive Python)
+```bash
+python
+```
+Then in the Python REPL:
+```python
+>>> from bm_preprocessing.IR import all
+>>> print(all)
+# Prints entire IR/all.py source code
+>>> from bm_preprocessing.DM import apriori
+>>> print(apriori)
+# Prints entire DM/apriori.py source code
+>>> from bm_preprocessing.DM import hunts, hunts_test
+>>> print(hunts)
+# Prints entire DM/hunts.py source code
+>>> print(hunts_test)
+# Prints entire DM/hunts_test.py source code
+```
+---
+## One-liner in Terminal
+```bash
+python -c "from bm_preprocessing.IR import all; print(all)"
+python -c "from bm_preprocessing.DM import apriori; print(apriori)"
+python -c "from bm_preprocessing.DM import hash; print(hash)"
+python -c "from bm_preprocessing.DM import hunts; print(hunts)"
+python -c "from bm_preprocessing.DM import hunts_test; print(hunts_test)"
+python -c "from bm_preprocessing.DM import preprocessing; print(preprocessing)"
+```
+---
+## Available Modules
+| Import | Description |
+|--------|-------------|
+| `from bm_preprocessing.IR import all` | Information Retrieval (BM25, TF-IDF, Boolean) |
+| `from bm_preprocessing.DM import all` | Data Mining algorithms |
+| `from bm_preprocessing.DM import apriori` | Apriori algorithm |
+| `from bm_preprocessing.DM import hash` | Hash-based mining |
+| `from bm_preprocessing.DM import hunts` | Hunt's decision tree algorithm |
+| `from bm_preprocessing.DM import hunts_test` | Hunt's decision tree with visualization |
+| `from bm_preprocessing.DM import preprocessing` | Data preprocessing utilities |

{bm_preprocessing-0.2.1 → bm_preprocessing-0.3.0}/pyproject.toml RENAMED Viewed

@@ -4,12 +4,15 @@ build-backend = "hatchling.build"
 [project]
 name = "bm-preprocessing"
-version = "0.2.1"
+version = "0.3.0"
 description = "A package to preprocess text data"
 readme = "README.md"
 requires-python = ">=3.8"
 dependencies = [
     "build>=1.2.2.post1",
+    "graphviz>=0.20.3",
+    "matplotlib>=3.7.5",
+    "pandas>=2.0.3",
     "twine>=6.1.0",
 ]

{bm_preprocessing-0.2.1 → bm_preprocessing-0.3.0}/src/bm_preprocessing/DM/__init__.py RENAMED Viewed

@@ -3,6 +3,8 @@
 from .all import all
 from .apriori import apriori
 from .hash import hash
+from .hunts import hunts
+from .hunts_test import hunts_test
 from .preprocessing import preprocessing
-__all__ = ["all", "apriori", "hash", "preprocessing"]
+__all__ = ["all", "apriori", "hash", "hunts", "hunts_test", "preprocessing"]

{bm_preprocessing-0.2.1 → bm_preprocessing-0.3.0}/src/bm_preprocessing/DM/all.py RENAMED Viewed

@@ -5,22 +5,22 @@ from pathlib import Path
 class SourceCodeModule:
     """A class that displays source code when printed."""
     def __init__(self, name: str, source_path: Path):
         self.name = name
         self._source_path = source_path
         self._source_code = None
     @property
     def source_code(self) -> str:
         """Lazily load source code."""
         if self._source_code is None:
-            self._source_code = self._source_path.read_text(encoding='utf-8')
+            self._source_code = self._source_path.read_text(encoding="utf-8")
         return self._source_code
     def __repr__(self) -> str:
         return self.source_code
     def __str__(self) -> str:
         return self.source_code

{bm_preprocessing-0.2.1 → bm_preprocessing-0.3.0}/src/bm_preprocessing/DM/apriori.py RENAMED Viewed

@@ -5,22 +5,22 @@ from pathlib import Path
 class SourceCodeModule:
     """A class that displays source code when printed."""
     def __init__(self, name: str, source_path: Path):
         self.name = name
         self._source_path = source_path
         self._source_code = None
     @property
     def source_code(self) -> str:
         """Lazily load source code."""
         if self._source_code is None:
-            self._source_code = self._source_path.read_text(encoding='utf-8')
+            self._source_code = self._source_path.read_text(encoding="utf-8")
         return self._source_code
     def __repr__(self) -> str:
         return self.source_code
     def __str__(self) -> str:
         return self.source_code

{bm_preprocessing-0.2.1 → bm_preprocessing-0.3.0}/src/bm_preprocessing/DM/hash.py RENAMED Viewed

@@ -5,22 +5,22 @@ from pathlib import Path
 class SourceCodeModule:
     """A class that displays source code when printed."""
     def __init__(self, name: str, source_path: Path):
         self.name = name
         self._source_path = source_path
         self._source_code = None
     @property
     def source_code(self) -> str:
         """Lazily load source code."""
         if self._source_code is None:
-            self._source_code = self._source_path.read_text(encoding='utf-8')
+            self._source_code = self._source_path.read_text(encoding="utf-8")
         return self._source_code
     def __repr__(self) -> str:
         return self.source_code
     def __str__(self) -> str:
         return self.source_code

bm_preprocessing-0.3.0/src/bm_preprocessing/DM/hunts.py ADDED Viewed

@@ -0,0 +1,30 @@
+"""Source code loader for DM/hunts.py"""
+from pathlib import Path
+class SourceCodeModule:
+    """A class that displays source code when printed."""
+    def __init__(self, name: str, source_path: Path):
+        self.name = name
+        self._source_path = source_path
+        self._source_code = None
+    @property
+    def source_code(self) -> str:
+        """Lazily load source code."""
+        if self._source_code is None:
+            self._source_code = self._source_path.read_text(encoding="utf-8")
+        return self._source_code
+    def __repr__(self) -> str:
+        return self.source_code
+    def __str__(self) -> str:
+        return self.source_code
+# Get the path to the source file
+_source_file = Path(__file__).parent / "sources" / "hunts.py"
+hunts = SourceCodeModule("DM.hunts", _source_file)

bm_preprocessing-0.3.0/src/bm_preprocessing/DM/hunts_test.py ADDED Viewed

@@ -0,0 +1,30 @@
+"""Source code loader for DM/hunts_test.py"""
+from pathlib import Path
+class SourceCodeModule:
+    """A class that displays source code when printed."""
+    def __init__(self, name: str, source_path: Path):
+        self.name = name
+        self._source_path = source_path
+        self._source_code = None
+    @property
+    def source_code(self) -> str:
+        """Lazily load source code."""
+        if self._source_code is None:
+            self._source_code = self._source_path.read_text(encoding="utf-8")
+        return self._source_code
+    def __repr__(self) -> str:
+        return self.source_code
+    def __str__(self) -> str:
+        return self.source_code
+# Get the path to the source file
+_source_file = Path(__file__).parent / "sources" / "hunts_test.py"
+hunts_test = SourceCodeModule("DM.hunts_test", _source_file)

{bm_preprocessing-0.2.1 → bm_preprocessing-0.3.0}/src/bm_preprocessing/DM/preprocessing.py RENAMED Viewed

@@ -5,22 +5,22 @@ from pathlib import Path
 class SourceCodeModule:
     """A class that displays source code when printed."""
     def __init__(self, name: str, source_path: Path):
         self.name = name
         self._source_path = source_path
         self._source_code = None
     @property
     def source_code(self) -> str:
         """Lazily load source code."""
         if self._source_code is None:
-            self._source_code = self._source_path.read_text(encoding='utf-8')
+            self._source_code = self._source_path.read_text(encoding="utf-8")
         return self._source_code
     def __repr__(self) -> str:
         return self.source_code
     def __str__(self) -> str:
         return self.source_code

{bm_preprocessing-0.2.1 → bm_preprocessing-0.3.0}/src/bm_preprocessing/DM/sources/all.py RENAMED Viewed

@@ -1,21 +1,21 @@
-from itertools import combinations, chain
 from collections import defaultdict
+from itertools import chain, combinations
 min_support = 2
 min_conf = 0.7
 transactions = {
-    "T1": {"I1","I2","I4","I5","I6"},
-    "T2": {"I2","I4","I6"},
-    "T3": {"I2","I3"},
-    "T4": {"I1","I2","I4"},
-    "T5": {"I1","I2","I3"},
-    "T6": {"I2","I3"},
-    "T7": {"I1","I3"},
-    "T8": {"I1","I2","I3","I5"},
-    "T9": {"I1","I2","I3"},
-    "T10": {"I1","I2","I4","I5"},
-    "T11": {"I5","I6"}
+    "T1": {"I1", "I2", "I4", "I5", "I6"},
+    "T2": {"I2", "I4", "I6"},
+    "T3": {"I2", "I3"},
+    "T4": {"I1", "I2", "I4"},
+    "T5": {"I1", "I2", "I3"},
+    "T6": {"I2", "I3"},
+    "T7": {"I1", "I3"},
+    "T8": {"I1", "I2", "I3", "I5"},
+    "T9": {"I1", "I2", "I3"},
+    "T10": {"I1", "I2", "I4", "I5"},
+    "T11": {"I5", "I6"},
 }
 genL = lambda C: {k: v for k, v in C.items() if v >= min_support}
@@ -85,13 +85,15 @@ for k, v in C.items():
         "Empty" if not v else "\n".join(f"{set(x)} : {y}" for x, y in v.items()),
     )
 for k, v in L.items():
-    print(f"\nL{k}:\n", "Empty" if not v else "\n".join(f"{set(x)} : {v[x]}" for x in v))
+    print(
+        f"\nL{k}:\n", "Empty" if not v else "\n".join(f"{set(x)} : {v[x]}" for x in v)
+    )
 print(f"\nAssociation Rules (conf >= {min_conf:.0%}):")
 for a, c, s, conf in rules:
     print(f"{set(a)} => {set(c)} | support: {s:.2f}, confidence: {conf:.2f}")
-from itertools import combinations, chain
+from itertools import chain, combinations
 transactions = {
     "10": {"A", "C", "D"},

{bm_preprocessing-0.2.1 → bm_preprocessing-0.3.0}/src/bm_preprocessing/DM/sources/apriori.py RENAMED Viewed

@@ -1,11 +1,13 @@
 from collections import defaultdict
 from itertools import combinations
 def print_table(data, title):
     print(f"\n--- {title} ---")
     for itemset, count in data.items():
         print(f"{itemset}: {count}")
 C = {}
 L = {}
@@ -18,26 +20,26 @@ def generate_candidates(prev_frequent_itemsets, k):
         union_set = set(itemset1).union(set(itemset2))
         if len(union_set) == k:
             candidates.add(tuple(sorted(union_set)))
     return sorted(list(candidates))
 def count_candidates(candidates, transactions):
     candidate_count = defaultdict(int)
     for candidate in candidates:
         for transaction in transactions.values():
             if all(item in transaction for item in candidate):
                 candidate_count[candidate] += 1
     return candidate_count
 def prune_candidates(candidate_count, min_support, prev_freq_itemsets=None):
     filtered_candidates = {}
     for itemset, count in candidate_count.items():
         if count >= min_support:
@@ -45,7 +47,9 @@ def prune_candidates(candidate_count, min_support, prev_freq_itemsets=None):
                 filtered_candidates[itemset] = count
             else:
                 subsets = combinations(itemset, len(itemset) - 1)
-                if all(tuple(sorted(subset)) in prev_freq_itemsets for subset in subsets):
+                if all(
+                    tuple(sorted(subset)) in prev_freq_itemsets for subset in subsets
+                ):
                     filtered_candidates[itemset] = count
     return filtered_candidates
@@ -53,12 +57,14 @@ def prune_candidates(candidate_count, min_support, prev_freq_itemsets=None):
 def apriori(transactions, min_support):
-    items = sorted(set(item for transaction in transactions.values() for item in transaction))
+    items = sorted(
+        set(item for transaction in transactions.values() for item in transaction)
+    )
     c1_list = [(item,) for item in items]
     C[1] = count_candidates(c1_list, transactions)
     L[1] = prune_candidates(C[1], min_support)
     print_table(C[1], "Candidate 1-itemsets (C1)")
     print_table(L[1], "Frequent 1-itemsets (L1)")
@@ -66,12 +72,12 @@ def apriori(transactions, min_support):
     while True:
-        candidates = generate_candidates(L[k-1].keys(), k)
+        candidates = generate_candidates(L[k - 1].keys(), k)
         if not candidates:
             break
         C[k] = count_candidates(candidates, transactions)
-        L[k] = prune_candidates(C[k], min_support, L[k-1].keys())
+        L[k] = prune_candidates(C[k], min_support, L[k - 1].keys())
         if not L[k]:
             print_table(C[k], f"Candidate {k}-itemsets (C{k})")
@@ -99,7 +105,7 @@ def main():
     }
     min_support = 2
     apriori(transactions, min_support)

bm_preprocessing-0.3.0/src/bm_preprocessing/DM/sources/data.csv ADDED Viewed

@@ -0,0 +1,11 @@
+Tid,Home Owner,Marital Status,Annual Income,Default id
+1,Yes,Single,125K,No
+2,No,Married,100K,No
+3,No,Single,70K,No
+4,Yes,Married,120K,No
+5,No,Divorced,95K,Yes
+6,No,Married,60K,No
+7,Yes,Divorced,220K,No
+8,No,Single,85K,Yes
+9,No,Married,75K,No
+10,No,Single,90K,Yes

{bm_preprocessing-0.2.1 → bm_preprocessing-0.3.0}/src/bm_preprocessing/DM/sources/hash.py RENAMED Viewed

@@ -1,14 +1,17 @@
 from collections import defaultdict
 from itertools import combinations
 def print_table(data, title):
     print(f"\n--- {title} ---")
     for itemset, count in data.items():
         print(f"{itemset}: {count}")
 C = {}
 L = {}
 class Bucket:
     def __init__(self):
         self.address: int
@@ -24,26 +27,26 @@ def generate_candidates(prev_frequent_itemsets, k):
         union_set = set(itemset1).union(set(itemset2))
         if len(union_set) == k:
             candidates.add(tuple(sorted(union_set)))
     return sorted(list(candidates))
 def count_candidates(candidates, transactions):
     candidate_count = defaultdict(int)
     for candidate in candidates:
         for transaction in transactions.values():
             if all(item in transaction for item in candidate):
                 candidate_count[candidate] += 1
     return candidate_count
 def prune_candidates(candidate_count, min_support, prev_freq_itemsets=None):
     filtered_candidates = {}
     for itemset, count in candidate_count.items():
         if count >= min_support:
@@ -51,7 +54,9 @@ def prune_candidates(candidate_count, min_support, prev_freq_itemsets=None):
                 filtered_candidates[itemset] = count
             else:
                 subsets = combinations(itemset, len(itemset) - 1)
-                if all(tuple(sorted(subset)) in prev_freq_itemsets for subset in subsets):
+                if all(
+                    tuple(sorted(subset)) in prev_freq_itemsets for subset in subsets
+                ):
                     filtered_candidates[itemset] = count
     return filtered_candidates
@@ -59,12 +64,14 @@ def prune_candidates(candidate_count, min_support, prev_freq_itemsets=None):
 def apriori(transactions, min_support):
-    items = sorted(set(item for transaction in transactions.values() for item in transaction))
+    items = sorted(
+        set(item for transaction in transactions.values() for item in transaction)
+    )
     c1_list = [(item,) for item in items]
     C[1] = count_candidates(c1_list, transactions)
     L[1] = prune_candidates(C[1], min_support)
     print_table(C[1], "Candidate 1-itemsets (C1)")
     print_table(L[1], "Frequent 1-itemsets (L1)")
@@ -77,7 +84,9 @@ def apriori(transactions, min_support):
     buckets = [Bucket(addr) for addr in range(7)]
-    items_list = sorted(set(item for transaction in transactions.values() for item in transaction))
+    items_list = sorted(
+        set(item for transaction in transactions.values() for item in transaction)
+    )
     ranks = {item: idx + 1 for idx, item in enumerate(items_list)}
     hash_fn = lambda item1, item2: (ranks[item1] * 10 + ranks[item2]) % 7
@@ -91,25 +100,32 @@ def apriori(transactions, min_support):
     print("\n--- Hash Table Buckets ---")
     for bucket in buckets:
-        print(f"Address: {bucket.address}, Count: {bucket.count}, Itemsets: {bucket.itemsets}")
+        print(
+            f"Address: {bucket.address}, Count: {bucket.count}, Itemsets: {bucket.itemsets}"
+        )
     # Filter
-    L2 = {itemset: bucket.count for bucket in buckets for itemset in bucket.itemsets if bucket.count >= min_support}
+    L2 = {
+        itemset: bucket.count
+        for bucket in buckets
+        for itemset in bucket.itemsets
+        if bucket.count >= min_support
+    }
     print_table(L2, "Frequent 2-itemsets after Hashing (L2)")
-    C["2"] = generate_candidates(L[k-1].keys(), k)
+    C["2"] = generate_candidates(L[k - 1].keys(), k)
     L["2"] = L2
     k = 3
     while True:
-        candidates = generate_candidates(L[k-1].keys(), k)
+        candidates = generate_candidates(L[k - 1].keys(), k)
         if not candidates:
             break
         C[k] = count_candidates(candidates, transactions)
-        L[k] = prune_candidates(C[k], min_support, L[k-1].keys())
+        L[k] = prune_candidates(C[k], min_support, L[k - 1].keys())
         if not L[k]:
             print_table(C[k], f"Candidate {k}-itemsets (C{k})")
@@ -137,7 +153,7 @@ def main():
     }
     min_support = 2
     apriori(transactions, min_support)

bm-preprocessing 0.2.1__tar.gz → 0.3.0__tar.gz

bm-preprocessing 0.2.1tar.gz → 0.3.0tar.gz