PyPI - itertoolkit - Versions diffs - 1.5.4__tar.gz → 1.5.9__tar.gz - Mend

itertoolkit 1.5.4tar.gz → 1.5.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

itertoolkit-1.5.9/EVAL_README.md ADDED Viewed

@@ -0,0 +1,214 @@
+# bm-eval-metrics
+bm-eval-metrics is a Python package providing easy-to-use evaluation metrics and utilities for machine learning workflows.
+## Features
+- Text cleaning and normalization
+- Tokenization and stopword removal
+- Lemmatization
+- TF-IDF and Bag-of-Words vectorization
+- Pipeline-based preprocessing
+- Built on NLTK and pandas
+- Scikit-learn style API
+## Installation
+Install from PyPI:
+```bash
+pip install bm-eval-metrics
+```
+## Quick Start
+### Basic Usage With Pipeline
+```python
+from bm_eval_metrics import (
+    TextCleaner,
+    Tokenizer,
+    Normalizer,
+    StopwordFilter,
+    Lemmatizer,
+    Vectorizer,
+    Pipeline,
+)
+# Sample documents
+documents = [
+    "This is an example document! It has punctuation and numbers: 123.",
+    "Natural Language Processing is AMAZING!!!",
+    "Preprocessing text is very important for NLP tasks.",
+]
+# Create preprocessing components
+cleaner = TextCleaner(
+    lowercase=True,
+    remove_punctuation=True,
+    remove_numbers=True,
+    strip_whitespace=True,
+)
+tokenizer = Tokenizer(method="word")
+normalizer = Normalizer(
+    expand_contractions=True,
+    fix_unicode=True,
+)
+stopword_filter = StopwordFilter(language="english")
+lemmatizer = Lemmatizer(method="wordnet")
+vectorizer = Vectorizer(
+    method="tfidf",
+    max_features=5000,
+    ngram_range=(1, 2),
+)
+# Build pipeline
+preprocessing_pipeline = Pipeline(
+    [
+        cleaner,
+        normalizer,
+        tokenizer,
+        stopword_filter,
+        lemmatizer,
+        vectorizer,
+    ]
+)
+# Run preprocessing
+processed_data = preprocessing_pipeline.fit_transform(documents)
+# Inspect output
+print("Processed features shape:", processed_data.shape)
+print("Sample vector:", processed_data[0])
+```
+### Step-by-Step Processing Without Pipeline
+```python
+from bm_eval_metrics import (
+    TextCleaner,
+    Tokenizer,
+    StopwordFilter,
+    Lemmatizer,
+    Vectorizer,
+)
+docs = [
+    "Machine learning is fun!",
+    "Text preprocessing improves results.",
+]
+# Initialize tools
+cleaner = TextCleaner(lowercase=True)
+tokenizer = Tokenizer()
+stopwords = StopwordFilter("english")
+lemmatizer = Lemmatizer()
+vectorizer = Vectorizer(method="bow")
+# Process
+cleaned = [cleaner.clean(d) for d in docs]
+tokens = [tokenizer.tokenize(d) for d in cleaned]
+filtered = [stopwords.remove(t) for t in tokens]
+lemmatized = [lemmatizer.lemmatize(t) for t in filtered]
+vectors = vectorizer.fit_transform(lemmatized)
+print(vectors)
+```
+## Components Overview
+| Component | Description |
+| --- | --- |
+| TextCleaner | Removes noise and formats text |
+| Tokenizer | Splits text into tokens |
+| Normalizer | Standardizes text |
+| StopwordFilter | Removes common filler words |
+| Lemmatizer | Converts words to base form |
+| Vectorizer | Converts text to numeric features |
+| Pipeline | Chains components into a workflow |
+## Deep Learning Preparation Example
+```python
+from bm_eval_metrics import (
+    TextCleaner,
+    Tokenizer,
+    SequencePadder,
+    VocabularyBuilder,
+)
+texts = [
+    "Deep learning for NLP",
+    "Transformers are powerful",
+]
+cleaner = TextCleaner(lowercase=True)
+tokenizer = Tokenizer()
+vocab = VocabularyBuilder(max_size=10000)
+padder = SequencePadder(max_length=50)
+# Clean
+cleaned = [cleaner.clean(t) for t in texts]
+# Tokenize
+tokens = [tokenizer.tokenize(t) for t in cleaned]
+# Build vocabulary
+vocab.fit(tokens)
+# Encode
+encoded = [vocab.encode(t) for t in tokens]
+# Pad
+padded = padder.pad(encoded)
+print(padded)
+```
+## Requirements
+- Python 3.11+
+- nltk
+- pandas
+- scikit-learn
+Install dependencies automatically with:
+```bash
+pip install bm-eval-metrics
+```
+## Project Structure
+```text
+bm-eval-metrics/
+├── cleaning.py
+├── tokenization.py
+├── normalization.py
+├── filtering.py
+├── lemmatization.py
+├── vectorization.py
+├── pipeline.py
+└── __init__.py
+```
+## Contributing
+Contributions are welcome.
+1. Fork the repository.
+2. Create a new branch.
+3. Commit your changes.
+4. Open a pull request.
+## License
+This project is licensed under the MIT License.
+## Support
+If you encounter issues or have feature requests, open an issue on GitHub.

itertoolkit-1.5.9/IMPORTS.md ADDED Viewed

@@ -0,0 +1,48 @@
+# Imports Guide
+```python
+# Top-level section access
+from bm_preprocessing import DM
+from bm_preprocessing import IR
+from bm_preprocessing import PY
+from bm_preprocessing import Finals
+from bm_preprocessing import KALKI
+# Finals exports
+from bm_preprocessing.Finals import kaadhal
+from bm_preprocessing.Finals import raaka
+from bm_preprocessing.Finals import seedan
+from bm_preprocessing.Finals import vikram
+# DM exports
+from bm_preprocessing.DM import agg
+from bm_preprocessing.DM import dbscan
+from bm_preprocessing.DM import finals
+from bm_preprocessing.DM import gsp
+from bm_preprocessing.DM import test
+# IR exports
+from bm_preprocessing.IR import finals
+from bm_preprocessing.IR import pagerank
+from bm_preprocessing.IR import recommenders_pca
+from bm_preprocessing.IR import test
+# PY exports
+from bm_preprocessing.PY import lib_doc
+from bm_preprocessing.PY import python_doc
+# KALKI exports
+from bm_preprocessing.KALKI import collaborative_filtering
+from bm_preprocessing.KALKI import content_based_filtering
+from bm_preprocessing.KALKI import pagerank
+from bm_preprocessing.KALKI import pca
+from bm_preprocessing.KALKI import pca_svd
+from bm_preprocessing.KALKI import svd
+# Importer-layer access
+from bm_preprocessing.importer.DM import agg, dbscan, finals, gsp, test
+from bm_preprocessing.importer.IR import finals, pagerank, recommenders_pca, test
+from bm_preprocessing.importer.PY import lib_doc, python_doc
+from bm_preprocessing.importer.Finals import kaadhal, raaka, seedan, vikram
+from bm_preprocessing.importer.KALKI import collaborative_filtering, content_based_filtering, pagerank, pca, pca_svd, svd
+```

{itertoolkit-1.5.4 → itertoolkit-1.5.9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: itertoolkit
-Version: 1.5.4
+Version: 1.5.9
 Summary: An itertools-inspired toolkit for cached iterator and data-structure processing
 Requires-Python: >=3.11
 Requires-Dist: gsppy>=5.3.0

itertoolkit-1.5.9/README.md ADDED Viewed

@@ -0,0 +1,214 @@
+# bm-eval-metrics
+bm-eval-metrics is a Python package providing easy-to-use evaluation metrics and utilities for machine learning workflows.
+## Features
+- Text cleaning and normalization
+- Tokenization and stopword removal
+- Lemmatization
+- TF-IDF and Bag-of-Words vectorization
+- Pipeline-based preprocessing
+- Built on NLTK and pandas
+- Scikit-learn style API
+## Installation
+Install from PyPI:
+```bash
+pip install bm-eval-metrics
+```
+## Quick Start
+### Basic Usage With Pipeline
+```python
+from bm_eval_metrics import (
+    TextCleaner,
+    Tokenizer,
+    Normalizer,
+    StopwordFilter,
+    Lemmatizer,
+    Vectorizer,
+    Pipeline,
+)
+# Sample documents
+documents = [
+    "This is an example document! It has punctuation and numbers: 123.",
+    "Natural Language Processing is AMAZING!!!",
+    "Preprocessing text is very important for NLP tasks.",
+]
+# Create preprocessing components
+cleaner = TextCleaner(
+    lowercase=True,
+    remove_punctuation=True,
+    remove_numbers=True,
+    strip_whitespace=True,
+)
+tokenizer = Tokenizer(method="word")
+normalizer = Normalizer(
+    expand_contractions=True,
+    fix_unicode=True,
+)
+stopword_filter = StopwordFilter(language="english")
+lemmatizer = Lemmatizer(method="wordnet")
+vectorizer = Vectorizer(
+    method="tfidf",
+    max_features=5000,
+    ngram_range=(1, 2),
+)
+# Build pipeline
+preprocessing_pipeline = Pipeline(
+    [
+        cleaner,
+        normalizer,
+        tokenizer,
+        stopword_filter,
+        lemmatizer,
+        vectorizer,
+    ]
+)
+# Run preprocessing
+processed_data = preprocessing_pipeline.fit_transform(documents)
+# Inspect output
+print("Processed features shape:", processed_data.shape)
+print("Sample vector:", processed_data[0])
+```
+### Step-by-Step Processing Without Pipeline
+```python
+from bm_eval_metrics import (
+    TextCleaner,
+    Tokenizer,
+    StopwordFilter,
+    Lemmatizer,
+    Vectorizer,
+)
+docs = [
+    "Machine learning is fun!",
+    "Text preprocessing improves results.",
+]
+# Initialize tools
+cleaner = TextCleaner(lowercase=True)
+tokenizer = Tokenizer()
+stopwords = StopwordFilter("english")
+lemmatizer = Lemmatizer()
+vectorizer = Vectorizer(method="bow")
+# Process
+cleaned = [cleaner.clean(d) for d in docs]
+tokens = [tokenizer.tokenize(d) for d in cleaned]
+filtered = [stopwords.remove(t) for t in tokens]
+lemmatized = [lemmatizer.lemmatize(t) for t in filtered]
+vectors = vectorizer.fit_transform(lemmatized)
+print(vectors)
+```
+## Components Overview
+| Component | Description |
+| --- | --- |
+| TextCleaner | Removes noise and formats text |
+| Tokenizer | Splits text into tokens |
+| Normalizer | Standardizes text |
+| StopwordFilter | Removes common filler words |
+| Lemmatizer | Converts words to base form |
+| Vectorizer | Converts text to numeric features |
+| Pipeline | Chains components into a workflow |
+## Deep Learning Preparation Example
+```python
+from bm_eval_metrics import (
+    TextCleaner,
+    Tokenizer,
+    SequencePadder,
+    VocabularyBuilder,
+)
+texts = [
+    "Deep learning for NLP",
+    "Transformers are powerful",
+]
+cleaner = TextCleaner(lowercase=True)
+tokenizer = Tokenizer()
+vocab = VocabularyBuilder(max_size=10000)
+padder = SequencePadder(max_length=50)
+# Clean
+cleaned = [cleaner.clean(t) for t in texts]
+# Tokenize
+tokens = [tokenizer.tokenize(t) for t in cleaned]
+# Build vocabulary
+vocab.fit(tokens)
+# Encode
+encoded = [vocab.encode(t) for t in tokens]
+# Pad
+padded = padder.pad(encoded)
+print(padded)
+```
+## Requirements
+- Python 3.11+
+- nltk
+- pandas
+- scikit-learn
+Install dependencies automatically with:
+```bash
+pip install bm-eval-metrics
+```
+## Project Structure
+```text
+bm-eval-metrics/
+├── cleaning.py
+├── tokenization.py
+├── normalization.py
+├── filtering.py
+├── lemmatization.py
+├── vectorization.py
+├── pipeline.py
+└── __init__.py
+```
+## Contributing
+Contributions are welcome.
+1. Fork the repository.
+2. Create a new branch.
+3. Commit your changes.
+4. Open a pull request.
+## License
+This project is licensed under the MIT License.
+## Support
+If you encounter issues or have feature requests, open an issue on GitHub.

{itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm-eval-metrics.pyproject.toml RENAMED Viewed

@@ -1,10 +1,11 @@
 [project]
 name = "bm-eval-metrics"
-version = "1.5.4"
+version = "1.5.8"
 description = "Python package providing easy-to-use evaluation metrics and utilities for Machine Learning"
-readme = "README.md"
+readme = "EVAL_README.md"
 requires-python = ">=3.11"
 dependencies = [
+    "groq>=1.1.2",
     "gsppy>=5.3.0",
     "matplotlib>=3.10.8",
     "networkx>=3.6.1",

{itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/importer/IR/__init__.py RENAMED Viewed

@@ -2,5 +2,6 @@ from .finals import finals
 from .pagerank import pagerank
 from .recommenders_pca import recommenders_pca
 from .test import test
+from .pagerank_mat import pagerank_mat
-__all__ = ["finals", "test", "pagerank", "recommenders_pca"]
+__all__ = ["finals", "test", "pagerank", "recommenders_pca", "pagerank_mat"]

itertoolkit-1.5.9/bm_preprocessing/importer/IR/pagerank_mat.py ADDED Viewed

@@ -0,0 +1,6 @@
+from pathlib import Path
+from .._module_printer import SourceCodeModule
+_source_file = Path(__file__).parents[2] / "src" / "IR" / "pagerank_mat.py"
+pagerank_mat = SourceCodeModule("bm_preprocessing.IR.pagerank_mat", _source_file)

{itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/importer/KALKI/__init__.py RENAMED Viewed

@@ -4,6 +4,7 @@ from .pagerank import pagerank
 from .pca import pca
 from .pca_svd import pca_svd
 from .svd import svd
+from .pagerank_mat import pagerank_mat
 __all__ = [
     "collaborative_filtering",
@@ -12,4 +13,5 @@ __all__ = [
     "pca",
     "pca_svd",
     "svd",
+    "pagerank_mat",
 ]

itertoolkit-1.5.9/bm_preprocessing/importer/KALKI/pagerank_mat.py ADDED Viewed

@@ -0,0 +1,6 @@
+from pathlib import Path
+from .._module_printer import SourceCodeModule
+_source_file = Path(__file__).parents[2] / "src" / "KALKI" / "pagerank_mat.py"
+pagerank_mat = SourceCodeModule("bm_preprocessing.KALKI.pagerank_mat", _source_file)

itertoolkit-1.5.9/bm_preprocessing/importer/PY/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .lib_doc import lib_doc
+from .python_doc import python_doc
+from .vis_doc import vis_doc
+__all__ = ["lib_doc", "python_doc", "vis_doc"]

itertoolkit-1.5.9/bm_preprocessing/importer/PY/vis_doc.py ADDED Viewed

@@ -0,0 +1,6 @@
+from pathlib import Path
+from .._module_printer import SourceCodeModule
+_source_file = Path(__file__).parents[2] / "src" / "PY" / "vis_doc.py"
+vis_doc = SourceCodeModule("bm_preprocessing.PY.vis_doc", _source_file)

itertoolkit-1.5.9/bm_preprocessing/src/IR/pagerank_mat.py ADDED Viewed

@@ -0,0 +1,114 @@
+import matplotlib.pyplot as plt
+import networkx as nx
+import numpy as np
+import pandas as pd
+# ===== INPUT =====
+pages = ["A", "B", "C", "D"]
+A = np.array(
+    [
+        [0, 1, 1, 0],  # A -> B,C
+        [0, 0, 1, 0],  # B -> C
+        [1, 0, 0, 0],  # C -> A
+        [0, 0, 1, 0],  # D -> C
+    ],
+    dtype=float,
+)
+d = 0.85
+max_iter = 20
+tol = 1e-8
+n = len(pages)
+# ===== STEP 1: TRANSITION MATRIX (COLUMN STOCHASTIC) =====
+S = np.zeros_like(A)
+for j in range(n):
+    out_degree = A[j].sum()
+    if out_degree > 0:
+        S[j] = A[j] / out_degree
+    else:
+        S[j] = np.ones(n) / n  # dangling node
+S = S.T  # convert to column-stochastic
+print("Transition Matrix S:")
+print(pd.DataFrame(S, index=pages, columns=pages))
+# ===== STEP 2: GOOGLE MATRIX =====
+M = d * S + (1 - d) / n * np.ones((n, n))
+print("\nGoogle Matrix M:")
+print(pd.DataFrame(M, index=pages, columns=pages))
+# ===== STEP 3: INITIAL RANK =====
+r = np.ones(n) / n
+history = [r.copy()]
+print("\nInitial PageRank:")
+print(pd.DataFrame({"Page": pages, "PR": r}))
+# ===== STEP 4: ITERATIONS =====
+for it in range(1, max_iter + 1):
+    r_new = M @ r
+    history.append(r_new.copy())
+    print(f"\nIteration {it}")
+    print(pd.DataFrame({"Page": pages, "PR": r_new.round(6)}))
+    diff = np.linalg.norm(r_new - r, 1)
+    r = r_new
+    if diff < tol:
+        break
+# ===== FINAL RESULT =====
+final_df = pd.DataFrame({"Page": pages, "Final PR": r})
+final_df = final_df.sort_values("Final PR", ascending=False)
+print("\nFinal PageRank:")
+print(final_df.round(6))
+# ===== GRAPH VISUALIZATION (NODE SIZE ∝ PageRank) =====
+G = nx.DiGraph()
+for i, src in enumerate(pages):
+    for j, dst in enumerate(pages):
+        if A[i, j] == 1:
+            G.add_edge(src, dst)
+plt.figure(figsize=(6, 4))
+pos = nx.spring_layout(G, seed=42)
+# --- Min-Max scaling for node sizes ---
+pr_dict = {pages[i]: r[i] for i in range(n)}
+pr_values = np.array([pr_dict[p] for p in G.nodes()])
+min_size, max_size = 500, 5000
+sizes = min_size + (pr_values - pr_values.min()) / (pr_values.max() - pr_values.min()) * (max_size - min_size)
+nx.draw(G, pos, with_labels=True, node_size=sizes, arrows=True)
+plt.title("Graph Visualization (node size ∝ PageRank)")
+plt.show()
+# ===== VISUALIZATION 2: CONVERGENCE =====
+history_arr = np.array(history)
+plt.figure()
+for i, p in enumerate(pages):
+    plt.plot(history_arr[:, i], label=p)
+plt.xlabel("Iteration")
+plt.ylabel("PageRank")
+plt.title("PageRank Convergence")
+plt.legend()
+plt.grid()
+plt.show()
+# ===== VISUALIZATION 3: FINAL SCORES =====
+plt.figure()
+plt.bar(final_df["Page"], final_df["Final PR"])
+plt.xlabel("Page")
+plt.ylabel("PageRank Score")
+plt.title("Final PageRank Ranking")
+plt.grid()
+plt.show()

{itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/KALKI/collaborative_filtering.py RENAMED Viewed

@@ -224,7 +224,7 @@ for i in range(df_iu.shape[0]):
         if i == ti and j == tu:
             ax.add_patch(
                 plt.Rectangle(
-                    (j - 0.5, i - 0.5), 1, 1, fill=True, color="#4fc3f7", zorder=2
+                    (j - 0.5, i - 0.5), 1, 1, fill=True, color="lightblue", zorder=2
                 )
             )
             ax.text(

{itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/KALKI/content_based_filtering.py RENAMED Viewed

@@ -369,12 +369,12 @@ def set_tick_labels(ax, df):
 ax = axes[0, 0]
 ti = list(df_iu.index).index(target_item)
 tu = list(df_iu.columns).index(target_user)
-im = ax.imshow(df_iu.values.astype(float), cmap="YlGn", aspect="auto", vmin=1, vmax=5)
+im = ax.imshow(df_iu.values.astype(float), cmap="viridis", aspect="auto", vmin=1, vmax=5)
 for i in range(df_iu.shape[0]):
     for j in range(df_iu.shape[1]):
         val = df_iu.iloc[i, j]
         if i == ti and j == tu:
-            ax.add_patch(plt.Rectangle((j-.5, i-.5), 1, 1, fill=True, color="#4fc3f7", zorder=2))
+            ax.add_patch(plt.Rectangle((j-.5, i-.5), 1, 1, fill=True, color="lightblue", zorder=2))
             ax.text(j, i, "?", ha="center", va="center", fontsize=9, fontweight="bold", color="navy", zorder=3)
         elif pd.notna(val):
             ax.text(j, i, int(val), ha="center", va="center", fontsize=8)

itertoolkit 1.5.4__tar.gz → 1.5.9__tar.gz

itertoolkit 1.5.4tar.gz → 1.5.9tar.gz