itertoolkit 1.5.4__tar.gz → 1.5.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- itertoolkit-1.5.9/EVAL_README.md +214 -0
- itertoolkit-1.5.9/IMPORTS.md +48 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/PKG-INFO +1 -1
- itertoolkit-1.5.9/README.md +214 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm-eval-metrics.pyproject.toml +3 -2
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/importer/IR/__init__.py +2 -1
- itertoolkit-1.5.9/bm_preprocessing/importer/IR/pagerank_mat.py +6 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/importer/KALKI/__init__.py +2 -0
- itertoolkit-1.5.9/bm_preprocessing/importer/KALKI/pagerank_mat.py +6 -0
- itertoolkit-1.5.9/bm_preprocessing/importer/PY/__init__.py +5 -0
- itertoolkit-1.5.9/bm_preprocessing/importer/PY/vis_doc.py +6 -0
- itertoolkit-1.5.9/bm_preprocessing/src/IR/pagerank_mat.py +114 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/KALKI/collaborative_filtering.py +1 -1
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/KALKI/content_based_filtering.py +2 -2
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/KALKI/pagerank.py +3 -3
- itertoolkit-1.5.9/bm_preprocessing/src/KALKI/pagerank_mat.py +164 -0
- itertoolkit-1.5.9/bm_preprocessing/src/PY/vis_doc.py +247 -0
- itertoolkit-1.5.9/bm_preprocessing/vsk.py +66 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/itertoolkit.pyproject.toml +3 -3
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/pyproject.toml +3 -3
- itertoolkit-1.5.4/bm_preprocessing/importer/PY/__init__.py +0 -4
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/.gitignore +0 -0
- /itertoolkit-1.5.4/README.md → /itertoolkit-1.5.9/ITER_README.md +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/__init__.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/__init__.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/importer/DM/__init__.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/importer/DM/agg.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/importer/DM/dbscan.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/importer/DM/finals.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/importer/DM/gsp.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/importer/DM/test.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/importer/Finals/__init__.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/importer/Finals/kaadhal.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/importer/Finals/raaka.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/importer/Finals/seedan.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/importer/Finals/vikram.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/importer/IR/finals.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/importer/IR/pagerank.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/importer/IR/recommenders_pca.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/importer/IR/test.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/importer/KALKI/collaborative_filtering.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/importer/KALKI/content_based_filtering.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/importer/KALKI/pagerank.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/importer/KALKI/pca.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/importer/KALKI/pca_svd.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/importer/KALKI/svd.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/importer/PY/lib_doc.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/importer/PY/python_doc.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/importer/__init__.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/importer/_module_printer.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/DM/__init__.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/DM/agg.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/DM/dbscan.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/DM/finals.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/DM/gsp.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/DM/test.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/Finals/__init__.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/Finals/kaadhal.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/Finals/raaka.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/Finals/seedan.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/Finals/vikram.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/IR/__init__.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/IR/finals.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/IR/pagerank.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/IR/recommenders_pca.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/IR/test.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/KALKI/__init__.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/KALKI/item_features.csv +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/KALKI/pca.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/KALKI/pca_svd.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/KALKI/svd.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/KALKI/user_items.csv +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/PY/__init__.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/PY/lib_doc.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/PY/python_doc.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/__init__.py +0 -0
- {itertoolkit-1.5.4 → itertoolkit-1.5.9}/main.py +0 -0
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
# bm-eval-metrics
|
|
2
|
+
|
|
3
|
+
bm-eval-metrics is a Python package providing easy-to-use evaluation metrics and utilities for machine learning workflows.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- Text cleaning and normalization
|
|
8
|
+
- Tokenization and stopword removal
|
|
9
|
+
- Lemmatization
|
|
10
|
+
- TF-IDF and Bag-of-Words vectorization
|
|
11
|
+
- Pipeline-based preprocessing
|
|
12
|
+
- Built on NLTK and pandas
|
|
13
|
+
- Scikit-learn style API
|
|
14
|
+
|
|
15
|
+
## Installation
|
|
16
|
+
|
|
17
|
+
Install from PyPI:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install bm-eval-metrics
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Quick Start
|
|
24
|
+
|
|
25
|
+
### Basic Usage With Pipeline
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
from bm_eval_metrics import (
|
|
29
|
+
TextCleaner,
|
|
30
|
+
Tokenizer,
|
|
31
|
+
Normalizer,
|
|
32
|
+
StopwordFilter,
|
|
33
|
+
Lemmatizer,
|
|
34
|
+
Vectorizer,
|
|
35
|
+
Pipeline,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
# Sample documents
|
|
39
|
+
documents = [
|
|
40
|
+
"This is an example document! It has punctuation and numbers: 123.",
|
|
41
|
+
"Natural Language Processing is AMAZING!!!",
|
|
42
|
+
"Preprocessing text is very important for NLP tasks.",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
# Create preprocessing components
|
|
46
|
+
cleaner = TextCleaner(
|
|
47
|
+
lowercase=True,
|
|
48
|
+
remove_punctuation=True,
|
|
49
|
+
remove_numbers=True,
|
|
50
|
+
strip_whitespace=True,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
tokenizer = Tokenizer(method="word")
|
|
54
|
+
|
|
55
|
+
normalizer = Normalizer(
|
|
56
|
+
expand_contractions=True,
|
|
57
|
+
fix_unicode=True,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
stopword_filter = StopwordFilter(language="english")
|
|
61
|
+
lemmatizer = Lemmatizer(method="wordnet")
|
|
62
|
+
|
|
63
|
+
vectorizer = Vectorizer(
|
|
64
|
+
method="tfidf",
|
|
65
|
+
max_features=5000,
|
|
66
|
+
ngram_range=(1, 2),
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# Build pipeline
|
|
70
|
+
preprocessing_pipeline = Pipeline(
|
|
71
|
+
[
|
|
72
|
+
cleaner,
|
|
73
|
+
normalizer,
|
|
74
|
+
tokenizer,
|
|
75
|
+
stopword_filter,
|
|
76
|
+
lemmatizer,
|
|
77
|
+
vectorizer,
|
|
78
|
+
]
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# Run preprocessing
|
|
82
|
+
processed_data = preprocessing_pipeline.fit_transform(documents)
|
|
83
|
+
|
|
84
|
+
# Inspect output
|
|
85
|
+
print("Processed features shape:", processed_data.shape)
|
|
86
|
+
print("Sample vector:", processed_data[0])
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### Step-by-Step Processing Without Pipeline
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
from bm_eval_metrics import (
|
|
93
|
+
TextCleaner,
|
|
94
|
+
Tokenizer,
|
|
95
|
+
StopwordFilter,
|
|
96
|
+
Lemmatizer,
|
|
97
|
+
Vectorizer,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
docs = [
|
|
101
|
+
"Machine learning is fun!",
|
|
102
|
+
"Text preprocessing improves results.",
|
|
103
|
+
]
|
|
104
|
+
|
|
105
|
+
# Initialize tools
|
|
106
|
+
cleaner = TextCleaner(lowercase=True)
|
|
107
|
+
tokenizer = Tokenizer()
|
|
108
|
+
stopwords = StopwordFilter("english")
|
|
109
|
+
lemmatizer = Lemmatizer()
|
|
110
|
+
vectorizer = Vectorizer(method="bow")
|
|
111
|
+
|
|
112
|
+
# Process
|
|
113
|
+
cleaned = [cleaner.clean(d) for d in docs]
|
|
114
|
+
tokens = [tokenizer.tokenize(d) for d in cleaned]
|
|
115
|
+
filtered = [stopwords.remove(t) for t in tokens]
|
|
116
|
+
lemmatized = [lemmatizer.lemmatize(t) for t in filtered]
|
|
117
|
+
|
|
118
|
+
vectors = vectorizer.fit_transform(lemmatized)
|
|
119
|
+
print(vectors)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
## Components Overview
|
|
123
|
+
|
|
124
|
+
| Component | Description |
|
|
125
|
+
| --- | --- |
|
|
126
|
+
| TextCleaner | Removes noise and formats text |
|
|
127
|
+
| Tokenizer | Splits text into tokens |
|
|
128
|
+
| Normalizer | Standardizes text |
|
|
129
|
+
| StopwordFilter | Removes common filler words |
|
|
130
|
+
| Lemmatizer | Converts words to base form |
|
|
131
|
+
| Vectorizer | Converts text to numeric features |
|
|
132
|
+
| Pipeline | Chains components into a workflow |
|
|
133
|
+
|
|
134
|
+
## Deep Learning Preparation Example
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
from bm_eval_metrics import (
|
|
138
|
+
TextCleaner,
|
|
139
|
+
Tokenizer,
|
|
140
|
+
SequencePadder,
|
|
141
|
+
VocabularyBuilder,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
texts = [
|
|
145
|
+
"Deep learning for NLP",
|
|
146
|
+
"Transformers are powerful",
|
|
147
|
+
]
|
|
148
|
+
|
|
149
|
+
cleaner = TextCleaner(lowercase=True)
|
|
150
|
+
tokenizer = Tokenizer()
|
|
151
|
+
vocab = VocabularyBuilder(max_size=10000)
|
|
152
|
+
padder = SequencePadder(max_length=50)
|
|
153
|
+
|
|
154
|
+
# Clean
|
|
155
|
+
cleaned = [cleaner.clean(t) for t in texts]
|
|
156
|
+
|
|
157
|
+
# Tokenize
|
|
158
|
+
tokens = [tokenizer.tokenize(t) for t in cleaned]
|
|
159
|
+
|
|
160
|
+
# Build vocabulary
|
|
161
|
+
vocab.fit(tokens)
|
|
162
|
+
|
|
163
|
+
# Encode
|
|
164
|
+
encoded = [vocab.encode(t) for t in tokens]
|
|
165
|
+
|
|
166
|
+
# Pad
|
|
167
|
+
padded = padder.pad(encoded)
|
|
168
|
+
|
|
169
|
+
print(padded)
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
## Requirements
|
|
173
|
+
|
|
174
|
+
- Python 3.11+
|
|
175
|
+
- nltk
|
|
176
|
+
- pandas
|
|
177
|
+
- scikit-learn
|
|
178
|
+
|
|
179
|
+
Install dependencies automatically with:
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
pip install bm-eval-metrics
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
## Project Structure
|
|
186
|
+
|
|
187
|
+
```text
|
|
188
|
+
bm-eval-metrics/
|
|
189
|
+
├── cleaning.py
|
|
190
|
+
├── tokenization.py
|
|
191
|
+
├── normalization.py
|
|
192
|
+
├── filtering.py
|
|
193
|
+
├── lemmatization.py
|
|
194
|
+
├── vectorization.py
|
|
195
|
+
├── pipeline.py
|
|
196
|
+
└── __init__.py
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
## Contributing
|
|
200
|
+
|
|
201
|
+
Contributions are welcome.
|
|
202
|
+
|
|
203
|
+
1. Fork the repository.
|
|
204
|
+
2. Create a new branch.
|
|
205
|
+
3. Commit your changes.
|
|
206
|
+
4. Open a pull request.
|
|
207
|
+
|
|
208
|
+
## License
|
|
209
|
+
|
|
210
|
+
This project is licensed under the MIT License.
|
|
211
|
+
|
|
212
|
+
## Support
|
|
213
|
+
|
|
214
|
+
If you encounter issues or have feature requests, open an issue on GitHub.
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# Imports Guide
|
|
2
|
+
|
|
3
|
+
```python
|
|
4
|
+
# Top-level section access
|
|
5
|
+
from bm_preprocessing import DM
|
|
6
|
+
from bm_preprocessing import IR
|
|
7
|
+
from bm_preprocessing import PY
|
|
8
|
+
from bm_preprocessing import Finals
|
|
9
|
+
from bm_preprocessing import KALKI
|
|
10
|
+
|
|
11
|
+
# Finals exports
|
|
12
|
+
from bm_preprocessing.Finals import kaadhal
|
|
13
|
+
from bm_preprocessing.Finals import raaka
|
|
14
|
+
from bm_preprocessing.Finals import seedan
|
|
15
|
+
from bm_preprocessing.Finals import vikram
|
|
16
|
+
|
|
17
|
+
# DM exports
|
|
18
|
+
from bm_preprocessing.DM import agg
|
|
19
|
+
from bm_preprocessing.DM import dbscan
|
|
20
|
+
from bm_preprocessing.DM import finals
|
|
21
|
+
from bm_preprocessing.DM import gsp
|
|
22
|
+
from bm_preprocessing.DM import test
|
|
23
|
+
|
|
24
|
+
# IR exports
|
|
25
|
+
from bm_preprocessing.IR import finals
|
|
26
|
+
from bm_preprocessing.IR import pagerank
|
|
27
|
+
from bm_preprocessing.IR import recommenders_pca
|
|
28
|
+
from bm_preprocessing.IR import test
|
|
29
|
+
|
|
30
|
+
# PY exports
|
|
31
|
+
from bm_preprocessing.PY import lib_doc
|
|
32
|
+
from bm_preprocessing.PY import python_doc
|
|
33
|
+
|
|
34
|
+
# KALKI exports
|
|
35
|
+
from bm_preprocessing.KALKI import collaborative_filtering
|
|
36
|
+
from bm_preprocessing.KALKI import content_based_filtering
|
|
37
|
+
from bm_preprocessing.KALKI import pagerank
|
|
38
|
+
from bm_preprocessing.KALKI import pca
|
|
39
|
+
from bm_preprocessing.KALKI import pca_svd
|
|
40
|
+
from bm_preprocessing.KALKI import svd
|
|
41
|
+
|
|
42
|
+
# Importer-layer access
|
|
43
|
+
from bm_preprocessing.importer.DM import agg, dbscan, finals, gsp, test
|
|
44
|
+
from bm_preprocessing.importer.IR import finals, pagerank, recommenders_pca, test
|
|
45
|
+
from bm_preprocessing.importer.PY import lib_doc, python_doc
|
|
46
|
+
from bm_preprocessing.importer.Finals import kaadhal, raaka, seedan, vikram
|
|
47
|
+
from bm_preprocessing.importer.KALKI import collaborative_filtering, content_based_filtering, pagerank, pca, pca_svd, svd
|
|
48
|
+
```
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
# bm-eval-metrics
|
|
2
|
+
|
|
3
|
+
bm-eval-metrics is a Python package providing easy-to-use evaluation metrics and utilities for machine learning workflows.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- Text cleaning and normalization
|
|
8
|
+
- Tokenization and stopword removal
|
|
9
|
+
- Lemmatization
|
|
10
|
+
- TF-IDF and Bag-of-Words vectorization
|
|
11
|
+
- Pipeline-based preprocessing
|
|
12
|
+
- Built on NLTK and pandas
|
|
13
|
+
- Scikit-learn style API
|
|
14
|
+
|
|
15
|
+
## Installation
|
|
16
|
+
|
|
17
|
+
Install from PyPI:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install bm-eval-metrics
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Quick Start
|
|
24
|
+
|
|
25
|
+
### Basic Usage With Pipeline
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
from bm_eval_metrics import (
|
|
29
|
+
TextCleaner,
|
|
30
|
+
Tokenizer,
|
|
31
|
+
Normalizer,
|
|
32
|
+
StopwordFilter,
|
|
33
|
+
Lemmatizer,
|
|
34
|
+
Vectorizer,
|
|
35
|
+
Pipeline,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
# Sample documents
|
|
39
|
+
documents = [
|
|
40
|
+
"This is an example document! It has punctuation and numbers: 123.",
|
|
41
|
+
"Natural Language Processing is AMAZING!!!",
|
|
42
|
+
"Preprocessing text is very important for NLP tasks.",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
# Create preprocessing components
|
|
46
|
+
cleaner = TextCleaner(
|
|
47
|
+
lowercase=True,
|
|
48
|
+
remove_punctuation=True,
|
|
49
|
+
remove_numbers=True,
|
|
50
|
+
strip_whitespace=True,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
tokenizer = Tokenizer(method="word")
|
|
54
|
+
|
|
55
|
+
normalizer = Normalizer(
|
|
56
|
+
expand_contractions=True,
|
|
57
|
+
fix_unicode=True,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
stopword_filter = StopwordFilter(language="english")
|
|
61
|
+
lemmatizer = Lemmatizer(method="wordnet")
|
|
62
|
+
|
|
63
|
+
vectorizer = Vectorizer(
|
|
64
|
+
method="tfidf",
|
|
65
|
+
max_features=5000,
|
|
66
|
+
ngram_range=(1, 2),
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# Build pipeline
|
|
70
|
+
preprocessing_pipeline = Pipeline(
|
|
71
|
+
[
|
|
72
|
+
cleaner,
|
|
73
|
+
normalizer,
|
|
74
|
+
tokenizer,
|
|
75
|
+
stopword_filter,
|
|
76
|
+
lemmatizer,
|
|
77
|
+
vectorizer,
|
|
78
|
+
]
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# Run preprocessing
|
|
82
|
+
processed_data = preprocessing_pipeline.fit_transform(documents)
|
|
83
|
+
|
|
84
|
+
# Inspect output
|
|
85
|
+
print("Processed features shape:", processed_data.shape)
|
|
86
|
+
print("Sample vector:", processed_data[0])
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### Step-by-Step Processing Without Pipeline
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
from bm_eval_metrics import (
|
|
93
|
+
TextCleaner,
|
|
94
|
+
Tokenizer,
|
|
95
|
+
StopwordFilter,
|
|
96
|
+
Lemmatizer,
|
|
97
|
+
Vectorizer,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
docs = [
|
|
101
|
+
"Machine learning is fun!",
|
|
102
|
+
"Text preprocessing improves results.",
|
|
103
|
+
]
|
|
104
|
+
|
|
105
|
+
# Initialize tools
|
|
106
|
+
cleaner = TextCleaner(lowercase=True)
|
|
107
|
+
tokenizer = Tokenizer()
|
|
108
|
+
stopwords = StopwordFilter("english")
|
|
109
|
+
lemmatizer = Lemmatizer()
|
|
110
|
+
vectorizer = Vectorizer(method="bow")
|
|
111
|
+
|
|
112
|
+
# Process
|
|
113
|
+
cleaned = [cleaner.clean(d) for d in docs]
|
|
114
|
+
tokens = [tokenizer.tokenize(d) for d in cleaned]
|
|
115
|
+
filtered = [stopwords.remove(t) for t in tokens]
|
|
116
|
+
lemmatized = [lemmatizer.lemmatize(t) for t in filtered]
|
|
117
|
+
|
|
118
|
+
vectors = vectorizer.fit_transform(lemmatized)
|
|
119
|
+
print(vectors)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
## Components Overview
|
|
123
|
+
|
|
124
|
+
| Component | Description |
|
|
125
|
+
| --- | --- |
|
|
126
|
+
| TextCleaner | Removes noise and formats text |
|
|
127
|
+
| Tokenizer | Splits text into tokens |
|
|
128
|
+
| Normalizer | Standardizes text |
|
|
129
|
+
| StopwordFilter | Removes common filler words |
|
|
130
|
+
| Lemmatizer | Converts words to base form |
|
|
131
|
+
| Vectorizer | Converts text to numeric features |
|
|
132
|
+
| Pipeline | Chains components into a workflow |
|
|
133
|
+
|
|
134
|
+
## Deep Learning Preparation Example
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
from bm_eval_metrics import (
|
|
138
|
+
TextCleaner,
|
|
139
|
+
Tokenizer,
|
|
140
|
+
SequencePadder,
|
|
141
|
+
VocabularyBuilder,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
texts = [
|
|
145
|
+
"Deep learning for NLP",
|
|
146
|
+
"Transformers are powerful",
|
|
147
|
+
]
|
|
148
|
+
|
|
149
|
+
cleaner = TextCleaner(lowercase=True)
|
|
150
|
+
tokenizer = Tokenizer()
|
|
151
|
+
vocab = VocabularyBuilder(max_size=10000)
|
|
152
|
+
padder = SequencePadder(max_length=50)
|
|
153
|
+
|
|
154
|
+
# Clean
|
|
155
|
+
cleaned = [cleaner.clean(t) for t in texts]
|
|
156
|
+
|
|
157
|
+
# Tokenize
|
|
158
|
+
tokens = [tokenizer.tokenize(t) for t in cleaned]
|
|
159
|
+
|
|
160
|
+
# Build vocabulary
|
|
161
|
+
vocab.fit(tokens)
|
|
162
|
+
|
|
163
|
+
# Encode
|
|
164
|
+
encoded = [vocab.encode(t) for t in tokens]
|
|
165
|
+
|
|
166
|
+
# Pad
|
|
167
|
+
padded = padder.pad(encoded)
|
|
168
|
+
|
|
169
|
+
print(padded)
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
## Requirements
|
|
173
|
+
|
|
174
|
+
- Python 3.11+
|
|
175
|
+
- nltk
|
|
176
|
+
- pandas
|
|
177
|
+
- scikit-learn
|
|
178
|
+
|
|
179
|
+
Install dependencies automatically with:
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
pip install bm-eval-metrics
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
## Project Structure
|
|
186
|
+
|
|
187
|
+
```text
|
|
188
|
+
bm-eval-metrics/
|
|
189
|
+
├── cleaning.py
|
|
190
|
+
├── tokenization.py
|
|
191
|
+
├── normalization.py
|
|
192
|
+
├── filtering.py
|
|
193
|
+
├── lemmatization.py
|
|
194
|
+
├── vectorization.py
|
|
195
|
+
├── pipeline.py
|
|
196
|
+
└── __init__.py
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
## Contributing
|
|
200
|
+
|
|
201
|
+
Contributions are welcome.
|
|
202
|
+
|
|
203
|
+
1. Fork the repository.
|
|
204
|
+
2. Create a new branch.
|
|
205
|
+
3. Commit your changes.
|
|
206
|
+
4. Open a pull request.
|
|
207
|
+
|
|
208
|
+
## License
|
|
209
|
+
|
|
210
|
+
This project is licensed under the MIT License.
|
|
211
|
+
|
|
212
|
+
## Support
|
|
213
|
+
|
|
214
|
+
If you encounter issues or have feature requests, open an issue on GitHub.
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "bm-eval-metrics"
|
|
3
|
-
version = "1.5.
|
|
3
|
+
version = "1.5.8"
|
|
4
4
|
description = "Python package providing easy-to-use evaluation metrics and utilities for Machine Learning"
|
|
5
|
-
readme = "
|
|
5
|
+
readme = "EVAL_README.md"
|
|
6
6
|
requires-python = ">=3.11"
|
|
7
7
|
dependencies = [
|
|
8
|
+
"groq>=1.1.2",
|
|
8
9
|
"gsppy>=5.3.0",
|
|
9
10
|
"matplotlib>=3.10.8",
|
|
10
11
|
"networkx>=3.6.1",
|
|
@@ -2,5 +2,6 @@ from .finals import finals
|
|
|
2
2
|
from .pagerank import pagerank
|
|
3
3
|
from .recommenders_pca import recommenders_pca
|
|
4
4
|
from .test import test
|
|
5
|
+
from .pagerank_mat import pagerank_mat
|
|
5
6
|
|
|
6
|
-
__all__ = ["finals", "test", "pagerank", "recommenders_pca"]
|
|
7
|
+
__all__ = ["finals", "test", "pagerank", "recommenders_pca", "pagerank_mat"]
|
|
@@ -4,6 +4,7 @@ from .pagerank import pagerank
|
|
|
4
4
|
from .pca import pca
|
|
5
5
|
from .pca_svd import pca_svd
|
|
6
6
|
from .svd import svd
|
|
7
|
+
from .pagerank_mat import pagerank_mat
|
|
7
8
|
|
|
8
9
|
__all__ = [
|
|
9
10
|
"collaborative_filtering",
|
|
@@ -12,4 +13,5 @@ __all__ = [
|
|
|
12
13
|
"pca",
|
|
13
14
|
"pca_svd",
|
|
14
15
|
"svd",
|
|
16
|
+
"pagerank_mat",
|
|
15
17
|
]
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
import matplotlib.pyplot as plt
|
|
2
|
+
import networkx as nx
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
# ===== INPUT =====
|
|
7
|
+
pages = ["A", "B", "C", "D"]
|
|
8
|
+
A = np.array(
|
|
9
|
+
[
|
|
10
|
+
[0, 1, 1, 0], # A -> B,C
|
|
11
|
+
[0, 0, 1, 0], # B -> C
|
|
12
|
+
[1, 0, 0, 0], # C -> A
|
|
13
|
+
[0, 0, 1, 0], # D -> C
|
|
14
|
+
],
|
|
15
|
+
dtype=float,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
d = 0.85
|
|
19
|
+
max_iter = 20
|
|
20
|
+
tol = 1e-8
|
|
21
|
+
n = len(pages)
|
|
22
|
+
|
|
23
|
+
# ===== STEP 1: TRANSITION MATRIX (COLUMN STOCHASTIC) =====
|
|
24
|
+
S = np.zeros_like(A)
|
|
25
|
+
|
|
26
|
+
for j in range(n):
|
|
27
|
+
out_degree = A[j].sum()
|
|
28
|
+
if out_degree > 0:
|
|
29
|
+
S[j] = A[j] / out_degree
|
|
30
|
+
else:
|
|
31
|
+
S[j] = np.ones(n) / n # dangling node
|
|
32
|
+
|
|
33
|
+
S = S.T # convert to column-stochastic
|
|
34
|
+
|
|
35
|
+
print("Transition Matrix S:")
|
|
36
|
+
print(pd.DataFrame(S, index=pages, columns=pages))
|
|
37
|
+
|
|
38
|
+
# ===== STEP 2: GOOGLE MATRIX =====
|
|
39
|
+
M = d * S + (1 - d) / n * np.ones((n, n))
|
|
40
|
+
|
|
41
|
+
print("\nGoogle Matrix M:")
|
|
42
|
+
print(pd.DataFrame(M, index=pages, columns=pages))
|
|
43
|
+
|
|
44
|
+
# ===== STEP 3: INITIAL RANK =====
|
|
45
|
+
r = np.ones(n) / n
|
|
46
|
+
history = [r.copy()]
|
|
47
|
+
|
|
48
|
+
print("\nInitial PageRank:")
|
|
49
|
+
print(pd.DataFrame({"Page": pages, "PR": r}))
|
|
50
|
+
|
|
51
|
+
# ===== STEP 4: ITERATIONS =====
|
|
52
|
+
for it in range(1, max_iter + 1):
|
|
53
|
+
r_new = M @ r
|
|
54
|
+
history.append(r_new.copy())
|
|
55
|
+
|
|
56
|
+
print(f"\nIteration {it}")
|
|
57
|
+
print(pd.DataFrame({"Page": pages, "PR": r_new.round(6)}))
|
|
58
|
+
|
|
59
|
+
diff = np.linalg.norm(r_new - r, 1)
|
|
60
|
+
r = r_new
|
|
61
|
+
|
|
62
|
+
if diff < tol:
|
|
63
|
+
break
|
|
64
|
+
|
|
65
|
+
# ===== FINAL RESULT =====
|
|
66
|
+
final_df = pd.DataFrame({"Page": pages, "Final PR": r})
|
|
67
|
+
final_df = final_df.sort_values("Final PR", ascending=False)
|
|
68
|
+
|
|
69
|
+
print("\nFinal PageRank:")
|
|
70
|
+
print(final_df.round(6))
|
|
71
|
+
|
|
72
|
+
# ===== GRAPH VISUALIZATION (NODE SIZE ∝ PageRank) =====
|
|
73
|
+
G = nx.DiGraph()
|
|
74
|
+
for i, src in enumerate(pages):
|
|
75
|
+
for j, dst in enumerate(pages):
|
|
76
|
+
if A[i, j] == 1:
|
|
77
|
+
G.add_edge(src, dst)
|
|
78
|
+
|
|
79
|
+
plt.figure(figsize=(6, 4))
|
|
80
|
+
pos = nx.spring_layout(G, seed=42)
|
|
81
|
+
|
|
82
|
+
# --- Min-Max scaling for node sizes ---
|
|
83
|
+
pr_dict = {pages[i]: r[i] for i in range(n)}
|
|
84
|
+
pr_values = np.array([pr_dict[p] for p in G.nodes()])
|
|
85
|
+
|
|
86
|
+
min_size, max_size = 500, 5000
|
|
87
|
+
sizes = min_size + (pr_values - pr_values.min()) / (pr_values.max() - pr_values.min()) * (max_size - min_size)
|
|
88
|
+
|
|
89
|
+
nx.draw(G, pos, with_labels=True, node_size=sizes, arrows=True)
|
|
90
|
+
plt.title("Graph Visualization (node size ∝ PageRank)")
|
|
91
|
+
plt.show()
|
|
92
|
+
|
|
93
|
+
# ===== VISUALIZATION 2: CONVERGENCE =====
|
|
94
|
+
history_arr = np.array(history)
|
|
95
|
+
|
|
96
|
+
plt.figure()
|
|
97
|
+
for i, p in enumerate(pages):
|
|
98
|
+
plt.plot(history_arr[:, i], label=p)
|
|
99
|
+
|
|
100
|
+
plt.xlabel("Iteration")
|
|
101
|
+
plt.ylabel("PageRank")
|
|
102
|
+
plt.title("PageRank Convergence")
|
|
103
|
+
plt.legend()
|
|
104
|
+
plt.grid()
|
|
105
|
+
plt.show()
|
|
106
|
+
|
|
107
|
+
# ===== VISUALIZATION 3: FINAL SCORES =====
|
|
108
|
+
plt.figure()
|
|
109
|
+
plt.bar(final_df["Page"], final_df["Final PR"])
|
|
110
|
+
plt.xlabel("Page")
|
|
111
|
+
plt.ylabel("PageRank Score")
|
|
112
|
+
plt.title("Final PageRank Ranking")
|
|
113
|
+
plt.grid()
|
|
114
|
+
plt.show()
|
{itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/KALKI/collaborative_filtering.py
RENAMED
|
@@ -224,7 +224,7 @@ for i in range(df_iu.shape[0]):
|
|
|
224
224
|
if i == ti and j == tu:
|
|
225
225
|
ax.add_patch(
|
|
226
226
|
plt.Rectangle(
|
|
227
|
-
(j - 0.5, i - 0.5), 1, 1, fill=True, color="
|
|
227
|
+
(j - 0.5, i - 0.5), 1, 1, fill=True, color="lightblue", zorder=2
|
|
228
228
|
)
|
|
229
229
|
)
|
|
230
230
|
ax.text(
|
{itertoolkit-1.5.4 → itertoolkit-1.5.9}/bm_preprocessing/src/KALKI/content_based_filtering.py
RENAMED
|
@@ -369,12 +369,12 @@ def set_tick_labels(ax, df):
|
|
|
369
369
|
ax = axes[0, 0]
|
|
370
370
|
ti = list(df_iu.index).index(target_item)
|
|
371
371
|
tu = list(df_iu.columns).index(target_user)
|
|
372
|
-
im = ax.imshow(df_iu.values.astype(float), cmap="
|
|
372
|
+
im = ax.imshow(df_iu.values.astype(float), cmap="viridis", aspect="auto", vmin=1, vmax=5)
|
|
373
373
|
for i in range(df_iu.shape[0]):
|
|
374
374
|
for j in range(df_iu.shape[1]):
|
|
375
375
|
val = df_iu.iloc[i, j]
|
|
376
376
|
if i == ti and j == tu:
|
|
377
|
-
ax.add_patch(plt.Rectangle((j-.5, i-.5), 1, 1, fill=True, color="
|
|
377
|
+
ax.add_patch(plt.Rectangle((j-.5, i-.5), 1, 1, fill=True, color="lightblue", zorder=2))
|
|
378
378
|
ax.text(j, i, "?", ha="center", va="center", fontsize=9, fontweight="bold", color="navy", zorder=3)
|
|
379
379
|
elif pd.notna(val):
|
|
380
380
|
ax.text(j, i, int(val), ha="center", va="center", fontsize=8)
|