dataset-complexity-profiler 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,9 @@
1
+ """Пакет dataset_complexity_profiler.
2
+
3
+ Предоставляет основной класс `DatasetProfiler` для анализа текстовых датасетов
4
+ и рекомендации оптимальной размерности эмбеддингов.
5
+ """
6
+
7
+ from .dataset_adapter import DatasetProfiler
8
+
9
+ __all__ = ["DatasetProfiler"]
@@ -0,0 +1,662 @@
1
+ import warnings
2
+ import signal
3
+ import joblib
4
+ from pathlib import Path
5
+ from typing import Dict, List, Optional, Tuple
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ from pymfe.mfe import MFE
10
+ from sklearn.decomposition import PCA
11
+ from sklearn.ensemble import RandomForestRegressor
12
+ from sklearn.impute import SimpleImputer
13
+ from sklearn.linear_model import LogisticRegression
14
+ from sklearn.model_selection import train_test_split
15
+ from sklearn.pipeline import Pipeline
16
+ from sklearn.preprocessing import StandardScaler
17
+
18
+ warnings.filterwarnings("ignore")
19
+
20
+ _DEFAULT_TIMEOUT_SECONDS = 20
21
+
22
+
23
+ class TimeoutException(Exception):
24
+ """Исключение, которое выбрасывается, если извлечение мета-признаков слишком долго."""
25
+ pass
26
+
27
+
28
+ def _timeout_handler(signum, frame):
29
+ """Обработчик таймаута для извлечения мета-признаков."""
30
+ raise TimeoutException("Meta-feature extraction timed out")
31
+
32
+
33
+ class DatasetProfiler:
34
+ """Инструмент для анализа текстовых датасетов и рекомендации оптимального размера эмбеддингов.
35
+
36
+ Основные возможности:
37
+ - Быстро проанализировать датасет и получить рекомендацию по размерности
38
+ - Использовать встроенную мета-модель или обучить свою на ваших данных
39
+ - Сжимать эмбеддинги с помощью PCA при необходимости
40
+ - Получать рекомендации по архитектуре нейросети
41
+
42
+ Примеры использования:
43
+ 1. Быстрый старт с встроенной моделью: `profiler = DatasetProfiler()`
44
+ 2. Анализ датасета: `report = profiler.analyze_text_dataset(texts, labels, "MyDataset")`
45
+ 3. Полный конвейер: `X_compressed = profiler.fit_transform(texts, labels)`
46
+ 4. Дообучение на своих данных: `profiler.train_custom_meta_model(my_datasets)`
47
+ """
48
+
49
+ def __init__(
50
+ self,
51
+ timeout_seconds: int = _DEFAULT_TIMEOUT_SECONDS,
52
+ auto_load_meta_model: bool = True,
53
+ meta_model_path: Optional[str] = None,
54
+ ):
55
+ self.imputer = SimpleImputer(strategy="mean")
56
+ self.meta_model = Pipeline(
57
+ [
58
+ ("scaler", StandardScaler()),
59
+ (
60
+ "regressor",
61
+ RandomForestRegressor(n_estimators=100, max_depth=6, random_state=42),
62
+ ),
63
+ ]
64
+ )
65
+ self.is_fitted = False
66
+ self.timeout_seconds = timeout_seconds
67
+
68
+ if auto_load_meta_model:
69
+ try:
70
+ self.load_default_meta_model(meta_model_path)
71
+ except Exception:
72
+ pass
73
+
74
+ def _prepare_data(self, X, y) -> Tuple[np.ndarray, np.ndarray]:
75
+ """Подготавливает данные для анализа: чистит NaN, преобразует типы.
76
+
77
+ Возвращает:
78
+ - Кортеж (очищенная матрица признаков, целые метки)
79
+ """
80
+ X_arr = np.array(X, dtype=float)
81
+ y_arr = np.array(y)
82
+
83
+ if X_arr.ndim == 1:
84
+ X_arr = X_arr.reshape(-1, 1)
85
+
86
+ X_clean = self.imputer.fit_transform(X_arr)
87
+
88
+ if y_arr.ndim != 1:
89
+ y_arr = y_arr.ravel()
90
+
91
+ if y_arr.dtype.kind not in "biufc":
92
+ y_arr = pd.factorize(y_arr)[0]
93
+
94
+ return X_clean, y_arr
95
+
96
+ def extract_meta_features(self, X, y, return_feature_names: bool = False):
97
+ """Извлекает мета-признаки датасета с помощью PyMFE.
98
+
99
+ Параметры:
100
+ - X: матрица признаков
101
+ - y: метки классов
102
+ - return_feature_names: если True, также возвращает названия признаков
103
+
104
+ Возвращает:
105
+ - Массив мета-признаков или кортеж (названия, признаки)
106
+ """
107
+ X_clean, y_arr = self._prepare_data(X, y)
108
+
109
+ if hasattr(signal, "SIGALRM"):
110
+ signal.signal(signal.SIGALRM, _timeout_handler)
111
+ signal.alarm(self.timeout_seconds)
112
+
113
+ try:
114
+ mfe = MFE(groups=["general", "statistical", "info-theory", "complexity"])
115
+ mfe.fit(X_clean, y_arr)
116
+ feature_names, ft_vals = mfe.extract()
117
+ features = np.nan_to_num(ft_vals, nan=0.0, posinf=0.0, neginf=0.0)
118
+ if features.size == 0:
119
+ raise RuntimeError("PyMFE returned an empty feature vector")
120
+ if return_feature_names:
121
+ return list(feature_names), features
122
+ return features
123
+ except TimeoutException as exc:
124
+ raise RuntimeError("Meta-feature extraction timed out") from exc
125
+ except Exception as exc:
126
+ raise RuntimeError("Meta-feature extraction failed") from exc
127
+ finally:
128
+ if hasattr(signal, "SIGALRM"):
129
+ signal.alarm(0)
130
+
131
+ def build_meta_feature_vector(self, X, y) -> np.ndarray:
132
+ """Строит полный вектор мета-признаков для предсказания мета-моделью.
133
+
134
+ Включает базовые статистики датасета (количество образцов, размерность, количество классов)
135
+ и детальные мета-признаки из PyMFE.
136
+ """
137
+ X_clean, y_arr = self._prepare_data(X, y)
138
+ # If labels contain a single class, `_evaluate_quality` would fail (stratify / solver requirements).
139
+ # In that case fall back to a conservative default baseline quality (0.0).
140
+ class_count = int(len(np.unique(y_arr)))
141
+ if class_count < 2:
142
+ baseline_quality = 0.0
143
+ else:
144
+ baseline_quality = float(self._evaluate_quality(X_clean, y_arr))
145
+
146
+ dataset_summary = np.array(
147
+ [
148
+ float(X_clean.shape[0]),
149
+ float(X_clean.shape[1]),
150
+ float(class_count),
151
+ baseline_quality,
152
+ float(self._pca_intrinsic_dim(X_clean, target_variance=0.95)),
153
+ ],
154
+ dtype=float,
155
+ )
156
+ raw_features = self.extract_meta_features(X_clean, y_arr)
157
+ features = np.concatenate([dataset_summary, raw_features])
158
+ return np.nan_to_num(features, nan=0.0, posinf=0.0, neginf=0.0)
159
+
160
+ def fit_meta_model(self, meta_X: np.ndarray, meta_y: np.ndarray) -> None:
161
+ """Обучает мета-модель на уже подготовленных матрицах признаков и целевых значений.
162
+
163
+ Параметры:
164
+ - meta_X: матрица мета-признаков (n_datasets, n_features)
165
+ - meta_y: целевые размерности (n_datasets,)
166
+ """
167
+ self.meta_model.fit(meta_X, meta_y)
168
+ self.is_fitted = True
169
+
170
+ def load_meta_model(self, model_path: str) -> None:
171
+ """Загружает сохранённую мета-модель с диска."""
172
+ path = Path(model_path)
173
+ if not path.is_file():
174
+ raise FileNotFoundError(f"Meta-model file not found: {model_path}")
175
+ self.meta_model = joblib.load(path)
176
+ self.is_fitted = True
177
+
178
+ def save_meta_model(self, model_path: str) -> None:
179
+ """Сохраняет текущую мета-модель на диск."""
180
+ path = Path(model_path)
181
+ joblib.dump(self.meta_model, path)
182
+
183
+ def load_default_meta_model(self, model_path: Optional[str] = None) -> None:
184
+ """Загружает встроенную мета-модель, которая идёт в пакете. Если файл не найден, тихо игнорирует ошибку."""
185
+ if model_path is not None:
186
+ self.load_meta_model(model_path)
187
+ return
188
+
189
+ default_path = Path(__file__).resolve().parent / "meta_model.pkl"
190
+ if default_path.is_file():
191
+ self.load_meta_model(str(default_path))
192
+
193
+ def train_custom_meta_model(
194
+ self,
195
+ datasets: List[Dict],
196
+ quality_threshold: float = 0.95,
197
+ embedder_name: str = "all-MiniLM-L6-v2",
198
+ batch_size: int = 64,
199
+ show_progress: bool = True,
200
+ cv: Optional[int] = None,
201
+ ) -> None:
202
+ """Обучает мета-модель на ваших датасетах для адаптации к конкретному домену.
203
+
204
+ Каждый датасет в списке — это словарь с ключами `texts` (список строк) и `labels` (список меток).
205
+ Метод:
206
+ 1. Эмбеддит тексты
207
+ 2. Оценивает оптимальную размерность для каждого датасета
208
+ 3. Извлекает мета-признаки
209
+ 4. Обучает модель на этих признаках
210
+
211
+ Параметры:
212
+ - datasets: список словарей {"texts": [...], "labels": [...]}
213
+ - quality_threshold: нужное качество классификации (по умолчанию 95%)
214
+ - embedder_name: модель для эмбеддинга (по умолчанию all-MiniLM-L6-v2)
215
+ - batch_size: размер батча при эмбеддинге
216
+ - show_progress: показывать ли прогресс
217
+ - cv: если указано, проверяет качество кросс-валидацией
218
+ """
219
+ feature_list = []
220
+ target_list = []
221
+
222
+ total = len(datasets)
223
+ if total == 0:
224
+ raise ValueError("No datasets provided for training")
225
+
226
+ for idx, ds in enumerate(datasets, start=1):
227
+ print(f"Processing dataset {idx}/{total} for meta-model training...")
228
+ texts = ds.get("texts")
229
+ labels = ds.get("labels")
230
+ if texts is None or labels is None:
231
+ print(" Skipping dataset: missing 'texts' or 'labels'")
232
+ continue
233
+
234
+ # Embed
235
+ print(f" Embedding {len(texts)} texts...")
236
+ X = self.embed_texts(texts, embedder_name=embedder_name, batch_size=batch_size, show_progress=show_progress)
237
+
238
+ # Estimate empirical target dimension. If labels lack class diversity, use PCA-based intrinsic dim.
239
+ print(" Estimating intrinsic / recommended dimension via empirical search...")
240
+ if len(np.unique(labels)) < 2:
241
+ recommended = int(self._pca_intrinsic_dim(X, target_variance=0.95))
242
+ print(" -> Single-class labels detected; using PCA intrinsic-dim fallback")
243
+ else:
244
+ info = self.estimate_intrinsic_dim(X, labels, quality_threshold=quality_threshold)
245
+ recommended = int(info.get("recommended_dim", 2))
246
+ print(f" -> Recommended dim (empirical): {recommended}")
247
+
248
+ # Build meta-feature vector
249
+ print(" Building meta-feature vector...")
250
+ feats = self.build_meta_feature_vector(X, labels)
251
+ if feats.ndim != 1:
252
+ feats = np.ravel(feats)
253
+
254
+ feature_list.append(feats)
255
+ target_list.append(float(recommended))
256
+
257
+ if len(feature_list) < 2:
258
+ raise ValueError("Need at least two datasets with features to train meta-model")
259
+
260
+ X_meta = np.vstack(feature_list)
261
+ y_meta = np.array(target_list, dtype=float)
262
+
263
+ print(f"Training meta-model on {X_meta.shape[0]} datasets with {X_meta.shape[1]} features each...")
264
+
265
+ # Optionally evaluate via cross-validation
266
+ if cv is None:
267
+ cv_folds = min(3, X_meta.shape[0])
268
+ else:
269
+ cv_folds = min(cv, X_meta.shape[0])
270
+
271
+ if cv_folds >= 2:
272
+ try:
273
+ from sklearn.model_selection import cross_val_score
274
+
275
+ print(f" Running cross-validation (cv={cv_folds})...")
276
+ scores = cross_val_score(self.meta_model, X_meta, y_meta, cv=cv_folds, scoring="r2")
277
+ print(f" Cross-validation R²: {scores}")
278
+ except Exception:
279
+ print(" Cross-validation failed or not available; continuing to fit")
280
+
281
+ # Fit the meta-model
282
+ self.meta_model.fit(X_meta, y_meta)
283
+ self.is_fitted = True
284
+
285
+ # Print feature importance if available
286
+ try:
287
+ feat_imp = self.meta_model.named_steps["regressor"].feature_importances_
288
+ print("Meta-model trained. Feature importances (first 10):", feat_imp[:10])
289
+ except Exception:
290
+ print("Meta-model trained.")
291
+
292
+ def predict_embedding_dim(self, X, y, min_dim: int = 2, max_dim: Optional[int] = None) -> int:
293
+ """Предсказывает оптимальный размер эмбеддинга для датасета с помощью мета-модели.
294
+
295
+ Параметры:
296
+ - X: матрица эмбеддингов или признаков
297
+ - y: метки классов
298
+ - min_dim: минимальная допустимая размерность
299
+ - max_dim: максимальная допустимая размерность
300
+
301
+ Возвращает:
302
+ - Рекомендуемый размер эмбеддинга
303
+ """
304
+ if not self.is_fitted:
305
+ raise ValueError("Meta model is not fitted yet. Call load_meta_model() first.")
306
+
307
+ X_clean, y_arr = self._prepare_data(X, y)
308
+ max_dim = int(max_dim or X_clean.shape[1])
309
+ features = self.build_meta_feature_vector(X_clean, y_arr).reshape(1, -1)
310
+ predicted = int(round(self.meta_model.predict(features)[0]))
311
+ return max(min_dim, min(predicted, max_dim))
312
+
313
+ def _build_quality_curve(
314
+ self,
315
+ X: np.ndarray,
316
+ y: np.ndarray,
317
+ dims: List[int],
318
+ baseline_score: float,
319
+ threshold: float,
320
+ ) -> Tuple[List[Dict[str, float]], int, float]:
321
+ """Строит кривую зависимости качества классификации от размера эмбеддинга.
322
+
323
+ Тестирует разные размеры PCA и оценивает качество LogisticRegression для каждого.
324
+ """
325
+ try:
326
+ X_train, X_test, y_train, y_test = train_test_split(
327
+ X, y, test_size=0.3, random_state=42, stratify=y
328
+ )
329
+ except ValueError:
330
+ X_train, X_test, y_train, y_test = train_test_split(
331
+ X, y, test_size=0.3, random_state=42, stratify=None
332
+ )
333
+ estimator = LogisticRegression(max_iter=2000, solver="lbfgs")
334
+ curve = []
335
+ best_dim = dims[0]
336
+ best_score = 0.0
337
+
338
+ for dim in dims:
339
+ n_components = min(dim, X_train.shape[0], X_train.shape[1])
340
+ pca = PCA(n_components=n_components)
341
+ X_train_pca = pca.fit_transform(X_train)
342
+ X_test_pca = pca.transform(X_test)
343
+ estimator.fit(X_train_pca, y_train)
344
+ score = estimator.score(X_test_pca, y_test)
345
+ curve.append({"dim": dim, "quality": float(score)})
346
+
347
+ if score > best_score:
348
+ best_score = score
349
+ best_dim = dim
350
+
351
+ threshold_value = baseline_score * threshold
352
+ recommended_dim = next(
353
+ (entry["dim"] for entry in curve if entry["quality"] >= threshold_value),
354
+ best_dim,
355
+ )
356
+ return curve, recommended_dim, best_score
357
+
358
+ def estimate_intrinsic_dim(
359
+ self,
360
+ X,
361
+ y,
362
+ quality_threshold: float = 0.95,
363
+ max_dim: Optional[int] = None,
364
+ ) -> Dict[str, object]:
365
+ """Оценивает внутреннюю размерность датасета и ищет оптимальный размер эмбеддинга.
366
+
367
+ Параметры:
368
+ - X: матрица эмбеддингов
369
+ - y: метки классов
370
+ - quality_threshold: целевое качество классификации (0.95 = 95%)
371
+ - max_dim: максимальный размер для поиска
372
+
373
+ Возвращает:
374
+ - Dict с рекомендациями, оценками размерности и кривой качества
375
+ """
376
+ X_clean, y_arr = self._prepare_data(X, y)
377
+ original_dim = X_clean.shape[1]
378
+ effective_max_dim = int(min(max_dim or original_dim, original_dim, X_clean.shape[0] - 1))
379
+
380
+ baseline_score = self._evaluate_quality(X_clean, y_arr)
381
+
382
+ if effective_max_dim <= 5:
383
+ dims = list(range(2, effective_max_dim + 1))
384
+ else:
385
+ dims = np.unique(
386
+ np.concatenate(
387
+ [
388
+ np.arange(2, min(25, effective_max_dim) + 1, dtype=int),
389
+ np.linspace(25, effective_max_dim, num=12, dtype=int),
390
+ ]
391
+ )
392
+ ).tolist()
393
+
394
+ curve, recommended_dim, best_score = self._build_quality_curve(
395
+ X_clean, y_arr, dims, baseline_score, quality_threshold
396
+ )
397
+
398
+ explained_dim = self._pca_intrinsic_dim(X_clean, target_variance=0.95)
399
+
400
+ return {
401
+ "original_dim": original_dim,
402
+ "intrinsic_dim_estimate": int(explained_dim),
403
+ "recommended_dim": int(recommended_dim),
404
+ "recommended_threshold": float(round(baseline_score * quality_threshold, 4)),
405
+ "baseline_quality": float(round(baseline_score, 4)),
406
+ "best_dim_quality": float(round(best_score, 4)),
407
+ "quality_threshold": float(quality_threshold),
408
+ "quality_curve": curve,
409
+ }
410
+
411
+ def _evaluate_quality(self, X: np.ndarray, y: np.ndarray) -> float:
412
+ """Оценивает качество классификации на полном наборе признаков (базовая метрика).
413
+
414
+ Возвращает:
415
+ - Точность LogisticRegression на тестовом наборе
416
+ """
417
+ if X.shape[1] == 0:
418
+ return 0.0
419
+
420
+ estimator = LogisticRegression(max_iter=2000, solver="lbfgs")
421
+ try:
422
+ X_train, X_test, y_train, y_test = train_test_split(
423
+ X, y, test_size=0.3, random_state=42, stratify=y
424
+ )
425
+ except ValueError:
426
+ X_train, X_test, y_train, y_test = train_test_split(
427
+ X, y, test_size=0.3, random_state=42, stratify=None
428
+ )
429
+ estimator.fit(X_train, y_train)
430
+ return float(estimator.score(X_test, y_test))
431
+
432
+ def _pca_intrinsic_dim(self, X: np.ndarray, target_variance: float = 0.95) -> int:
433
+ """Оценивает внутреннюю размерность датасета через PCA.
434
+
435
+ Вычисляет количество главных компонент, нужных для объяснения целевого процента дисперсии.
436
+ """
437
+ max_components = min(X.shape[1], X.shape[0] - 1)
438
+ if max_components < 1:
439
+ return 1
440
+ pca = PCA(n_components=max_components)
441
+ pca.fit(X)
442
+ cumulative = np.cumsum(pca.explained_variance_ratio_)
443
+ idx = int(np.searchsorted(cumulative, target_variance, side="left") + 1)
444
+ return max(1, min(idx, X.shape[1]))
445
+
446
+ def recommend_architecture(
447
+ self,
448
+ original_dim: int,
449
+ recommended_dim: int,
450
+ n_classes: Optional[int] = None,
451
+ ) -> Dict[str, object]:
452
+ """Рекомендует архитектуру нейросети на основе размера эмбеддинга и количества классов."""
453
+ width_ratio = recommended_dim / original_dim if original_dim else 1.0
454
+ if n_classes is None:
455
+ n_classes = 2
456
+
457
+ if n_classes <= 2:
458
+ model_type = "Binary classifier with a compact embedding and shallow head"
459
+ elif n_classes <= 10:
460
+ model_type = "Multi-class classifier with moderate embedding size"
461
+ else:
462
+ model_type = "Wide embedding + transformer-style head"
463
+
464
+ architecture = {
465
+ "model_type": model_type,
466
+ "embedding_dim": recommended_dim,
467
+ "linear_layer_dim": max(16, int(recommended_dim * 1.5)),
468
+ "compression_ratio": float(round((1 - width_ratio) * 100, 2)),
469
+ "note": (
470
+ "Если задача сложнее, сохраняйте пространство не слишком узким. "
471
+ "Рекомендуется сначала протестировать PCA/linear reduction на этой размерности."
472
+ ),
473
+ }
474
+ return architecture
475
+
476
+ def analyze_and_adapt(
477
+ self,
478
+ X,
479
+ y,
480
+ dataset_name: str = "Unknown",
481
+ quality_threshold: float = 0.95,
482
+ ) -> Dict[str, object]:
483
+ """Анализирует уже готовые эмбеддинги и выдаёт рекомендации по размерности.
484
+
485
+ Параметры:
486
+ - X: матрица эмбеддингов (n_samples, embedding_dim)
487
+ - y: метки классов
488
+ - dataset_name: название датасета для отчёта
489
+ - quality_threshold: требуемое качество классификации
490
+
491
+ Возвращает:
492
+ - Dict с рекомендациями, метриками и кривой качества
493
+ """
494
+ X_clean, y_arr = self._prepare_data(X, y)
495
+ meta_features = self.extract_meta_features(X_clean, y_arr)
496
+ dimension_info = self.estimate_intrinsic_dim(
497
+ X_clean, y_arr, quality_threshold=quality_threshold
498
+ )
499
+ arch = self.recommend_architecture(
500
+ dimension_info["original_dim"],
501
+ dimension_info["recommended_dim"],
502
+ n_classes=int(np.unique(y_arr).size),
503
+ )
504
+ prediction = None
505
+ if self.is_fitted:
506
+ prediction = self.predict_embedding_dim(X_clean, y_arr)
507
+
508
+ return {
509
+ "dataset_name": dataset_name,
510
+ "sample_count": int(X_clean.shape[0]),
511
+ "original_dim": int(X_clean.shape[1]),
512
+ "class_count": int(np.unique(y_arr).size),
513
+ "baseline_quality": dimension_info["baseline_quality"],
514
+ "intrinsic_dim_estimate": dimension_info["intrinsic_dim_estimate"],
515
+ "recommended_embedding_dim": dimension_info["recommended_dim"],
516
+ "quality_threshold": dimension_info["quality_threshold"],
517
+ "recommended_quality_target": dimension_info["recommended_threshold"],
518
+ "architecture_recommendation": arch,
519
+ "meta_features": meta_features.tolist(),
520
+ "quality_curve": dimension_info["quality_curve"],
521
+ "meta_model_prediction": int(prediction) if prediction is not None else None,
522
+ }
523
+
524
+ def embed_texts(
525
+ self,
526
+ texts,
527
+ embedder_name: str = "all-MiniLM-L6-v2",
528
+ batch_size: int = 64,
529
+ show_progress: bool = True,
530
+ ) -> np.ndarray:
531
+ """Преобразует список текстов в векторы эмбеддингов.
532
+
533
+ Параметры:
534
+ - texts: список строк
535
+ - embedder_name: модель SentenceTransformer для эмбеддинга
536
+ - batch_size: размер батча для обработки
537
+ - show_progress: показывать ли прогресс-бар
538
+
539
+ Возвращает:
540
+ - Матрица эмбеддингов (n_samples, embedding_dim)
541
+ """
542
+ try:
543
+ from sentence_transformers import SentenceTransformer
544
+ except ImportError as exc:
545
+ raise ImportError(
546
+ "sentence-transformers is required for text embedding support. "
547
+ "Install it with uv run python -m pip install sentence-transformers"
548
+ ) from exc
549
+
550
+ embedder = SentenceTransformer(embedder_name)
551
+ return np.array(
552
+ embedder.encode(
553
+ texts,
554
+ batch_size=batch_size,
555
+ show_progress_bar=show_progress,
556
+ convert_to_numpy=True,
557
+ )
558
+ )
559
+
560
+ def fit_transform(
561
+ self,
562
+ texts: List[str],
563
+ labels,
564
+ embedder_name: str = "all-MiniLM-L6-v2",
565
+ batch_size: int = 64,
566
+ show_progress: bool = True,
567
+ return_pca: bool = False,
568
+ ) -> np.ndarray:
569
+ """Полный конвейер от текстов к сжатым эмбеддингам. Используйте эту функцию для быстрого старта!
570
+
571
+ Шаги:
572
+ 1) Эмбеддит тексты
573
+ 2) Предсказывает оптимальную размерность с помощью мета-модели
574
+ 3) Применяет PCA для сжатия
575
+ 4) Возвращает готовые векторы размера (n_samples, optimal_dim)
576
+
577
+ Параметры:
578
+ - texts: список текстов
579
+ - labels: метки классов (используются для предсказания размерности)
580
+ - embedder_name: модель для эмбеддинга
581
+ - batch_size: размер батча
582
+ - show_progress: показывать ли прогресс
583
+ - return_pca: если True, возвращает также объект PCA
584
+
585
+ Возвращает:
586
+ - Сжатые эмбеддинги формы (n_samples, optimal_dim)
587
+ - Если return_pca=True, возвращает кортеж (X_compressed, pca)
588
+ """
589
+ print(f"Step 1/5: Embedding {len(texts)} texts using '{embedder_name}'...")
590
+ X_raw = self.embed_texts(
591
+ texts, embedder_name=embedder_name, batch_size=batch_size, show_progress=show_progress
592
+ )
593
+ print(f" -> Embedded into shape: {X_raw.shape}")
594
+
595
+ # Determine optimal dimension
596
+ if self.is_fitted:
597
+ print("Step 2/5: Meta-model loaded — predicting optimal dimension using meta-model...")
598
+ optimal_dim = int(self.predict_embedding_dim(X_raw, labels))
599
+ else:
600
+ print("Step 2/5: Meta-model not loaded — estimating intrinsic dimension via PCA variance...")
601
+ optimal_dim = int(self._pca_intrinsic_dim(X_raw, target_variance=0.95))
602
+
603
+ optimal_dim = max(1, int(optimal_dim))
604
+ optimal_dim = min(optimal_dim, X_raw.shape[0], X_raw.shape[1])
605
+ print(f" -> Chosen optimal dimension: {optimal_dim}")
606
+
607
+ # PCA compression
608
+ print("Step 3/5: Initializing PCA...")
609
+ pca = PCA(n_components=optimal_dim)
610
+
611
+ print("Step 4/5: Fitting PCA and transforming embeddings...")
612
+ X_compressed = pca.fit_transform(X_raw)
613
+
614
+ print("Step 5/5: Compression complete — returning compressed embeddings.")
615
+ X_arr = np.asarray(X_compressed)
616
+ if return_pca:
617
+ return X_arr, pca
618
+ return X_arr
619
+
620
+ def analyze_text_dataset(
621
+ self,
622
+ texts,
623
+ labels,
624
+ dataset_name: str,
625
+ embedder_name: str = "all-MiniLM-L6-v2",
626
+ batch_size: int = 64,
627
+ sample_limit: Optional[int] = None,
628
+ quality_threshold: float = 0.95,
629
+ ) -> Dict[str, object]:
630
+ """Анализирует датасет от начала до конца: эмбеддит тексты и выдаёт полный отчёт.
631
+
632
+ Параметры:
633
+ - texts: список текстов для анализа
634
+ - labels: метки классов
635
+ - dataset_name: название датасета (для отчёта)
636
+ - embedder_name: модель для эмбеддинга
637
+ - batch_size: размер батча
638
+ - sample_limit: если указано, анализирует только первые N образцов
639
+ - quality_threshold: требуемое качество классификации
640
+
641
+ Возвращает:
642
+ - Полный отчёт с рекомендациями, метриками и кривой качества
643
+ """
644
+ if sample_limit is not None:
645
+ texts = texts[:sample_limit]
646
+ labels = labels[:sample_limit]
647
+
648
+ X = self.embed_texts(
649
+ texts,
650
+ embedder_name=embedder_name,
651
+ batch_size=batch_size,
652
+ show_progress=True,
653
+ )
654
+ report = self.analyze_and_adapt(
655
+ X,
656
+ labels,
657
+ dataset_name=dataset_name,
658
+ quality_threshold=quality_threshold,
659
+ )
660
+ report["embedder_name"] = embedder_name
661
+ report["sample_limit"] = int(sample_limit) if sample_limit is not None else None
662
+ return report
@@ -0,0 +1,187 @@
1
+ Metadata-Version: 2.4
2
+ Name: dataset-complexity-profiler
3
+ Version: 0.1.0
4
+ Summary: Text dataset complexity profiler with a packaged meta-model for automatic embedding dimension recommendation
5
+ License: MIT
6
+ Keywords: nlp,meta-learning,dataset-profiling,embeddings
7
+ Requires-Python: >=3.9
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: sentence-transformers>=2.2.2
11
+ Requires-Dist: datasets>=2.14.5
12
+ Requires-Dist: pymfe>=0.3.1
13
+ Requires-Dist: scikit-learn>=1.2.2
14
+ Requires-Dist: numpy>=1.24.0
15
+ Requires-Dist: pandas>=1.5.3
16
+ Requires-Dist: joblib>=1.2.0
17
+ Dynamic: license-file
18
+
19
+ # Dataset Complexity Profiling Tool
20
+
21
+ A text dataset complexity profiler with a packaged default meta-model for automatic embedding dimension recommendations.
22
+
23
+ This repository is organized as a package plus scripts, so you can use it as a library, run command-line flows with `uv`, and retrain the meta-model when needed.
24
+
25
+ ## Project structure
26
+
27
+ - `dataset_complexity_profiler/`
28
+ - package code
29
+ - contains `DatasetProfiler` and packaged `meta_model.pkl`
30
+ - `scripts/`
31
+ - helper scripts for demo, prediction, benchmark collection, training, and sanity checks
32
+ - `tests/`
33
+ - automated pytest unit tests
34
+ - `pyproject.toml`
35
+ - packaging and dependency settings for `uv`
36
+ - `MANIFEST.in`
37
+ - includes `dataset_complexity_profiler/meta_model.pkl` in the package
38
+ - `requirements.txt`
39
+ - runtime dependencies
40
+ - `requirements-dev.txt`
41
+ - test/development dependencies
42
+ - `uv.lock`
43
+ - `uv` dependency lock file
44
+
45
+ ## What each file does
46
+
47
+ - `dataset_complexity_profiler/__init__.py`
48
+ - exposes `DatasetProfiler` for `from dataset_complexity_profiler import DatasetProfiler`
49
+ - `dataset_complexity_profiler/dataset_adapter.py`
50
+ - the main implementation
51
+ - extracts meta-features, analyzes datasets, predicts embedding dimension, and trains custom meta-models
52
+ - `scripts/main.py`
53
+ - demo runner for benchmark datasets
54
+ - good for a quick functional check
55
+ - `scripts/predict.py`
56
+ - example script to predict optimal embedding dimension for one dataset and save JSON output
57
+ - `scripts/collect_benchmarks.py`
58
+ - builds `benchmarks.csv` from a set of benchmark text datasets
59
+ - used when preparing training data for a new meta-model
60
+ - `scripts/train_meta_model.py`
61
+ - trains the regression meta-model on `benchmarks.csv`
62
+ - saves a new `meta_model.pkl`
63
+ - `scripts/test.py`
64
+ - manual demo script for running a quick prediction flow
65
+ - a small convenience script, separate from automated tests
66
+ - `tests/`
67
+ - unit tests for package logic using `pytest`
68
+
69
+ ## What `scripts/main.py` does
70
+
71
+ - Runs the package pipeline on a set of preconfigured benchmark datasets.
72
+ - Uses the default packaged `meta_model.pkl`.
73
+ - Downloads datasets, computes embeddings, predicts optimal embedding dimension, and prints recommendations.
74
+ - Useful for a full end-to-end verification of the workflow.
75
+
76
+ ## How to demonstrate the model
77
+
78
+ Use one of these scripts to show the package working:
79
+
80
+ - `uv run python scripts/main.py`
81
+ - runs a complete benchmark demo on several datasets
82
+ - `uv run python scripts/predict.py`
83
+ - predicts optimal embedding dimension for a single dataset and writes `prediction_result.json`
84
+ - `uv run python scripts/test.py`
85
+ - quick manual check script for a simple prediction flow
86
+
87
+ ## Why `benchmarks.csv` exists
88
+
89
+ - `benchmarks.csv` is a training dataset for the meta-model.
90
+ - It is generated by `scripts/collect_benchmarks.py` from several benchmark text datasets.
91
+ - It is not required for normal use of the package because the repository already ships with a default `meta_model.pkl`.
92
+
93
+ In other words:
94
+
95
+ - `benchmarks.csv` is used for meta-model training and experimentation.
96
+ - `DatasetProfiler` can work immediately without it.
97
+
98
+ ## How the default model works
99
+
100
+ The package includes `dataset_complexity_profiler/meta_model.pkl`.
101
+ If you create `DatasetProfiler()` normally, it loads that model automatically.
102
+ That means users can get recommendations without training anything.
103
+
104
+ ## How to train your own meta-model
105
+
106
+ If you want a model tuned to your own data distribution, you do not need to pass a CSV file directly to `DatasetProfiler`.
107
+ Instead, use Python data structures with raw text and labels.
108
+
109
+ Example:
110
+
111
+ ```python
112
+ from dataset_complexity_profiler import DatasetProfiler
113
+
114
+ profiler = DatasetProfiler(auto_load_meta_model=False)
115
+
116
+ datasets = [
117
+ {"texts": ["sample 1", "sample 2"], "labels": [0, 1]},
118
+ {"texts": ["sample 3", "sample 4"], "labels": [1, 0]},
119
+ ]
120
+
121
+ profiler.train_custom_meta_model(datasets)
122
+ profiler.save_meta_model("custom_meta_model.pkl")
123
+ ```
124
+
125
+ Then later:
126
+
127
+ ```python
128
+ profiler = DatasetProfiler(auto_load_meta_model=False)
129
+ profiler.load_meta_model("custom_meta_model.pkl")
130
+ ```
131
+
132
+ If you want to generate a training dataset from benchmark datasets, use:
133
+
134
+ ```bash
135
+ uv run python scripts/collect_benchmarks.py
136
+ uv run python scripts/train_meta_model.py
137
+ ```
138
+
139
+ Then the resulting `meta_model.pkl` will be placed in `dataset_complexity_profiler/`.
140
+
141
+ ## How to use the package with `uv`
142
+
143
+ Install dependencies:
144
+
145
+ ```bash
146
+ uv sync
147
+ ```
148
+
149
+ Run the demo script:
150
+
151
+ ```bash
152
+ uv run python scripts/main.py
153
+ ```
154
+
155
+ Predict on a new dataset:
156
+
157
+ ```bash
158
+ uv run python scripts/predict.py
159
+ ```
160
+
161
+ Collect benchmark training data:
162
+
163
+ ```bash
164
+ uv run python scripts/collect_benchmarks.py
165
+ ```
166
+
167
+ Train a new meta-model:
168
+
169
+ ```bash
170
+ uv run python scripts/train_meta_model.py
171
+ ```
172
+
173
+ ## Core API
174
+
175
+ - `analyze_text_dataset(texts, labels, ...)` — full dataset analysis report
176
+ - `analyze_and_adapt(X, y, ...)` — analyze precomputed embeddings
177
+ - `fit_transform(texts, labels, ...)` — text → embeddings → compressed vectors
178
+ - `predict_embedding_dim(X, y, ...)` — recommended embedding dimension
179
+ - `train_custom_meta_model(datasets, ...)` — train a custom meta-model on user datasets
180
+ - `save_meta_model(path)` and `load_meta_model(path)`
181
+
182
+ ## Notes
183
+
184
+ - `scripts/test.py` is optional and only for manual sanity checking.
185
+ - `tests/` is the real automated test suite.
186
+ - `benchmarks.csv` is only needed when you want to retrain or expand the meta-model.
187
+ - For user-specific training, pass text/label pairs in Python, not a single CSV file.
@@ -0,0 +1,8 @@
1
+ dataset_complexity_profiler/__init__.py,sha256=y4C-que9aE4MtMWYLsRsXvwfhLaQBXqZHqTmyFk8l_w,356
2
+ dataset_complexity_profiler/dataset_adapter.py,sha256=V7v43y6c06pXgGqWKA5vwF5DNJ0Zzh4YwIVQlIETLwE,29144
3
+ dataset_complexity_profiler/meta_model.pkl,sha256=HluoOL3PievyDtXsoB0kQRE_OgryWZ5DMlkLD8KTgxw,136906
4
+ dataset_complexity_profiler-0.1.0.dist-info/licenses/LICENSE,sha256=ESYyLizI0WWtxMeS7rGVcX3ivMezm-HOd5WdeOh-9oU,1056
5
+ dataset_complexity_profiler-0.1.0.dist-info/METADATA,sha256=Zcl3ETKoP0LFXlSx-wzOjm3ziyk8piU_Lom1XIbGDHU,6094
6
+ dataset_complexity_profiler-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
7
+ dataset_complexity_profiler-0.1.0.dist-info/top_level.txt,sha256=4ZNaYI9bdmAMqiT1SgpyQzFhOr65MGBlVJnBTJRIPLw,28
8
+ dataset_complexity_profiler-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ dataset_complexity_profiler