javi-forge 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ci-local/ci-local.sh +20 -8
- package/package.json +1 -1
- package/ai-config/.skillignore +0 -15
- package/ai-config/AUTO_INVOKE.md +0 -300
- package/ai-config/agents/_TEMPLATE.md +0 -93
- package/ai-config/agents/business/api-designer.md +0 -1657
- package/ai-config/agents/business/business-analyst.md +0 -1331
- package/ai-config/agents/business/product-strategist.md +0 -206
- package/ai-config/agents/business/project-manager.md +0 -178
- package/ai-config/agents/business/requirements-analyst.md +0 -1277
- package/ai-config/agents/business/technical-writer.md +0 -1679
- package/ai-config/agents/creative/ux-designer.md +0 -205
- package/ai-config/agents/data-ai/ai-engineer.md +0 -487
- package/ai-config/agents/data-ai/analytics-engineer.md +0 -953
- package/ai-config/agents/data-ai/data-engineer.md +0 -173
- package/ai-config/agents/data-ai/data-scientist.md +0 -672
- package/ai-config/agents/data-ai/mlops-engineer.md +0 -814
- package/ai-config/agents/data-ai/prompt-engineer.md +0 -772
- package/ai-config/agents/development/angular-expert.md +0 -620
- package/ai-config/agents/development/backend-architect.md +0 -795
- package/ai-config/agents/development/database-specialist.md +0 -212
- package/ai-config/agents/development/frontend-specialist.md +0 -686
- package/ai-config/agents/development/fullstack-engineer.md +0 -668
- package/ai-config/agents/development/golang-pro.md +0 -338
- package/ai-config/agents/development/java-enterprise.md +0 -400
- package/ai-config/agents/development/javascript-pro.md +0 -422
- package/ai-config/agents/development/nextjs-pro.md +0 -474
- package/ai-config/agents/development/python-pro.md +0 -570
- package/ai-config/agents/development/react-pro.md +0 -487
- package/ai-config/agents/development/rust-pro.md +0 -246
- package/ai-config/agents/development/spring-boot-4-expert.md +0 -326
- package/ai-config/agents/development/typescript-pro.md +0 -336
- package/ai-config/agents/development/vue-specialist.md +0 -605
- package/ai-config/agents/infrastructure/cloud-architect.md +0 -472
- package/ai-config/agents/infrastructure/deployment-manager.md +0 -358
- package/ai-config/agents/infrastructure/devops-engineer.md +0 -455
- package/ai-config/agents/infrastructure/incident-responder.md +0 -519
- package/ai-config/agents/infrastructure/kubernetes-expert.md +0 -705
- package/ai-config/agents/infrastructure/monitoring-specialist.md +0 -674
- package/ai-config/agents/infrastructure/performance-engineer.md +0 -658
- package/ai-config/agents/orchestrator.md +0 -241
- package/ai-config/agents/quality/accessibility-auditor.md +0 -1204
- package/ai-config/agents/quality/code-reviewer-compact.md +0 -123
- package/ai-config/agents/quality/code-reviewer.md +0 -363
- package/ai-config/agents/quality/dependency-manager.md +0 -743
- package/ai-config/agents/quality/e2e-test-specialist.md +0 -1005
- package/ai-config/agents/quality/performance-tester.md +0 -1086
- package/ai-config/agents/quality/security-auditor.md +0 -133
- package/ai-config/agents/quality/test-engineer.md +0 -453
- package/ai-config/agents/specialists/api-designer.md +0 -87
- package/ai-config/agents/specialists/backend-architect.md +0 -73
- package/ai-config/agents/specialists/code-reviewer.md +0 -77
- package/ai-config/agents/specialists/db-optimizer.md +0 -75
- package/ai-config/agents/specialists/devops-engineer.md +0 -83
- package/ai-config/agents/specialists/documentation-writer.md +0 -78
- package/ai-config/agents/specialists/frontend-developer.md +0 -75
- package/ai-config/agents/specialists/performance-analyst.md +0 -82
- package/ai-config/agents/specialists/refactor-specialist.md +0 -74
- package/ai-config/agents/specialists/security-auditor.md +0 -74
- package/ai-config/agents/specialists/test-engineer.md +0 -81
- package/ai-config/agents/specialists/ux-consultant.md +0 -76
- package/ai-config/agents/specialized/agent-generator.md +0 -1190
- package/ai-config/agents/specialized/blockchain-developer.md +0 -149
- package/ai-config/agents/specialized/code-migrator.md +0 -892
- package/ai-config/agents/specialized/context-manager.md +0 -978
- package/ai-config/agents/specialized/documentation-writer.md +0 -1078
- package/ai-config/agents/specialized/ecommerce-expert.md +0 -1756
- package/ai-config/agents/specialized/embedded-engineer.md +0 -1714
- package/ai-config/agents/specialized/error-detective.md +0 -1034
- package/ai-config/agents/specialized/fintech-specialist.md +0 -1659
- package/ai-config/agents/specialized/freelance-project-planner-v2.md +0 -1988
- package/ai-config/agents/specialized/freelance-project-planner-v3.md +0 -2136
- package/ai-config/agents/specialized/freelance-project-planner-v4.md +0 -4503
- package/ai-config/agents/specialized/freelance-project-planner.md +0 -722
- package/ai-config/agents/specialized/game-developer.md +0 -1963
- package/ai-config/agents/specialized/healthcare-dev.md +0 -1620
- package/ai-config/agents/specialized/mobile-developer.md +0 -188
- package/ai-config/agents/specialized/parallel-plan-executor.md +0 -506
- package/ai-config/agents/specialized/plan-executor.md +0 -485
- package/ai-config/agents/specialized/solo-dev-planner-modular/00-INDEX.md +0 -485
- package/ai-config/agents/specialized/solo-dev-planner-modular/01-CORE.md +0 -3493
- package/ai-config/agents/specialized/solo-dev-planner-modular/02-SELF-CORRECTION.md +0 -778
- package/ai-config/agents/specialized/solo-dev-planner-modular/03-PROGRESSIVE-SETUP.md +0 -918
- package/ai-config/agents/specialized/solo-dev-planner-modular/04-DEPLOYMENT.md +0 -1537
- package/ai-config/agents/specialized/solo-dev-planner-modular/05-TESTING.md +0 -2633
- package/ai-config/agents/specialized/solo-dev-planner-modular/06-OPERATIONS.md +0 -5610
- package/ai-config/agents/specialized/solo-dev-planner-modular/INSTALL.md +0 -335
- package/ai-config/agents/specialized/solo-dev-planner-modular/QUICK-REFERENCE.txt +0 -215
- package/ai-config/agents/specialized/solo-dev-planner-modular/README.md +0 -260
- package/ai-config/agents/specialized/solo-dev-planner-modular/START-HERE.md +0 -379
- package/ai-config/agents/specialized/solo-dev-planner-modular/WORKFLOW-DIAGRAM.md +0 -355
- package/ai-config/agents/specialized/solo-dev-planner-modular/solo-dev-planner.md +0 -279
- package/ai-config/agents/specialized/template-writer.md +0 -347
- package/ai-config/agents/specialized/test-runner.md +0 -99
- package/ai-config/agents/specialized/vibekanban-smart-worker.md +0 -244
- package/ai-config/agents/specialized/wave-executor.md +0 -138
- package/ai-config/agents/specialized/workflow-optimizer.md +0 -1114
- package/ai-config/commands/git/changelog.md +0 -32
- package/ai-config/commands/git/ci-local.md +0 -70
- package/ai-config/commands/git/commit.md +0 -35
- package/ai-config/commands/git/fix-issue.md +0 -23
- package/ai-config/commands/git/pr-create.md +0 -42
- package/ai-config/commands/git/pr-review.md +0 -50
- package/ai-config/commands/git/worktree.md +0 -39
- package/ai-config/commands/refactoring/cleanup.md +0 -24
- package/ai-config/commands/refactoring/dead-code.md +0 -40
- package/ai-config/commands/refactoring/extract.md +0 -31
- package/ai-config/commands/testing/e2e.md +0 -30
- package/ai-config/commands/testing/tdd.md +0 -36
- package/ai-config/commands/testing/test-coverage.md +0 -30
- package/ai-config/commands/testing/test-fix.md +0 -24
- package/ai-config/commands/workflow/generate-agents-md.md +0 -85
- package/ai-config/commands/workflow/planning.md +0 -47
- package/ai-config/commands/workflows/compound.md +0 -89
- package/ai-config/commands/workflows/diagnose.md +0 -70
- package/ai-config/commands/workflows/discover.md +0 -86
- package/ai-config/commands/workflows/plan.md +0 -77
- package/ai-config/commands/workflows/review.md +0 -78
- package/ai-config/commands/workflows/work.md +0 -75
- package/ai-config/config.yaml +0 -18
- package/ai-config/hooks/_TEMPLATE.md +0 -96
- package/ai-config/hooks/block-dangerous-commands.md +0 -75
- package/ai-config/hooks/commit-guard.md +0 -90
- package/ai-config/hooks/context-loader.md +0 -73
- package/ai-config/hooks/improve-prompt.md +0 -91
- package/ai-config/hooks/learning-log.md +0 -72
- package/ai-config/hooks/model-router.md +0 -86
- package/ai-config/hooks/secret-scanner.md +0 -64
- package/ai-config/hooks/skill-validator.md +0 -102
- package/ai-config/hooks/task-artifact.md +0 -114
- package/ai-config/hooks/validate-workflow.md +0 -100
- package/ai-config/prompts/base.md +0 -71
- package/ai-config/prompts/modes/debug.md +0 -34
- package/ai-config/prompts/modes/deploy.md +0 -40
- package/ai-config/prompts/modes/research.md +0 -32
- package/ai-config/prompts/modes/review.md +0 -33
- package/ai-config/prompts/review-policy.md +0 -79
- package/ai-config/skills/_TEMPLATE.md +0 -157
- package/ai-config/skills/backend/api-gateway/SKILL.md +0 -254
- package/ai-config/skills/backend/bff-concepts/SKILL.md +0 -239
- package/ai-config/skills/backend/bff-spring/SKILL.md +0 -364
- package/ai-config/skills/backend/chi-router/SKILL.md +0 -396
- package/ai-config/skills/backend/error-handling/SKILL.md +0 -255
- package/ai-config/skills/backend/exceptions-spring/SKILL.md +0 -323
- package/ai-config/skills/backend/fastapi/SKILL.md +0 -302
- package/ai-config/skills/backend/gateway-spring/SKILL.md +0 -390
- package/ai-config/skills/backend/go-backend/SKILL.md +0 -457
- package/ai-config/skills/backend/gradle-multimodule/SKILL.md +0 -274
- package/ai-config/skills/backend/graphql-concepts/SKILL.md +0 -352
- package/ai-config/skills/backend/graphql-spring/SKILL.md +0 -398
- package/ai-config/skills/backend/grpc-concepts/SKILL.md +0 -283
- package/ai-config/skills/backend/grpc-spring/SKILL.md +0 -445
- package/ai-config/skills/backend/jwt-auth/SKILL.md +0 -412
- package/ai-config/skills/backend/notifications-concepts/SKILL.md +0 -259
- package/ai-config/skills/backend/recommendations-concepts/SKILL.md +0 -261
- package/ai-config/skills/backend/search-concepts/SKILL.md +0 -263
- package/ai-config/skills/backend/search-spring/SKILL.md +0 -375
- package/ai-config/skills/backend/spring-boot-4/SKILL.md +0 -172
- package/ai-config/skills/backend/websockets/SKILL.md +0 -532
- package/ai-config/skills/data-ai/ai-ml/SKILL.md +0 -423
- package/ai-config/skills/data-ai/analytics-concepts/SKILL.md +0 -195
- package/ai-config/skills/data-ai/analytics-spring/SKILL.md +0 -340
- package/ai-config/skills/data-ai/duckdb-analytics/SKILL.md +0 -440
- package/ai-config/skills/data-ai/langchain/SKILL.md +0 -238
- package/ai-config/skills/data-ai/mlflow/SKILL.md +0 -302
- package/ai-config/skills/data-ai/onnx-inference/SKILL.md +0 -290
- package/ai-config/skills/data-ai/powerbi/SKILL.md +0 -352
- package/ai-config/skills/data-ai/pytorch/SKILL.md +0 -274
- package/ai-config/skills/data-ai/scikit-learn/SKILL.md +0 -321
- package/ai-config/skills/data-ai/vector-db/SKILL.md +0 -301
- package/ai-config/skills/database/graph-databases/SKILL.md +0 -218
- package/ai-config/skills/database/graph-spring/SKILL.md +0 -361
- package/ai-config/skills/database/pgx-postgres/SKILL.md +0 -512
- package/ai-config/skills/database/redis-cache/SKILL.md +0 -343
- package/ai-config/skills/database/sqlite-embedded/SKILL.md +0 -388
- package/ai-config/skills/database/timescaledb/SKILL.md +0 -320
- package/ai-config/skills/docs/api-documentation/SKILL.md +0 -293
- package/ai-config/skills/docs/docs-spring/SKILL.md +0 -377
- package/ai-config/skills/docs/mustache-templates/SKILL.md +0 -190
- package/ai-config/skills/docs/technical-docs/SKILL.md +0 -447
- package/ai-config/skills/frontend/astro-ssr/SKILL.md +0 -441
- package/ai-config/skills/frontend/frontend-design/SKILL.md +0 -54
- package/ai-config/skills/frontend/frontend-web/SKILL.md +0 -368
- package/ai-config/skills/frontend/mantine-ui/SKILL.md +0 -396
- package/ai-config/skills/frontend/tanstack-query/SKILL.md +0 -439
- package/ai-config/skills/frontend/zod-validation/SKILL.md +0 -417
- package/ai-config/skills/frontend/zustand-state/SKILL.md +0 -350
- package/ai-config/skills/infrastructure/chaos-engineering/SKILL.md +0 -244
- package/ai-config/skills/infrastructure/chaos-spring/SKILL.md +0 -378
- package/ai-config/skills/infrastructure/devops-infra/SKILL.md +0 -435
- package/ai-config/skills/infrastructure/docker-containers/SKILL.md +0 -420
- package/ai-config/skills/infrastructure/kubernetes/SKILL.md +0 -456
- package/ai-config/skills/infrastructure/opentelemetry/SKILL.md +0 -546
- package/ai-config/skills/infrastructure/traefik-proxy/SKILL.md +0 -474
- package/ai-config/skills/infrastructure/woodpecker-ci/SKILL.md +0 -315
- package/ai-config/skills/mobile/ionic-capacitor/SKILL.md +0 -504
- package/ai-config/skills/mobile/mobile-ionic/SKILL.md +0 -448
- package/ai-config/skills/prompt-improver/SKILL.md +0 -125
- package/ai-config/skills/quality/ghagga-review/SKILL.md +0 -216
- package/ai-config/skills/references/hooks-patterns/SKILL.md +0 -238
- package/ai-config/skills/references/mcp-servers/SKILL.md +0 -275
- package/ai-config/skills/references/plugins-reference/SKILL.md +0 -110
- package/ai-config/skills/references/skills-reference/SKILL.md +0 -420
- package/ai-config/skills/references/subagent-templates/SKILL.md +0 -193
- package/ai-config/skills/systems-iot/modbus-protocol/SKILL.md +0 -410
- package/ai-config/skills/systems-iot/mqtt-rumqttc/SKILL.md +0 -408
- package/ai-config/skills/systems-iot/rust-systems/SKILL.md +0 -386
- package/ai-config/skills/systems-iot/tokio-async/SKILL.md +0 -324
- package/ai-config/skills/testing/playwright-e2e/SKILL.md +0 -289
- package/ai-config/skills/testing/testcontainers/SKILL.md +0 -299
- package/ai-config/skills/testing/vitest-testing/SKILL.md +0 -381
- package/ai-config/skills/workflow/ci-local-guide/SKILL.md +0 -118
- package/ai-config/skills/workflow/claude-automation-recommender/SKILL.md +0 -299
- package/ai-config/skills/workflow/claude-md-improver/SKILL.md +0 -158
- package/ai-config/skills/workflow/finishing-a-development-branch/SKILL.md +0 -117
- package/ai-config/skills/workflow/git-github/SKILL.md +0 -334
- package/ai-config/skills/workflow/git-github/references/examples.md +0 -160
- package/ai-config/skills/workflow/git-workflow/SKILL.md +0 -214
- package/ai-config/skills/workflow/ide-plugins/SKILL.md +0 -277
- package/ai-config/skills/workflow/ide-plugins-intellij/SKILL.md +0 -401
- package/ai-config/skills/workflow/obsidian-brain-workflow/SKILL.md +0 -199
- package/ai-config/skills/workflow/using-git-worktrees/SKILL.md +0 -100
- package/ai-config/skills/workflow/verification-before-completion/SKILL.md +0 -73
- package/ai-config/skills/workflow/wave-workflow/SKILL.md +0 -178
- package/schemas/agent.schema.json +0 -34
- package/schemas/ai-config.schema.json +0 -28
- package/schemas/plugin.schema.json +0 -62
- package/schemas/skill.schema.json +0 -44
|
@@ -1,321 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: scikit-learn
|
|
3
|
-
description: >
|
|
4
|
-
Classical ML with scikit-learn for anomaly detection, classification, and clustering.
|
|
5
|
-
Trigger: sklearn, scikit-learn, classical ml, anomaly detection, classification, clustering
|
|
6
|
-
tools:
|
|
7
|
-
- Read
|
|
8
|
-
- Write
|
|
9
|
-
- Bash
|
|
10
|
-
- Grep
|
|
11
|
-
metadata:
|
|
12
|
-
author: plataforma-industrial
|
|
13
|
-
version: "2.0"
|
|
14
|
-
tags: [sklearn, ml, classification, anomaly-detection, clustering]
|
|
15
|
-
updated: "2026-02"
|
|
16
|
-
---
|
|
17
|
-
|
|
18
|
-
# Scikit-learn Skill
|
|
19
|
-
|
|
20
|
-
Classical ML for anomaly detection, classification, regression, and clustering.
|
|
21
|
-
|
|
22
|
-
## Stack
|
|
23
|
-
|
|
24
|
-
```yaml
|
|
25
|
-
scikit-learn: 1.4+
|
|
26
|
-
pandas: 2.2+
|
|
27
|
-
numpy: 1.26+
|
|
28
|
-
joblib: 1.3+
|
|
29
|
-
imbalanced-learn: 0.12+
|
|
30
|
-
```
|
|
31
|
-
|
|
32
|
-
## Anomaly Detection
|
|
33
|
-
|
|
34
|
-
### Isolation Forest
|
|
35
|
-
|
|
36
|
-
```python
|
|
37
|
-
from sklearn.ensemble import IsolationForest
|
|
38
|
-
from sklearn.preprocessing import StandardScaler
|
|
39
|
-
from sklearn.pipeline import Pipeline
|
|
40
|
-
import numpy as np
|
|
41
|
-
import joblib
|
|
42
|
-
|
|
43
|
-
class AnomalyDetector:
|
|
44
|
-
def __init__(self, contamination: float = 0.05, n_estimators: int = 100):
|
|
45
|
-
self.pipeline = Pipeline([
|
|
46
|
-
('scaler', StandardScaler()),
|
|
47
|
-
('detector', IsolationForest(
|
|
48
|
-
n_estimators=n_estimators,
|
|
49
|
-
contamination=contamination,
|
|
50
|
-
random_state=42,
|
|
51
|
-
n_jobs=-1
|
|
52
|
-
))
|
|
53
|
-
])
|
|
54
|
-
|
|
55
|
-
def fit(self, X: np.ndarray):
|
|
56
|
-
self.pipeline.fit(X)
|
|
57
|
-
return self
|
|
58
|
-
|
|
59
|
-
def predict(self, X: np.ndarray):
|
|
60
|
-
labels = self.pipeline.predict(X) # -1 anomaly, 1 normal
|
|
61
|
-
scores = self.pipeline.decision_function(X)
|
|
62
|
-
return labels, scores
|
|
63
|
-
|
|
64
|
-
def predict_proba(self, X: np.ndarray):
|
|
65
|
-
scores = self.pipeline.decision_function(X)
|
|
66
|
-
return 1 / (1 + np.exp(scores)) # Convert to probability
|
|
67
|
-
|
|
68
|
-
def save(self, path: str):
|
|
69
|
-
joblib.dump(self.pipeline, path)
|
|
70
|
-
|
|
71
|
-
@classmethod
|
|
72
|
-
def load(cls, path: str):
|
|
73
|
-
detector = cls()
|
|
74
|
-
detector.pipeline = joblib.load(path)
|
|
75
|
-
return detector
|
|
76
|
-
```
|
|
77
|
-
|
|
78
|
-
### One-Class SVM & LOF
|
|
79
|
-
|
|
80
|
-
```python
|
|
81
|
-
from sklearn.svm import OneClassSVM
|
|
82
|
-
from sklearn.neighbors import LocalOutlierFactor
|
|
83
|
-
from sklearn.preprocessing import RobustScaler
|
|
84
|
-
|
|
85
|
-
# One-Class SVM (robust to outliers)
|
|
86
|
-
class RobustDetector:
|
|
87
|
-
def __init__(self, nu: float = 0.05):
|
|
88
|
-
self.pipeline = Pipeline([
|
|
89
|
-
('scaler', RobustScaler()),
|
|
90
|
-
('svm', OneClassSVM(nu=nu, kernel='rbf', gamma='scale'))
|
|
91
|
-
])
|
|
92
|
-
|
|
93
|
-
def fit(self, X):
|
|
94
|
-
self.pipeline.fit(X)
|
|
95
|
-
return self
|
|
96
|
-
|
|
97
|
-
def predict(self, X):
|
|
98
|
-
return self.pipeline.predict(X)
|
|
99
|
-
|
|
100
|
-
# Local Outlier Factor (streaming)
|
|
101
|
-
class StreamingLOF:
|
|
102
|
-
def __init__(self, n_neighbors: int = 20, contamination: float = 0.05):
|
|
103
|
-
self.lof = LocalOutlierFactor(
|
|
104
|
-
n_neighbors=n_neighbors,
|
|
105
|
-
contamination=contamination,
|
|
106
|
-
novelty=True # Enable predict on new data
|
|
107
|
-
)
|
|
108
|
-
|
|
109
|
-
def fit(self, X):
|
|
110
|
-
self.lof.fit(X)
|
|
111
|
-
return self
|
|
112
|
-
|
|
113
|
-
def predict(self, X):
|
|
114
|
-
return self.lof.predict(X)
|
|
115
|
-
```
|
|
116
|
-
|
|
117
|
-
## Classification
|
|
118
|
-
|
|
119
|
-
```python
|
|
120
|
-
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
|
|
121
|
-
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
|
122
|
-
from sklearn.pipeline import Pipeline
|
|
123
|
-
from sklearn.metrics import classification_report
|
|
124
|
-
import pandas as pd
|
|
125
|
-
|
|
126
|
-
class EquipmentClassifier:
|
|
127
|
-
STATES = ['normal', 'degraded', 'maintenance_required', 'critical']
|
|
128
|
-
|
|
129
|
-
def __init__(self, model_type: str = 'random_forest'):
|
|
130
|
-
self.label_encoder = LabelEncoder()
|
|
131
|
-
self.label_encoder.fit(self.STATES)
|
|
132
|
-
|
|
133
|
-
classifier = RandomForestClassifier(
|
|
134
|
-
n_estimators=200, max_depth=10, min_samples_split=5,
|
|
135
|
-
class_weight='balanced', n_jobs=-1, random_state=42
|
|
136
|
-
) if model_type == 'random_forest' else GradientBoostingClassifier(
|
|
137
|
-
n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42
|
|
138
|
-
)
|
|
139
|
-
|
|
140
|
-
self.pipeline = Pipeline([
|
|
141
|
-
('scaler', StandardScaler()),
|
|
142
|
-
('classifier', classifier)
|
|
143
|
-
])
|
|
144
|
-
|
|
145
|
-
def fit(self, X: np.ndarray, y: np.ndarray):
|
|
146
|
-
if isinstance(y[0], str):
|
|
147
|
-
y = self.label_encoder.transform(y)
|
|
148
|
-
self.pipeline.fit(X, y)
|
|
149
|
-
return self
|
|
150
|
-
|
|
151
|
-
def predict(self, X: np.ndarray):
|
|
152
|
-
y_pred = self.pipeline.predict(X)
|
|
153
|
-
return self.label_encoder.inverse_transform(y_pred)
|
|
154
|
-
|
|
155
|
-
def predict_proba(self, X: np.ndarray) -> pd.DataFrame:
|
|
156
|
-
proba = self.pipeline.predict_proba(X)
|
|
157
|
-
return pd.DataFrame(proba, columns=self.STATES)
|
|
158
|
-
|
|
159
|
-
def feature_importance(self, feature_names: list) -> pd.DataFrame:
|
|
160
|
-
importances = self.pipeline.named_steps['classifier'].feature_importances_
|
|
161
|
-
return pd.DataFrame({'feature': feature_names, 'importance': importances}).sort_values('importance', ascending=False)
|
|
162
|
-
```
|
|
163
|
-
|
|
164
|
-
## Feature Engineering
|
|
165
|
-
|
|
166
|
-
```python
|
|
167
|
-
def extract_features(df: pd.DataFrame, window_size: int = 60) -> pd.DataFrame:
|
|
168
|
-
features = pd.DataFrame()
|
|
169
|
-
|
|
170
|
-
for col in ['temperature', 'pressure', 'vibration', 'flow_rate']:
|
|
171
|
-
features[f'{col}_mean'] = df[col].rolling(window_size).mean()
|
|
172
|
-
features[f'{col}_std'] = df[col].rolling(window_size).std()
|
|
173
|
-
features[f'{col}_min'] = df[col].rolling(window_size).min()
|
|
174
|
-
features[f'{col}_max'] = df[col].rolling(window_size).max()
|
|
175
|
-
features[f'{col}_range'] = features[f'{col}_max'] - features[f'{col}_min']
|
|
176
|
-
features[f'{col}_diff'] = df[col].diff()
|
|
177
|
-
features[f'{col}_diff_mean'] = features[f'{col}_diff'].rolling(window_size).mean()
|
|
178
|
-
|
|
179
|
-
return features.dropna()
|
|
180
|
-
```
|
|
181
|
-
|
|
182
|
-
## Forecasting (Multi-step)
|
|
183
|
-
|
|
184
|
-
```python
|
|
185
|
-
from sklearn.multioutput import MultiOutputRegressor
|
|
186
|
-
from sklearn.ensemble import GradientBoostingRegressor
|
|
187
|
-
|
|
188
|
-
class Forecaster:
|
|
189
|
-
def __init__(self, horizon: int = 12, lookback: int = 24):
|
|
190
|
-
self.horizon = horizon
|
|
191
|
-
self.lookback = lookback
|
|
192
|
-
self.pipeline = Pipeline([
|
|
193
|
-
('scaler', StandardScaler()),
|
|
194
|
-
('regressor', MultiOutputRegressor(
|
|
195
|
-
GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=42),
|
|
196
|
-
n_jobs=-1
|
|
197
|
-
))
|
|
198
|
-
])
|
|
199
|
-
|
|
200
|
-
def create_sequences(self, data: np.ndarray):
|
|
201
|
-
X, y = [], []
|
|
202
|
-
for i in range(len(data) - self.lookback - self.horizon + 1):
|
|
203
|
-
X.append(data[i:i + self.lookback].flatten())
|
|
204
|
-
y.append(data[i + self.lookback:i + self.lookback + self.horizon].flatten())
|
|
205
|
-
return np.array(X), np.array(y)
|
|
206
|
-
|
|
207
|
-
def fit(self, data: np.ndarray):
|
|
208
|
-
X, y = self.create_sequences(data)
|
|
209
|
-
self.pipeline.fit(X, y)
|
|
210
|
-
return self
|
|
211
|
-
|
|
212
|
-
def predict(self, recent_data: np.ndarray) -> np.ndarray:
|
|
213
|
-
X = recent_data.flatten().reshape(1, -1)
|
|
214
|
-
y_pred = self.pipeline.predict(X)
|
|
215
|
-
return y_pred.reshape(self.horizon, recent_data.shape[1])
|
|
216
|
-
```
|
|
217
|
-
|
|
218
|
-
## Clustering
|
|
219
|
-
|
|
220
|
-
```python
|
|
221
|
-
from sklearn.cluster import KMeans, DBSCAN
|
|
222
|
-
from sklearn.decomposition import PCA
|
|
223
|
-
from sklearn.metrics import silhouette_score
|
|
224
|
-
|
|
225
|
-
class ModeDetector:
|
|
226
|
-
def __init__(self, n_modes: int = None, method: str = 'kmeans'):
|
|
227
|
-
self.n_modes = n_modes
|
|
228
|
-
self.scaler = StandardScaler()
|
|
229
|
-
self.pca = PCA(n_components=0.95)
|
|
230
|
-
self.clusterer = None
|
|
231
|
-
|
|
232
|
-
def fit(self, X: np.ndarray):
|
|
233
|
-
X_scaled = self.scaler.fit_transform(X)
|
|
234
|
-
X_pca = self.pca.fit_transform(X_scaled)
|
|
235
|
-
|
|
236
|
-
if self.n_modes is None:
|
|
237
|
-
self.n_modes = self._find_optimal_k(X_pca)
|
|
238
|
-
|
|
239
|
-
self.clusterer = KMeans(n_clusters=self.n_modes, n_init=10, random_state=42)
|
|
240
|
-
self.clusterer.fit(X_pca)
|
|
241
|
-
return self
|
|
242
|
-
|
|
243
|
-
def predict(self, X: np.ndarray):
|
|
244
|
-
X_pca = self.pca.transform(self.scaler.transform(X))
|
|
245
|
-
return self.clusterer.predict(X_pca)
|
|
246
|
-
|
|
247
|
-
def _find_optimal_k(self, X: np.ndarray, max_k: int = 10) -> int:
|
|
248
|
-
scores = [silhouette_score(X, KMeans(n_clusters=k, n_init=10).fit_predict(X))
|
|
249
|
-
for k in range(2, max_k + 1)]
|
|
250
|
-
return np.argmax(scores) + 2
|
|
251
|
-
```
|
|
252
|
-
|
|
253
|
-
## Hyperparameter Tuning
|
|
254
|
-
|
|
255
|
-
```python
|
|
256
|
-
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
|
|
257
|
-
|
|
258
|
-
def tune_classifier(X: np.ndarray, y: np.ndarray):
|
|
259
|
-
tscv = TimeSeriesSplit(n_splits=5)
|
|
260
|
-
|
|
261
|
-
param_grid = {
|
|
262
|
-
'n_estimators': [100, 200, 300],
|
|
263
|
-
'max_depth': [5, 10, 15, None],
|
|
264
|
-
'min_samples_split': [2, 5, 10],
|
|
265
|
-
'min_samples_leaf': [1, 2, 4],
|
|
266
|
-
}
|
|
267
|
-
|
|
268
|
-
search = RandomizedSearchCV(
|
|
269
|
-
RandomForestClassifier(random_state=42, n_jobs=-1),
|
|
270
|
-
param_grid,
|
|
271
|
-
n_iter=50,
|
|
272
|
-
cv=tscv,
|
|
273
|
-
scoring='f1_weighted',
|
|
274
|
-
n_jobs=-1
|
|
275
|
-
)
|
|
276
|
-
search.fit(X, y)
|
|
277
|
-
return search.best_estimator_, search.best_params_
|
|
278
|
-
```
|
|
279
|
-
|
|
280
|
-
## Model Persistence
|
|
281
|
-
|
|
282
|
-
```python
|
|
283
|
-
import json
|
|
284
|
-
from pathlib import Path
|
|
285
|
-
from datetime import datetime
|
|
286
|
-
|
|
287
|
-
def save_model(model, path: str, metadata: dict = None, feature_names: list = None):
|
|
288
|
-
path = Path(path)
|
|
289
|
-
path.parent.mkdir(parents=True, exist_ok=True)
|
|
290
|
-
joblib.dump(model, path)
|
|
291
|
-
|
|
292
|
-
meta = {
|
|
293
|
-
'saved_at': datetime.utcnow().isoformat(),
|
|
294
|
-
'sklearn_version': __import__('sklearn').__version__,
|
|
295
|
-
'feature_names': feature_names,
|
|
296
|
-
**(metadata or {})
|
|
297
|
-
}
|
|
298
|
-
path.with_suffix('.json').write_text(json.dumps(meta, indent=2))
|
|
299
|
-
|
|
300
|
-
def load_model(path: str):
|
|
301
|
-
path = Path(path)
|
|
302
|
-
model = joblib.load(path)
|
|
303
|
-
meta_path = path.with_suffix('.json')
|
|
304
|
-
metadata = json.loads(meta_path.read_text()) if meta_path.exists() else {}
|
|
305
|
-
return model, metadata
|
|
306
|
-
```
|
|
307
|
-
|
|
308
|
-
## Best Practices
|
|
309
|
-
|
|
310
|
-
1. **Use pipelines** - Combine preprocessing and model for reproducibility
|
|
311
|
-
2. **Handle imbalance** - SMOTE or `class_weight='balanced'`
|
|
312
|
-
3. **Time series CV** - Use `TimeSeriesSplit` instead of random splits
|
|
313
|
-
4. **Feature importance** - Analyze with `.feature_importances_`
|
|
314
|
-
5. **Version models** - Save with metadata (version, features, metrics)
|
|
315
|
-
|
|
316
|
-
## Related Skills
|
|
317
|
-
|
|
318
|
-
- `mlflow`: Experiment tracking
|
|
319
|
-
- `pytorch`: Deep learning alternative
|
|
320
|
-
- `duckdb-analytics`: Data preprocessing
|
|
321
|
-
- `onnx-inference`: Model deployment
|
|
@@ -1,301 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: vector-db
|
|
3
|
-
description: >
|
|
4
|
-
Vector databases for RAG and semantic search with ChromaDB and pgvector.
|
|
5
|
-
Trigger: vector database, embeddings, rag, semantic search, chromadb, pgvector
|
|
6
|
-
tools:
|
|
7
|
-
- Read
|
|
8
|
-
- Write
|
|
9
|
-
- Bash
|
|
10
|
-
- Grep
|
|
11
|
-
metadata:
|
|
12
|
-
author: plataforma-industrial
|
|
13
|
-
version: "2.0"
|
|
14
|
-
tags: [vector-db, rag, embeddings, semantic-search, chromadb, pgvector]
|
|
15
|
-
updated: "2026-02"
|
|
16
|
-
---
|
|
17
|
-
|
|
18
|
-
# Vector Database Skill
|
|
19
|
-
|
|
20
|
-
Vector databases for RAG and semantic search.
|
|
21
|
-
|
|
22
|
-
## Stack
|
|
23
|
-
|
|
24
|
-
```yaml
|
|
25
|
-
# Primary
|
|
26
|
-
chromadb: 0.4+
|
|
27
|
-
pgvector: 0.6+ # PostgreSQL extension
|
|
28
|
-
|
|
29
|
-
# Alternatives
|
|
30
|
-
qdrant-client: 1.7+
|
|
31
|
-
pinecone-client: 3.0+
|
|
32
|
-
|
|
33
|
-
# Embeddings
|
|
34
|
-
openai: 1.12+
|
|
35
|
-
sentence-transformers: 2.5+
|
|
36
|
-
```
|
|
37
|
-
|
|
38
|
-
## ChromaDB
|
|
39
|
-
|
|
40
|
-
```python
|
|
41
|
-
import chromadb
|
|
42
|
-
from chromadb.config import Settings
|
|
43
|
-
from chromadb.utils import embedding_functions
|
|
44
|
-
import os
|
|
45
|
-
|
|
46
|
-
class ChromaVectorStore:
|
|
47
|
-
def __init__(self, collection_name: str = "documents", persist_dir: str = "./chroma_db"):
|
|
48
|
-
self.client = chromadb.PersistentClient(
|
|
49
|
-
path=persist_dir,
|
|
50
|
-
settings=Settings(anonymized_telemetry=False)
|
|
51
|
-
)
|
|
52
|
-
|
|
53
|
-
self.embedding_fn = embedding_functions.OpenAIEmbeddingFunction(
|
|
54
|
-
api_key=os.getenv("OPENAI_API_KEY"),
|
|
55
|
-
model_name="text-embedding-3-small"
|
|
56
|
-
)
|
|
57
|
-
|
|
58
|
-
self.collection = self.client.get_or_create_collection(
|
|
59
|
-
name=collection_name,
|
|
60
|
-
embedding_function=self.embedding_fn,
|
|
61
|
-
metadata={"hnsw:space": "cosine"}
|
|
62
|
-
)
|
|
63
|
-
|
|
64
|
-
def add_documents(self, documents: list[str], metadatas: list[dict] = None, ids: list[str] = None):
|
|
65
|
-
if ids is None:
|
|
66
|
-
ids = [f"doc_{i}" for i in range(len(documents))]
|
|
67
|
-
self.collection.add(documents=documents, metadatas=metadatas, ids=ids)
|
|
68
|
-
|
|
69
|
-
def query(self, query_text: str, n_results: int = 5, where: dict = None):
|
|
70
|
-
return self.collection.query(
|
|
71
|
-
query_texts=[query_text],
|
|
72
|
-
n_results=n_results,
|
|
73
|
-
where=where,
|
|
74
|
-
include=["documents", "metadatas", "distances"]
|
|
75
|
-
)
|
|
76
|
-
|
|
77
|
-
def delete(self, ids: list[str] = None, where: dict = None):
|
|
78
|
-
if ids:
|
|
79
|
-
self.collection.delete(ids=ids)
|
|
80
|
-
elif where:
|
|
81
|
-
self.collection.delete(where=where)
|
|
82
|
-
```
|
|
83
|
-
|
|
84
|
-
## pgvector (PostgreSQL)
|
|
85
|
-
|
|
86
|
-
### Schema
|
|
87
|
-
|
|
88
|
-
```sql
|
|
89
|
-
CREATE EXTENSION IF NOT EXISTS vector;
|
|
90
|
-
|
|
91
|
-
CREATE TABLE documents (
|
|
92
|
-
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
93
|
-
content TEXT NOT NULL,
|
|
94
|
-
embedding vector(1536), -- OpenAI dimension
|
|
95
|
-
metadata JSONB DEFAULT '{}',
|
|
96
|
-
tenant_id UUID NOT NULL,
|
|
97
|
-
created_at TIMESTAMPTZ DEFAULT NOW()
|
|
98
|
-
);
|
|
99
|
-
|
|
100
|
-
-- HNSW index for fast search
|
|
101
|
-
CREATE INDEX ON documents USING hnsw (embedding vector_cosine_ops) WITH (m = 16, ef_construction = 64);
|
|
102
|
-
CREATE INDEX idx_documents_tenant ON documents (tenant_id);
|
|
103
|
-
CREATE INDEX idx_documents_metadata ON documents USING gin (metadata);
|
|
104
|
-
```
|
|
105
|
-
|
|
106
|
-
### Python Client
|
|
107
|
-
|
|
108
|
-
```python
|
|
109
|
-
import asyncpg
|
|
110
|
-
from pgvector.asyncpg import register_vector
|
|
111
|
-
import numpy as np
|
|
112
|
-
from openai import AsyncOpenAI
|
|
113
|
-
|
|
114
|
-
class PgVectorStore:
|
|
115
|
-
def __init__(self, pool: asyncpg.Pool, embedding_model: str = "text-embedding-3-small"):
|
|
116
|
-
self.pool = pool
|
|
117
|
-
self.openai = AsyncOpenAI()
|
|
118
|
-
self.embedding_model = embedding_model
|
|
119
|
-
|
|
120
|
-
async def setup(self):
|
|
121
|
-
async with self.pool.acquire() as conn:
|
|
122
|
-
await register_vector(conn)
|
|
123
|
-
|
|
124
|
-
async def embed(self, texts: list[str]) -> list[np.ndarray]:
|
|
125
|
-
response = await self.openai.embeddings.create(model=self.embedding_model, input=texts)
|
|
126
|
-
return [np.array(e.embedding) for e in response.data]
|
|
127
|
-
|
|
128
|
-
async def add_documents(self, contents: list[str], metadatas: list[dict], tenant_id: str) -> list[str]:
|
|
129
|
-
embeddings = await self.embed(contents)
|
|
130
|
-
ids = []
|
|
131
|
-
async with self.pool.acquire() as conn:
|
|
132
|
-
for content, embedding, metadata in zip(contents, embeddings, metadatas):
|
|
133
|
-
row = await conn.fetchrow(
|
|
134
|
-
"INSERT INTO documents (content, embedding, metadata, tenant_id) VALUES ($1, $2, $3, $4) RETURNING id",
|
|
135
|
-
content, embedding, metadata, tenant_id
|
|
136
|
-
)
|
|
137
|
-
ids.append(str(row['id']))
|
|
138
|
-
return ids
|
|
139
|
-
|
|
140
|
-
async def similarity_search(self, query: str, tenant_id: str, k: int = 5, metadata_filter: dict = None):
|
|
141
|
-
query_embedding = (await self.embed([query]))[0]
|
|
142
|
-
|
|
143
|
-
sql = """
|
|
144
|
-
SELECT id, content, metadata, 1 - (embedding <=> $1) AS similarity
|
|
145
|
-
FROM documents WHERE tenant_id = $2
|
|
146
|
-
"""
|
|
147
|
-
params = [query_embedding, tenant_id]
|
|
148
|
-
|
|
149
|
-
if metadata_filter:
|
|
150
|
-
sql += " AND metadata @> $3"
|
|
151
|
-
params.append(metadata_filter)
|
|
152
|
-
|
|
153
|
-
sql += f" ORDER BY embedding <=> $1 LIMIT ${len(params) + 1}"
|
|
154
|
-
params.append(k)
|
|
155
|
-
|
|
156
|
-
async with self.pool.acquire() as conn:
|
|
157
|
-
rows = await conn.fetch(sql, *params)
|
|
158
|
-
|
|
159
|
-
return [{"id": str(r['id']), "content": r['content'], "metadata": dict(r['metadata']), "similarity": float(r['similarity'])} for r in rows]
|
|
160
|
-
|
|
161
|
-
async def hybrid_search(self, query: str, tenant_id: str, k: int = 5, keyword_weight: float = 0.3):
|
|
162
|
-
query_embedding = (await self.embed([query]))[0]
|
|
163
|
-
|
|
164
|
-
sql = """
|
|
165
|
-
WITH vector_search AS (
|
|
166
|
-
SELECT id, content, metadata, 1 - (embedding <=> $1) AS vector_score
|
|
167
|
-
FROM documents WHERE tenant_id = $2
|
|
168
|
-
ORDER BY embedding <=> $1 LIMIT $3 * 2
|
|
169
|
-
),
|
|
170
|
-
keyword_search AS (
|
|
171
|
-
SELECT id, ts_rank(to_tsvector('english', content), plainto_tsquery('english', $4)) AS keyword_score
|
|
172
|
-
FROM documents WHERE tenant_id = $2 AND to_tsvector('english', content) @@ plainto_tsquery('english', $4)
|
|
173
|
-
)
|
|
174
|
-
SELECT v.id, v.content, v.metadata,
|
|
175
|
-
(1 - $5) * v.vector_score + $5 * COALESCE(k.keyword_score, 0) AS combined_score
|
|
176
|
-
FROM vector_search v LEFT JOIN keyword_search k ON v.id = k.id
|
|
177
|
-
ORDER BY combined_score DESC LIMIT $3
|
|
178
|
-
"""
|
|
179
|
-
|
|
180
|
-
async with self.pool.acquire() as conn:
|
|
181
|
-
rows = await conn.fetch(sql, query_embedding, tenant_id, k, query, keyword_weight)
|
|
182
|
-
|
|
183
|
-
return [{"id": str(r['id']), "content": r['content'], "metadata": dict(r['metadata']), "score": float(r['combined_score'])} for r in rows]
|
|
184
|
-
```
|
|
185
|
-
|
|
186
|
-
## Document Chunking
|
|
187
|
-
|
|
188
|
-
```python
|
|
189
|
-
from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter
|
|
190
|
-
|
|
191
|
-
def chunk_documents(documents: list[dict], chunk_size: int = 1000, chunk_overlap: int = 200) -> list[dict]:
|
|
192
|
-
splitter = RecursiveCharacterTextSplitter(
|
|
193
|
-
chunk_size=chunk_size,
|
|
194
|
-
chunk_overlap=chunk_overlap,
|
|
195
|
-
separators=["\n\n", "\n", ". ", " ", ""]
|
|
196
|
-
)
|
|
197
|
-
|
|
198
|
-
chunks = []
|
|
199
|
-
for doc in documents:
|
|
200
|
-
for i, chunk in enumerate(splitter.split_text(doc["content"])):
|
|
201
|
-
chunks.append({
|
|
202
|
-
"content": chunk,
|
|
203
|
-
"metadata": {**doc.get("metadata", {}), "source_id": doc.get("id"), "chunk_index": i}
|
|
204
|
-
})
|
|
205
|
-
return chunks
|
|
206
|
-
|
|
207
|
-
def chunk_markdown(content: str, source_metadata: dict = None) -> list[dict]:
|
|
208
|
-
header_splitter = MarkdownHeaderTextSplitter(
|
|
209
|
-
headers_to_split_on=[("#", "h1"), ("##", "h2"), ("###", "h3")]
|
|
210
|
-
)
|
|
211
|
-
size_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
|
212
|
-
|
|
213
|
-
chunks = []
|
|
214
|
-
for split in header_splitter.split_text(content):
|
|
215
|
-
text, metadata = split.page_content, {**(source_metadata or {}), **split.metadata}
|
|
216
|
-
if len(text) > 1000:
|
|
217
|
-
for i, sub_chunk in enumerate(size_splitter.split_text(text)):
|
|
218
|
-
chunks.append({"content": sub_chunk, "metadata": {**metadata, "sub_chunk": i}})
|
|
219
|
-
else:
|
|
220
|
-
chunks.append({"content": text, "metadata": metadata})
|
|
221
|
-
return chunks
|
|
222
|
-
```
|
|
223
|
-
|
|
224
|
-
## RAG Chain
|
|
225
|
-
|
|
226
|
-
```python
|
|
227
|
-
from openai import AsyncOpenAI
|
|
228
|
-
|
|
229
|
-
class RAGChain:
|
|
230
|
-
def __init__(self, vector_store: PgVectorStore, model: str = "gpt-4-turbo-preview", k: int = 5):
|
|
231
|
-
self.vector_store = vector_store
|
|
232
|
-
self.openai = AsyncOpenAI()
|
|
233
|
-
self.model = model
|
|
234
|
-
self.k = k
|
|
235
|
-
|
|
236
|
-
async def query(self, question: str, tenant_id: str, system_prompt: str = None, metadata_filter: dict = None):
|
|
237
|
-
docs = await self.vector_store.similarity_search(
|
|
238
|
-
query=question, tenant_id=tenant_id, k=self.k, metadata_filter=metadata_filter
|
|
239
|
-
)
|
|
240
|
-
|
|
241
|
-
context = "\n\n---\n\n".join([f"[Source: {d['metadata'].get('source', 'Unknown')}]\n{d['content']}" for d in docs])
|
|
242
|
-
|
|
243
|
-
if system_prompt is None:
|
|
244
|
-
system_prompt = "Answer based on the provided context. If not in context, say so. Cite sources."
|
|
245
|
-
|
|
246
|
-
response = await self.openai.chat.completions.create(
|
|
247
|
-
model=self.model,
|
|
248
|
-
messages=[
|
|
249
|
-
{"role": "system", "content": system_prompt},
|
|
250
|
-
{"role": "user", "content": f"Context:\n{context}\n\nQuestion: {question}"}
|
|
251
|
-
],
|
|
252
|
-
temperature=0
|
|
253
|
-
)
|
|
254
|
-
|
|
255
|
-
return {
|
|
256
|
-
"answer": response.choices[0].message.content,
|
|
257
|
-
"sources": [d['metadata'].get('source') for d in docs],
|
|
258
|
-
"docs": docs
|
|
259
|
-
}
|
|
260
|
-
```
|
|
261
|
-
|
|
262
|
-
## Indexing Pipeline
|
|
263
|
-
|
|
264
|
-
```python
|
|
265
|
-
class IndexingPipeline:
|
|
266
|
-
def __init__(self, vector_store: PgVectorStore, chunk_size: int = 1000, batch_size: int = 100):
|
|
267
|
-
self.vector_store = vector_store
|
|
268
|
-
self.chunk_size = chunk_size
|
|
269
|
-
self.batch_size = batch_size
|
|
270
|
-
|
|
271
|
-
async def index_documents(self, documents: list[dict], tenant_id: str) -> int:
|
|
272
|
-
chunks = chunk_documents(documents, chunk_size=self.chunk_size)
|
|
273
|
-
total = 0
|
|
274
|
-
|
|
275
|
-
for i in range(0, len(chunks), self.batch_size):
|
|
276
|
-
batch = chunks[i:i + self.batch_size]
|
|
277
|
-
await self.vector_store.add_documents(
|
|
278
|
-
contents=[c['content'] for c in batch],
|
|
279
|
-
metadatas=[c['metadata'] for c in batch],
|
|
280
|
-
tenant_id=tenant_id
|
|
281
|
-
)
|
|
282
|
-
total += len(batch)
|
|
283
|
-
print(f"Indexed {total}/{len(chunks)}")
|
|
284
|
-
|
|
285
|
-
return total
|
|
286
|
-
```
|
|
287
|
-
|
|
288
|
-
## Best Practices
|
|
289
|
-
|
|
290
|
-
1. **Chunk size** - Technical docs: 1000-1500, FAQs: 300-500
|
|
291
|
-
2. **Include overlap** - 10-20% overlap for context continuity
|
|
292
|
-
3. **Use metadata** - Filter by type, source, date
|
|
293
|
-
4. **Hybrid search** - Combine vector + keyword for better recall
|
|
294
|
-
5. **Re-rank results** - Cross-encoder for precision: `cross-encoder/ms-marco-MiniLM-L-6-v2`
|
|
295
|
-
|
|
296
|
-
## Related Skills
|
|
297
|
-
|
|
298
|
-
- `langchain`: RAG implementation
|
|
299
|
-
- `ai-ml`: Embedding generation
|
|
300
|
-
- `fastapi`: Vector search API
|
|
301
|
-
- `redis-cache`: Hybrid search caching
|