repotoire 0.1.2__cp313-cp313-win_amd64.whl → 0.1.4__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- repotoire/__init__.py +2 -2
- repotoire/ai/__init__.py +23 -0
- repotoire/ai/compression.py +543 -0
- repotoire/ai/embeddings.py +315 -12
- repotoire/ai/retrieval.py +273 -56
- repotoire/ai/vector_store.py +549 -0
- repotoire/api/app.py +502 -31
- repotoire/api/auth/__init__.py +26 -6
- repotoire/api/docs/webhooks.py +2 -0
- repotoire/api/models.py +60 -4
- repotoire/api/services/__init__.py +13 -0
- repotoire/api/services/asset_storage.py +31 -38
- repotoire/api/services/cloud_storage.py +4 -50
- repotoire/api/services/github.py +8 -5
- repotoire/api/services/narrative.py +441 -0
- repotoire/api/services/notifications.py +415 -171
- repotoire/api/services/status_emails.py +1 -1
- repotoire/api/services/stripe_service.py +6 -6
- repotoire/api/shared/auth/clerk.py +90 -11
- repotoire/api/shared/auth/state_store.py +40 -13
- repotoire/api/shared/docs/webhooks.py +2 -0
- repotoire/api/shared/helpers/__init__.py +12 -0
- repotoire/api/shared/helpers/errors.py +575 -0
- repotoire/api/shared/helpers/user.py +83 -0
- repotoire/api/shared/middleware/__init__.py +70 -0
- repotoire/api/shared/middleware/csrf.py +176 -0
- repotoire/api/shared/middleware/idempotency.py +285 -0
- repotoire/api/shared/middleware/rate_limit.py +562 -0
- repotoire/api/shared/middleware/security_headers.py +184 -0
- repotoire/api/shared/middleware/tenant.py +315 -0
- repotoire/api/shared/middleware/usage.py +76 -32
- repotoire/api/shared/services/__init__.py +32 -2
- repotoire/api/shared/services/billing.py +130 -6
- repotoire/api/shared/services/circuit_breaker.py +326 -0
- repotoire/api/shared/services/github.py +829 -66
- repotoire/api/shared/services/s3_client.py +237 -0
- repotoire/api/shared/services/stripe_service.py +217 -310
- repotoire/api/v1/__init__.py +42 -0
- repotoire/api/v1/routes/__init__.py +16 -0
- repotoire/api/v1/routes/admin/overrides.py +147 -7
- repotoire/api/v1/routes/analysis.py +419 -20
- repotoire/api/v1/routes/analytics.py +311 -5
- repotoire/api/v1/routes/billing.py +77 -378
- repotoire/api/v1/routes/cli_auth.py +200 -30
- repotoire/api/v1/routes/code.py +830 -88
- repotoire/api/v1/routes/customer_webhooks.py +4 -4
- repotoire/api/v1/routes/detector_settings.py +443 -0
- repotoire/api/v1/routes/findings.py +494 -29
- repotoire/api/v1/routes/fixes.py +886 -33
- repotoire/api/v1/routes/github.py +400 -40
- repotoire/api/v1/routes/graph.py +546 -0
- repotoire/api/v1/routes/historical.py +478 -221
- repotoire/api/v1/routes/marketplace.py +36 -16
- repotoire/api/v1/routes/monorepo.py +605 -0
- repotoire/api/v1/routes/narratives.py +581 -0
- repotoire/api/v1/routes/notifications.py +295 -7
- repotoire/api/v1/routes/organizations.py +333 -9
- repotoire/api/v1/routes/preferences.py +232 -0
- repotoire/api/v1/routes/provenance_settings.py +185 -0
- repotoire/api/v1/routes/rules.py +734 -0
- repotoire/api/v1/routes/sandbox.py +156 -13
- repotoire/api/v1/routes/security.py +543 -0
- repotoire/api/v1/routes/status.py +2 -1
- repotoire/api/v1/routes/team.py +24 -11
- repotoire/api/v1/routes/webhooks.py +708 -49
- repotoire/autofix/applicator.py +94 -51
- repotoire/autofix/best_of_n.py +8 -13
- repotoire/autofix/engine.py +360 -61
- repotoire/autofix/entitlements.py +25 -90
- repotoire/autofix/learning/store.py +21 -7
- repotoire/autofix/verifier.py +4 -1
- repotoire/cache/__init__.py +76 -20
- repotoire/cache/skill.py +166 -34
- repotoire/cli/__init__.py +1027 -1241
- repotoire/cli/api_keys.py +14 -2
- repotoire/cli/auth.py +74 -5
- repotoire/cli/auth_commands.py +60 -17
- repotoire/cli/graph.py +32 -126
- repotoire/cli/historical.py +548 -0
- repotoire/cli/ml.py +28 -28
- repotoire/cli/monorepo.py +4 -16
- repotoire/cli/org_commands.py +284 -0
- repotoire/cli/security.py +7 -48
- repotoire/cli/tier_limits.py +59 -22
- repotoire/config.py +1324 -115
- repotoire/db/models/__init__.py +19 -1
- repotoire/db/models/analysis.py +6 -0
- repotoire/db/models/billing.py +59 -2
- repotoire/db/models/cli_token.py +202 -0
- repotoire/db/models/detector_settings.py +223 -0
- repotoire/db/models/email.py +5 -0
- repotoire/db/models/finding.py +64 -1
- repotoire/db/models/fix.py +8 -0
- repotoire/db/models/notification.py +113 -0
- repotoire/db/models/organization.py +20 -1
- repotoire/db/models/provenance_settings.py +77 -0
- repotoire/db/models/quota_override.py +5 -5
- repotoire/db/models/user.py +26 -0
- repotoire/db/models/user_preferences.py +94 -0
- repotoire/db/repositories/fix.py +200 -3
- repotoire/db/repositories/quota_override.py +10 -2
- repotoire/db/session.py +143 -7
- repotoire/detectors/__init__.py +19 -0
- repotoire/detectors/architectural_bottleneck.py +15 -6
- repotoire/detectors/async_antipattern.py +4 -4
- repotoire/detectors/bandit_detector.py +57 -74
- repotoire/detectors/base.py +201 -7
- repotoire/detectors/circular_dependency.py +68 -11
- repotoire/detectors/core_utility.py +9 -6
- repotoire/detectors/data_clumps.py +51 -10
- repotoire/detectors/dead_code.py +399 -138
- repotoire/detectors/deduplicator.py +13 -21
- repotoire/detectors/degree_centrality.py +9 -6
- repotoire/detectors/duplicate_rust.py +4 -4
- repotoire/detectors/engine.py +572 -80
- repotoire/detectors/eslint_detector.py +558 -0
- repotoire/detectors/external_tool_runner.py +597 -0
- repotoire/detectors/feature_envy.py +14 -17
- repotoire/detectors/generator_misuse.py +4 -4
- repotoire/detectors/god_class.py +118 -53
- repotoire/detectors/graph_algorithms.py +249 -106
- repotoire/detectors/graphsage_detector.py +2 -2
- repotoire/detectors/grouping.py +264 -0
- repotoire/detectors/health_delta.py +489 -0
- repotoire/detectors/inappropriate_intimacy.py +16 -6
- repotoire/detectors/infinite_loop_detector.py +459 -0
- repotoire/detectors/influential_code.py +25 -17
- repotoire/detectors/jscpd_detector.py +67 -39
- repotoire/detectors/lazy_class.py +13 -11
- repotoire/detectors/long_parameter_list.py +9 -6
- repotoire/detectors/message_chain.py +9 -7
- repotoire/detectors/middle_man.py +14 -10
- repotoire/detectors/ml_bug_detector.py +2 -2
- repotoire/detectors/module_cohesion.py +9 -6
- repotoire/detectors/multimodal_detector.py +6 -6
- repotoire/detectors/mypy_detector.py +64 -76
- repotoire/detectors/npm_audit_detector.py +450 -0
- repotoire/detectors/pylint_detector.py +152 -245
- repotoire/detectors/radon_detector.py +172 -17
- repotoire/detectors/refused_bequest.py +12 -9
- repotoire/detectors/ruff_import_detector.py +32 -10
- repotoire/detectors/ruff_lint_detector.py +58 -73
- repotoire/detectors/rust_graph_detectors.py +1227 -0
- repotoire/detectors/satd_detector.py +36 -4
- repotoire/detectors/semgrep_detector.py +64 -74
- repotoire/detectors/shotgun_surgery.py +9 -5
- repotoire/detectors/taint_detector.py +36 -4
- repotoire/detectors/temporal_metrics.py +3 -3
- repotoire/detectors/test_smell.py +4 -4
- repotoire/detectors/truly_unused_imports.py +151 -15
- repotoire/detectors/tsc_detector.py +488 -0
- repotoire/detectors/type_hint_coverage.py +4 -6
- repotoire/detectors/voting_engine.py +158 -83
- repotoire/detectors/vulture_detector.py +75 -66
- repotoire/github/pr_analyzer.py +11 -11
- repotoire/graph/__init__.py +7 -10
- repotoire/graph/cloud_client.py +376 -0
- repotoire/graph/enricher.py +6 -6
- repotoire/graph/factory.py +156 -64
- repotoire/graph/falkordb_client.py +769 -84
- repotoire/graph/incremental_scc.py +59 -37
- repotoire/graph/queries/builders.py +198 -4
- repotoire/graph/queries/patterns.py +52 -6
- repotoire/graph/queries/traversal.py +450 -10
- repotoire/graph/schema.py +236 -21
- repotoire/graph/tenant_factory.py +150 -193
- repotoire/historical/__init__.py +11 -5
- repotoire/historical/git_extractor.py +181 -0
- repotoire/historical/git_rag.py +1210 -0
- repotoire/historical/timescale_client.py +29 -13
- repotoire/hooks/pre_commit.py +16 -12
- repotoire/http_client.py +532 -0
- repotoire/mcp/api_server.py +342 -9
- repotoire/mcp/execution_env.py +50 -17
- repotoire/mcp/pattern_detector.py +4 -4
- repotoire/mcp/resources.py +5 -5
- repotoire/mcp/schema_generator.py +9 -9
- repotoire/mcp/server_generator.py +39 -45
- repotoire/migrations/001_initial_schema.py +59 -7
- repotoire/migrations/002_add_clue_nodes.py +4 -4
- repotoire/migrations/003_add_session_nodes.py +5 -5
- repotoire/migrations/__init__.py +1 -1
- repotoire/migrations/manager.py +3 -3
- repotoire/migrations/migration.py +7 -7
- repotoire/ml/__init__.py +4 -3
- repotoire/ml/bug_predictor.py +8 -8
- repotoire/ml/contrastive_learning.py +1 -1
- repotoire/ml/cross_project_trainer.py +1 -1
- repotoire/ml/graph_embeddings.py +63 -48
- repotoire/ml/graphsage_predictor.py +1 -1
- repotoire/ml/multimodal_analyzer.py +7 -7
- repotoire/ml/node2vec_embeddings.py +41 -22
- repotoire/ml/similarity.py +6 -5
- repotoire/ml/training_data.py +2 -2
- repotoire/models.py +386 -18
- repotoire/monorepo/analyzer.py +5 -5
- repotoire/parsers/__init__.py +38 -0
- repotoire/parsers/base_tree_sitter_parser.py +61 -5
- repotoire/parsers/generic_fallback_parser.py +777 -0
- repotoire/parsers/python_parser.py +344 -103
- repotoire/parsers/rust_parser.py +554 -0
- repotoire/parsers/tree_sitter_go.py +893 -0
- repotoire/parsers/tree_sitter_java.py +745 -0
- repotoire/parsers/tree_sitter_typescript.py +1134 -0
- repotoire/pipeline/__init__.py +2 -2
- repotoire/pipeline/ingestion.py +1153 -236
- repotoire/pipeline/temporal_ingestion.py +9 -9
- repotoire/reporters/__init__.py +14 -2
- repotoire/reporters/base_reporter.py +227 -0
- repotoire/reporters/excel_reporter.py +329 -0
- repotoire/reporters/html_reporter.py +113 -46
- repotoire/reporters/markdown_reporter.py +344 -0
- repotoire/reporters/pdf_reporter.py +432 -0
- repotoire/reporters/sarif_reporter.py +570 -0
- repotoire/rules/daemon.py +15 -9
- repotoire/rules/engine.py +2 -2
- repotoire/rules/validator.py +3 -3
- repotoire/sandbox/__init__.py +15 -0
- repotoire/sandbox/billing.py +493 -0
- repotoire/sandbox/code_validator.py +10 -2
- repotoire/sandbox/config.py +78 -8
- repotoire/sandbox/enforcement.py +40 -12
- repotoire/sandbox/metrics.py +15 -4
- repotoire/sandbox/session_tracker.py +70 -17
- repotoire/sandbox/skill_executor.py +3 -0
- repotoire/sandbox/tool_executor.py +643 -39
- repotoire/sandbox/usage.py +23 -17
- repotoire/security/dependency_scanner.py +11 -5
- repotoire/services/audit.py +4 -4
- repotoire/tenant/__init__.py +86 -0
- repotoire/tenant/context.py +288 -0
- repotoire/tenant/logging.py +132 -0
- repotoire/tenant/resolver.py +253 -0
- repotoire/utils/encryption.py +91 -0
- repotoire/validation.py +465 -88
- repotoire/workers/analytics_tasks.py +5 -5
- repotoire/workers/celery_app.py +16 -3
- repotoire/workers/cleanup.py +117 -0
- repotoire/workers/hooks.py +492 -47
- repotoire/workers/limits.py +33 -6
- repotoire/workers/progress.py +26 -3
- repotoire/workers/tasks.py +377 -44
- {repotoire-0.1.2.dist-info → repotoire-0.1.4.dist-info}/METADATA +39 -44
- repotoire-0.1.4.dist-info/RECORD +406 -0
- {repotoire-0.1.2.dist-info → repotoire-0.1.4.dist-info}/WHEEL +1 -1
- repotoire_fast/__init__.py +116 -0
- repotoire_fast/repotoire_fast.cp313-win_amd64.pyd +0 -0
- repotoire/api/auth/clerk.py +0 -192
- repotoire/api/auth/state_store.py +0 -283
- repotoire/api/middleware/usage.py +0 -165
- repotoire/api/routes/__init__.py +0 -39
- repotoire/api/routes/account.py +0 -601
- repotoire/api/routes/admin/__init__.py +0 -5
- repotoire/api/routes/admin/overrides.py +0 -376
- repotoire/api/routes/analysis.py +0 -823
- repotoire/api/routes/analytics.py +0 -563
- repotoire/api/routes/audit.py +0 -427
- repotoire/api/routes/billing.py +0 -600
- repotoire/api/routes/cli_auth.py +0 -449
- repotoire/api/routes/code.py +0 -363
- repotoire/api/routes/customer_webhooks.py +0 -733
- repotoire/api/routes/findings.py +0 -651
- repotoire/api/routes/fixes.py +0 -1389
- repotoire/api/routes/github.py +0 -1309
- repotoire/api/routes/historical.py +0 -338
- repotoire/api/routes/notifications.py +0 -203
- repotoire/api/routes/organizations.py +0 -703
- repotoire/api/routes/sandbox.py +0 -699
- repotoire/api/routes/team.py +0 -408
- repotoire/api/routes/usage.py +0 -228
- repotoire/api/routes/webhooks.py +0 -984
- repotoire/graph/client.py +0 -1009
- repotoire/graph/neo4j_multitenant.py +0 -380
- repotoire/historical/git_graphiti.py +0 -350
- repotoire-0.1.2.dist-info/RECORD +0 -376
- {repotoire-0.1.2.dist-info → repotoire-0.1.4.dist-info}/entry_points.txt +2 -2
- {repotoire-0.1.2.dist-info → repotoire-0.1.4.dist-info}/licenses/LICENSE +0 -0
repotoire/__init__.py
CHANGED
|
@@ -5,10 +5,10 @@ Analyzes codebases using knowledge graphs to detect code smells,
|
|
|
5
5
|
architectural issues, and technical debt.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
-
__version__ = "0.1.
|
|
8
|
+
__version__ = "0.1.4"
|
|
9
9
|
|
|
10
10
|
from repotoire.pipeline import IngestionPipeline
|
|
11
|
-
from repotoire.graph import
|
|
11
|
+
from repotoire.graph import FalkorDBClient
|
|
12
12
|
from repotoire.detectors import AnalysisEngine
|
|
13
13
|
from repotoire.models import CodebaseHealth, Finding
|
|
14
14
|
|
repotoire/ai/__init__.py
CHANGED
|
@@ -6,6 +6,11 @@ from repotoire.ai.embeddings import (
|
|
|
6
6
|
EmbeddingConfig,
|
|
7
7
|
EmbeddingBackend,
|
|
8
8
|
create_embedder,
|
|
9
|
+
# Int8 quantization for memory-efficient storage
|
|
10
|
+
quantize_embedding,
|
|
11
|
+
dequantize_embedding,
|
|
12
|
+
quantize_embeddings_batch,
|
|
13
|
+
compute_cosine_similarity_quantized,
|
|
9
14
|
)
|
|
10
15
|
from repotoire.ai.retrieval import (
|
|
11
16
|
GraphRAGRetriever,
|
|
@@ -43,6 +48,13 @@ from repotoire.ai.contextual import (
|
|
|
43
48
|
ContextGenerationResult,
|
|
44
49
|
create_context_generator,
|
|
45
50
|
)
|
|
51
|
+
from repotoire.ai.compression import (
|
|
52
|
+
EmbeddingCompressor,
|
|
53
|
+
TenantCompressor,
|
|
54
|
+
create_compressor,
|
|
55
|
+
estimate_memory_savings,
|
|
56
|
+
DEFAULT_TARGET_DIMS,
|
|
57
|
+
)
|
|
46
58
|
|
|
47
59
|
__all__ = [
|
|
48
60
|
# NLP
|
|
@@ -52,6 +64,11 @@ __all__ = [
|
|
|
52
64
|
"EmbeddingConfig",
|
|
53
65
|
"EmbeddingBackend",
|
|
54
66
|
"create_embedder",
|
|
67
|
+
# Int8 quantization (4x memory reduction)
|
|
68
|
+
"quantize_embedding",
|
|
69
|
+
"dequantize_embedding",
|
|
70
|
+
"quantize_embeddings_batch",
|
|
71
|
+
"compute_cosine_similarity_quantized",
|
|
55
72
|
# Retrieval
|
|
56
73
|
"GraphRAGRetriever",
|
|
57
74
|
"RetrievalResult",
|
|
@@ -83,4 +100,10 @@ __all__ = [
|
|
|
83
100
|
"CostLimitExceeded",
|
|
84
101
|
"ContextGenerationResult",
|
|
85
102
|
"create_context_generator",
|
|
103
|
+
# Compression (memory optimization)
|
|
104
|
+
"EmbeddingCompressor",
|
|
105
|
+
"TenantCompressor",
|
|
106
|
+
"create_compressor",
|
|
107
|
+
"estimate_memory_savings",
|
|
108
|
+
"DEFAULT_TARGET_DIMS",
|
|
86
109
|
]
|
|
@@ -0,0 +1,543 @@
|
|
|
1
|
+
"""Embedding compression for memory-efficient storage.
|
|
2
|
+
|
|
3
|
+
Implements PCA dimensionality reduction + int8 quantization for 8x compression
|
|
4
|
+
with <3% quality loss, based on research findings.
|
|
5
|
+
|
|
6
|
+
Compression pipeline:
|
|
7
|
+
1. PCA: 4096 → 2048 dimensions (2x reduction)
|
|
8
|
+
2. int8 quantization: float32 → int8 (4x reduction)
|
|
9
|
+
3. Combined: 8x total compression
|
|
10
|
+
|
|
11
|
+
Storage: 4096 * 4 bytes = 16KB → 2048 * 1 byte = 2KB per embedding
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import os
|
|
15
|
+
import json
|
|
16
|
+
import pickle
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import List, Optional, Tuple
|
|
19
|
+
import numpy as np
|
|
20
|
+
|
|
21
|
+
from repotoire.logging_config import get_logger
|
|
22
|
+
|
|
23
|
+
logger = get_logger(__name__)
|
|
24
|
+
|
|
25
|
+
# Default compression settings
|
|
26
|
+
DEFAULT_TARGET_DIMS = 1024 # PCA target dimensions (4x reduction from 4096)
|
|
27
|
+
DEFAULT_QUANTIZATION_BITS = 8 # int8 quantization
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class EmbeddingCompressor:
|
|
31
|
+
"""Compress embeddings using PCA + quantization.
|
|
32
|
+
|
|
33
|
+
Provides 8x compression with <3% quality loss on retrieval tasks.
|
|
34
|
+
|
|
35
|
+
Example:
|
|
36
|
+
>>> compressor = EmbeddingCompressor(target_dims=2048)
|
|
37
|
+
>>> # Fit on existing embeddings
|
|
38
|
+
>>> compressor.fit(existing_embeddings)
|
|
39
|
+
>>> # Compress new embeddings
|
|
40
|
+
>>> compressed = compressor.compress(new_embedding)
|
|
41
|
+
>>> # Decompress for similarity computation
|
|
42
|
+
>>> decompressed = compressor.decompress(compressed)
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
def __init__(
|
|
46
|
+
self,
|
|
47
|
+
target_dims: int = DEFAULT_TARGET_DIMS,
|
|
48
|
+
quantization_bits: int = DEFAULT_QUANTIZATION_BITS,
|
|
49
|
+
model_path: Optional[Path] = None,
|
|
50
|
+
):
|
|
51
|
+
"""Initialize compressor.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
target_dims: Target dimensions after PCA (default: 2048)
|
|
55
|
+
quantization_bits: Bits for quantization (default: 8 for int8)
|
|
56
|
+
model_path: Path to save/load fitted PCA model
|
|
57
|
+
"""
|
|
58
|
+
self.target_dims = target_dims
|
|
59
|
+
self.quantization_bits = quantization_bits
|
|
60
|
+
self.model_path = model_path or Path.home() / ".repotoire" / "pca_model.pkl"
|
|
61
|
+
|
|
62
|
+
# PCA components (fitted)
|
|
63
|
+
self._pca_components: Optional[np.ndarray] = None
|
|
64
|
+
self._pca_mean: Optional[np.ndarray] = None
|
|
65
|
+
self._source_dims: Optional[int] = None
|
|
66
|
+
|
|
67
|
+
# Quantization parameters (computed during fit)
|
|
68
|
+
self._scale: Optional[float] = None
|
|
69
|
+
self._zero_point: Optional[float] = None
|
|
70
|
+
|
|
71
|
+
# Try to load existing model
|
|
72
|
+
if self.model_path.exists():
|
|
73
|
+
self._load_model()
|
|
74
|
+
|
|
75
|
+
@property
|
|
76
|
+
def is_fitted(self) -> bool:
|
|
77
|
+
"""Check if compressor has been fitted."""
|
|
78
|
+
return self._pca_components is not None
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
def compression_ratio(self) -> float:
|
|
82
|
+
"""Calculate compression ratio."""
|
|
83
|
+
if not self._source_dims:
|
|
84
|
+
return 1.0
|
|
85
|
+
# Original: source_dims * 4 bytes (float32)
|
|
86
|
+
# Compressed: target_dims * 1 byte (int8)
|
|
87
|
+
original_size = self._source_dims * 4
|
|
88
|
+
compressed_size = self.target_dims * 1
|
|
89
|
+
return original_size / compressed_size
|
|
90
|
+
|
|
91
|
+
def fit(
|
|
92
|
+
self,
|
|
93
|
+
embeddings: List[List[float]],
|
|
94
|
+
save: bool = True,
|
|
95
|
+
) -> "EmbeddingCompressor":
|
|
96
|
+
"""Fit PCA on a sample of embeddings.
|
|
97
|
+
|
|
98
|
+
Should be called with a representative sample of embeddings
|
|
99
|
+
(e.g., 1000-10000 embeddings) to learn the principal components.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
embeddings: List of embedding vectors to fit on
|
|
103
|
+
save: Whether to save the fitted model to disk
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
self for method chaining
|
|
107
|
+
"""
|
|
108
|
+
if len(embeddings) < 100:
|
|
109
|
+
logger.warning(
|
|
110
|
+
f"Fitting PCA on only {len(embeddings)} samples. "
|
|
111
|
+
"Recommend at least 1000 for good quality."
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# Convert to numpy array
|
|
115
|
+
X = np.array(embeddings, dtype=np.float32)
|
|
116
|
+
n_samples, n_features = X.shape
|
|
117
|
+
self._source_dims = n_features
|
|
118
|
+
|
|
119
|
+
# PCA requires n_components <= min(n_samples, n_features)
|
|
120
|
+
max_components = min(n_samples, n_features)
|
|
121
|
+
effective_target_dims = min(self.target_dims, max_components)
|
|
122
|
+
|
|
123
|
+
if effective_target_dims < self.target_dims:
|
|
124
|
+
logger.warning(
|
|
125
|
+
f"Reducing target_dims from {self.target_dims} to {effective_target_dims} "
|
|
126
|
+
f"(limited by {n_samples} samples)"
|
|
127
|
+
)
|
|
128
|
+
self.target_dims = effective_target_dims
|
|
129
|
+
|
|
130
|
+
logger.info(
|
|
131
|
+
f"Fitting PCA: {self._source_dims} dims → {self.target_dims} dims "
|
|
132
|
+
f"on {len(embeddings)} samples"
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# Compute mean
|
|
136
|
+
self._pca_mean = np.mean(X, axis=0)
|
|
137
|
+
|
|
138
|
+
# Center the data
|
|
139
|
+
X_centered = X - self._pca_mean
|
|
140
|
+
|
|
141
|
+
# Compute covariance matrix (more memory efficient for large dims)
|
|
142
|
+
# Using SVD instead of eigendecomposition for numerical stability
|
|
143
|
+
try:
|
|
144
|
+
from sklearn.decomposition import PCA
|
|
145
|
+
|
|
146
|
+
# Use sklearn PCA for better numerical stability
|
|
147
|
+
pca = PCA(n_components=self.target_dims, svd_solver='randomized')
|
|
148
|
+
pca.fit(X)
|
|
149
|
+
|
|
150
|
+
self._pca_components = pca.components_.astype(np.float32)
|
|
151
|
+
self._pca_mean = pca.mean_.astype(np.float32)
|
|
152
|
+
|
|
153
|
+
explained_variance = sum(pca.explained_variance_ratio_)
|
|
154
|
+
logger.info(f"PCA explains {explained_variance:.1%} of variance")
|
|
155
|
+
|
|
156
|
+
except ImportError:
|
|
157
|
+
# Fallback to manual SVD if sklearn not available
|
|
158
|
+
logger.info("sklearn not available, using numpy SVD")
|
|
159
|
+
U, S, Vt = np.linalg.svd(X_centered, full_matrices=False)
|
|
160
|
+
self._pca_components = Vt[:self.target_dims].astype(np.float32)
|
|
161
|
+
|
|
162
|
+
# Compute explained variance
|
|
163
|
+
total_var = np.sum(S ** 2)
|
|
164
|
+
explained_var = np.sum(S[:self.target_dims] ** 2) / total_var
|
|
165
|
+
logger.info(f"PCA explains {explained_var:.1%} of variance")
|
|
166
|
+
|
|
167
|
+
# Compute quantization parameters from transformed data
|
|
168
|
+
X_transformed = self._pca_transform(X)
|
|
169
|
+
self._compute_quantization_params(X_transformed)
|
|
170
|
+
|
|
171
|
+
logger.info(
|
|
172
|
+
f"Compression ratio: {self.compression_ratio:.1f}x "
|
|
173
|
+
f"({self._source_dims * 4} bytes → {self.target_dims} bytes)"
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
if save:
|
|
177
|
+
self._save_model()
|
|
178
|
+
|
|
179
|
+
return self
|
|
180
|
+
|
|
181
|
+
def _pca_transform(self, X: np.ndarray) -> np.ndarray:
|
|
182
|
+
"""Apply PCA transformation."""
|
|
183
|
+
X_centered = X - self._pca_mean
|
|
184
|
+
return X_centered @ self._pca_components.T
|
|
185
|
+
|
|
186
|
+
def _pca_inverse_transform(self, X_reduced: np.ndarray) -> np.ndarray:
|
|
187
|
+
"""Inverse PCA transformation (approximate reconstruction)."""
|
|
188
|
+
return X_reduced @ self._pca_components + self._pca_mean
|
|
189
|
+
|
|
190
|
+
def _compute_quantization_params(self, X: np.ndarray) -> None:
|
|
191
|
+
"""Compute scale and zero point for int8 quantization."""
|
|
192
|
+
# Use percentiles to be robust to outliers
|
|
193
|
+
min_val = np.percentile(X, 0.1)
|
|
194
|
+
max_val = np.percentile(X, 99.9)
|
|
195
|
+
|
|
196
|
+
# Compute scale and zero point for symmetric quantization
|
|
197
|
+
self._scale = (max_val - min_val) / 255 # 256 levels for int8
|
|
198
|
+
self._zero_point = min_val
|
|
199
|
+
|
|
200
|
+
logger.debug(f"Quantization params: scale={self._scale:.6f}, zero={self._zero_point:.6f}")
|
|
201
|
+
|
|
202
|
+
def _quantize(self, X: np.ndarray) -> np.ndarray:
|
|
203
|
+
"""Quantize float32 to int8."""
|
|
204
|
+
# Scale to [0, 255] range
|
|
205
|
+
X_scaled = (X - self._zero_point) / self._scale
|
|
206
|
+
# Clip and convert to uint8
|
|
207
|
+
X_quantized = np.clip(X_scaled, 0, 255).astype(np.uint8)
|
|
208
|
+
return X_quantized
|
|
209
|
+
|
|
210
|
+
def _dequantize(self, X_quantized: np.ndarray) -> np.ndarray:
|
|
211
|
+
"""Dequantize int8 back to float32."""
|
|
212
|
+
return X_quantized.astype(np.float32) * self._scale + self._zero_point
|
|
213
|
+
|
|
214
|
+
def compress(self, embedding: List[float]) -> bytes:
|
|
215
|
+
"""Compress a single embedding to bytes.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
embedding: Original embedding vector (e.g., 4096 floats)
|
|
219
|
+
|
|
220
|
+
Returns:
|
|
221
|
+
Compressed embedding as bytes (e.g., 2048 bytes)
|
|
222
|
+
"""
|
|
223
|
+
if not self.is_fitted:
|
|
224
|
+
raise RuntimeError("Compressor not fitted. Call fit() first.")
|
|
225
|
+
|
|
226
|
+
# Convert to numpy
|
|
227
|
+
X = np.array([embedding], dtype=np.float32)
|
|
228
|
+
|
|
229
|
+
# Apply PCA
|
|
230
|
+
X_reduced = self._pca_transform(X)
|
|
231
|
+
|
|
232
|
+
# Quantize to int8
|
|
233
|
+
X_quantized = self._quantize(X_reduced)
|
|
234
|
+
|
|
235
|
+
# Return as bytes
|
|
236
|
+
return X_quantized[0].tobytes()
|
|
237
|
+
|
|
238
|
+
def compress_batch(self, embeddings: List[List[float]]) -> List[bytes]:
|
|
239
|
+
"""Compress multiple embeddings efficiently.
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
embeddings: List of embedding vectors
|
|
243
|
+
|
|
244
|
+
Returns:
|
|
245
|
+
List of compressed embeddings as bytes
|
|
246
|
+
"""
|
|
247
|
+
if not self.is_fitted:
|
|
248
|
+
raise RuntimeError("Compressor not fitted. Call fit() first.")
|
|
249
|
+
|
|
250
|
+
# Convert to numpy
|
|
251
|
+
X = np.array(embeddings, dtype=np.float32)
|
|
252
|
+
|
|
253
|
+
# Apply PCA
|
|
254
|
+
X_reduced = self._pca_transform(X)
|
|
255
|
+
|
|
256
|
+
# Quantize to int8
|
|
257
|
+
X_quantized = self._quantize(X_reduced)
|
|
258
|
+
|
|
259
|
+
# Return as list of bytes
|
|
260
|
+
return [row.tobytes() for row in X_quantized]
|
|
261
|
+
|
|
262
|
+
def decompress(self, compressed: bytes) -> List[float]:
|
|
263
|
+
"""Decompress bytes back to embedding vector.
|
|
264
|
+
|
|
265
|
+
Note: This is an approximate reconstruction due to PCA and quantization.
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
compressed: Compressed embedding bytes
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
Reconstructed embedding vector (original dimensions)
|
|
272
|
+
"""
|
|
273
|
+
if not self.is_fitted:
|
|
274
|
+
raise RuntimeError("Compressor not fitted. Call fit() first.")
|
|
275
|
+
|
|
276
|
+
# Convert bytes to numpy array
|
|
277
|
+
X_quantized = np.frombuffer(compressed, dtype=np.uint8).reshape(1, -1)
|
|
278
|
+
|
|
279
|
+
# Dequantize
|
|
280
|
+
X_reduced = self._dequantize(X_quantized)
|
|
281
|
+
|
|
282
|
+
# Inverse PCA (approximate reconstruction)
|
|
283
|
+
X_reconstructed = self._pca_inverse_transform(X_reduced)
|
|
284
|
+
|
|
285
|
+
return X_reconstructed[0].tolist()
|
|
286
|
+
|
|
287
|
+
def decompress_batch(self, compressed_list: List[bytes]) -> List[List[float]]:
|
|
288
|
+
"""Decompress multiple embeddings efficiently.
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
compressed_list: List of compressed embedding bytes
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
List of reconstructed embedding vectors
|
|
295
|
+
"""
|
|
296
|
+
if not self.is_fitted:
|
|
297
|
+
raise RuntimeError("Compressor not fitted. Call fit() first.")
|
|
298
|
+
|
|
299
|
+
# Stack all compressed embeddings
|
|
300
|
+
X_quantized = np.array([
|
|
301
|
+
np.frombuffer(c, dtype=np.uint8) for c in compressed_list
|
|
302
|
+
])
|
|
303
|
+
|
|
304
|
+
# Dequantize
|
|
305
|
+
X_reduced = self._dequantize(X_quantized)
|
|
306
|
+
|
|
307
|
+
# Inverse PCA
|
|
308
|
+
X_reconstructed = self._pca_inverse_transform(X_reduced)
|
|
309
|
+
|
|
310
|
+
return X_reconstructed.tolist()
|
|
311
|
+
|
|
312
|
+
def get_reduced_embedding(self, embedding: List[float]) -> List[float]:
|
|
313
|
+
"""Get PCA-reduced embedding without quantization.
|
|
314
|
+
|
|
315
|
+
Useful when you want dimensionality reduction but need float precision
|
|
316
|
+
for vector similarity search.
|
|
317
|
+
|
|
318
|
+
Args:
|
|
319
|
+
embedding: Original embedding vector
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
Reduced embedding (target_dims floats)
|
|
323
|
+
"""
|
|
324
|
+
if not self.is_fitted:
|
|
325
|
+
raise RuntimeError("Compressor not fitted. Call fit() first.")
|
|
326
|
+
|
|
327
|
+
X = np.array([embedding], dtype=np.float32)
|
|
328
|
+
X_reduced = self._pca_transform(X)
|
|
329
|
+
return X_reduced[0].tolist()
|
|
330
|
+
|
|
331
|
+
def get_reduced_embeddings_batch(
|
|
332
|
+
self, embeddings: List[List[float]]
|
|
333
|
+
) -> List[List[float]]:
|
|
334
|
+
"""Get PCA-reduced embeddings for a batch.
|
|
335
|
+
|
|
336
|
+
Args:
|
|
337
|
+
embeddings: List of original embedding vectors
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
List of reduced embeddings (target_dims floats each)
|
|
341
|
+
"""
|
|
342
|
+
if not self.is_fitted:
|
|
343
|
+
raise RuntimeError("Compressor not fitted. Call fit() first.")
|
|
344
|
+
|
|
345
|
+
X = np.array(embeddings, dtype=np.float32)
|
|
346
|
+
X_reduced = self._pca_transform(X)
|
|
347
|
+
return X_reduced.tolist()
|
|
348
|
+
|
|
349
|
+
def _save_model(self) -> None:
|
|
350
|
+
"""Save fitted PCA model to disk."""
|
|
351
|
+
self.model_path.parent.mkdir(parents=True, exist_ok=True)
|
|
352
|
+
|
|
353
|
+
model_data = {
|
|
354
|
+
"pca_components": self._pca_components,
|
|
355
|
+
"pca_mean": self._pca_mean,
|
|
356
|
+
"source_dims": self._source_dims,
|
|
357
|
+
"target_dims": self.target_dims,
|
|
358
|
+
"scale": self._scale,
|
|
359
|
+
"zero_point": self._zero_point,
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
with open(self.model_path, "wb") as f:
|
|
363
|
+
pickle.dump(model_data, f)
|
|
364
|
+
|
|
365
|
+
logger.info(f"Saved PCA model to {self.model_path}")
|
|
366
|
+
|
|
367
|
+
def _load_model(self) -> None:
|
|
368
|
+
"""Load fitted PCA model from disk."""
|
|
369
|
+
try:
|
|
370
|
+
with open(self.model_path, "rb") as f:
|
|
371
|
+
model_data = pickle.load(f)
|
|
372
|
+
|
|
373
|
+
self._pca_components = model_data["pca_components"]
|
|
374
|
+
self._pca_mean = model_data["pca_mean"]
|
|
375
|
+
self._source_dims = model_data["source_dims"]
|
|
376
|
+
self.target_dims = model_data["target_dims"]
|
|
377
|
+
self._scale = model_data["scale"]
|
|
378
|
+
self._zero_point = model_data["zero_point"]
|
|
379
|
+
|
|
380
|
+
logger.info(
|
|
381
|
+
f"Loaded PCA model: {self._source_dims} → {self.target_dims} dims"
|
|
382
|
+
)
|
|
383
|
+
except Exception as e:
|
|
384
|
+
logger.warning(f"Could not load PCA model: {e}")
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
class TenantCompressor:
|
|
388
|
+
"""Per-tenant embedding compressor with model storage in cloud.
|
|
389
|
+
|
|
390
|
+
Each tenant can have their own PCA model fitted on their codebase,
|
|
391
|
+
allowing for better compression quality tailored to their code patterns.
|
|
392
|
+
"""
|
|
393
|
+
|
|
394
|
+
def __init__(
|
|
395
|
+
self,
|
|
396
|
+
tenant_id: str,
|
|
397
|
+
storage_backend: str = "local", # or "s3", "gcs"
|
|
398
|
+
target_dims: int = DEFAULT_TARGET_DIMS,
|
|
399
|
+
):
|
|
400
|
+
"""Initialize tenant-specific compressor.
|
|
401
|
+
|
|
402
|
+
Args:
|
|
403
|
+
tenant_id: Unique tenant identifier
|
|
404
|
+
storage_backend: Where to store PCA models
|
|
405
|
+
target_dims: Target dimensions after compression
|
|
406
|
+
"""
|
|
407
|
+
self.tenant_id = tenant_id
|
|
408
|
+
self.storage_backend = storage_backend
|
|
409
|
+
self.target_dims = target_dims
|
|
410
|
+
|
|
411
|
+
# Model path includes tenant ID
|
|
412
|
+
model_dir = Path.home() / ".repotoire" / "compression_models"
|
|
413
|
+
self.model_path = model_dir / f"{tenant_id}_pca.pkl"
|
|
414
|
+
|
|
415
|
+
self._compressor = EmbeddingCompressor(
|
|
416
|
+
target_dims=target_dims,
|
|
417
|
+
model_path=self.model_path,
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
@property
|
|
421
|
+
def is_fitted(self) -> bool:
|
|
422
|
+
"""Check if tenant compressor is fitted."""
|
|
423
|
+
return self._compressor.is_fitted
|
|
424
|
+
|
|
425
|
+
def fit_from_graph(
|
|
426
|
+
self,
|
|
427
|
+
graph_client,
|
|
428
|
+
sample_size: int = 5000,
|
|
429
|
+
) -> "TenantCompressor":
|
|
430
|
+
"""Fit compressor on embeddings from tenant's graph.
|
|
431
|
+
|
|
432
|
+
Args:
|
|
433
|
+
graph_client: FalkorDB client for the tenant
|
|
434
|
+
sample_size: Number of embeddings to sample for fitting
|
|
435
|
+
|
|
436
|
+
Returns:
|
|
437
|
+
self for method chaining
|
|
438
|
+
"""
|
|
439
|
+
# Query embeddings from graph
|
|
440
|
+
# Note: FalkorDB uses labels() function for label checks instead of inline syntax
|
|
441
|
+
query = """
|
|
442
|
+
MATCH (n)
|
|
443
|
+
WHERE ('Function' IN labels(n) OR 'Class' IN labels(n) OR 'File' IN labels(n)) AND n.embedding IS NOT NULL
|
|
444
|
+
RETURN n.embedding as embedding
|
|
445
|
+
LIMIT $limit
|
|
446
|
+
"""
|
|
447
|
+
|
|
448
|
+
results = graph_client.query(query, {"limit": sample_size})
|
|
449
|
+
|
|
450
|
+
if not results:
|
|
451
|
+
logger.warning(f"No embeddings found for tenant {self.tenant_id}")
|
|
452
|
+
return self
|
|
453
|
+
|
|
454
|
+
embeddings = [r["embedding"] for r in results if r.get("embedding")]
|
|
455
|
+
|
|
456
|
+
if len(embeddings) < 100:
|
|
457
|
+
logger.warning(
|
|
458
|
+
f"Only {len(embeddings)} embeddings for tenant {self.tenant_id}. "
|
|
459
|
+
"Recommend at least 100 for quality compression."
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
logger.info(f"Fitting compressor for tenant {self.tenant_id} on {len(embeddings)} embeddings")
|
|
463
|
+
self._compressor.fit(embeddings)
|
|
464
|
+
|
|
465
|
+
return self
|
|
466
|
+
|
|
467
|
+
def compress(self, embedding: List[float]) -> bytes:
|
|
468
|
+
"""Compress embedding using tenant's model."""
|
|
469
|
+
return self._compressor.compress(embedding)
|
|
470
|
+
|
|
471
|
+
def compress_batch(self, embeddings: List[List[float]]) -> List[bytes]:
|
|
472
|
+
"""Compress batch of embeddings."""
|
|
473
|
+
return self._compressor.compress_batch(embeddings)
|
|
474
|
+
|
|
475
|
+
def decompress(self, compressed: bytes) -> List[float]:
|
|
476
|
+
"""Decompress embedding."""
|
|
477
|
+
return self._compressor.decompress(compressed)
|
|
478
|
+
|
|
479
|
+
def get_reduced_embedding(self, embedding: List[float]) -> List[float]:
|
|
480
|
+
"""Get PCA-reduced embedding (float precision)."""
|
|
481
|
+
return self._compressor.get_reduced_embedding(embedding)
|
|
482
|
+
|
|
483
|
+
def get_reduced_embeddings_batch(
|
|
484
|
+
self, embeddings: List[List[float]]
|
|
485
|
+
) -> List[List[float]]:
|
|
486
|
+
"""Get PCA-reduced embeddings for batch."""
|
|
487
|
+
return self._compressor.get_reduced_embeddings_batch(embeddings)
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
def create_compressor(
|
|
491
|
+
target_dims: int = DEFAULT_TARGET_DIMS,
|
|
492
|
+
model_path: Optional[Path] = None,
|
|
493
|
+
) -> EmbeddingCompressor:
|
|
494
|
+
"""Factory function to create an embedding compressor.
|
|
495
|
+
|
|
496
|
+
Args:
|
|
497
|
+
target_dims: Target dimensions after PCA
|
|
498
|
+
model_path: Path to save/load PCA model
|
|
499
|
+
|
|
500
|
+
Returns:
|
|
501
|
+
EmbeddingCompressor instance
|
|
502
|
+
"""
|
|
503
|
+
return EmbeddingCompressor(
|
|
504
|
+
target_dims=target_dims,
|
|
505
|
+
model_path=model_path,
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
|
|
509
|
+
def estimate_memory_savings(
|
|
510
|
+
num_entities: int,
|
|
511
|
+
source_dims: int = 4096,
|
|
512
|
+
target_dims: int = DEFAULT_TARGET_DIMS,
|
|
513
|
+
) -> dict:
|
|
514
|
+
"""Estimate memory savings from compression.
|
|
515
|
+
|
|
516
|
+
Args:
|
|
517
|
+
num_entities: Number of entities with embeddings
|
|
518
|
+
source_dims: Original embedding dimensions
|
|
519
|
+
target_dims: Target dimensions after compression
|
|
520
|
+
|
|
521
|
+
Returns:
|
|
522
|
+
Dictionary with memory estimates
|
|
523
|
+
"""
|
|
524
|
+
# Original: float32 (4 bytes per dimension)
|
|
525
|
+
original_bytes = num_entities * source_dims * 4
|
|
526
|
+
|
|
527
|
+
# Compressed: int8 (1 byte per dimension)
|
|
528
|
+
compressed_bytes = num_entities * target_dims * 1
|
|
529
|
+
|
|
530
|
+
# PCA-reduced only (float32, no quantization)
|
|
531
|
+
reduced_bytes = num_entities * target_dims * 4
|
|
532
|
+
|
|
533
|
+
return {
|
|
534
|
+
"num_entities": num_entities,
|
|
535
|
+
"source_dims": source_dims,
|
|
536
|
+
"target_dims": target_dims,
|
|
537
|
+
"original_mb": original_bytes / (1024 * 1024),
|
|
538
|
+
"compressed_mb": compressed_bytes / (1024 * 1024),
|
|
539
|
+
"reduced_only_mb": reduced_bytes / (1024 * 1024),
|
|
540
|
+
"compression_ratio": original_bytes / compressed_bytes,
|
|
541
|
+
"savings_mb": (original_bytes - compressed_bytes) / (1024 * 1024),
|
|
542
|
+
"savings_percent": (1 - compressed_bytes / original_bytes) * 100,
|
|
543
|
+
}
|