repotoire 0.1.2__cp313-cp313-win_amd64.whl → 0.1.4__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (277) hide show
  1. repotoire/__init__.py +2 -2
  2. repotoire/ai/__init__.py +23 -0
  3. repotoire/ai/compression.py +543 -0
  4. repotoire/ai/embeddings.py +315 -12
  5. repotoire/ai/retrieval.py +273 -56
  6. repotoire/ai/vector_store.py +549 -0
  7. repotoire/api/app.py +502 -31
  8. repotoire/api/auth/__init__.py +26 -6
  9. repotoire/api/docs/webhooks.py +2 -0
  10. repotoire/api/models.py +60 -4
  11. repotoire/api/services/__init__.py +13 -0
  12. repotoire/api/services/asset_storage.py +31 -38
  13. repotoire/api/services/cloud_storage.py +4 -50
  14. repotoire/api/services/github.py +8 -5
  15. repotoire/api/services/narrative.py +441 -0
  16. repotoire/api/services/notifications.py +415 -171
  17. repotoire/api/services/status_emails.py +1 -1
  18. repotoire/api/services/stripe_service.py +6 -6
  19. repotoire/api/shared/auth/clerk.py +90 -11
  20. repotoire/api/shared/auth/state_store.py +40 -13
  21. repotoire/api/shared/docs/webhooks.py +2 -0
  22. repotoire/api/shared/helpers/__init__.py +12 -0
  23. repotoire/api/shared/helpers/errors.py +575 -0
  24. repotoire/api/shared/helpers/user.py +83 -0
  25. repotoire/api/shared/middleware/__init__.py +70 -0
  26. repotoire/api/shared/middleware/csrf.py +176 -0
  27. repotoire/api/shared/middleware/idempotency.py +285 -0
  28. repotoire/api/shared/middleware/rate_limit.py +562 -0
  29. repotoire/api/shared/middleware/security_headers.py +184 -0
  30. repotoire/api/shared/middleware/tenant.py +315 -0
  31. repotoire/api/shared/middleware/usage.py +76 -32
  32. repotoire/api/shared/services/__init__.py +32 -2
  33. repotoire/api/shared/services/billing.py +130 -6
  34. repotoire/api/shared/services/circuit_breaker.py +326 -0
  35. repotoire/api/shared/services/github.py +829 -66
  36. repotoire/api/shared/services/s3_client.py +237 -0
  37. repotoire/api/shared/services/stripe_service.py +217 -310
  38. repotoire/api/v1/__init__.py +42 -0
  39. repotoire/api/v1/routes/__init__.py +16 -0
  40. repotoire/api/v1/routes/admin/overrides.py +147 -7
  41. repotoire/api/v1/routes/analysis.py +419 -20
  42. repotoire/api/v1/routes/analytics.py +311 -5
  43. repotoire/api/v1/routes/billing.py +77 -378
  44. repotoire/api/v1/routes/cli_auth.py +200 -30
  45. repotoire/api/v1/routes/code.py +830 -88
  46. repotoire/api/v1/routes/customer_webhooks.py +4 -4
  47. repotoire/api/v1/routes/detector_settings.py +443 -0
  48. repotoire/api/v1/routes/findings.py +494 -29
  49. repotoire/api/v1/routes/fixes.py +886 -33
  50. repotoire/api/v1/routes/github.py +400 -40
  51. repotoire/api/v1/routes/graph.py +546 -0
  52. repotoire/api/v1/routes/historical.py +478 -221
  53. repotoire/api/v1/routes/marketplace.py +36 -16
  54. repotoire/api/v1/routes/monorepo.py +605 -0
  55. repotoire/api/v1/routes/narratives.py +581 -0
  56. repotoire/api/v1/routes/notifications.py +295 -7
  57. repotoire/api/v1/routes/organizations.py +333 -9
  58. repotoire/api/v1/routes/preferences.py +232 -0
  59. repotoire/api/v1/routes/provenance_settings.py +185 -0
  60. repotoire/api/v1/routes/rules.py +734 -0
  61. repotoire/api/v1/routes/sandbox.py +156 -13
  62. repotoire/api/v1/routes/security.py +543 -0
  63. repotoire/api/v1/routes/status.py +2 -1
  64. repotoire/api/v1/routes/team.py +24 -11
  65. repotoire/api/v1/routes/webhooks.py +708 -49
  66. repotoire/autofix/applicator.py +94 -51
  67. repotoire/autofix/best_of_n.py +8 -13
  68. repotoire/autofix/engine.py +360 -61
  69. repotoire/autofix/entitlements.py +25 -90
  70. repotoire/autofix/learning/store.py +21 -7
  71. repotoire/autofix/verifier.py +4 -1
  72. repotoire/cache/__init__.py +76 -20
  73. repotoire/cache/skill.py +166 -34
  74. repotoire/cli/__init__.py +1027 -1241
  75. repotoire/cli/api_keys.py +14 -2
  76. repotoire/cli/auth.py +74 -5
  77. repotoire/cli/auth_commands.py +60 -17
  78. repotoire/cli/graph.py +32 -126
  79. repotoire/cli/historical.py +548 -0
  80. repotoire/cli/ml.py +28 -28
  81. repotoire/cli/monorepo.py +4 -16
  82. repotoire/cli/org_commands.py +284 -0
  83. repotoire/cli/security.py +7 -48
  84. repotoire/cli/tier_limits.py +59 -22
  85. repotoire/config.py +1324 -115
  86. repotoire/db/models/__init__.py +19 -1
  87. repotoire/db/models/analysis.py +6 -0
  88. repotoire/db/models/billing.py +59 -2
  89. repotoire/db/models/cli_token.py +202 -0
  90. repotoire/db/models/detector_settings.py +223 -0
  91. repotoire/db/models/email.py +5 -0
  92. repotoire/db/models/finding.py +64 -1
  93. repotoire/db/models/fix.py +8 -0
  94. repotoire/db/models/notification.py +113 -0
  95. repotoire/db/models/organization.py +20 -1
  96. repotoire/db/models/provenance_settings.py +77 -0
  97. repotoire/db/models/quota_override.py +5 -5
  98. repotoire/db/models/user.py +26 -0
  99. repotoire/db/models/user_preferences.py +94 -0
  100. repotoire/db/repositories/fix.py +200 -3
  101. repotoire/db/repositories/quota_override.py +10 -2
  102. repotoire/db/session.py +143 -7
  103. repotoire/detectors/__init__.py +19 -0
  104. repotoire/detectors/architectural_bottleneck.py +15 -6
  105. repotoire/detectors/async_antipattern.py +4 -4
  106. repotoire/detectors/bandit_detector.py +57 -74
  107. repotoire/detectors/base.py +201 -7
  108. repotoire/detectors/circular_dependency.py +68 -11
  109. repotoire/detectors/core_utility.py +9 -6
  110. repotoire/detectors/data_clumps.py +51 -10
  111. repotoire/detectors/dead_code.py +399 -138
  112. repotoire/detectors/deduplicator.py +13 -21
  113. repotoire/detectors/degree_centrality.py +9 -6
  114. repotoire/detectors/duplicate_rust.py +4 -4
  115. repotoire/detectors/engine.py +572 -80
  116. repotoire/detectors/eslint_detector.py +558 -0
  117. repotoire/detectors/external_tool_runner.py +597 -0
  118. repotoire/detectors/feature_envy.py +14 -17
  119. repotoire/detectors/generator_misuse.py +4 -4
  120. repotoire/detectors/god_class.py +118 -53
  121. repotoire/detectors/graph_algorithms.py +249 -106
  122. repotoire/detectors/graphsage_detector.py +2 -2
  123. repotoire/detectors/grouping.py +264 -0
  124. repotoire/detectors/health_delta.py +489 -0
  125. repotoire/detectors/inappropriate_intimacy.py +16 -6
  126. repotoire/detectors/infinite_loop_detector.py +459 -0
  127. repotoire/detectors/influential_code.py +25 -17
  128. repotoire/detectors/jscpd_detector.py +67 -39
  129. repotoire/detectors/lazy_class.py +13 -11
  130. repotoire/detectors/long_parameter_list.py +9 -6
  131. repotoire/detectors/message_chain.py +9 -7
  132. repotoire/detectors/middle_man.py +14 -10
  133. repotoire/detectors/ml_bug_detector.py +2 -2
  134. repotoire/detectors/module_cohesion.py +9 -6
  135. repotoire/detectors/multimodal_detector.py +6 -6
  136. repotoire/detectors/mypy_detector.py +64 -76
  137. repotoire/detectors/npm_audit_detector.py +450 -0
  138. repotoire/detectors/pylint_detector.py +152 -245
  139. repotoire/detectors/radon_detector.py +172 -17
  140. repotoire/detectors/refused_bequest.py +12 -9
  141. repotoire/detectors/ruff_import_detector.py +32 -10
  142. repotoire/detectors/ruff_lint_detector.py +58 -73
  143. repotoire/detectors/rust_graph_detectors.py +1227 -0
  144. repotoire/detectors/satd_detector.py +36 -4
  145. repotoire/detectors/semgrep_detector.py +64 -74
  146. repotoire/detectors/shotgun_surgery.py +9 -5
  147. repotoire/detectors/taint_detector.py +36 -4
  148. repotoire/detectors/temporal_metrics.py +3 -3
  149. repotoire/detectors/test_smell.py +4 -4
  150. repotoire/detectors/truly_unused_imports.py +151 -15
  151. repotoire/detectors/tsc_detector.py +488 -0
  152. repotoire/detectors/type_hint_coverage.py +4 -6
  153. repotoire/detectors/voting_engine.py +158 -83
  154. repotoire/detectors/vulture_detector.py +75 -66
  155. repotoire/github/pr_analyzer.py +11 -11
  156. repotoire/graph/__init__.py +7 -10
  157. repotoire/graph/cloud_client.py +376 -0
  158. repotoire/graph/enricher.py +6 -6
  159. repotoire/graph/factory.py +156 -64
  160. repotoire/graph/falkordb_client.py +769 -84
  161. repotoire/graph/incremental_scc.py +59 -37
  162. repotoire/graph/queries/builders.py +198 -4
  163. repotoire/graph/queries/patterns.py +52 -6
  164. repotoire/graph/queries/traversal.py +450 -10
  165. repotoire/graph/schema.py +236 -21
  166. repotoire/graph/tenant_factory.py +150 -193
  167. repotoire/historical/__init__.py +11 -5
  168. repotoire/historical/git_extractor.py +181 -0
  169. repotoire/historical/git_rag.py +1210 -0
  170. repotoire/historical/timescale_client.py +29 -13
  171. repotoire/hooks/pre_commit.py +16 -12
  172. repotoire/http_client.py +532 -0
  173. repotoire/mcp/api_server.py +342 -9
  174. repotoire/mcp/execution_env.py +50 -17
  175. repotoire/mcp/pattern_detector.py +4 -4
  176. repotoire/mcp/resources.py +5 -5
  177. repotoire/mcp/schema_generator.py +9 -9
  178. repotoire/mcp/server_generator.py +39 -45
  179. repotoire/migrations/001_initial_schema.py +59 -7
  180. repotoire/migrations/002_add_clue_nodes.py +4 -4
  181. repotoire/migrations/003_add_session_nodes.py +5 -5
  182. repotoire/migrations/__init__.py +1 -1
  183. repotoire/migrations/manager.py +3 -3
  184. repotoire/migrations/migration.py +7 -7
  185. repotoire/ml/__init__.py +4 -3
  186. repotoire/ml/bug_predictor.py +8 -8
  187. repotoire/ml/contrastive_learning.py +1 -1
  188. repotoire/ml/cross_project_trainer.py +1 -1
  189. repotoire/ml/graph_embeddings.py +63 -48
  190. repotoire/ml/graphsage_predictor.py +1 -1
  191. repotoire/ml/multimodal_analyzer.py +7 -7
  192. repotoire/ml/node2vec_embeddings.py +41 -22
  193. repotoire/ml/similarity.py +6 -5
  194. repotoire/ml/training_data.py +2 -2
  195. repotoire/models.py +386 -18
  196. repotoire/monorepo/analyzer.py +5 -5
  197. repotoire/parsers/__init__.py +38 -0
  198. repotoire/parsers/base_tree_sitter_parser.py +61 -5
  199. repotoire/parsers/generic_fallback_parser.py +777 -0
  200. repotoire/parsers/python_parser.py +344 -103
  201. repotoire/parsers/rust_parser.py +554 -0
  202. repotoire/parsers/tree_sitter_go.py +893 -0
  203. repotoire/parsers/tree_sitter_java.py +745 -0
  204. repotoire/parsers/tree_sitter_typescript.py +1134 -0
  205. repotoire/pipeline/__init__.py +2 -2
  206. repotoire/pipeline/ingestion.py +1153 -236
  207. repotoire/pipeline/temporal_ingestion.py +9 -9
  208. repotoire/reporters/__init__.py +14 -2
  209. repotoire/reporters/base_reporter.py +227 -0
  210. repotoire/reporters/excel_reporter.py +329 -0
  211. repotoire/reporters/html_reporter.py +113 -46
  212. repotoire/reporters/markdown_reporter.py +344 -0
  213. repotoire/reporters/pdf_reporter.py +432 -0
  214. repotoire/reporters/sarif_reporter.py +570 -0
  215. repotoire/rules/daemon.py +15 -9
  216. repotoire/rules/engine.py +2 -2
  217. repotoire/rules/validator.py +3 -3
  218. repotoire/sandbox/__init__.py +15 -0
  219. repotoire/sandbox/billing.py +493 -0
  220. repotoire/sandbox/code_validator.py +10 -2
  221. repotoire/sandbox/config.py +78 -8
  222. repotoire/sandbox/enforcement.py +40 -12
  223. repotoire/sandbox/metrics.py +15 -4
  224. repotoire/sandbox/session_tracker.py +70 -17
  225. repotoire/sandbox/skill_executor.py +3 -0
  226. repotoire/sandbox/tool_executor.py +643 -39
  227. repotoire/sandbox/usage.py +23 -17
  228. repotoire/security/dependency_scanner.py +11 -5
  229. repotoire/services/audit.py +4 -4
  230. repotoire/tenant/__init__.py +86 -0
  231. repotoire/tenant/context.py +288 -0
  232. repotoire/tenant/logging.py +132 -0
  233. repotoire/tenant/resolver.py +253 -0
  234. repotoire/utils/encryption.py +91 -0
  235. repotoire/validation.py +465 -88
  236. repotoire/workers/analytics_tasks.py +5 -5
  237. repotoire/workers/celery_app.py +16 -3
  238. repotoire/workers/cleanup.py +117 -0
  239. repotoire/workers/hooks.py +492 -47
  240. repotoire/workers/limits.py +33 -6
  241. repotoire/workers/progress.py +26 -3
  242. repotoire/workers/tasks.py +377 -44
  243. {repotoire-0.1.2.dist-info → repotoire-0.1.4.dist-info}/METADATA +39 -44
  244. repotoire-0.1.4.dist-info/RECORD +406 -0
  245. {repotoire-0.1.2.dist-info → repotoire-0.1.4.dist-info}/WHEEL +1 -1
  246. repotoire_fast/__init__.py +116 -0
  247. repotoire_fast/repotoire_fast.cp313-win_amd64.pyd +0 -0
  248. repotoire/api/auth/clerk.py +0 -192
  249. repotoire/api/auth/state_store.py +0 -283
  250. repotoire/api/middleware/usage.py +0 -165
  251. repotoire/api/routes/__init__.py +0 -39
  252. repotoire/api/routes/account.py +0 -601
  253. repotoire/api/routes/admin/__init__.py +0 -5
  254. repotoire/api/routes/admin/overrides.py +0 -376
  255. repotoire/api/routes/analysis.py +0 -823
  256. repotoire/api/routes/analytics.py +0 -563
  257. repotoire/api/routes/audit.py +0 -427
  258. repotoire/api/routes/billing.py +0 -600
  259. repotoire/api/routes/cli_auth.py +0 -449
  260. repotoire/api/routes/code.py +0 -363
  261. repotoire/api/routes/customer_webhooks.py +0 -733
  262. repotoire/api/routes/findings.py +0 -651
  263. repotoire/api/routes/fixes.py +0 -1389
  264. repotoire/api/routes/github.py +0 -1309
  265. repotoire/api/routes/historical.py +0 -338
  266. repotoire/api/routes/notifications.py +0 -203
  267. repotoire/api/routes/organizations.py +0 -703
  268. repotoire/api/routes/sandbox.py +0 -699
  269. repotoire/api/routes/team.py +0 -408
  270. repotoire/api/routes/usage.py +0 -228
  271. repotoire/api/routes/webhooks.py +0 -984
  272. repotoire/graph/client.py +0 -1009
  273. repotoire/graph/neo4j_multitenant.py +0 -380
  274. repotoire/historical/git_graphiti.py +0 -350
  275. repotoire-0.1.2.dist-info/RECORD +0 -376
  276. {repotoire-0.1.2.dist-info → repotoire-0.1.4.dist-info}/entry_points.txt +2 -2
  277. {repotoire-0.1.2.dist-info → repotoire-0.1.4.dist-info}/licenses/LICENSE +0 -0
repotoire/__init__.py CHANGED
@@ -5,10 +5,10 @@ Analyzes codebases using knowledge graphs to detect code smells,
5
5
  architectural issues, and technical debt.
6
6
  """
7
7
 
8
- __version__ = "0.1.2"
8
+ __version__ = "0.1.4"
9
9
 
10
10
  from repotoire.pipeline import IngestionPipeline
11
- from repotoire.graph import Neo4jClient
11
+ from repotoire.graph import FalkorDBClient
12
12
  from repotoire.detectors import AnalysisEngine
13
13
  from repotoire.models import CodebaseHealth, Finding
14
14
 
repotoire/ai/__init__.py CHANGED
@@ -6,6 +6,11 @@ from repotoire.ai.embeddings import (
6
6
  EmbeddingConfig,
7
7
  EmbeddingBackend,
8
8
  create_embedder,
9
+ # Int8 quantization for memory-efficient storage
10
+ quantize_embedding,
11
+ dequantize_embedding,
12
+ quantize_embeddings_batch,
13
+ compute_cosine_similarity_quantized,
9
14
  )
10
15
  from repotoire.ai.retrieval import (
11
16
  GraphRAGRetriever,
@@ -43,6 +48,13 @@ from repotoire.ai.contextual import (
43
48
  ContextGenerationResult,
44
49
  create_context_generator,
45
50
  )
51
+ from repotoire.ai.compression import (
52
+ EmbeddingCompressor,
53
+ TenantCompressor,
54
+ create_compressor,
55
+ estimate_memory_savings,
56
+ DEFAULT_TARGET_DIMS,
57
+ )
46
58
 
47
59
  __all__ = [
48
60
  # NLP
@@ -52,6 +64,11 @@ __all__ = [
52
64
  "EmbeddingConfig",
53
65
  "EmbeddingBackend",
54
66
  "create_embedder",
67
+ # Int8 quantization (4x memory reduction)
68
+ "quantize_embedding",
69
+ "dequantize_embedding",
70
+ "quantize_embeddings_batch",
71
+ "compute_cosine_similarity_quantized",
55
72
  # Retrieval
56
73
  "GraphRAGRetriever",
57
74
  "RetrievalResult",
@@ -83,4 +100,10 @@ __all__ = [
83
100
  "CostLimitExceeded",
84
101
  "ContextGenerationResult",
85
102
  "create_context_generator",
103
+ # Compression (memory optimization)
104
+ "EmbeddingCompressor",
105
+ "TenantCompressor",
106
+ "create_compressor",
107
+ "estimate_memory_savings",
108
+ "DEFAULT_TARGET_DIMS",
86
109
  ]
@@ -0,0 +1,543 @@
1
+ """Embedding compression for memory-efficient storage.
2
+
3
+ Implements PCA dimensionality reduction + int8 quantization for 8x compression
4
+ with <3% quality loss, based on research findings.
5
+
6
+ Compression pipeline:
7
+ 1. PCA: 4096 → 2048 dimensions (2x reduction)
8
+ 2. int8 quantization: float32 → int8 (4x reduction)
9
+ 3. Combined: 8x total compression
10
+
11
+ Storage: 4096 * 4 bytes = 16KB → 2048 * 1 byte = 2KB per embedding
12
+ """
13
+
14
+ import os
15
+ import json
16
+ import pickle
17
+ from pathlib import Path
18
+ from typing import List, Optional, Tuple
19
+ import numpy as np
20
+
21
+ from repotoire.logging_config import get_logger
22
+
23
+ logger = get_logger(__name__)
24
+
25
+ # Default compression settings
26
+ DEFAULT_TARGET_DIMS = 1024 # PCA target dimensions (4x reduction from 4096)
27
+ DEFAULT_QUANTIZATION_BITS = 8 # int8 quantization
28
+
29
+
30
+ class EmbeddingCompressor:
31
+ """Compress embeddings using PCA + quantization.
32
+
33
+ Provides 8x compression with <3% quality loss on retrieval tasks.
34
+
35
+ Example:
36
+ >>> compressor = EmbeddingCompressor(target_dims=2048)
37
+ >>> # Fit on existing embeddings
38
+ >>> compressor.fit(existing_embeddings)
39
+ >>> # Compress new embeddings
40
+ >>> compressed = compressor.compress(new_embedding)
41
+ >>> # Decompress for similarity computation
42
+ >>> decompressed = compressor.decompress(compressed)
43
+ """
44
+
45
+ def __init__(
46
+ self,
47
+ target_dims: int = DEFAULT_TARGET_DIMS,
48
+ quantization_bits: int = DEFAULT_QUANTIZATION_BITS,
49
+ model_path: Optional[Path] = None,
50
+ ):
51
+ """Initialize compressor.
52
+
53
+ Args:
54
+ target_dims: Target dimensions after PCA (default: 2048)
55
+ quantization_bits: Bits for quantization (default: 8 for int8)
56
+ model_path: Path to save/load fitted PCA model
57
+ """
58
+ self.target_dims = target_dims
59
+ self.quantization_bits = quantization_bits
60
+ self.model_path = model_path or Path.home() / ".repotoire" / "pca_model.pkl"
61
+
62
+ # PCA components (fitted)
63
+ self._pca_components: Optional[np.ndarray] = None
64
+ self._pca_mean: Optional[np.ndarray] = None
65
+ self._source_dims: Optional[int] = None
66
+
67
+ # Quantization parameters (computed during fit)
68
+ self._scale: Optional[float] = None
69
+ self._zero_point: Optional[float] = None
70
+
71
+ # Try to load existing model
72
+ if self.model_path.exists():
73
+ self._load_model()
74
+
75
+ @property
76
+ def is_fitted(self) -> bool:
77
+ """Check if compressor has been fitted."""
78
+ return self._pca_components is not None
79
+
80
+ @property
81
+ def compression_ratio(self) -> float:
82
+ """Calculate compression ratio."""
83
+ if not self._source_dims:
84
+ return 1.0
85
+ # Original: source_dims * 4 bytes (float32)
86
+ # Compressed: target_dims * 1 byte (int8)
87
+ original_size = self._source_dims * 4
88
+ compressed_size = self.target_dims * 1
89
+ return original_size / compressed_size
90
+
91
+ def fit(
92
+ self,
93
+ embeddings: List[List[float]],
94
+ save: bool = True,
95
+ ) -> "EmbeddingCompressor":
96
+ """Fit PCA on a sample of embeddings.
97
+
98
+ Should be called with a representative sample of embeddings
99
+ (e.g., 1000-10000 embeddings) to learn the principal components.
100
+
101
+ Args:
102
+ embeddings: List of embedding vectors to fit on
103
+ save: Whether to save the fitted model to disk
104
+
105
+ Returns:
106
+ self for method chaining
107
+ """
108
+ if len(embeddings) < 100:
109
+ logger.warning(
110
+ f"Fitting PCA on only {len(embeddings)} samples. "
111
+ "Recommend at least 1000 for good quality."
112
+ )
113
+
114
+ # Convert to numpy array
115
+ X = np.array(embeddings, dtype=np.float32)
116
+ n_samples, n_features = X.shape
117
+ self._source_dims = n_features
118
+
119
+ # PCA requires n_components <= min(n_samples, n_features)
120
+ max_components = min(n_samples, n_features)
121
+ effective_target_dims = min(self.target_dims, max_components)
122
+
123
+ if effective_target_dims < self.target_dims:
124
+ logger.warning(
125
+ f"Reducing target_dims from {self.target_dims} to {effective_target_dims} "
126
+ f"(limited by {n_samples} samples)"
127
+ )
128
+ self.target_dims = effective_target_dims
129
+
130
+ logger.info(
131
+ f"Fitting PCA: {self._source_dims} dims → {self.target_dims} dims "
132
+ f"on {len(embeddings)} samples"
133
+ )
134
+
135
+ # Compute mean
136
+ self._pca_mean = np.mean(X, axis=0)
137
+
138
+ # Center the data
139
+ X_centered = X - self._pca_mean
140
+
141
+ # Compute covariance matrix (more memory efficient for large dims)
142
+ # Using SVD instead of eigendecomposition for numerical stability
143
+ try:
144
+ from sklearn.decomposition import PCA
145
+
146
+ # Use sklearn PCA for better numerical stability
147
+ pca = PCA(n_components=self.target_dims, svd_solver='randomized')
148
+ pca.fit(X)
149
+
150
+ self._pca_components = pca.components_.astype(np.float32)
151
+ self._pca_mean = pca.mean_.astype(np.float32)
152
+
153
+ explained_variance = sum(pca.explained_variance_ratio_)
154
+ logger.info(f"PCA explains {explained_variance:.1%} of variance")
155
+
156
+ except ImportError:
157
+ # Fallback to manual SVD if sklearn not available
158
+ logger.info("sklearn not available, using numpy SVD")
159
+ U, S, Vt = np.linalg.svd(X_centered, full_matrices=False)
160
+ self._pca_components = Vt[:self.target_dims].astype(np.float32)
161
+
162
+ # Compute explained variance
163
+ total_var = np.sum(S ** 2)
164
+ explained_var = np.sum(S[:self.target_dims] ** 2) / total_var
165
+ logger.info(f"PCA explains {explained_var:.1%} of variance")
166
+
167
+ # Compute quantization parameters from transformed data
168
+ X_transformed = self._pca_transform(X)
169
+ self._compute_quantization_params(X_transformed)
170
+
171
+ logger.info(
172
+ f"Compression ratio: {self.compression_ratio:.1f}x "
173
+ f"({self._source_dims * 4} bytes → {self.target_dims} bytes)"
174
+ )
175
+
176
+ if save:
177
+ self._save_model()
178
+
179
+ return self
180
+
181
+ def _pca_transform(self, X: np.ndarray) -> np.ndarray:
182
+ """Apply PCA transformation."""
183
+ X_centered = X - self._pca_mean
184
+ return X_centered @ self._pca_components.T
185
+
186
+ def _pca_inverse_transform(self, X_reduced: np.ndarray) -> np.ndarray:
187
+ """Inverse PCA transformation (approximate reconstruction)."""
188
+ return X_reduced @ self._pca_components + self._pca_mean
189
+
190
+ def _compute_quantization_params(self, X: np.ndarray) -> None:
191
+ """Compute scale and zero point for int8 quantization."""
192
+ # Use percentiles to be robust to outliers
193
+ min_val = np.percentile(X, 0.1)
194
+ max_val = np.percentile(X, 99.9)
195
+
196
+ # Compute scale and zero point for symmetric quantization
197
+ self._scale = (max_val - min_val) / 255 # 256 levels for int8
198
+ self._zero_point = min_val
199
+
200
+ logger.debug(f"Quantization params: scale={self._scale:.6f}, zero={self._zero_point:.6f}")
201
+
202
+ def _quantize(self, X: np.ndarray) -> np.ndarray:
203
+ """Quantize float32 to int8."""
204
+ # Scale to [0, 255] range
205
+ X_scaled = (X - self._zero_point) / self._scale
206
+ # Clip and convert to uint8
207
+ X_quantized = np.clip(X_scaled, 0, 255).astype(np.uint8)
208
+ return X_quantized
209
+
210
+ def _dequantize(self, X_quantized: np.ndarray) -> np.ndarray:
211
+ """Dequantize int8 back to float32."""
212
+ return X_quantized.astype(np.float32) * self._scale + self._zero_point
213
+
214
+ def compress(self, embedding: List[float]) -> bytes:
215
+ """Compress a single embedding to bytes.
216
+
217
+ Args:
218
+ embedding: Original embedding vector (e.g., 4096 floats)
219
+
220
+ Returns:
221
+ Compressed embedding as bytes (e.g., 2048 bytes)
222
+ """
223
+ if not self.is_fitted:
224
+ raise RuntimeError("Compressor not fitted. Call fit() first.")
225
+
226
+ # Convert to numpy
227
+ X = np.array([embedding], dtype=np.float32)
228
+
229
+ # Apply PCA
230
+ X_reduced = self._pca_transform(X)
231
+
232
+ # Quantize to int8
233
+ X_quantized = self._quantize(X_reduced)
234
+
235
+ # Return as bytes
236
+ return X_quantized[0].tobytes()
237
+
238
+ def compress_batch(self, embeddings: List[List[float]]) -> List[bytes]:
239
+ """Compress multiple embeddings efficiently.
240
+
241
+ Args:
242
+ embeddings: List of embedding vectors
243
+
244
+ Returns:
245
+ List of compressed embeddings as bytes
246
+ """
247
+ if not self.is_fitted:
248
+ raise RuntimeError("Compressor not fitted. Call fit() first.")
249
+
250
+ # Convert to numpy
251
+ X = np.array(embeddings, dtype=np.float32)
252
+
253
+ # Apply PCA
254
+ X_reduced = self._pca_transform(X)
255
+
256
+ # Quantize to int8
257
+ X_quantized = self._quantize(X_reduced)
258
+
259
+ # Return as list of bytes
260
+ return [row.tobytes() for row in X_quantized]
261
+
262
+ def decompress(self, compressed: bytes) -> List[float]:
263
+ """Decompress bytes back to embedding vector.
264
+
265
+ Note: This is an approximate reconstruction due to PCA and quantization.
266
+
267
+ Args:
268
+ compressed: Compressed embedding bytes
269
+
270
+ Returns:
271
+ Reconstructed embedding vector (original dimensions)
272
+ """
273
+ if not self.is_fitted:
274
+ raise RuntimeError("Compressor not fitted. Call fit() first.")
275
+
276
+ # Convert bytes to numpy array
277
+ X_quantized = np.frombuffer(compressed, dtype=np.uint8).reshape(1, -1)
278
+
279
+ # Dequantize
280
+ X_reduced = self._dequantize(X_quantized)
281
+
282
+ # Inverse PCA (approximate reconstruction)
283
+ X_reconstructed = self._pca_inverse_transform(X_reduced)
284
+
285
+ return X_reconstructed[0].tolist()
286
+
287
+ def decompress_batch(self, compressed_list: List[bytes]) -> List[List[float]]:
288
+ """Decompress multiple embeddings efficiently.
289
+
290
+ Args:
291
+ compressed_list: List of compressed embedding bytes
292
+
293
+ Returns:
294
+ List of reconstructed embedding vectors
295
+ """
296
+ if not self.is_fitted:
297
+ raise RuntimeError("Compressor not fitted. Call fit() first.")
298
+
299
+ # Stack all compressed embeddings
300
+ X_quantized = np.array([
301
+ np.frombuffer(c, dtype=np.uint8) for c in compressed_list
302
+ ])
303
+
304
+ # Dequantize
305
+ X_reduced = self._dequantize(X_quantized)
306
+
307
+ # Inverse PCA
308
+ X_reconstructed = self._pca_inverse_transform(X_reduced)
309
+
310
+ return X_reconstructed.tolist()
311
+
312
+ def get_reduced_embedding(self, embedding: List[float]) -> List[float]:
313
+ """Get PCA-reduced embedding without quantization.
314
+
315
+ Useful when you want dimensionality reduction but need float precision
316
+ for vector similarity search.
317
+
318
+ Args:
319
+ embedding: Original embedding vector
320
+
321
+ Returns:
322
+ Reduced embedding (target_dims floats)
323
+ """
324
+ if not self.is_fitted:
325
+ raise RuntimeError("Compressor not fitted. Call fit() first.")
326
+
327
+ X = np.array([embedding], dtype=np.float32)
328
+ X_reduced = self._pca_transform(X)
329
+ return X_reduced[0].tolist()
330
+
331
+ def get_reduced_embeddings_batch(
332
+ self, embeddings: List[List[float]]
333
+ ) -> List[List[float]]:
334
+ """Get PCA-reduced embeddings for a batch.
335
+
336
+ Args:
337
+ embeddings: List of original embedding vectors
338
+
339
+ Returns:
340
+ List of reduced embeddings (target_dims floats each)
341
+ """
342
+ if not self.is_fitted:
343
+ raise RuntimeError("Compressor not fitted. Call fit() first.")
344
+
345
+ X = np.array(embeddings, dtype=np.float32)
346
+ X_reduced = self._pca_transform(X)
347
+ return X_reduced.tolist()
348
+
349
+ def _save_model(self) -> None:
350
+ """Save fitted PCA model to disk."""
351
+ self.model_path.parent.mkdir(parents=True, exist_ok=True)
352
+
353
+ model_data = {
354
+ "pca_components": self._pca_components,
355
+ "pca_mean": self._pca_mean,
356
+ "source_dims": self._source_dims,
357
+ "target_dims": self.target_dims,
358
+ "scale": self._scale,
359
+ "zero_point": self._zero_point,
360
+ }
361
+
362
+ with open(self.model_path, "wb") as f:
363
+ pickle.dump(model_data, f)
364
+
365
+ logger.info(f"Saved PCA model to {self.model_path}")
366
+
367
+ def _load_model(self) -> None:
368
+ """Load fitted PCA model from disk."""
369
+ try:
370
+ with open(self.model_path, "rb") as f:
371
+ model_data = pickle.load(f)
372
+
373
+ self._pca_components = model_data["pca_components"]
374
+ self._pca_mean = model_data["pca_mean"]
375
+ self._source_dims = model_data["source_dims"]
376
+ self.target_dims = model_data["target_dims"]
377
+ self._scale = model_data["scale"]
378
+ self._zero_point = model_data["zero_point"]
379
+
380
+ logger.info(
381
+ f"Loaded PCA model: {self._source_dims} → {self.target_dims} dims"
382
+ )
383
+ except Exception as e:
384
+ logger.warning(f"Could not load PCA model: {e}")
385
+
386
+
387
+ class TenantCompressor:
388
+ """Per-tenant embedding compressor with model storage in cloud.
389
+
390
+ Each tenant can have their own PCA model fitted on their codebase,
391
+ allowing for better compression quality tailored to their code patterns.
392
+ """
393
+
394
+ def __init__(
395
+ self,
396
+ tenant_id: str,
397
+ storage_backend: str = "local", # or "s3", "gcs"
398
+ target_dims: int = DEFAULT_TARGET_DIMS,
399
+ ):
400
+ """Initialize tenant-specific compressor.
401
+
402
+ Args:
403
+ tenant_id: Unique tenant identifier
404
+ storage_backend: Where to store PCA models
405
+ target_dims: Target dimensions after compression
406
+ """
407
+ self.tenant_id = tenant_id
408
+ self.storage_backend = storage_backend
409
+ self.target_dims = target_dims
410
+
411
+ # Model path includes tenant ID
412
+ model_dir = Path.home() / ".repotoire" / "compression_models"
413
+ self.model_path = model_dir / f"{tenant_id}_pca.pkl"
414
+
415
+ self._compressor = EmbeddingCompressor(
416
+ target_dims=target_dims,
417
+ model_path=self.model_path,
418
+ )
419
+
420
+ @property
421
+ def is_fitted(self) -> bool:
422
+ """Check if tenant compressor is fitted."""
423
+ return self._compressor.is_fitted
424
+
425
+ def fit_from_graph(
426
+ self,
427
+ graph_client,
428
+ sample_size: int = 5000,
429
+ ) -> "TenantCompressor":
430
+ """Fit compressor on embeddings from tenant's graph.
431
+
432
+ Args:
433
+ graph_client: FalkorDB client for the tenant
434
+ sample_size: Number of embeddings to sample for fitting
435
+
436
+ Returns:
437
+ self for method chaining
438
+ """
439
+ # Query embeddings from graph
440
+ # Note: FalkorDB uses labels() function for label checks instead of inline syntax
441
+ query = """
442
+ MATCH (n)
443
+ WHERE ('Function' IN labels(n) OR 'Class' IN labels(n) OR 'File' IN labels(n)) AND n.embedding IS NOT NULL
444
+ RETURN n.embedding as embedding
445
+ LIMIT $limit
446
+ """
447
+
448
+ results = graph_client.query(query, {"limit": sample_size})
449
+
450
+ if not results:
451
+ logger.warning(f"No embeddings found for tenant {self.tenant_id}")
452
+ return self
453
+
454
+ embeddings = [r["embedding"] for r in results if r.get("embedding")]
455
+
456
+ if len(embeddings) < 100:
457
+ logger.warning(
458
+ f"Only {len(embeddings)} embeddings for tenant {self.tenant_id}. "
459
+ "Recommend at least 100 for quality compression."
460
+ )
461
+
462
+ logger.info(f"Fitting compressor for tenant {self.tenant_id} on {len(embeddings)} embeddings")
463
+ self._compressor.fit(embeddings)
464
+
465
+ return self
466
+
467
+ def compress(self, embedding: List[float]) -> bytes:
468
+ """Compress embedding using tenant's model."""
469
+ return self._compressor.compress(embedding)
470
+
471
+ def compress_batch(self, embeddings: List[List[float]]) -> List[bytes]:
472
+ """Compress batch of embeddings."""
473
+ return self._compressor.compress_batch(embeddings)
474
+
475
+ def decompress(self, compressed: bytes) -> List[float]:
476
+ """Decompress embedding."""
477
+ return self._compressor.decompress(compressed)
478
+
479
+ def get_reduced_embedding(self, embedding: List[float]) -> List[float]:
480
+ """Get PCA-reduced embedding (float precision)."""
481
+ return self._compressor.get_reduced_embedding(embedding)
482
+
483
+ def get_reduced_embeddings_batch(
484
+ self, embeddings: List[List[float]]
485
+ ) -> List[List[float]]:
486
+ """Get PCA-reduced embeddings for batch."""
487
+ return self._compressor.get_reduced_embeddings_batch(embeddings)
488
+
489
+
490
+ def create_compressor(
491
+ target_dims: int = DEFAULT_TARGET_DIMS,
492
+ model_path: Optional[Path] = None,
493
+ ) -> EmbeddingCompressor:
494
+ """Factory function to create an embedding compressor.
495
+
496
+ Args:
497
+ target_dims: Target dimensions after PCA
498
+ model_path: Path to save/load PCA model
499
+
500
+ Returns:
501
+ EmbeddingCompressor instance
502
+ """
503
+ return EmbeddingCompressor(
504
+ target_dims=target_dims,
505
+ model_path=model_path,
506
+ )
507
+
508
+
509
+ def estimate_memory_savings(
510
+ num_entities: int,
511
+ source_dims: int = 4096,
512
+ target_dims: int = DEFAULT_TARGET_DIMS,
513
+ ) -> dict:
514
+ """Estimate memory savings from compression.
515
+
516
+ Args:
517
+ num_entities: Number of entities with embeddings
518
+ source_dims: Original embedding dimensions
519
+ target_dims: Target dimensions after compression
520
+
521
+ Returns:
522
+ Dictionary with memory estimates
523
+ """
524
+ # Original: float32 (4 bytes per dimension)
525
+ original_bytes = num_entities * source_dims * 4
526
+
527
+ # Compressed: int8 (1 byte per dimension)
528
+ compressed_bytes = num_entities * target_dims * 1
529
+
530
+ # PCA-reduced only (float32, no quantization)
531
+ reduced_bytes = num_entities * target_dims * 4
532
+
533
+ return {
534
+ "num_entities": num_entities,
535
+ "source_dims": source_dims,
536
+ "target_dims": target_dims,
537
+ "original_mb": original_bytes / (1024 * 1024),
538
+ "compressed_mb": compressed_bytes / (1024 * 1024),
539
+ "reduced_only_mb": reduced_bytes / (1024 * 1024),
540
+ "compression_ratio": original_bytes / compressed_bytes,
541
+ "savings_mb": (original_bytes - compressed_bytes) / (1024 * 1024),
542
+ "savings_percent": (1 - compressed_bytes / original_bytes) * 100,
543
+ }