aiecs 1.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. aiecs/__init__.py +72 -0
  2. aiecs/__main__.py +41 -0
  3. aiecs/aiecs_client.py +469 -0
  4. aiecs/application/__init__.py +10 -0
  5. aiecs/application/executors/__init__.py +10 -0
  6. aiecs/application/executors/operation_executor.py +363 -0
  7. aiecs/application/knowledge_graph/__init__.py +7 -0
  8. aiecs/application/knowledge_graph/builder/__init__.py +37 -0
  9. aiecs/application/knowledge_graph/builder/document_builder.py +375 -0
  10. aiecs/application/knowledge_graph/builder/graph_builder.py +356 -0
  11. aiecs/application/knowledge_graph/builder/schema_mapping.py +531 -0
  12. aiecs/application/knowledge_graph/builder/structured_pipeline.py +443 -0
  13. aiecs/application/knowledge_graph/builder/text_chunker.py +319 -0
  14. aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
  15. aiecs/application/knowledge_graph/extractors/base.py +100 -0
  16. aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +327 -0
  17. aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +349 -0
  18. aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +244 -0
  19. aiecs/application/knowledge_graph/fusion/__init__.py +23 -0
  20. aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +387 -0
  21. aiecs/application/knowledge_graph/fusion/entity_linker.py +343 -0
  22. aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +580 -0
  23. aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +189 -0
  24. aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
  25. aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +344 -0
  26. aiecs/application/knowledge_graph/pattern_matching/query_executor.py +378 -0
  27. aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
  28. aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +199 -0
  29. aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
  30. aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
  31. aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +347 -0
  32. aiecs/application/knowledge_graph/reasoning/inference_engine.py +504 -0
  33. aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +167 -0
  34. aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
  35. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
  36. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +630 -0
  37. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +654 -0
  38. aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +477 -0
  39. aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +390 -0
  40. aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +217 -0
  41. aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +169 -0
  42. aiecs/application/knowledge_graph/reasoning/query_planner.py +872 -0
  43. aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +554 -0
  44. aiecs/application/knowledge_graph/retrieval/__init__.py +19 -0
  45. aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +596 -0
  46. aiecs/application/knowledge_graph/search/__init__.py +59 -0
  47. aiecs/application/knowledge_graph/search/hybrid_search.py +423 -0
  48. aiecs/application/knowledge_graph/search/reranker.py +295 -0
  49. aiecs/application/knowledge_graph/search/reranker_strategies.py +553 -0
  50. aiecs/application/knowledge_graph/search/text_similarity.py +398 -0
  51. aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
  52. aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +329 -0
  53. aiecs/application/knowledge_graph/traversal/path_scorer.py +269 -0
  54. aiecs/application/knowledge_graph/validators/__init__.py +13 -0
  55. aiecs/application/knowledge_graph/validators/relation_validator.py +189 -0
  56. aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
  57. aiecs/application/knowledge_graph/visualization/graph_visualizer.py +321 -0
  58. aiecs/common/__init__.py +9 -0
  59. aiecs/common/knowledge_graph/__init__.py +17 -0
  60. aiecs/common/knowledge_graph/runnable.py +484 -0
  61. aiecs/config/__init__.py +16 -0
  62. aiecs/config/config.py +498 -0
  63. aiecs/config/graph_config.py +137 -0
  64. aiecs/config/registry.py +23 -0
  65. aiecs/core/__init__.py +46 -0
  66. aiecs/core/interface/__init__.py +34 -0
  67. aiecs/core/interface/execution_interface.py +152 -0
  68. aiecs/core/interface/storage_interface.py +171 -0
  69. aiecs/domain/__init__.py +289 -0
  70. aiecs/domain/agent/__init__.py +189 -0
  71. aiecs/domain/agent/base_agent.py +697 -0
  72. aiecs/domain/agent/exceptions.py +103 -0
  73. aiecs/domain/agent/graph_aware_mixin.py +559 -0
  74. aiecs/domain/agent/hybrid_agent.py +490 -0
  75. aiecs/domain/agent/integration/__init__.py +26 -0
  76. aiecs/domain/agent/integration/context_compressor.py +222 -0
  77. aiecs/domain/agent/integration/context_engine_adapter.py +252 -0
  78. aiecs/domain/agent/integration/retry_policy.py +219 -0
  79. aiecs/domain/agent/integration/role_config.py +213 -0
  80. aiecs/domain/agent/knowledge_aware_agent.py +646 -0
  81. aiecs/domain/agent/lifecycle.py +296 -0
  82. aiecs/domain/agent/llm_agent.py +300 -0
  83. aiecs/domain/agent/memory/__init__.py +12 -0
  84. aiecs/domain/agent/memory/conversation.py +197 -0
  85. aiecs/domain/agent/migration/__init__.py +14 -0
  86. aiecs/domain/agent/migration/conversion.py +160 -0
  87. aiecs/domain/agent/migration/legacy_wrapper.py +90 -0
  88. aiecs/domain/agent/models.py +317 -0
  89. aiecs/domain/agent/observability.py +407 -0
  90. aiecs/domain/agent/persistence.py +289 -0
  91. aiecs/domain/agent/prompts/__init__.py +29 -0
  92. aiecs/domain/agent/prompts/builder.py +161 -0
  93. aiecs/domain/agent/prompts/formatters.py +189 -0
  94. aiecs/domain/agent/prompts/template.py +255 -0
  95. aiecs/domain/agent/registry.py +260 -0
  96. aiecs/domain/agent/tool_agent.py +257 -0
  97. aiecs/domain/agent/tools/__init__.py +12 -0
  98. aiecs/domain/agent/tools/schema_generator.py +221 -0
  99. aiecs/domain/community/__init__.py +155 -0
  100. aiecs/domain/community/agent_adapter.py +477 -0
  101. aiecs/domain/community/analytics.py +481 -0
  102. aiecs/domain/community/collaborative_workflow.py +642 -0
  103. aiecs/domain/community/communication_hub.py +645 -0
  104. aiecs/domain/community/community_builder.py +320 -0
  105. aiecs/domain/community/community_integration.py +800 -0
  106. aiecs/domain/community/community_manager.py +813 -0
  107. aiecs/domain/community/decision_engine.py +879 -0
  108. aiecs/domain/community/exceptions.py +225 -0
  109. aiecs/domain/community/models/__init__.py +33 -0
  110. aiecs/domain/community/models/community_models.py +268 -0
  111. aiecs/domain/community/resource_manager.py +457 -0
  112. aiecs/domain/community/shared_context_manager.py +603 -0
  113. aiecs/domain/context/__init__.py +58 -0
  114. aiecs/domain/context/context_engine.py +989 -0
  115. aiecs/domain/context/conversation_models.py +354 -0
  116. aiecs/domain/context/graph_memory.py +467 -0
  117. aiecs/domain/execution/__init__.py +12 -0
  118. aiecs/domain/execution/model.py +57 -0
  119. aiecs/domain/knowledge_graph/__init__.py +19 -0
  120. aiecs/domain/knowledge_graph/models/__init__.py +52 -0
  121. aiecs/domain/knowledge_graph/models/entity.py +130 -0
  122. aiecs/domain/knowledge_graph/models/evidence.py +194 -0
  123. aiecs/domain/knowledge_graph/models/inference_rule.py +186 -0
  124. aiecs/domain/knowledge_graph/models/path.py +179 -0
  125. aiecs/domain/knowledge_graph/models/path_pattern.py +173 -0
  126. aiecs/domain/knowledge_graph/models/query.py +272 -0
  127. aiecs/domain/knowledge_graph/models/query_plan.py +187 -0
  128. aiecs/domain/knowledge_graph/models/relation.py +136 -0
  129. aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
  130. aiecs/domain/knowledge_graph/schema/entity_type.py +135 -0
  131. aiecs/domain/knowledge_graph/schema/graph_schema.py +271 -0
  132. aiecs/domain/knowledge_graph/schema/property_schema.py +155 -0
  133. aiecs/domain/knowledge_graph/schema/relation_type.py +171 -0
  134. aiecs/domain/knowledge_graph/schema/schema_manager.py +496 -0
  135. aiecs/domain/knowledge_graph/schema/type_enums.py +205 -0
  136. aiecs/domain/task/__init__.py +13 -0
  137. aiecs/domain/task/dsl_processor.py +613 -0
  138. aiecs/domain/task/model.py +62 -0
  139. aiecs/domain/task/task_context.py +268 -0
  140. aiecs/infrastructure/__init__.py +24 -0
  141. aiecs/infrastructure/graph_storage/__init__.py +11 -0
  142. aiecs/infrastructure/graph_storage/base.py +601 -0
  143. aiecs/infrastructure/graph_storage/batch_operations.py +449 -0
  144. aiecs/infrastructure/graph_storage/cache.py +429 -0
  145. aiecs/infrastructure/graph_storage/distributed.py +226 -0
  146. aiecs/infrastructure/graph_storage/error_handling.py +390 -0
  147. aiecs/infrastructure/graph_storage/graceful_degradation.py +306 -0
  148. aiecs/infrastructure/graph_storage/health_checks.py +378 -0
  149. aiecs/infrastructure/graph_storage/in_memory.py +514 -0
  150. aiecs/infrastructure/graph_storage/index_optimization.py +483 -0
  151. aiecs/infrastructure/graph_storage/lazy_loading.py +410 -0
  152. aiecs/infrastructure/graph_storage/metrics.py +357 -0
  153. aiecs/infrastructure/graph_storage/migration.py +413 -0
  154. aiecs/infrastructure/graph_storage/pagination.py +471 -0
  155. aiecs/infrastructure/graph_storage/performance_monitoring.py +466 -0
  156. aiecs/infrastructure/graph_storage/postgres.py +871 -0
  157. aiecs/infrastructure/graph_storage/query_optimizer.py +635 -0
  158. aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
  159. aiecs/infrastructure/graph_storage/sqlite.py +623 -0
  160. aiecs/infrastructure/graph_storage/streaming.py +495 -0
  161. aiecs/infrastructure/messaging/__init__.py +13 -0
  162. aiecs/infrastructure/messaging/celery_task_manager.py +383 -0
  163. aiecs/infrastructure/messaging/websocket_manager.py +298 -0
  164. aiecs/infrastructure/monitoring/__init__.py +34 -0
  165. aiecs/infrastructure/monitoring/executor_metrics.py +174 -0
  166. aiecs/infrastructure/monitoring/global_metrics_manager.py +213 -0
  167. aiecs/infrastructure/monitoring/structured_logger.py +48 -0
  168. aiecs/infrastructure/monitoring/tracing_manager.py +410 -0
  169. aiecs/infrastructure/persistence/__init__.py +24 -0
  170. aiecs/infrastructure/persistence/context_engine_client.py +187 -0
  171. aiecs/infrastructure/persistence/database_manager.py +333 -0
  172. aiecs/infrastructure/persistence/file_storage.py +754 -0
  173. aiecs/infrastructure/persistence/redis_client.py +220 -0
  174. aiecs/llm/__init__.py +86 -0
  175. aiecs/llm/callbacks/__init__.py +11 -0
  176. aiecs/llm/callbacks/custom_callbacks.py +264 -0
  177. aiecs/llm/client_factory.py +420 -0
  178. aiecs/llm/clients/__init__.py +33 -0
  179. aiecs/llm/clients/base_client.py +193 -0
  180. aiecs/llm/clients/googleai_client.py +181 -0
  181. aiecs/llm/clients/openai_client.py +131 -0
  182. aiecs/llm/clients/vertex_client.py +437 -0
  183. aiecs/llm/clients/xai_client.py +184 -0
  184. aiecs/llm/config/__init__.py +51 -0
  185. aiecs/llm/config/config_loader.py +275 -0
  186. aiecs/llm/config/config_validator.py +236 -0
  187. aiecs/llm/config/model_config.py +151 -0
  188. aiecs/llm/utils/__init__.py +10 -0
  189. aiecs/llm/utils/validate_config.py +91 -0
  190. aiecs/main.py +363 -0
  191. aiecs/scripts/__init__.py +3 -0
  192. aiecs/scripts/aid/VERSION_MANAGEMENT.md +97 -0
  193. aiecs/scripts/aid/__init__.py +19 -0
  194. aiecs/scripts/aid/version_manager.py +215 -0
  195. aiecs/scripts/dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md +242 -0
  196. aiecs/scripts/dependance_check/README_DEPENDENCY_CHECKER.md +310 -0
  197. aiecs/scripts/dependance_check/__init__.py +17 -0
  198. aiecs/scripts/dependance_check/dependency_checker.py +938 -0
  199. aiecs/scripts/dependance_check/dependency_fixer.py +391 -0
  200. aiecs/scripts/dependance_check/download_nlp_data.py +396 -0
  201. aiecs/scripts/dependance_check/quick_dependency_check.py +270 -0
  202. aiecs/scripts/dependance_check/setup_nlp_data.sh +217 -0
  203. aiecs/scripts/dependance_patch/__init__.py +7 -0
  204. aiecs/scripts/dependance_patch/fix_weasel/README_WEASEL_PATCH.md +126 -0
  205. aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
  206. aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.py +128 -0
  207. aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.sh +82 -0
  208. aiecs/scripts/dependance_patch/fix_weasel/patch_weasel_library.sh +188 -0
  209. aiecs/scripts/dependance_patch/fix_weasel/run_weasel_patch.sh +41 -0
  210. aiecs/scripts/tools_develop/README.md +449 -0
  211. aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
  212. aiecs/scripts/tools_develop/__init__.py +21 -0
  213. aiecs/scripts/tools_develop/check_type_annotations.py +259 -0
  214. aiecs/scripts/tools_develop/validate_tool_schemas.py +422 -0
  215. aiecs/scripts/tools_develop/verify_tools.py +356 -0
  216. aiecs/tasks/__init__.py +1 -0
  217. aiecs/tasks/worker.py +172 -0
  218. aiecs/tools/__init__.py +299 -0
  219. aiecs/tools/apisource/__init__.py +99 -0
  220. aiecs/tools/apisource/intelligence/__init__.py +19 -0
  221. aiecs/tools/apisource/intelligence/data_fusion.py +381 -0
  222. aiecs/tools/apisource/intelligence/query_analyzer.py +413 -0
  223. aiecs/tools/apisource/intelligence/search_enhancer.py +388 -0
  224. aiecs/tools/apisource/monitoring/__init__.py +9 -0
  225. aiecs/tools/apisource/monitoring/metrics.py +303 -0
  226. aiecs/tools/apisource/providers/__init__.py +115 -0
  227. aiecs/tools/apisource/providers/base.py +664 -0
  228. aiecs/tools/apisource/providers/census.py +401 -0
  229. aiecs/tools/apisource/providers/fred.py +564 -0
  230. aiecs/tools/apisource/providers/newsapi.py +412 -0
  231. aiecs/tools/apisource/providers/worldbank.py +357 -0
  232. aiecs/tools/apisource/reliability/__init__.py +12 -0
  233. aiecs/tools/apisource/reliability/error_handler.py +375 -0
  234. aiecs/tools/apisource/reliability/fallback_strategy.py +391 -0
  235. aiecs/tools/apisource/tool.py +850 -0
  236. aiecs/tools/apisource/utils/__init__.py +9 -0
  237. aiecs/tools/apisource/utils/validators.py +338 -0
  238. aiecs/tools/base_tool.py +201 -0
  239. aiecs/tools/docs/__init__.py +121 -0
  240. aiecs/tools/docs/ai_document_orchestrator.py +599 -0
  241. aiecs/tools/docs/ai_document_writer_orchestrator.py +2403 -0
  242. aiecs/tools/docs/content_insertion_tool.py +1333 -0
  243. aiecs/tools/docs/document_creator_tool.py +1317 -0
  244. aiecs/tools/docs/document_layout_tool.py +1166 -0
  245. aiecs/tools/docs/document_parser_tool.py +994 -0
  246. aiecs/tools/docs/document_writer_tool.py +1818 -0
  247. aiecs/tools/knowledge_graph/__init__.py +17 -0
  248. aiecs/tools/knowledge_graph/graph_reasoning_tool.py +734 -0
  249. aiecs/tools/knowledge_graph/graph_search_tool.py +923 -0
  250. aiecs/tools/knowledge_graph/kg_builder_tool.py +476 -0
  251. aiecs/tools/langchain_adapter.py +542 -0
  252. aiecs/tools/schema_generator.py +275 -0
  253. aiecs/tools/search_tool/__init__.py +100 -0
  254. aiecs/tools/search_tool/analyzers.py +589 -0
  255. aiecs/tools/search_tool/cache.py +260 -0
  256. aiecs/tools/search_tool/constants.py +128 -0
  257. aiecs/tools/search_tool/context.py +216 -0
  258. aiecs/tools/search_tool/core.py +749 -0
  259. aiecs/tools/search_tool/deduplicator.py +123 -0
  260. aiecs/tools/search_tool/error_handler.py +271 -0
  261. aiecs/tools/search_tool/metrics.py +371 -0
  262. aiecs/tools/search_tool/rate_limiter.py +178 -0
  263. aiecs/tools/search_tool/schemas.py +277 -0
  264. aiecs/tools/statistics/__init__.py +80 -0
  265. aiecs/tools/statistics/ai_data_analysis_orchestrator.py +643 -0
  266. aiecs/tools/statistics/ai_insight_generator_tool.py +505 -0
  267. aiecs/tools/statistics/ai_report_orchestrator_tool.py +694 -0
  268. aiecs/tools/statistics/data_loader_tool.py +564 -0
  269. aiecs/tools/statistics/data_profiler_tool.py +658 -0
  270. aiecs/tools/statistics/data_transformer_tool.py +573 -0
  271. aiecs/tools/statistics/data_visualizer_tool.py +495 -0
  272. aiecs/tools/statistics/model_trainer_tool.py +487 -0
  273. aiecs/tools/statistics/statistical_analyzer_tool.py +459 -0
  274. aiecs/tools/task_tools/__init__.py +86 -0
  275. aiecs/tools/task_tools/chart_tool.py +732 -0
  276. aiecs/tools/task_tools/classfire_tool.py +922 -0
  277. aiecs/tools/task_tools/image_tool.py +447 -0
  278. aiecs/tools/task_tools/office_tool.py +684 -0
  279. aiecs/tools/task_tools/pandas_tool.py +635 -0
  280. aiecs/tools/task_tools/report_tool.py +635 -0
  281. aiecs/tools/task_tools/research_tool.py +392 -0
  282. aiecs/tools/task_tools/scraper_tool.py +715 -0
  283. aiecs/tools/task_tools/stats_tool.py +688 -0
  284. aiecs/tools/temp_file_manager.py +130 -0
  285. aiecs/tools/tool_executor/__init__.py +37 -0
  286. aiecs/tools/tool_executor/tool_executor.py +881 -0
  287. aiecs/utils/LLM_output_structor.py +445 -0
  288. aiecs/utils/__init__.py +34 -0
  289. aiecs/utils/base_callback.py +47 -0
  290. aiecs/utils/cache_provider.py +695 -0
  291. aiecs/utils/execution_utils.py +184 -0
  292. aiecs/utils/logging.py +1 -0
  293. aiecs/utils/prompt_loader.py +14 -0
  294. aiecs/utils/token_usage_repository.py +323 -0
  295. aiecs/ws/__init__.py +0 -0
  296. aiecs/ws/socket_server.py +52 -0
  297. aiecs-1.5.1.dist-info/METADATA +608 -0
  298. aiecs-1.5.1.dist-info/RECORD +302 -0
  299. aiecs-1.5.1.dist-info/WHEEL +5 -0
  300. aiecs-1.5.1.dist-info/entry_points.txt +10 -0
  301. aiecs-1.5.1.dist-info/licenses/LICENSE +225 -0
  302. aiecs-1.5.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,754 @@
1
+ """
2
+ File Storage Implementation with Google Cloud Storage
3
+
4
+ Provides file storage capabilities using Google Cloud Storage as the backend,
5
+ with support for local fallback and caching.
6
+ """
7
+
8
+ import os
9
+ import json
10
+ import logging
11
+ import aiofiles
12
+ from typing import Dict, List, Any, Optional, Union
13
+ from datetime import datetime
14
+ from pathlib import Path
15
+ import gzip
16
+ import pickle
17
+
18
+ try:
19
+ from google.cloud import storage
20
+ from google.cloud.exceptions import NotFound, GoogleCloudError
21
+ from google.auth.exceptions import DefaultCredentialsError
22
+
23
+ GCS_AVAILABLE = True
24
+ except ImportError:
25
+ GCS_AVAILABLE = False
26
+ storage = None
27
+ NotFound = Exception
28
+ GoogleCloudError = Exception
29
+ DefaultCredentialsError = Exception
30
+
31
+ from ..monitoring.global_metrics_manager import get_global_metrics
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ class FileStorageError(Exception):
37
+ """Base exception for file storage operations."""
38
+
39
+
40
+ class FileStorageConfig:
41
+ """Configuration for file storage."""
42
+
43
+ def __init__(self, config: Dict[str, Any]):
44
+ # Google Cloud Storage settings
45
+ self.gcs_bucket_name = config.get("gcs_bucket_name", "multi-task-storage")
46
+ self.gcs_project_id = config.get("gcs_project_id")
47
+ self.gcs_credentials_path = config.get("gcs_credentials_path")
48
+ self.gcs_location = config.get("gcs_location", "US")
49
+
50
+ # Local storage fallback
51
+ self.local_storage_path = config.get("local_storage_path", "./storage")
52
+ self.enable_local_fallback = config.get("enable_local_fallback", True)
53
+
54
+ # Cache settings
55
+ self.enable_cache = config.get("enable_cache", True)
56
+ self.cache_ttl_seconds = config.get("cache_ttl_seconds", 3600)
57
+ self.max_cache_size_mb = config.get("max_cache_size_mb", 100)
58
+
59
+ # Performance settings
60
+ self.chunk_size = config.get("chunk_size", 8192)
61
+ self.max_retries = config.get("max_retries", 3)
62
+ self.timeout_seconds = config.get("timeout_seconds", 30)
63
+
64
+ # Compression settings
65
+ self.enable_compression = config.get("enable_compression", True)
66
+ self.compression_threshold_bytes = config.get("compression_threshold_bytes", 1024)
67
+
68
+ # Security settings
69
+ self.enable_encryption = config.get("enable_encryption", False)
70
+ self.encryption_key = config.get("encryption_key")
71
+
72
+
73
+ class FileStorage:
74
+ """
75
+ File storage implementation with Google Cloud Storage backend.
76
+
77
+ Features:
78
+ - Google Cloud Storage as primary backend
79
+ - Local filesystem fallback
80
+ - In-memory caching with TTL
81
+ - Automatic compression for large files
82
+ - Retry logic with exponential backoff
83
+ - Metrics collection
84
+ """
85
+
86
+ def __init__(self, config: Dict[str, Any]):
87
+ self.config = FileStorageConfig(config)
88
+ self._gcs_client = None
89
+ self._gcs_bucket = None
90
+ self._cache = {}
91
+ self._cache_timestamps = {}
92
+ self._initialized = False
93
+
94
+ # Metrics - use global metrics manager
95
+ self.metrics = get_global_metrics()
96
+
97
+ # Ensure local storage directory exists
98
+ if self.config.enable_local_fallback:
99
+ Path(self.config.local_storage_path).mkdir(parents=True, exist_ok=True)
100
+
101
+ async def initialize(self) -> bool:
102
+ """
103
+ Initialize the file storage system.
104
+
105
+ Returns:
106
+ True if initialization was successful
107
+ """
108
+ try:
109
+ if GCS_AVAILABLE:
110
+ await self._init_gcs()
111
+ else:
112
+ logger.warning("Google Cloud Storage not available, using local storage only")
113
+
114
+ self._initialized = True
115
+ logger.info("File storage initialized successfully")
116
+ return True
117
+
118
+ except Exception as e:
119
+ logger.error(f"Failed to initialize file storage: {e}")
120
+ if not self.config.enable_local_fallback:
121
+ raise FileStorageError(f"Storage initialization failed: {e}")
122
+
123
+ logger.info("Falling back to local storage only")
124
+ self._initialized = True
125
+ return True
126
+
127
+ async def _init_gcs(self):
128
+ """Initialize Google Cloud Storage client."""
129
+ try:
130
+ # Set credentials if provided
131
+ if self.config.gcs_credentials_path:
132
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self.config.gcs_credentials_path
133
+
134
+ # Create client - project is required for bucket creation
135
+ # If project_id is None, client will use default project from credentials
136
+ # but we need it for bucket creation API calls
137
+ if not self.config.gcs_project_id:
138
+ logger.warning("GCS project ID not provided. Bucket creation will be disabled.")
139
+ logger.warning(
140
+ "Bucket must exist and be accessible. Falling back to local storage if bucket not found."
141
+ )
142
+
143
+ # Create client with project ID (can be None, but bucket creation
144
+ # will fail)
145
+ self._gcs_client = storage.Client(project=self.config.gcs_project_id)
146
+
147
+ # Get or create bucket
148
+ try:
149
+ self._gcs_bucket = self._gcs_client.bucket(self.config.gcs_bucket_name)
150
+ # Test bucket access
151
+ self._gcs_bucket.reload()
152
+ logger.info(f"Connected to GCS bucket: {self.config.gcs_bucket_name}")
153
+
154
+ except NotFound:
155
+ # Only create bucket if project_id is provided
156
+ # Bucket creation requires project parameter in API call
157
+ if self.config.gcs_project_id:
158
+ try:
159
+ self._gcs_bucket = self._gcs_client.create_bucket(
160
+ self.config.gcs_bucket_name,
161
+ project=self.config.gcs_project_id, # Explicitly pass project parameter
162
+ location=self.config.gcs_location,
163
+ )
164
+ logger.info(
165
+ f"Created GCS bucket: {self.config.gcs_bucket_name} in project {self.config.gcs_project_id}"
166
+ )
167
+ except Exception as create_error:
168
+ logger.error(
169
+ f"Failed to create GCS bucket {self.config.gcs_bucket_name}: {create_error}"
170
+ )
171
+ logger.warning("Bucket creation failed. Will use local storage fallback.")
172
+ self._gcs_bucket = None
173
+ else:
174
+ logger.error(
175
+ f"GCS bucket '{self.config.gcs_bucket_name}' not found and "
176
+ "project ID is not provided. Cannot create bucket without project parameter."
177
+ )
178
+ logger.warning(
179
+ "Please ensure the bucket exists or provide DOC_PARSER_GCS_PROJECT_ID in configuration."
180
+ )
181
+ logger.warning("Falling back to local storage only.")
182
+ self._gcs_bucket = None
183
+
184
+ except DefaultCredentialsError:
185
+ logger.warning("GCS credentials not found, using local storage only")
186
+ self._gcs_client = None
187
+ self._gcs_bucket = None
188
+
189
+ except Exception as e:
190
+ logger.error(f"Failed to initialize GCS: {e}")
191
+ self._gcs_client = None
192
+ self._gcs_bucket = None
193
+
194
+ async def store(
195
+ self,
196
+ key: str,
197
+ data: Union[str, bytes, Dict[str, Any]],
198
+ metadata: Optional[Dict[str, Any]] = None,
199
+ ) -> bool:
200
+ """
201
+ Store data with the given key.
202
+
203
+ Args:
204
+ key: Storage key
205
+ data: Data to store
206
+ metadata: Optional metadata
207
+
208
+ Returns:
209
+ True if storage was successful
210
+ """
211
+ if not self._initialized:
212
+ await self.initialize()
213
+
214
+ start_time = datetime.utcnow()
215
+
216
+ try:
217
+ # Serialize data
218
+ serialized_data = await self._serialize_data(data)
219
+
220
+ # Compress if enabled and data is large enough
221
+ if (
222
+ self.config.enable_compression
223
+ and len(serialized_data) > self.config.compression_threshold_bytes
224
+ ):
225
+ serialized_data = gzip.compress(serialized_data)
226
+ compressed = True
227
+ else:
228
+ compressed = False
229
+
230
+ # Store in cache
231
+ if self.config.enable_cache:
232
+ self._cache[key] = {
233
+ "data": data,
234
+ "metadata": metadata,
235
+ "compressed": compressed,
236
+ }
237
+ self._cache_timestamps[key] = datetime.utcnow()
238
+ await self._cleanup_cache()
239
+
240
+ # Store in GCS if available
241
+ if self._gcs_bucket:
242
+ success = await self._store_gcs(key, serialized_data, metadata, compressed)
243
+ if success:
244
+ if self.metrics:
245
+ self.metrics.record_operation("gcs_store_success", 1)
246
+ duration = (datetime.utcnow() - start_time).total_seconds()
247
+ self.metrics.record_duration("gcs_store_duration", duration)
248
+ return True
249
+
250
+ # Fallback to local storage
251
+ if self.config.enable_local_fallback:
252
+ success = await self._store_local(key, serialized_data, metadata, compressed)
253
+ if success:
254
+ if self.metrics:
255
+ self.metrics.record_operation("local_store_success", 1)
256
+ duration = (datetime.utcnow() - start_time).total_seconds()
257
+ self.metrics.record_duration("local_store_duration", duration)
258
+ return True
259
+
260
+ if self.metrics:
261
+ self.metrics.record_operation("store_failure", 1)
262
+ return False
263
+
264
+ except Exception as e:
265
+ logger.error(f"Failed to store data for key {key}: {e}")
266
+ if self.metrics:
267
+ self.metrics.record_operation("store_error", 1)
268
+ raise FileStorageError(f"Storage failed: {e}")
269
+
270
+ async def retrieve(self, key: str) -> Optional[Union[str, bytes, Dict[str, Any]]]:
271
+ """
272
+ Retrieve data by key.
273
+
274
+ Args:
275
+ key: Storage key
276
+
277
+ Returns:
278
+ The stored data if found, None otherwise
279
+ """
280
+ if not self._initialized:
281
+ await self.initialize()
282
+
283
+ start_time = datetime.utcnow()
284
+
285
+ try:
286
+ # Check cache first
287
+ if self.config.enable_cache and key in self._cache:
288
+ cache_time = self._cache_timestamps.get(key)
289
+ if (
290
+ cache_time
291
+ and (datetime.utcnow() - cache_time).total_seconds()
292
+ < self.config.cache_ttl_seconds
293
+ ):
294
+ if self.metrics:
295
+ self.metrics.record_operation("cache_hit", 1)
296
+ return self._cache[key]["data"]
297
+ else:
298
+ # Remove expired cache entry
299
+ self._cache.pop(key, None)
300
+ self._cache_timestamps.pop(key, None)
301
+
302
+ # Try GCS first
303
+ if self._gcs_bucket:
304
+ data = await self._retrieve_gcs(key)
305
+ if data is not None:
306
+ if self.metrics:
307
+ self.metrics.record_operation("gcs_retrieve_success", 1)
308
+ duration = (datetime.utcnow() - start_time).total_seconds()
309
+ self.metrics.record_duration("gcs_retrieve_duration", duration)
310
+
311
+ # Update cache
312
+ if self.config.enable_cache:
313
+ self._cache[key] = {"data": data, "metadata": {}}
314
+ self._cache_timestamps[key] = datetime.utcnow()
315
+
316
+ return data
317
+
318
+ # Fallback to local storage
319
+ if self.config.enable_local_fallback:
320
+ data = await self._retrieve_local(key)
321
+ if data is not None:
322
+ if self.metrics:
323
+ self.metrics.record_operation("local_retrieve_success", 1)
324
+ duration = (datetime.utcnow() - start_time).total_seconds()
325
+ self.metrics.record_duration("local_retrieve_duration", duration)
326
+
327
+ # Update cache
328
+ if self.config.enable_cache:
329
+ self._cache[key] = {"data": data, "metadata": {}}
330
+ self._cache_timestamps[key] = datetime.utcnow()
331
+
332
+ return data
333
+
334
+ if self.metrics:
335
+ self.metrics.record_operation("retrieve_not_found", 1)
336
+ return None
337
+
338
+ except Exception as e:
339
+ logger.error(f"Failed to retrieve data for key {key}: {e}")
340
+ if self.metrics:
341
+ self.metrics.record_operation("retrieve_error", 1)
342
+ raise FileStorageError(f"Retrieval failed: {e}")
343
+
344
+ async def delete(self, key: str) -> bool:
345
+ """
346
+ Delete data by key.
347
+
348
+ Args:
349
+ key: Storage key
350
+
351
+ Returns:
352
+ True if deletion was successful
353
+ """
354
+ if not self._initialized:
355
+ await self.initialize()
356
+
357
+ try:
358
+ success = True
359
+
360
+ # Remove from cache
361
+ if self.config.enable_cache:
362
+ self._cache.pop(key, None)
363
+ self._cache_timestamps.pop(key, None)
364
+
365
+ # Delete from GCS
366
+ if self._gcs_bucket:
367
+ gcs_success = await self._delete_gcs(key)
368
+ if gcs_success:
369
+ if self.metrics:
370
+ self.metrics.record_operation("gcs_delete_success", 1)
371
+ else:
372
+ success = False
373
+
374
+ # Delete from local storage
375
+ if self.config.enable_local_fallback:
376
+ local_success = await self._delete_local(key)
377
+ if local_success:
378
+ if self.metrics:
379
+ self.metrics.record_operation("local_delete_success", 1)
380
+ else:
381
+ success = False
382
+
383
+ if self.metrics:
384
+ if success:
385
+ self.metrics.record_operation("delete_success", 1)
386
+ else:
387
+ self.metrics.record_operation("delete_failure", 1)
388
+
389
+ return success
390
+
391
+ except Exception as e:
392
+ logger.error(f"Failed to delete data for key {key}: {e}")
393
+ if self.metrics:
394
+ self.metrics.record_operation("delete_error", 1)
395
+ raise FileStorageError(f"Deletion failed: {e}")
396
+
397
+ async def exists(self, key: str) -> bool:
398
+ """
399
+ Check if data exists for the given key.
400
+
401
+ Args:
402
+ key: Storage key
403
+
404
+ Returns:
405
+ True if data exists
406
+ """
407
+ if not self._initialized:
408
+ await self.initialize()
409
+
410
+ try:
411
+ # Check cache first
412
+ if self.config.enable_cache and key in self._cache:
413
+ cache_time = self._cache_timestamps.get(key)
414
+ if (
415
+ cache_time
416
+ and (datetime.utcnow() - cache_time).total_seconds()
417
+ < self.config.cache_ttl_seconds
418
+ ):
419
+ return True
420
+
421
+ # Check GCS
422
+ if self._gcs_bucket:
423
+ if await self._exists_gcs(key):
424
+ return True
425
+
426
+ # Check local storage
427
+ if self.config.enable_local_fallback:
428
+ return await self._exists_local(key)
429
+
430
+ return False
431
+
432
+ except Exception as e:
433
+ logger.error(f"Failed to check existence for key {key}: {e}")
434
+ raise FileStorageError(f"Existence check failed: {e}")
435
+
436
+ async def list_keys(
437
+ self, prefix: Optional[str] = None, limit: Optional[int] = None
438
+ ) -> List[str]:
439
+ """
440
+ List storage keys with optional prefix filtering.
441
+
442
+ Args:
443
+ prefix: Optional key prefix filter
444
+ limit: Maximum number of keys to return
445
+
446
+ Returns:
447
+ List of storage keys
448
+ """
449
+ if not self._initialized:
450
+ await self.initialize()
451
+
452
+ try:
453
+ keys = set()
454
+
455
+ # Get keys from GCS
456
+ if self._gcs_bucket:
457
+ gcs_keys = await self._list_keys_gcs(prefix, limit)
458
+ keys.update(gcs_keys)
459
+
460
+ # Get keys from local storage
461
+ if self.config.enable_local_fallback:
462
+ local_keys = await self._list_keys_local(prefix, limit)
463
+ keys.update(local_keys)
464
+
465
+ # Apply limit if specified
466
+ keys_list = list(keys)
467
+ if limit:
468
+ keys_list = keys_list[:limit]
469
+
470
+ return keys_list
471
+
472
+ except Exception as e:
473
+ logger.error(f"Failed to list keys: {e}")
474
+ raise FileStorageError(f"Key listing failed: {e}")
475
+
476
+ # GCS implementation methods
477
+
478
+ async def _store_gcs(
479
+ self,
480
+ key: str,
481
+ data: bytes,
482
+ metadata: Optional[Dict[str, Any]],
483
+ compressed: bool,
484
+ ) -> bool:
485
+ """Store data in Google Cloud Storage."""
486
+ try:
487
+ blob = self._gcs_bucket.blob(key)
488
+
489
+ # Set metadata
490
+ if metadata:
491
+ blob.metadata = metadata
492
+ if compressed:
493
+ blob.content_encoding = "gzip"
494
+
495
+ # Upload data
496
+ blob.upload_from_string(data)
497
+ return True
498
+
499
+ except Exception as e:
500
+ logger.error(f"GCS store failed for key {key}: {e}")
501
+ return False
502
+
503
+ async def _retrieve_gcs(self, key: str) -> Optional[Any]:
504
+ """Retrieve data from Google Cloud Storage."""
505
+ try:
506
+ blob = self._gcs_bucket.blob(key)
507
+
508
+ if not blob.exists():
509
+ return None
510
+
511
+ # Download data
512
+ data = blob.download_as_bytes()
513
+
514
+ # Decompress if needed
515
+ if blob.content_encoding == "gzip":
516
+ data = gzip.decompress(data)
517
+
518
+ # Deserialize data
519
+ return await self._deserialize_data(data)
520
+
521
+ except NotFound:
522
+ return None
523
+ except Exception as e:
524
+ logger.error(f"GCS retrieve failed for key {key}: {e}")
525
+ return None
526
+
527
+ async def _delete_gcs(self, key: str) -> bool:
528
+ """Delete data from Google Cloud Storage."""
529
+ try:
530
+ blob = self._gcs_bucket.blob(key)
531
+ blob.delete()
532
+ return True
533
+
534
+ except NotFound:
535
+ return True # Already deleted
536
+ except Exception as e:
537
+ logger.error(f"GCS delete failed for key {key}: {e}")
538
+ return False
539
+
540
+ async def _exists_gcs(self, key: str) -> bool:
541
+ """Check if data exists in Google Cloud Storage."""
542
+ try:
543
+ blob = self._gcs_bucket.blob(key)
544
+ return blob.exists()
545
+
546
+ except Exception as e:
547
+ logger.error(f"GCS exists check failed for key {key}: {e}")
548
+ return False
549
+
550
+ async def _list_keys_gcs(self, prefix: Optional[str], limit: Optional[int]) -> List[str]:
551
+ """List keys from Google Cloud Storage."""
552
+ try:
553
+ blobs = self._gcs_bucket.list_blobs(prefix=prefix, max_results=limit)
554
+ return [blob.name for blob in blobs]
555
+
556
+ except Exception as e:
557
+ logger.error(f"GCS list keys failed: {e}")
558
+ return []
559
+
560
+ # Local storage implementation methods
561
+
562
+ async def _store_local(
563
+ self,
564
+ key: str,
565
+ data: bytes,
566
+ metadata: Optional[Dict[str, Any]],
567
+ compressed: bool,
568
+ ) -> bool:
569
+ """Store data in local filesystem."""
570
+ try:
571
+ file_path = Path(self.config.local_storage_path) / key
572
+ file_path.parent.mkdir(parents=True, exist_ok=True)
573
+
574
+ async with aiofiles.open(file_path, "wb") as f:
575
+ await f.write(data)
576
+
577
+ # Store metadata separately
578
+ if metadata:
579
+ metadata_path = file_path.with_suffix(".metadata")
580
+ metadata_with_compression = {
581
+ **metadata,
582
+ "compressed": compressed,
583
+ }
584
+ async with aiofiles.open(metadata_path, "w") as f:
585
+ await f.write(json.dumps(metadata_with_compression))
586
+
587
+ return True
588
+
589
+ except Exception as e:
590
+ logger.error(f"Local store failed for key {key}: {e}")
591
+ return False
592
+
593
+ async def _retrieve_local(self, key: str) -> Optional[Any]:
594
+ """Retrieve data from local filesystem."""
595
+ try:
596
+ file_path = Path(self.config.local_storage_path) / key
597
+
598
+ if not file_path.exists():
599
+ return None
600
+
601
+ async with aiofiles.open(file_path, "rb") as f:
602
+ data = await f.read()
603
+
604
+ # Check for compression metadata
605
+ metadata_path = file_path.with_suffix(".metadata")
606
+ compressed = False
607
+ if metadata_path.exists():
608
+ async with aiofiles.open(metadata_path, "r") as f:
609
+ metadata = json.loads(await f.read())
610
+ compressed = metadata.get("compressed", False)
611
+
612
+ # Decompress if needed
613
+ if compressed:
614
+ data = gzip.decompress(data)
615
+
616
+ # Deserialize data
617
+ return await self._deserialize_data(data)
618
+
619
+ except Exception as e:
620
+ logger.error(f"Local retrieve failed for key {key}: {e}")
621
+ return None
622
+
623
+ async def _delete_local(self, key: str) -> bool:
624
+ """Delete data from local filesystem."""
625
+ try:
626
+ file_path = Path(self.config.local_storage_path) / key
627
+ metadata_path = file_path.with_suffix(".metadata")
628
+
629
+ success = True
630
+ if file_path.exists():
631
+ file_path.unlink()
632
+
633
+ if metadata_path.exists():
634
+ metadata_path.unlink()
635
+
636
+ return success
637
+
638
+ except Exception as e:
639
+ logger.error(f"Local delete failed for key {key}: {e}")
640
+ return False
641
+
642
+ async def _exists_local(self, key: str) -> bool:
643
+ """Check if data exists in local filesystem."""
644
+ try:
645
+ file_path = Path(self.config.local_storage_path) / key
646
+ return file_path.exists()
647
+
648
+ except Exception as e:
649
+ logger.error(f"Local exists check failed for key {key}: {e}")
650
+ return False
651
+
652
+ async def _list_keys_local(self, prefix: Optional[str], limit: Optional[int]) -> List[str]:
653
+ """List keys from local filesystem."""
654
+ try:
655
+ storage_path = Path(self.config.local_storage_path)
656
+ if not storage_path.exists():
657
+ return []
658
+
659
+ keys = []
660
+ for file_path in storage_path.rglob("*"):
661
+ if file_path.is_file() and not file_path.name.endswith(".metadata"):
662
+ key = str(file_path.relative_to(storage_path))
663
+ if not prefix or key.startswith(prefix):
664
+ keys.append(key)
665
+ if limit and len(keys) >= limit:
666
+ break
667
+
668
+ return keys
669
+
670
+ except Exception as e:
671
+ logger.error(f"Local list keys failed: {e}")
672
+ return []
673
+
674
+ # Utility methods
675
+
676
+ async def _serialize_data(self, data: Union[str, bytes, Dict[str, Any]]) -> bytes:
677
+ """Serialize data for storage."""
678
+ if isinstance(data, bytes):
679
+ return data
680
+ elif isinstance(data, str):
681
+ return data.encode("utf-8")
682
+ else:
683
+ # Use pickle for complex objects
684
+ return pickle.dumps(data)
685
+
686
+ async def _deserialize_data(self, data: bytes) -> Any:
687
+ """Deserialize data from storage."""
688
+ try:
689
+ # Try to deserialize as pickle first
690
+ return pickle.loads(data)
691
+ except Exception:
692
+ try:
693
+ # Try as JSON
694
+ return json.loads(data.decode("utf-8"))
695
+ except Exception:
696
+ # Return as string
697
+ return data.decode("utf-8")
698
+
699
+ async def _cleanup_cache(self):
700
+ """Clean up expired cache entries."""
701
+ if not self.config.enable_cache:
702
+ return
703
+
704
+ current_time = datetime.utcnow()
705
+ expired_keys = []
706
+
707
+ for key, timestamp in self._cache_timestamps.items():
708
+ if (current_time - timestamp).total_seconds() > self.config.cache_ttl_seconds:
709
+ expired_keys.append(key)
710
+
711
+ for key in expired_keys:
712
+ self._cache.pop(key, None)
713
+ self._cache_timestamps.pop(key, None)
714
+
715
+ def get_stats(self) -> Dict[str, Any]:
716
+ """Get storage statistics."""
717
+ return {
718
+ "initialized": self._initialized,
719
+ "gcs_available": self._gcs_bucket is not None,
720
+ "local_fallback_enabled": self.config.enable_local_fallback,
721
+ "cache_enabled": self.config.enable_cache,
722
+ "cache_size": len(self._cache),
723
+ "metrics": (
724
+ self.metrics.get_metrics_summary()
725
+ if self.metrics and hasattr(self.metrics, "get_metrics_summary")
726
+ else {}
727
+ ),
728
+ }
729
+
730
+
731
+ # Global instance
732
+ _file_storage_instance = None
733
+
734
+
735
+ def get_file_storage(config: Optional[Dict[str, Any]] = None) -> FileStorage:
736
+ """Get the global file storage instance."""
737
+ global _file_storage_instance
738
+ if _file_storage_instance is None:
739
+ if config is None:
740
+ from aiecs.config.config import get_settings
741
+
742
+ settings = get_settings()
743
+ config = settings.file_storage_config
744
+ _file_storage_instance = FileStorage(config)
745
+ return _file_storage_instance
746
+
747
+
748
+ async def initialize_file_storage(
749
+ config: Optional[Dict[str, Any]] = None,
750
+ ) -> FileStorage:
751
+ """Initialize and return the file storage instance."""
752
+ storage = get_file_storage(config)
753
+ await storage.initialize()
754
+ return storage