atlas-chat 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (250) hide show
  1. atlas/__init__.py +40 -0
  2. atlas/application/__init__.py +7 -0
  3. atlas/application/chat/__init__.py +7 -0
  4. atlas/application/chat/agent/__init__.py +10 -0
  5. atlas/application/chat/agent/act_loop.py +179 -0
  6. atlas/application/chat/agent/factory.py +142 -0
  7. atlas/application/chat/agent/protocols.py +46 -0
  8. atlas/application/chat/agent/react_loop.py +338 -0
  9. atlas/application/chat/agent/think_act_loop.py +171 -0
  10. atlas/application/chat/approval_manager.py +151 -0
  11. atlas/application/chat/elicitation_manager.py +191 -0
  12. atlas/application/chat/events/__init__.py +1 -0
  13. atlas/application/chat/events/agent_event_relay.py +112 -0
  14. atlas/application/chat/modes/__init__.py +1 -0
  15. atlas/application/chat/modes/agent.py +125 -0
  16. atlas/application/chat/modes/plain.py +74 -0
  17. atlas/application/chat/modes/rag.py +81 -0
  18. atlas/application/chat/modes/tools.py +179 -0
  19. atlas/application/chat/orchestrator.py +213 -0
  20. atlas/application/chat/policies/__init__.py +1 -0
  21. atlas/application/chat/policies/tool_authorization.py +99 -0
  22. atlas/application/chat/preprocessors/__init__.py +1 -0
  23. atlas/application/chat/preprocessors/message_builder.py +92 -0
  24. atlas/application/chat/preprocessors/prompt_override_service.py +104 -0
  25. atlas/application/chat/service.py +454 -0
  26. atlas/application/chat/utilities/__init__.py +6 -0
  27. atlas/application/chat/utilities/error_handler.py +367 -0
  28. atlas/application/chat/utilities/event_notifier.py +546 -0
  29. atlas/application/chat/utilities/file_processor.py +613 -0
  30. atlas/application/chat/utilities/tool_executor.py +789 -0
  31. atlas/atlas_chat_cli.py +347 -0
  32. atlas/atlas_client.py +238 -0
  33. atlas/core/__init__.py +0 -0
  34. atlas/core/auth.py +205 -0
  35. atlas/core/authorization_manager.py +27 -0
  36. atlas/core/capabilities.py +123 -0
  37. atlas/core/compliance.py +215 -0
  38. atlas/core/domain_whitelist.py +147 -0
  39. atlas/core/domain_whitelist_middleware.py +82 -0
  40. atlas/core/http_client.py +28 -0
  41. atlas/core/log_sanitizer.py +102 -0
  42. atlas/core/metrics_logger.py +59 -0
  43. atlas/core/middleware.py +131 -0
  44. atlas/core/otel_config.py +242 -0
  45. atlas/core/prompt_risk.py +200 -0
  46. atlas/core/rate_limit.py +0 -0
  47. atlas/core/rate_limit_middleware.py +64 -0
  48. atlas/core/security_headers_middleware.py +51 -0
  49. atlas/domain/__init__.py +37 -0
  50. atlas/domain/chat/__init__.py +1 -0
  51. atlas/domain/chat/dtos.py +85 -0
  52. atlas/domain/errors.py +96 -0
  53. atlas/domain/messages/__init__.py +12 -0
  54. atlas/domain/messages/models.py +160 -0
  55. atlas/domain/rag_mcp_service.py +664 -0
  56. atlas/domain/sessions/__init__.py +7 -0
  57. atlas/domain/sessions/models.py +36 -0
  58. atlas/domain/unified_rag_service.py +371 -0
  59. atlas/infrastructure/__init__.py +10 -0
  60. atlas/infrastructure/app_factory.py +135 -0
  61. atlas/infrastructure/events/__init__.py +1 -0
  62. atlas/infrastructure/events/cli_event_publisher.py +140 -0
  63. atlas/infrastructure/events/websocket_publisher.py +140 -0
  64. atlas/infrastructure/sessions/in_memory_repository.py +56 -0
  65. atlas/infrastructure/transport/__init__.py +7 -0
  66. atlas/infrastructure/transport/websocket_connection_adapter.py +33 -0
  67. atlas/init_cli.py +226 -0
  68. atlas/interfaces/__init__.py +15 -0
  69. atlas/interfaces/events.py +134 -0
  70. atlas/interfaces/llm.py +54 -0
  71. atlas/interfaces/rag.py +40 -0
  72. atlas/interfaces/sessions.py +75 -0
  73. atlas/interfaces/tools.py +57 -0
  74. atlas/interfaces/transport.py +24 -0
  75. atlas/main.py +564 -0
  76. atlas/mcp/api_key_demo/README.md +76 -0
  77. atlas/mcp/api_key_demo/main.py +172 -0
  78. atlas/mcp/api_key_demo/run.sh +56 -0
  79. atlas/mcp/basictable/main.py +147 -0
  80. atlas/mcp/calculator/main.py +149 -0
  81. atlas/mcp/code-executor/execution_engine.py +98 -0
  82. atlas/mcp/code-executor/execution_environment.py +95 -0
  83. atlas/mcp/code-executor/main.py +528 -0
  84. atlas/mcp/code-executor/result_processing.py +276 -0
  85. atlas/mcp/code-executor/script_generation.py +195 -0
  86. atlas/mcp/code-executor/security_checker.py +140 -0
  87. atlas/mcp/corporate_cars/main.py +437 -0
  88. atlas/mcp/csv_reporter/main.py +545 -0
  89. atlas/mcp/duckduckgo/main.py +182 -0
  90. atlas/mcp/elicitation_demo/README.md +171 -0
  91. atlas/mcp/elicitation_demo/main.py +262 -0
  92. atlas/mcp/env-demo/README.md +158 -0
  93. atlas/mcp/env-demo/main.py +199 -0
  94. atlas/mcp/file_size_test/main.py +284 -0
  95. atlas/mcp/filesystem/main.py +348 -0
  96. atlas/mcp/image_demo/main.py +113 -0
  97. atlas/mcp/image_demo/requirements.txt +4 -0
  98. atlas/mcp/logging_demo/README.md +72 -0
  99. atlas/mcp/logging_demo/main.py +103 -0
  100. atlas/mcp/many_tools_demo/main.py +50 -0
  101. atlas/mcp/order_database/__init__.py +0 -0
  102. atlas/mcp/order_database/main.py +369 -0
  103. atlas/mcp/order_database/signal_data.csv +1001 -0
  104. atlas/mcp/pdfbasic/main.py +394 -0
  105. atlas/mcp/pptx_generator/main.py +760 -0
  106. atlas/mcp/pptx_generator/requirements.txt +13 -0
  107. atlas/mcp/pptx_generator/run_test.sh +1 -0
  108. atlas/mcp/pptx_generator/test_pptx_generator_security.py +169 -0
  109. atlas/mcp/progress_demo/main.py +167 -0
  110. atlas/mcp/progress_updates_demo/QUICKSTART.md +273 -0
  111. atlas/mcp/progress_updates_demo/README.md +120 -0
  112. atlas/mcp/progress_updates_demo/main.py +497 -0
  113. atlas/mcp/prompts/main.py +222 -0
  114. atlas/mcp/public_demo/main.py +189 -0
  115. atlas/mcp/sampling_demo/README.md +169 -0
  116. atlas/mcp/sampling_demo/main.py +234 -0
  117. atlas/mcp/thinking/main.py +77 -0
  118. atlas/mcp/tool_planner/main.py +240 -0
  119. atlas/mcp/ui-demo/badmesh.png +0 -0
  120. atlas/mcp/ui-demo/main.py +383 -0
  121. atlas/mcp/ui-demo/templates/button_demo.html +32 -0
  122. atlas/mcp/ui-demo/templates/data_visualization.html +32 -0
  123. atlas/mcp/ui-demo/templates/form_demo.html +28 -0
  124. atlas/mcp/username-override-demo/README.md +320 -0
  125. atlas/mcp/username-override-demo/main.py +308 -0
  126. atlas/modules/__init__.py +0 -0
  127. atlas/modules/config/__init__.py +34 -0
  128. atlas/modules/config/cli.py +231 -0
  129. atlas/modules/config/config_manager.py +1096 -0
  130. atlas/modules/file_storage/__init__.py +22 -0
  131. atlas/modules/file_storage/cli.py +330 -0
  132. atlas/modules/file_storage/content_extractor.py +290 -0
  133. atlas/modules/file_storage/manager.py +295 -0
  134. atlas/modules/file_storage/mock_s3_client.py +402 -0
  135. atlas/modules/file_storage/s3_client.py +417 -0
  136. atlas/modules/llm/__init__.py +19 -0
  137. atlas/modules/llm/caller.py +287 -0
  138. atlas/modules/llm/litellm_caller.py +675 -0
  139. atlas/modules/llm/models.py +19 -0
  140. atlas/modules/mcp_tools/__init__.py +17 -0
  141. atlas/modules/mcp_tools/client.py +2123 -0
  142. atlas/modules/mcp_tools/token_storage.py +556 -0
  143. atlas/modules/prompts/prompt_provider.py +130 -0
  144. atlas/modules/rag/__init__.py +24 -0
  145. atlas/modules/rag/atlas_rag_client.py +336 -0
  146. atlas/modules/rag/client.py +129 -0
  147. atlas/routes/admin_routes.py +865 -0
  148. atlas/routes/config_routes.py +484 -0
  149. atlas/routes/feedback_routes.py +361 -0
  150. atlas/routes/files_routes.py +274 -0
  151. atlas/routes/health_routes.py +40 -0
  152. atlas/routes/mcp_auth_routes.py +223 -0
  153. atlas/server_cli.py +164 -0
  154. atlas/tests/conftest.py +20 -0
  155. atlas/tests/integration/test_mcp_auth_integration.py +152 -0
  156. atlas/tests/manual_test_sampling.py +87 -0
  157. atlas/tests/modules/mcp_tools/test_client_auth.py +226 -0
  158. atlas/tests/modules/mcp_tools/test_client_env.py +191 -0
  159. atlas/tests/test_admin_mcp_server_management_routes.py +141 -0
  160. atlas/tests/test_agent_roa.py +135 -0
  161. atlas/tests/test_app_factory_smoke.py +47 -0
  162. atlas/tests/test_approval_manager.py +439 -0
  163. atlas/tests/test_atlas_client.py +188 -0
  164. atlas/tests/test_atlas_rag_client.py +447 -0
  165. atlas/tests/test_atlas_rag_integration.py +224 -0
  166. atlas/tests/test_attach_file_flow.py +287 -0
  167. atlas/tests/test_auth_utils.py +165 -0
  168. atlas/tests/test_backend_public_url.py +185 -0
  169. atlas/tests/test_banner_logging.py +287 -0
  170. atlas/tests/test_capability_tokens_and_injection.py +203 -0
  171. atlas/tests/test_compliance_level.py +54 -0
  172. atlas/tests/test_compliance_manager.py +253 -0
  173. atlas/tests/test_config_manager.py +617 -0
  174. atlas/tests/test_config_manager_paths.py +12 -0
  175. atlas/tests/test_core_auth.py +18 -0
  176. atlas/tests/test_core_utils.py +190 -0
  177. atlas/tests/test_docker_env_sync.py +202 -0
  178. atlas/tests/test_domain_errors.py +329 -0
  179. atlas/tests/test_domain_whitelist.py +359 -0
  180. atlas/tests/test_elicitation_manager.py +408 -0
  181. atlas/tests/test_elicitation_routing.py +296 -0
  182. atlas/tests/test_env_demo_server.py +88 -0
  183. atlas/tests/test_error_classification.py +113 -0
  184. atlas/tests/test_error_flow_integration.py +116 -0
  185. atlas/tests/test_feedback_routes.py +333 -0
  186. atlas/tests/test_file_content_extraction.py +1134 -0
  187. atlas/tests/test_file_extraction_routes.py +158 -0
  188. atlas/tests/test_file_library.py +107 -0
  189. atlas/tests/test_file_manager_unit.py +18 -0
  190. atlas/tests/test_health_route.py +49 -0
  191. atlas/tests/test_http_client_stub.py +8 -0
  192. atlas/tests/test_imports_smoke.py +30 -0
  193. atlas/tests/test_interfaces_llm_response.py +9 -0
  194. atlas/tests/test_issue_access_denied_fix.py +136 -0
  195. atlas/tests/test_llm_env_expansion.py +836 -0
  196. atlas/tests/test_log_level_sensitive_data.py +285 -0
  197. atlas/tests/test_mcp_auth_routes.py +341 -0
  198. atlas/tests/test_mcp_client_auth.py +331 -0
  199. atlas/tests/test_mcp_data_injection.py +270 -0
  200. atlas/tests/test_mcp_get_authorized_servers.py +95 -0
  201. atlas/tests/test_mcp_hot_reload.py +512 -0
  202. atlas/tests/test_mcp_image_content.py +424 -0
  203. atlas/tests/test_mcp_logging.py +172 -0
  204. atlas/tests/test_mcp_progress_updates.py +313 -0
  205. atlas/tests/test_mcp_prompt_override_system_prompt.py +102 -0
  206. atlas/tests/test_mcp_prompts_server.py +39 -0
  207. atlas/tests/test_mcp_tool_result_parsing.py +296 -0
  208. atlas/tests/test_metrics_logger.py +56 -0
  209. atlas/tests/test_middleware_auth.py +379 -0
  210. atlas/tests/test_prompt_risk_and_acl.py +141 -0
  211. atlas/tests/test_rag_mcp_aggregator.py +204 -0
  212. atlas/tests/test_rag_mcp_service.py +224 -0
  213. atlas/tests/test_rate_limit_middleware.py +45 -0
  214. atlas/tests/test_routes_config_smoke.py +60 -0
  215. atlas/tests/test_routes_files_download_token.py +41 -0
  216. atlas/tests/test_routes_files_health.py +18 -0
  217. atlas/tests/test_runtime_imports.py +53 -0
  218. atlas/tests/test_sampling_integration.py +482 -0
  219. atlas/tests/test_security_admin_routes.py +61 -0
  220. atlas/tests/test_security_capability_tokens.py +65 -0
  221. atlas/tests/test_security_file_stats_scope.py +21 -0
  222. atlas/tests/test_security_header_injection.py +191 -0
  223. atlas/tests/test_security_headers_and_filename.py +63 -0
  224. atlas/tests/test_shared_session_repository.py +101 -0
  225. atlas/tests/test_system_prompt_loading.py +181 -0
  226. atlas/tests/test_token_storage.py +505 -0
  227. atlas/tests/test_tool_approval_config.py +93 -0
  228. atlas/tests/test_tool_approval_utils.py +356 -0
  229. atlas/tests/test_tool_authorization_group_filtering.py +223 -0
  230. atlas/tests/test_tool_details_in_config.py +108 -0
  231. atlas/tests/test_tool_planner.py +300 -0
  232. atlas/tests/test_unified_rag_service.py +398 -0
  233. atlas/tests/test_username_override_in_approval.py +258 -0
  234. atlas/tests/test_websocket_auth_header.py +168 -0
  235. atlas/version.py +6 -0
  236. atlas_chat-0.1.0.data/data/.env.example +253 -0
  237. atlas_chat-0.1.0.data/data/config/defaults/compliance-levels.json +44 -0
  238. atlas_chat-0.1.0.data/data/config/defaults/domain-whitelist.json +123 -0
  239. atlas_chat-0.1.0.data/data/config/defaults/file-extractors.json +74 -0
  240. atlas_chat-0.1.0.data/data/config/defaults/help-config.json +198 -0
  241. atlas_chat-0.1.0.data/data/config/defaults/llmconfig-buggy.yml +11 -0
  242. atlas_chat-0.1.0.data/data/config/defaults/llmconfig.yml +19 -0
  243. atlas_chat-0.1.0.data/data/config/defaults/mcp.json +138 -0
  244. atlas_chat-0.1.0.data/data/config/defaults/rag-sources.json +17 -0
  245. atlas_chat-0.1.0.data/data/config/defaults/splash-config.json +16 -0
  246. atlas_chat-0.1.0.dist-info/METADATA +236 -0
  247. atlas_chat-0.1.0.dist-info/RECORD +250 -0
  248. atlas_chat-0.1.0.dist-info/WHEEL +5 -0
  249. atlas_chat-0.1.0.dist-info/entry_points.txt +4 -0
  250. atlas_chat-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1134 @@
1
+ """Tests for file content extraction feature.
2
+
3
+ Tests the FileExtractorConfig, FileExtractorsConfig models, FileContentExtractor class,
4
+ and related configuration functionality.
5
+ """
6
+
7
+ from unittest.mock import AsyncMock, Mock, patch
8
+
9
+ import httpx
10
+ import pytest
11
+
12
+ from atlas.modules.config.config_manager import (
13
+ ConfigManager,
14
+ FileExtractorConfig,
15
+ FileExtractorsConfig,
16
+ )
17
+ from atlas.modules.file_storage.content_extractor import (
18
+ ExtractionResult,
19
+ FileContentExtractor,
20
+ )
21
+
22
+
23
+ class TestFileExtractorConfig:
24
+ """Test FileExtractorConfig Pydantic model."""
25
+
26
+ def test_file_extractor_config_with_defaults(self):
27
+ """FileExtractorConfig should have sensible defaults."""
28
+ config = FileExtractorConfig(url="http://localhost:8010/extract")
29
+
30
+ assert config.url == "http://localhost:8010/extract"
31
+ assert config.method == "POST"
32
+ assert config.timeout_seconds == 30
33
+ assert config.max_file_size_mb == 50
34
+ assert config.preview_chars == 2000
35
+ assert config.request_format == "base64"
36
+ assert config.response_field == "text"
37
+ assert config.enabled is True
38
+
39
+ def test_file_extractor_config_with_custom_values(self):
40
+ """FileExtractorConfig should accept custom values."""
41
+ config = FileExtractorConfig(
42
+ url="http://custom-service:9000/ocr",
43
+ method="PUT",
44
+ timeout_seconds=120,
45
+ max_file_size_mb=100,
46
+ preview_chars=5000,
47
+ request_format="url",
48
+ response_field="content",
49
+ enabled=False
50
+ )
51
+
52
+ assert config.url == "http://custom-service:9000/ocr"
53
+ assert config.method == "PUT"
54
+ assert config.timeout_seconds == 120
55
+ assert config.max_file_size_mb == 100
56
+ assert config.preview_chars == 5000
57
+ assert config.request_format == "url"
58
+ assert config.response_field == "content"
59
+ assert config.enabled is False
60
+
61
+ def test_file_extractor_config_preview_chars_optional(self):
62
+ """preview_chars should be optional (None)."""
63
+ config = FileExtractorConfig(
64
+ url="http://localhost:8010/extract",
65
+ preview_chars=None
66
+ )
67
+
68
+ assert config.preview_chars is None
69
+
70
+
71
+ class TestFileExtractorsConfig:
72
+ """Test FileExtractorsConfig Pydantic model."""
73
+
74
+ def test_file_extractors_config_with_defaults(self):
75
+ """FileExtractorsConfig should have sensible defaults."""
76
+ config = FileExtractorsConfig()
77
+
78
+ assert config.enabled is True
79
+ assert config.default_behavior == "full"
80
+ assert config.extractors == {}
81
+ assert config.extension_mapping == {}
82
+ assert config.mime_mapping == {}
83
+
84
+ def test_file_extractors_config_with_extractors(self):
85
+ """FileExtractorsConfig should properly parse nested extractors."""
86
+ config = FileExtractorsConfig(
87
+ enabled=True,
88
+ default_behavior="attach_only",
89
+ extractors={
90
+ "pdf-text": {
91
+ "url": "http://localhost:8010/extract",
92
+ "timeout_seconds": 60
93
+ },
94
+ "image-vision": {
95
+ "url": "http://localhost:8011/analyze",
96
+ "enabled": False
97
+ }
98
+ },
99
+ extension_mapping={
100
+ ".pdf": "pdf-text",
101
+ ".png": "image-vision"
102
+ },
103
+ mime_mapping={
104
+ "application/pdf": "pdf-text",
105
+ "image/png": "image-vision"
106
+ }
107
+ )
108
+
109
+ assert config.enabled is True
110
+ assert config.default_behavior == "none" # "attach_only" normalized to "none"
111
+ assert len(config.extractors) == 2
112
+ assert isinstance(config.extractors["pdf-text"], FileExtractorConfig)
113
+ assert config.extractors["pdf-text"].url == "http://localhost:8010/extract"
114
+ assert config.extractors["pdf-text"].timeout_seconds == 60
115
+ assert config.extractors["image-vision"].enabled is False
116
+ assert config.extension_mapping[".pdf"] == "pdf-text"
117
+ assert config.mime_mapping["application/pdf"] == "pdf-text"
118
+
119
+ def test_file_extractors_config_validator_converts_dicts(self):
120
+ """Validator should convert plain dicts to FileExtractorConfig objects."""
121
+ raw_data = {
122
+ "enabled": True,
123
+ "extractors": {
124
+ "test": {"url": "http://test.local/extract"}
125
+ }
126
+ }
127
+
128
+ config = FileExtractorsConfig(**raw_data)
129
+
130
+ assert isinstance(config.extractors["test"], FileExtractorConfig)
131
+ assert config.extractors["test"].url == "http://test.local/extract"
132
+
133
+
134
+ class TestFileContentExtractor:
135
+ """Test FileContentExtractor class."""
136
+
137
+ def test_extractor_initialization_with_config(self):
138
+ """FileContentExtractor should accept config override."""
139
+ config = FileExtractorsConfig(enabled=True)
140
+ extractor = FileContentExtractor(config=config)
141
+
142
+ assert extractor.config is config
143
+
144
+ def test_extractor_lazy_loads_config(self):
145
+ """FileContentExtractor should lazy load config if not provided."""
146
+ extractor = FileContentExtractor()
147
+
148
+ # Config should be loaded on first access
149
+ config = extractor.config
150
+ assert config is not None
151
+ assert isinstance(config, FileExtractorsConfig)
152
+
153
+ def test_is_enabled_checks_both_flags(self):
154
+ """is_enabled should check both feature flag and config enabled."""
155
+ config = FileExtractorsConfig(enabled=True)
156
+ extractor = FileContentExtractor(config=config)
157
+
158
+ with patch('atlas.modules.file_storage.content_extractor.get_app_settings') as mock_settings:
159
+ # Both enabled
160
+ mock_settings.return_value.feature_file_content_extraction_enabled = True
161
+ assert extractor.is_enabled() is True
162
+
163
+ # Feature flag disabled
164
+ mock_settings.return_value.feature_file_content_extraction_enabled = False
165
+ assert extractor.is_enabled() is False
166
+
167
+ def test_is_enabled_config_disabled(self):
168
+ """is_enabled should return False if config.enabled is False."""
169
+ config = FileExtractorsConfig(enabled=False)
170
+ extractor = FileContentExtractor(config=config)
171
+
172
+ with patch('atlas.modules.file_storage.content_extractor.get_app_settings') as mock_settings:
173
+ mock_settings.return_value.feature_file_content_extraction_enabled = True
174
+ assert extractor.is_enabled() is False
175
+
176
+ def test_get_default_behavior(self):
177
+ """get_default_behavior should return config value."""
178
+ config = FileExtractorsConfig(default_behavior="attach_only")
179
+ extractor = FileContentExtractor(config=config)
180
+
181
+ assert extractor.get_default_behavior() == "none" # "attach_only" normalized
182
+
183
+ def test_get_default_behavior_preview(self):
184
+ """get_default_behavior should accept preview mode directly."""
185
+ config = FileExtractorsConfig(default_behavior="preview")
186
+ extractor = FileContentExtractor(config=config)
187
+
188
+ assert extractor.get_default_behavior() == "preview"
189
+
190
+ def test_legacy_extract_normalizes_to_full(self):
191
+ """Legacy 'extract' value should normalize to 'full'."""
192
+ config = FileExtractorsConfig(default_behavior="extract")
193
+
194
+ assert config.default_behavior == "full"
195
+
196
+ def test_legacy_attach_only_normalizes_to_none(self):
197
+ """Legacy 'attach_only' value should normalize to 'none'."""
198
+ config = FileExtractorsConfig(default_behavior="attach_only")
199
+
200
+ assert config.default_behavior == "none"
201
+
202
+ def test_get_extractor_for_file_by_extension(self):
203
+ """Should find extractor by file extension."""
204
+ config = FileExtractorsConfig(
205
+ enabled=True,
206
+ extractors={
207
+ "pdf-text": FileExtractorConfig(url="http://localhost/pdf", enabled=True)
208
+ },
209
+ extension_mapping={".pdf": "pdf-text"}
210
+ )
211
+ extractor = FileContentExtractor(config=config)
212
+
213
+ with patch('atlas.modules.file_storage.content_extractor.get_app_settings') as mock_settings:
214
+ mock_settings.return_value.feature_file_content_extraction_enabled = True
215
+
216
+ result = extractor.get_extractor_for_file("document.pdf")
217
+ assert result is not None
218
+ assert result.url == "http://localhost/pdf"
219
+
220
+ def test_get_extractor_for_file_by_mime_fallback(self):
221
+ """Should fall back to MIME type lookup."""
222
+ config = FileExtractorsConfig(
223
+ enabled=True,
224
+ extractors={
225
+ "pdf-text": FileExtractorConfig(url="http://localhost/pdf", enabled=True)
226
+ },
227
+ extension_mapping={}, # No extension mapping
228
+ mime_mapping={"application/pdf": "pdf-text"}
229
+ )
230
+ extractor = FileContentExtractor(config=config)
231
+
232
+ with patch('atlas.modules.file_storage.content_extractor.get_app_settings') as mock_settings:
233
+ mock_settings.return_value.feature_file_content_extraction_enabled = True
234
+
235
+ result = extractor.get_extractor_for_file("document.xyz", mime_type="application/pdf")
236
+ assert result is not None
237
+ assert result.url == "http://localhost/pdf"
238
+
239
+ def test_get_extractor_for_file_returns_none_if_disabled(self):
240
+ """Should return None if extraction is disabled."""
241
+ config = FileExtractorsConfig(
242
+ enabled=True,
243
+ extractors={
244
+ "pdf-text": FileExtractorConfig(url="http://localhost/pdf", enabled=False)
245
+ },
246
+ extension_mapping={".pdf": "pdf-text"}
247
+ )
248
+ extractor = FileContentExtractor(config=config)
249
+
250
+ with patch('atlas.modules.file_storage.content_extractor.get_app_settings') as mock_settings:
251
+ mock_settings.return_value.feature_file_content_extraction_enabled = True
252
+
253
+ result = extractor.get_extractor_for_file("document.pdf")
254
+ assert result is None
255
+
256
+ def test_get_extractor_for_file_returns_none_if_no_mapping(self):
257
+ """Should return None if no mapping exists."""
258
+ config = FileExtractorsConfig(
259
+ enabled=True,
260
+ extractors={},
261
+ extension_mapping={}
262
+ )
263
+ extractor = FileContentExtractor(config=config)
264
+
265
+ with patch('atlas.modules.file_storage.content_extractor.get_app_settings') as mock_settings:
266
+ mock_settings.return_value.feature_file_content_extraction_enabled = True
267
+
268
+ result = extractor.get_extractor_for_file("document.xyz")
269
+ assert result is None
270
+
271
+ def test_can_extract_returns_boolean(self):
272
+ """can_extract should return True/False based on extractor availability."""
273
+ config = FileExtractorsConfig(
274
+ enabled=True,
275
+ extractors={
276
+ "pdf-text": FileExtractorConfig(url="http://localhost/pdf", enabled=True)
277
+ },
278
+ extension_mapping={".pdf": "pdf-text"}
279
+ )
280
+ extractor = FileContentExtractor(config=config)
281
+
282
+ with patch('atlas.modules.file_storage.content_extractor.get_app_settings') as mock_settings:
283
+ mock_settings.return_value.feature_file_content_extraction_enabled = True
284
+
285
+ assert extractor.can_extract("document.pdf") is True
286
+ assert extractor.can_extract("document.xyz") is False
287
+
288
+ def test_get_supported_extensions(self):
289
+ """get_supported_extensions should return list of extractable extensions."""
290
+ config = FileExtractorsConfig(
291
+ enabled=True,
292
+ extractors={
293
+ "pdf-text": FileExtractorConfig(url="http://localhost/pdf", enabled=True),
294
+ "image-vision": FileExtractorConfig(url="http://localhost/img", enabled=False)
295
+ },
296
+ extension_mapping={
297
+ ".pdf": "pdf-text",
298
+ ".png": "image-vision" # Disabled extractor
299
+ }
300
+ )
301
+ extractor = FileContentExtractor(config=config)
302
+
303
+ with patch('atlas.modules.file_storage.content_extractor.get_app_settings') as mock_settings:
304
+ mock_settings.return_value.feature_file_content_extraction_enabled = True
305
+
306
+ extensions = extractor.get_supported_extensions()
307
+ assert ".pdf" in extensions
308
+ assert ".png" not in extensions # Extractor is disabled
309
+
310
+ def test_get_supported_extensions_empty_when_disabled(self):
311
+ """get_supported_extensions should return empty list when disabled."""
312
+ config = FileExtractorsConfig(enabled=False)
313
+ extractor = FileContentExtractor(config=config)
314
+
315
+ with patch('atlas.modules.file_storage.content_extractor.get_app_settings') as mock_settings:
316
+ mock_settings.return_value.feature_file_content_extraction_enabled = True
317
+
318
+ extensions = extractor.get_supported_extensions()
319
+ assert extensions == []
320
+
321
+
322
+ class TestFileContentExtractorAsync:
323
+ """Test FileContentExtractor async extraction methods."""
324
+
325
+ @pytest.mark.asyncio
326
+ async def test_extract_content_success(self):
327
+ """extract_content should return successful result on 200 response."""
328
+ config = FileExtractorsConfig(
329
+ enabled=True,
330
+ extractors={
331
+ "pdf-text": FileExtractorConfig(
332
+ url="http://localhost:8010/extract",
333
+ enabled=True,
334
+ response_field="text",
335
+ preview_chars=100
336
+ )
337
+ },
338
+ extension_mapping={".pdf": "pdf-text"}
339
+ )
340
+ extractor = FileContentExtractor(config=config)
341
+
342
+ mock_response = Mock()
343
+ mock_response.status_code = 200
344
+ mock_response.json.return_value = {
345
+ "success": True,
346
+ "text": "This is the extracted content from the PDF document.",
347
+ "metadata": {"pages": 5}
348
+ }
349
+
350
+ with patch('atlas.modules.file_storage.content_extractor.get_app_settings') as mock_settings:
351
+ mock_settings.return_value.feature_file_content_extraction_enabled = True
352
+
353
+ with patch('httpx.AsyncClient') as mock_client_class:
354
+ mock_client = AsyncMock()
355
+ mock_client.request = AsyncMock(return_value=mock_response)
356
+ mock_client.__aenter__ = AsyncMock(return_value=mock_client)
357
+ mock_client.__aexit__ = AsyncMock(return_value=None)
358
+ mock_client_class.return_value = mock_client
359
+
360
+ result = await extractor.extract_content(
361
+ filename="document.pdf",
362
+ content_base64="dGVzdCBjb250ZW50" # "test content" in base64
363
+ )
364
+
365
+ assert result.success is True
366
+ assert result.content == "This is the extracted content from the PDF document."
367
+ assert result.metadata == {"pages": 5}
368
+
369
+ @pytest.mark.asyncio
370
+ async def test_extract_content_no_extractor(self):
371
+ """extract_content should return error when no extractor available."""
372
+ config = FileExtractorsConfig(enabled=True, extractors={})
373
+ extractor = FileContentExtractor(config=config)
374
+
375
+ with patch('atlas.modules.file_storage.content_extractor.get_app_settings') as mock_settings:
376
+ mock_settings.return_value.feature_file_content_extraction_enabled = True
377
+
378
+ result = await extractor.extract_content(
379
+ filename="document.xyz",
380
+ content_base64="dGVzdA=="
381
+ )
382
+
383
+ assert result.success is False
384
+ assert "No extractor available" in result.error
385
+
386
+ @pytest.mark.asyncio
387
+ async def test_extract_content_file_too_large(self):
388
+ """extract_content should reject files exceeding size limit."""
389
+ config = FileExtractorsConfig(
390
+ enabled=True,
391
+ extractors={
392
+ "pdf-text": FileExtractorConfig(
393
+ url="http://localhost:8010/extract",
394
+ enabled=True,
395
+ max_file_size_mb=1 # 1MB limit
396
+ )
397
+ },
398
+ extension_mapping={".pdf": "pdf-text"}
399
+ )
400
+ extractor = FileContentExtractor(config=config)
401
+
402
+ # Create a base64 string that would decode to more than 1MB
403
+ large_content = "A" * (2 * 1024 * 1024) # ~1.5MB when decoded
404
+
405
+ with patch('atlas.modules.file_storage.content_extractor.get_app_settings') as mock_settings:
406
+ mock_settings.return_value.feature_file_content_extraction_enabled = True
407
+
408
+ result = await extractor.extract_content(
409
+ filename="large.pdf",
410
+ content_base64=large_content
411
+ )
412
+
413
+ assert result.success is False
414
+ assert "File too large" in result.error
415
+
416
+ @pytest.mark.asyncio
417
+ async def test_extract_content_http_error(self):
418
+ """extract_content should handle HTTP errors gracefully."""
419
+ config = FileExtractorsConfig(
420
+ enabled=True,
421
+ extractors={
422
+ "pdf-text": FileExtractorConfig(
423
+ url="http://localhost:8010/extract",
424
+ enabled=True
425
+ )
426
+ },
427
+ extension_mapping={".pdf": "pdf-text"}
428
+ )
429
+ extractor = FileContentExtractor(config=config)
430
+
431
+ mock_response = Mock()
432
+ mock_response.status_code = 500
433
+
434
+ with patch('atlas.modules.file_storage.content_extractor.get_app_settings') as mock_settings:
435
+ mock_settings.return_value.feature_file_content_extraction_enabled = True
436
+
437
+ with patch('httpx.AsyncClient') as mock_client_class:
438
+ mock_client = AsyncMock()
439
+ mock_client.request = AsyncMock(return_value=mock_response)
440
+ mock_client.__aenter__ = AsyncMock(return_value=mock_client)
441
+ mock_client.__aexit__ = AsyncMock(return_value=None)
442
+ mock_client_class.return_value = mock_client
443
+
444
+ result = await extractor.extract_content(
445
+ filename="document.pdf",
446
+ content_base64="dGVzdA=="
447
+ )
448
+
449
+ assert result.success is False
450
+ assert "status 500" in result.error
451
+
452
+ @pytest.mark.asyncio
453
+ async def test_extract_content_timeout(self):
454
+ """extract_content should handle timeout gracefully."""
455
+ config = FileExtractorsConfig(
456
+ enabled=True,
457
+ extractors={
458
+ "pdf-text": FileExtractorConfig(
459
+ url="http://localhost:8010/extract",
460
+ enabled=True,
461
+ timeout_seconds=5
462
+ )
463
+ },
464
+ extension_mapping={".pdf": "pdf-text"}
465
+ )
466
+ extractor = FileContentExtractor(config=config)
467
+
468
+ with patch('atlas.modules.file_storage.content_extractor.get_app_settings') as mock_settings:
469
+ mock_settings.return_value.feature_file_content_extraction_enabled = True
470
+
471
+ with patch('httpx.AsyncClient') as mock_client_class:
472
+ mock_client = AsyncMock()
473
+ mock_client.request = AsyncMock(side_effect=httpx.TimeoutException("Timeout"))
474
+ mock_client.__aenter__ = AsyncMock(return_value=mock_client)
475
+ mock_client.__aexit__ = AsyncMock(return_value=None)
476
+ mock_client_class.return_value = mock_client
477
+
478
+ result = await extractor.extract_content(
479
+ filename="document.pdf",
480
+ content_base64="dGVzdA=="
481
+ )
482
+
483
+ assert result.success is False
484
+ assert "timed out" in result.error
485
+
486
+ @pytest.mark.asyncio
487
+ async def test_extract_content_connection_error(self):
488
+ """extract_content should handle connection errors gracefully."""
489
+ config = FileExtractorsConfig(
490
+ enabled=True,
491
+ extractors={
492
+ "pdf-text": FileExtractorConfig(
493
+ url="http://localhost:8010/extract",
494
+ enabled=True
495
+ )
496
+ },
497
+ extension_mapping={".pdf": "pdf-text"}
498
+ )
499
+ extractor = FileContentExtractor(config=config)
500
+
501
+ with patch('atlas.modules.file_storage.content_extractor.get_app_settings') as mock_settings:
502
+ mock_settings.return_value.feature_file_content_extraction_enabled = True
503
+
504
+ with patch('httpx.AsyncClient') as mock_client_class:
505
+ mock_client = AsyncMock()
506
+ mock_client.request = AsyncMock(
507
+ side_effect=httpx.RequestError("Connection refused")
508
+ )
509
+ mock_client.__aenter__ = AsyncMock(return_value=mock_client)
510
+ mock_client.__aexit__ = AsyncMock(return_value=None)
511
+ mock_client_class.return_value = mock_client
512
+
513
+ result = await extractor.extract_content(
514
+ filename="document.pdf",
515
+ content_base64="dGVzdA=="
516
+ )
517
+
518
+ assert result.success is False
519
+ assert "Failed to connect" in result.error
520
+
521
+ @pytest.mark.asyncio
522
+ async def test_extract_content_preview_truncation(self):
523
+ """extract_content should truncate preview for long content."""
524
+ config = FileExtractorsConfig(
525
+ enabled=True,
526
+ extractors={
527
+ "pdf-text": FileExtractorConfig(
528
+ url="http://localhost:8010/extract",
529
+ enabled=True,
530
+ response_field="text",
531
+ preview_chars=20
532
+ )
533
+ },
534
+ extension_mapping={".pdf": "pdf-text"}
535
+ )
536
+ extractor = FileContentExtractor(config=config)
537
+
538
+ long_text = "A" * 100 # 100 characters
539
+ mock_response = Mock()
540
+ mock_response.status_code = 200
541
+ mock_response.json.return_value = {
542
+ "success": True,
543
+ "text": long_text
544
+ }
545
+
546
+ with patch('atlas.modules.file_storage.content_extractor.get_app_settings') as mock_settings:
547
+ mock_settings.return_value.feature_file_content_extraction_enabled = True
548
+
549
+ with patch('httpx.AsyncClient') as mock_client_class:
550
+ mock_client = AsyncMock()
551
+ mock_client.request = AsyncMock(return_value=mock_response)
552
+ mock_client.__aenter__ = AsyncMock(return_value=mock_client)
553
+ mock_client.__aexit__ = AsyncMock(return_value=None)
554
+ mock_client_class.return_value = mock_client
555
+
556
+ result = await extractor.extract_content(
557
+ filename="document.pdf",
558
+ content_base64="dGVzdA=="
559
+ )
560
+
561
+ assert result.success is True
562
+ assert result.content == long_text
563
+ assert result.preview == "A" * 20 + "..."
564
+ assert len(result.preview) == 23 # 20 + "..."
565
+
566
+
567
+ class TestExtractionResult:
568
+ """Test ExtractionResult dataclass."""
569
+
570
+ def test_extraction_result_success(self):
571
+ """ExtractionResult should store successful extraction data."""
572
+ result = ExtractionResult(
573
+ success=True,
574
+ content="Extracted text",
575
+ preview="Extracted...",
576
+ metadata={"pages": 3}
577
+ )
578
+
579
+ assert result.success is True
580
+ assert result.content == "Extracted text"
581
+ assert result.preview == "Extracted..."
582
+ assert result.metadata == {"pages": 3}
583
+ assert result.error is None
584
+
585
+ def test_extraction_result_failure(self):
586
+ """ExtractionResult should store failure information."""
587
+ result = ExtractionResult(
588
+ success=False,
589
+ error="Connection refused"
590
+ )
591
+
592
+ assert result.success is False
593
+ assert result.error == "Connection refused"
594
+ assert result.content is None
595
+ assert result.preview is None
596
+ assert result.metadata is None
597
+
598
+
599
+ class TestFileExtractorApiKeyAndHeaders:
600
+ """Test FileExtractorConfig api_key and headers functionality."""
601
+
602
+ def test_file_extractor_config_with_api_key(self):
603
+ """FileExtractorConfig should accept api_key field."""
604
+ config = FileExtractorConfig(
605
+ url="http://localhost:8010/extract",
606
+ api_key="sk-test-key-123"
607
+ )
608
+
609
+ assert config.api_key == "sk-test-key-123"
610
+
611
+ def test_file_extractor_config_with_headers(self):
612
+ """FileExtractorConfig should accept headers field."""
613
+ config = FileExtractorConfig(
614
+ url="http://localhost:8010/extract",
615
+ headers={"X-Client-ID": "client-123", "X-Custom-Header": "value"}
616
+ )
617
+
618
+ assert config.headers == {"X-Client-ID": "client-123", "X-Custom-Header": "value"}
619
+
620
+ def test_file_extractor_config_api_key_and_headers_optional(self):
621
+ """api_key and headers should be optional (None by default)."""
622
+ config = FileExtractorConfig(url="http://localhost:8010/extract")
623
+
624
+ assert config.api_key is None
625
+ assert config.headers is None
626
+
627
+ def test_file_extractor_env_var_resolution_api_key(self, monkeypatch):
628
+ """ConfigManager should resolve ${ENV_VAR} in api_key."""
629
+ from atlas.modules.config.config_manager import resolve_env_var
630
+
631
+ monkeypatch.setenv("TEST_EXTRACTOR_API_KEY", "sk-resolved-key-456")
632
+
633
+ # Test resolve_env_var directly
634
+ resolved = resolve_env_var("${TEST_EXTRACTOR_API_KEY}")
635
+ assert resolved == "sk-resolved-key-456"
636
+
637
+ def test_file_extractor_env_var_resolution_headers(self, monkeypatch):
638
+ """ConfigManager should resolve ${ENV_VAR} in header values."""
639
+ from atlas.modules.config.config_manager import resolve_env_var
640
+
641
+ monkeypatch.setenv("TEST_CLIENT_ID", "client-resolved-789")
642
+
643
+ resolved = resolve_env_var("${TEST_CLIENT_ID}")
644
+ assert resolved == "client-resolved-789"
645
+
646
+ def test_file_extractor_env_var_optional_returns_none(self):
647
+ """resolve_env_var with required=False should return None for missing vars."""
648
+ from atlas.modules.config.config_manager import resolve_env_var
649
+
650
+ # Missing env var with required=False should return None
651
+ result = resolve_env_var("${MISSING_OPTIONAL_KEY}", required=False)
652
+ assert result is None
653
+
654
+ def test_file_extractor_env_var_required_raises(self):
655
+ """resolve_env_var with required=True should raise for missing vars."""
656
+ from atlas.modules.config.config_manager import resolve_env_var
657
+
658
+ with pytest.raises(ValueError) as exc_info:
659
+ resolve_env_var("${MISSING_REQUIRED_KEY}", required=True)
660
+
661
+ assert "MISSING_REQUIRED_KEY" in str(exc_info.value)
662
+
663
+ def test_file_extractor_literal_value_unchanged(self):
664
+ """resolve_env_var should return literal values unchanged."""
665
+ from atlas.modules.config.config_manager import resolve_env_var
666
+
667
+ result = resolve_env_var("sk-literal-key")
668
+ assert result == "sk-literal-key"
669
+
670
+ @pytest.mark.asyncio
671
+ async def test_extract_content_includes_api_key_header(self):
672
+ """extract_content should include api_key as Authorization header."""
673
+ config = FileExtractorsConfig(
674
+ enabled=True,
675
+ extractors={
676
+ "pdf-text": FileExtractorConfig(
677
+ url="http://localhost:8010/extract",
678
+ enabled=True,
679
+ api_key="sk-test-api-key",
680
+ response_field="text"
681
+ )
682
+ },
683
+ extension_mapping={".pdf": "pdf-text"}
684
+ )
685
+ extractor = FileContentExtractor(config=config)
686
+
687
+ mock_response = Mock()
688
+ mock_response.status_code = 200
689
+ mock_response.json.return_value = {"success": True, "text": "Extracted content"}
690
+
691
+ captured_headers = {}
692
+
693
+ async def capture_request(*args, **kwargs):
694
+ nonlocal captured_headers
695
+ captured_headers = kwargs.get("headers", {})
696
+ return mock_response
697
+
698
+ with patch('atlas.modules.file_storage.content_extractor.get_app_settings') as mock_settings:
699
+ mock_settings.return_value.feature_file_content_extraction_enabled = True
700
+
701
+ with patch('httpx.AsyncClient') as mock_client_class:
702
+ mock_client = AsyncMock()
703
+ mock_client.request = AsyncMock(side_effect=capture_request)
704
+ mock_client.__aenter__ = AsyncMock(return_value=mock_client)
705
+ mock_client.__aexit__ = AsyncMock(return_value=None)
706
+ mock_client_class.return_value = mock_client
707
+
708
+ await extractor.extract_content(
709
+ filename="document.pdf",
710
+ content_base64="dGVzdA=="
711
+ )
712
+
713
+ assert "Authorization" in captured_headers
714
+ assert captured_headers["Authorization"] == "Bearer sk-test-api-key"
715
+
716
+ @pytest.mark.asyncio
717
+ async def test_extract_content_includes_custom_headers(self):
718
+ """extract_content should include custom headers from config."""
719
+ config = FileExtractorsConfig(
720
+ enabled=True,
721
+ extractors={
722
+ "pdf-text": FileExtractorConfig(
723
+ url="http://localhost:8010/extract",
724
+ enabled=True,
725
+ headers={"X-Client-ID": "my-client", "X-Request-Source": "atlas-ui"},
726
+ response_field="text"
727
+ )
728
+ },
729
+ extension_mapping={".pdf": "pdf-text"}
730
+ )
731
+ extractor = FileContentExtractor(config=config)
732
+
733
+ mock_response = Mock()
734
+ mock_response.status_code = 200
735
+ mock_response.json.return_value = {"success": True, "text": "Extracted content"}
736
+
737
+ captured_headers = {}
738
+
739
+ async def capture_request(*args, **kwargs):
740
+ nonlocal captured_headers
741
+ captured_headers = kwargs.get("headers", {})
742
+ return mock_response
743
+
744
+ with patch('atlas.modules.file_storage.content_extractor.get_app_settings') as mock_settings:
745
+ mock_settings.return_value.feature_file_content_extraction_enabled = True
746
+
747
+ with patch('httpx.AsyncClient') as mock_client_class:
748
+ mock_client = AsyncMock()
749
+ mock_client.request = AsyncMock(side_effect=capture_request)
750
+ mock_client.__aenter__ = AsyncMock(return_value=mock_client)
751
+ mock_client.__aexit__ = AsyncMock(return_value=None)
752
+ mock_client_class.return_value = mock_client
753
+
754
+ await extractor.extract_content(
755
+ filename="document.pdf",
756
+ content_base64="dGVzdA=="
757
+ )
758
+
759
+ assert captured_headers.get("X-Client-ID") == "my-client"
760
+ assert captured_headers.get("X-Request-Source") == "atlas-ui"
761
+
762
+ @pytest.mark.asyncio
763
+ async def test_extract_content_no_headers_when_not_configured(self):
764
+ """extract_content should pass None headers when not configured."""
765
+ config = FileExtractorsConfig(
766
+ enabled=True,
767
+ extractors={
768
+ "pdf-text": FileExtractorConfig(
769
+ url="http://localhost:8010/extract",
770
+ enabled=True,
771
+ response_field="text"
772
+ # No api_key or headers
773
+ )
774
+ },
775
+ extension_mapping={".pdf": "pdf-text"}
776
+ )
777
+ extractor = FileContentExtractor(config=config)
778
+
779
+ mock_response = Mock()
780
+ mock_response.status_code = 200
781
+ mock_response.json.return_value = {"success": True, "text": "Extracted content"}
782
+
783
+ captured_headers = "NOT_SET"
784
+
785
+ async def capture_request(*args, **kwargs):
786
+ nonlocal captured_headers
787
+ captured_headers = kwargs.get("headers")
788
+ return mock_response
789
+
790
+ with patch('atlas.modules.file_storage.content_extractor.get_app_settings') as mock_settings:
791
+ mock_settings.return_value.feature_file_content_extraction_enabled = True
792
+
793
+ with patch('httpx.AsyncClient') as mock_client_class:
794
+ mock_client = AsyncMock()
795
+ mock_client.request = AsyncMock(side_effect=capture_request)
796
+ mock_client.__aenter__ = AsyncMock(return_value=mock_client)
797
+ mock_client.__aexit__ = AsyncMock(return_value=None)
798
+ mock_client_class.return_value = mock_client
799
+
800
+ await extractor.extract_content(
801
+ filename="document.pdf",
802
+ content_base64="dGVzdA=="
803
+ )
804
+
805
+ assert captured_headers is None
806
+
807
+
808
+ class TestMultipartUpload:
809
+ """Test multipart form-data upload path in FileContentExtractor."""
810
+
811
+ def test_file_extractor_config_form_field_name_default(self):
812
+ """form_field_name should default to 'file'."""
813
+ config = FileExtractorConfig(url="http://localhost:8010/extract")
814
+ assert config.form_field_name == "file"
815
+
816
+ def test_file_extractor_config_custom_form_field_name(self):
817
+ """form_field_name should accept custom values."""
818
+ config = FileExtractorConfig(
819
+ url="http://localhost:8010/extract",
820
+ request_format="multipart",
821
+ form_field_name="document"
822
+ )
823
+ assert config.form_field_name == "document"
824
+ assert config.request_format == "multipart"
825
+
826
+ @pytest.mark.asyncio
827
+ async def test_multipart_upload_sends_file(self):
828
+ """Multipart request_format should send file via multipart form-data."""
829
+ config = FileExtractorsConfig(
830
+ enabled=True,
831
+ extractors={
832
+ "pdf-text": FileExtractorConfig(
833
+ url="http://localhost:8010/extract-multipart",
834
+ enabled=True,
835
+ request_format="multipart",
836
+ form_field_name="file",
837
+ response_field="text"
838
+ )
839
+ },
840
+ extension_mapping={".pdf": "pdf-text"}
841
+ )
842
+ extractor = FileContentExtractor(config=config)
843
+
844
+ mock_response = Mock()
845
+ mock_response.status_code = 200
846
+ mock_response.json.return_value = {
847
+ "success": True,
848
+ "text": "Extracted multipart content"
849
+ }
850
+
851
+ captured_kwargs = {}
852
+
853
+ async def capture_post(*args, **kwargs):
854
+ nonlocal captured_kwargs
855
+ captured_kwargs = kwargs
856
+ return mock_response
857
+
858
+ with patch('atlas.modules.file_storage.content_extractor.get_app_settings') as mock_settings:
859
+ mock_settings.return_value.feature_file_content_extraction_enabled = True
860
+
861
+ with patch('httpx.AsyncClient') as mock_client_class:
862
+ mock_client = AsyncMock()
863
+ mock_client.post = AsyncMock(side_effect=capture_post)
864
+ mock_client.__aenter__ = AsyncMock(return_value=mock_client)
865
+ mock_client.__aexit__ = AsyncMock(return_value=None)
866
+ mock_client_class.return_value = mock_client
867
+
868
+ result = await extractor.extract_content(
869
+ filename="document.pdf",
870
+ content_base64="dGVzdCBjb250ZW50", # "test content"
871
+ mime_type="application/pdf"
872
+ )
873
+
874
+ assert result.success is True
875
+ assert result.content == "Extracted multipart content"
876
+
877
+ # Verify multipart files dict was passed
878
+ assert "files" in captured_kwargs
879
+ files = captured_kwargs["files"]
880
+ assert "file" in files
881
+ file_tuple = files["file"]
882
+ assert file_tuple[0] == "document.pdf"
883
+ assert file_tuple[1] == b"test content"
884
+ assert file_tuple[2] == "application/pdf"
885
+
886
+ @pytest.mark.asyncio
887
+ async def test_multipart_upload_custom_field_name(self):
888
+ """Multipart upload should use the configured form_field_name."""
889
+ config = FileExtractorsConfig(
890
+ enabled=True,
891
+ extractors={
892
+ "pdf-text": FileExtractorConfig(
893
+ url="http://localhost:8010/extract-multipart",
894
+ enabled=True,
895
+ request_format="multipart",
896
+ form_field_name="document",
897
+ response_field="text"
898
+ )
899
+ },
900
+ extension_mapping={".pdf": "pdf-text"}
901
+ )
902
+ extractor = FileContentExtractor(config=config)
903
+
904
+ mock_response = Mock()
905
+ mock_response.status_code = 200
906
+ mock_response.json.return_value = {"success": True, "text": "Content"}
907
+
908
+ captured_kwargs = {}
909
+
910
+ async def capture_post(*args, **kwargs):
911
+ nonlocal captured_kwargs
912
+ captured_kwargs = kwargs
913
+ return mock_response
914
+
915
+ with patch('atlas.modules.file_storage.content_extractor.get_app_settings') as mock_settings:
916
+ mock_settings.return_value.feature_file_content_extraction_enabled = True
917
+
918
+ with patch('httpx.AsyncClient') as mock_client_class:
919
+ mock_client = AsyncMock()
920
+ mock_client.post = AsyncMock(side_effect=capture_post)
921
+ mock_client.__aenter__ = AsyncMock(return_value=mock_client)
922
+ mock_client.__aexit__ = AsyncMock(return_value=None)
923
+ mock_client_class.return_value = mock_client
924
+
925
+ await extractor.extract_content(
926
+ filename="document.pdf",
927
+ content_base64="dGVzdA==",
928
+ )
929
+
930
+ files = captured_kwargs["files"]
931
+ assert "document" in files
932
+
933
+ @pytest.mark.asyncio
934
+ async def test_multipart_upload_invalid_base64(self):
935
+ """Multipart upload should handle invalid base64 gracefully."""
936
+ config = FileExtractorsConfig(
937
+ enabled=True,
938
+ extractors={
939
+ "pdf-text": FileExtractorConfig(
940
+ url="http://localhost:8010/extract-multipart",
941
+ enabled=True,
942
+ request_format="multipart",
943
+ response_field="text"
944
+ )
945
+ },
946
+ extension_mapping={".pdf": "pdf-text"}
947
+ )
948
+ extractor = FileContentExtractor(config=config)
949
+
950
+ with patch('atlas.modules.file_storage.content_extractor.get_app_settings') as mock_settings:
951
+ mock_settings.return_value.feature_file_content_extraction_enabled = True
952
+
953
+ result = await extractor.extract_content(
954
+ filename="document.pdf",
955
+ content_base64="!!!not-valid-base64!!!"
956
+ )
957
+
958
+ assert result.success is False
959
+ assert "decode" in result.error.lower() or "base64" in result.error.lower()
960
+
961
+ @pytest.mark.asyncio
962
+ async def test_multipart_upload_includes_accept_header(self):
963
+ """Multipart upload should include Accept: application/json header."""
964
+ config = FileExtractorsConfig(
965
+ enabled=True,
966
+ extractors={
967
+ "pdf-text": FileExtractorConfig(
968
+ url="http://localhost:8010/extract-multipart",
969
+ enabled=True,
970
+ request_format="multipart",
971
+ response_field="text"
972
+ )
973
+ },
974
+ extension_mapping={".pdf": "pdf-text"}
975
+ )
976
+ extractor = FileContentExtractor(config=config)
977
+
978
+ mock_response = Mock()
979
+ mock_response.status_code = 200
980
+ mock_response.json.return_value = {"success": True, "text": "Content"}
981
+
982
+ captured_kwargs = {}
983
+
984
+ async def capture_post(*args, **kwargs):
985
+ nonlocal captured_kwargs
986
+ captured_kwargs = kwargs
987
+ return mock_response
988
+
989
+ with patch('atlas.modules.file_storage.content_extractor.get_app_settings') as mock_settings:
990
+ mock_settings.return_value.feature_file_content_extraction_enabled = True
991
+
992
+ with patch('httpx.AsyncClient') as mock_client_class:
993
+ mock_client = AsyncMock()
994
+ mock_client.post = AsyncMock(side_effect=capture_post)
995
+ mock_client.__aenter__ = AsyncMock(return_value=mock_client)
996
+ mock_client.__aexit__ = AsyncMock(return_value=None)
997
+ mock_client_class.return_value = mock_client
998
+
999
+ await extractor.extract_content(
1000
+ filename="document.pdf",
1001
+ content_base64="dGVzdA=="
1002
+ )
1003
+
1004
+ headers = captured_kwargs.get("headers", {})
1005
+ assert headers.get("Accept") == "application/json"
1006
+
1007
+ @pytest.mark.asyncio
1008
+ async def test_multipart_upload_with_api_key(self):
1009
+ """Multipart upload should include Authorization header when api_key is set."""
1010
+ config = FileExtractorsConfig(
1011
+ enabled=True,
1012
+ extractors={
1013
+ "pdf-text": FileExtractorConfig(
1014
+ url="http://localhost:8010/extract-multipart",
1015
+ enabled=True,
1016
+ request_format="multipart",
1017
+ api_key="sk-test-key",
1018
+ response_field="text"
1019
+ )
1020
+ },
1021
+ extension_mapping={".pdf": "pdf-text"}
1022
+ )
1023
+ extractor = FileContentExtractor(config=config)
1024
+
1025
+ mock_response = Mock()
1026
+ mock_response.status_code = 200
1027
+ mock_response.json.return_value = {"success": True, "text": "Content"}
1028
+
1029
+ captured_kwargs = {}
1030
+
1031
+ async def capture_post(*args, **kwargs):
1032
+ nonlocal captured_kwargs
1033
+ captured_kwargs = kwargs
1034
+ return mock_response
1035
+
1036
+ with patch('atlas.modules.file_storage.content_extractor.get_app_settings') as mock_settings:
1037
+ mock_settings.return_value.feature_file_content_extraction_enabled = True
1038
+
1039
+ with patch('httpx.AsyncClient') as mock_client_class:
1040
+ mock_client = AsyncMock()
1041
+ mock_client.post = AsyncMock(side_effect=capture_post)
1042
+ mock_client.__aenter__ = AsyncMock(return_value=mock_client)
1043
+ mock_client.__aexit__ = AsyncMock(return_value=None)
1044
+ mock_client_class.return_value = mock_client
1045
+
1046
+ await extractor.extract_content(
1047
+ filename="document.pdf",
1048
+ content_base64="dGVzdA=="
1049
+ )
1050
+
1051
+ headers = captured_kwargs.get("headers", {})
1052
+ assert headers.get("Authorization") == "Bearer sk-test-key"
1053
+
1054
+ @pytest.mark.asyncio
1055
+ async def test_multipart_default_mime_type(self):
1056
+ """Multipart upload should default to application/octet-stream when no mime_type."""
1057
+ config = FileExtractorsConfig(
1058
+ enabled=True,
1059
+ extractors={
1060
+ "pdf-text": FileExtractorConfig(
1061
+ url="http://localhost:8010/extract-multipart",
1062
+ enabled=True,
1063
+ request_format="multipart",
1064
+ response_field="text"
1065
+ )
1066
+ },
1067
+ extension_mapping={".pdf": "pdf-text"}
1068
+ )
1069
+ extractor = FileContentExtractor(config=config)
1070
+
1071
+ mock_response = Mock()
1072
+ mock_response.status_code = 200
1073
+ mock_response.json.return_value = {"success": True, "text": "Content"}
1074
+
1075
+ captured_kwargs = {}
1076
+
1077
+ async def capture_post(*args, **kwargs):
1078
+ nonlocal captured_kwargs
1079
+ captured_kwargs = kwargs
1080
+ return mock_response
1081
+
1082
+ with patch('atlas.modules.file_storage.content_extractor.get_app_settings') as mock_settings:
1083
+ mock_settings.return_value.feature_file_content_extraction_enabled = True
1084
+
1085
+ with patch('httpx.AsyncClient') as mock_client_class:
1086
+ mock_client = AsyncMock()
1087
+ mock_client.post = AsyncMock(side_effect=capture_post)
1088
+ mock_client.__aenter__ = AsyncMock(return_value=mock_client)
1089
+ mock_client.__aexit__ = AsyncMock(return_value=None)
1090
+ mock_client_class.return_value = mock_client
1091
+
1092
+ await extractor.extract_content(
1093
+ filename="document.pdf",
1094
+ content_base64="dGVzdA==",
1095
+ mime_type=None
1096
+ )
1097
+
1098
+ files = captured_kwargs["files"]
1099
+ file_tuple = files["file"]
1100
+ assert file_tuple[2] == "application/octet-stream"
1101
+
1102
+
1103
+ class TestConfigManagerFileExtractors:
1104
+ """Test ConfigManager loading of file extractors config."""
1105
+
1106
+ def test_config_manager_loads_file_extractors(self):
1107
+ """ConfigManager should load file extractors configuration."""
1108
+ cm = ConfigManager()
1109
+ config = cm.file_extractors_config
1110
+
1111
+ assert config is not None
1112
+ assert isinstance(config, FileExtractorsConfig)
1113
+
1114
+ def test_config_manager_caches_file_extractors(self):
1115
+ """ConfigManager should cache file extractors config."""
1116
+ cm = ConfigManager()
1117
+
1118
+ config1 = cm.file_extractors_config
1119
+ config2 = cm.file_extractors_config
1120
+
1121
+ assert config1 is config2
1122
+
1123
+ def test_config_manager_returns_disabled_on_missing_file(self):
1124
+ """ConfigManager should return disabled config if file not found."""
1125
+ cm = ConfigManager()
1126
+
1127
+ # Clear cached config
1128
+ cm._file_extractors_config = None
1129
+
1130
+ # Mock _search_paths to return empty paths
1131
+ with patch.object(cm, '_load_file_with_error_handling', return_value=None):
1132
+ config = cm.file_extractors_config
1133
+
1134
+ assert config.enabled is False