julee 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. julee/__init__.py +3 -0
  2. julee/api/__init__.py +20 -0
  3. julee/api/app.py +180 -0
  4. julee/api/dependencies.py +257 -0
  5. julee/api/requests.py +175 -0
  6. julee/api/responses.py +43 -0
  7. julee/api/routers/__init__.py +43 -0
  8. julee/api/routers/assembly_specifications.py +212 -0
  9. julee/api/routers/documents.py +182 -0
  10. julee/api/routers/knowledge_service_configs.py +79 -0
  11. julee/api/routers/knowledge_service_queries.py +293 -0
  12. julee/api/routers/system.py +137 -0
  13. julee/api/routers/workflows.py +234 -0
  14. julee/api/services/__init__.py +20 -0
  15. julee/api/services/system_initialization.py +214 -0
  16. julee/api/tests/__init__.py +14 -0
  17. julee/api/tests/routers/__init__.py +17 -0
  18. julee/api/tests/routers/test_assembly_specifications.py +749 -0
  19. julee/api/tests/routers/test_documents.py +301 -0
  20. julee/api/tests/routers/test_knowledge_service_configs.py +234 -0
  21. julee/api/tests/routers/test_knowledge_service_queries.py +738 -0
  22. julee/api/tests/routers/test_system.py +179 -0
  23. julee/api/tests/routers/test_workflows.py +393 -0
  24. julee/api/tests/test_app.py +285 -0
  25. julee/api/tests/test_dependencies.py +245 -0
  26. julee/api/tests/test_requests.py +250 -0
  27. julee/domain/__init__.py +22 -0
  28. julee/domain/models/__init__.py +49 -0
  29. julee/domain/models/assembly/__init__.py +17 -0
  30. julee/domain/models/assembly/assembly.py +103 -0
  31. julee/domain/models/assembly/tests/__init__.py +0 -0
  32. julee/domain/models/assembly/tests/factories.py +37 -0
  33. julee/domain/models/assembly/tests/test_assembly.py +430 -0
  34. julee/domain/models/assembly_specification/__init__.py +24 -0
  35. julee/domain/models/assembly_specification/assembly_specification.py +172 -0
  36. julee/domain/models/assembly_specification/knowledge_service_query.py +123 -0
  37. julee/domain/models/assembly_specification/tests/__init__.py +0 -0
  38. julee/domain/models/assembly_specification/tests/factories.py +78 -0
  39. julee/domain/models/assembly_specification/tests/test_assembly_specification.py +490 -0
  40. julee/domain/models/assembly_specification/tests/test_knowledge_service_query.py +310 -0
  41. julee/domain/models/custom_fields/__init__.py +0 -0
  42. julee/domain/models/custom_fields/content_stream.py +68 -0
  43. julee/domain/models/custom_fields/tests/__init__.py +0 -0
  44. julee/domain/models/custom_fields/tests/test_custom_fields.py +53 -0
  45. julee/domain/models/document/__init__.py +17 -0
  46. julee/domain/models/document/document.py +150 -0
  47. julee/domain/models/document/tests/__init__.py +0 -0
  48. julee/domain/models/document/tests/factories.py +76 -0
  49. julee/domain/models/document/tests/test_document.py +297 -0
  50. julee/domain/models/knowledge_service_config/__init__.py +17 -0
  51. julee/domain/models/knowledge_service_config/knowledge_service_config.py +86 -0
  52. julee/domain/models/policy/__init__.py +15 -0
  53. julee/domain/models/policy/document_policy_validation.py +220 -0
  54. julee/domain/models/policy/policy.py +203 -0
  55. julee/domain/models/policy/tests/__init__.py +0 -0
  56. julee/domain/models/policy/tests/factories.py +47 -0
  57. julee/domain/models/policy/tests/test_document_policy_validation.py +420 -0
  58. julee/domain/models/policy/tests/test_policy.py +546 -0
  59. julee/domain/repositories/__init__.py +27 -0
  60. julee/domain/repositories/assembly.py +45 -0
  61. julee/domain/repositories/assembly_specification.py +52 -0
  62. julee/domain/repositories/base.py +146 -0
  63. julee/domain/repositories/document.py +49 -0
  64. julee/domain/repositories/document_policy_validation.py +52 -0
  65. julee/domain/repositories/knowledge_service_config.py +54 -0
  66. julee/domain/repositories/knowledge_service_query.py +44 -0
  67. julee/domain/repositories/policy.py +49 -0
  68. julee/domain/use_cases/__init__.py +17 -0
  69. julee/domain/use_cases/decorators.py +107 -0
  70. julee/domain/use_cases/extract_assemble_data.py +649 -0
  71. julee/domain/use_cases/initialize_system_data.py +842 -0
  72. julee/domain/use_cases/tests/__init__.py +7 -0
  73. julee/domain/use_cases/tests/test_extract_assemble_data.py +548 -0
  74. julee/domain/use_cases/tests/test_initialize_system_data.py +455 -0
  75. julee/domain/use_cases/tests/test_validate_document.py +1228 -0
  76. julee/domain/use_cases/validate_document.py +736 -0
  77. julee/fixtures/assembly_specifications.yaml +70 -0
  78. julee/fixtures/documents.yaml +178 -0
  79. julee/fixtures/knowledge_service_configs.yaml +37 -0
  80. julee/fixtures/knowledge_service_queries.yaml +27 -0
  81. julee/repositories/__init__.py +17 -0
  82. julee/repositories/memory/__init__.py +31 -0
  83. julee/repositories/memory/assembly.py +84 -0
  84. julee/repositories/memory/assembly_specification.py +125 -0
  85. julee/repositories/memory/base.py +227 -0
  86. julee/repositories/memory/document.py +149 -0
  87. julee/repositories/memory/document_policy_validation.py +104 -0
  88. julee/repositories/memory/knowledge_service_config.py +123 -0
  89. julee/repositories/memory/knowledge_service_query.py +120 -0
  90. julee/repositories/memory/policy.py +87 -0
  91. julee/repositories/memory/tests/__init__.py +0 -0
  92. julee/repositories/memory/tests/test_document.py +212 -0
  93. julee/repositories/memory/tests/test_document_policy_validation.py +161 -0
  94. julee/repositories/memory/tests/test_policy.py +443 -0
  95. julee/repositories/minio/__init__.py +31 -0
  96. julee/repositories/minio/assembly.py +103 -0
  97. julee/repositories/minio/assembly_specification.py +170 -0
  98. julee/repositories/minio/client.py +570 -0
  99. julee/repositories/minio/document.py +530 -0
  100. julee/repositories/minio/document_policy_validation.py +120 -0
  101. julee/repositories/minio/knowledge_service_config.py +187 -0
  102. julee/repositories/minio/knowledge_service_query.py +211 -0
  103. julee/repositories/minio/policy.py +106 -0
  104. julee/repositories/minio/tests/__init__.py +0 -0
  105. julee/repositories/minio/tests/fake_client.py +213 -0
  106. julee/repositories/minio/tests/test_assembly.py +374 -0
  107. julee/repositories/minio/tests/test_assembly_specification.py +391 -0
  108. julee/repositories/minio/tests/test_client_protocol.py +57 -0
  109. julee/repositories/minio/tests/test_document.py +591 -0
  110. julee/repositories/minio/tests/test_document_policy_validation.py +192 -0
  111. julee/repositories/minio/tests/test_knowledge_service_config.py +374 -0
  112. julee/repositories/minio/tests/test_knowledge_service_query.py +438 -0
  113. julee/repositories/minio/tests/test_policy.py +559 -0
  114. julee/repositories/temporal/__init__.py +38 -0
  115. julee/repositories/temporal/activities.py +114 -0
  116. julee/repositories/temporal/activity_names.py +34 -0
  117. julee/repositories/temporal/proxies.py +159 -0
  118. julee/services/__init__.py +18 -0
  119. julee/services/knowledge_service/__init__.py +48 -0
  120. julee/services/knowledge_service/anthropic/__init__.py +12 -0
  121. julee/services/knowledge_service/anthropic/knowledge_service.py +331 -0
  122. julee/services/knowledge_service/anthropic/tests/test_knowledge_service.py +318 -0
  123. julee/services/knowledge_service/factory.py +138 -0
  124. julee/services/knowledge_service/knowledge_service.py +160 -0
  125. julee/services/knowledge_service/memory/__init__.py +13 -0
  126. julee/services/knowledge_service/memory/knowledge_service.py +278 -0
  127. julee/services/knowledge_service/memory/test_knowledge_service.py +345 -0
  128. julee/services/knowledge_service/test_factory.py +112 -0
  129. julee/services/temporal/__init__.py +38 -0
  130. julee/services/temporal/activities.py +86 -0
  131. julee/services/temporal/activity_names.py +22 -0
  132. julee/services/temporal/proxies.py +41 -0
  133. julee/util/__init__.py +0 -0
  134. julee/util/domain.py +119 -0
  135. julee/util/repos/__init__.py +0 -0
  136. julee/util/repos/minio/__init__.py +0 -0
  137. julee/util/repos/minio/file_storage.py +213 -0
  138. julee/util/repos/temporal/__init__.py +11 -0
  139. julee/util/repos/temporal/client_proxies/file_storage.py +68 -0
  140. julee/util/repos/temporal/data_converter.py +123 -0
  141. julee/util/repos/temporal/minio_file_storage.py +12 -0
  142. julee/util/repos/temporal/proxies/__init__.py +0 -0
  143. julee/util/repos/temporal/proxies/file_storage.py +58 -0
  144. julee/util/repositories.py +55 -0
  145. julee/util/temporal/__init__.py +22 -0
  146. julee/util/temporal/activities.py +123 -0
  147. julee/util/temporal/decorators.py +473 -0
  148. julee/util/tests/__init__.py +1 -0
  149. julee/util/tests/test_decorators.py +770 -0
  150. julee/util/validation/__init__.py +29 -0
  151. julee/util/validation/repository.py +100 -0
  152. julee/util/validation/type_guards.py +369 -0
  153. julee/worker.py +211 -0
  154. julee/workflows/__init__.py +26 -0
  155. julee/workflows/extract_assemble.py +215 -0
  156. julee/workflows/validate_document.py +228 -0
  157. julee-0.1.0.dist-info/METADATA +195 -0
  158. julee-0.1.0.dist-info/RECORD +161 -0
  159. julee-0.1.0.dist-info/WHEEL +5 -0
  160. julee-0.1.0.dist-info/licenses/LICENSE +674 -0
  161. julee-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,736 @@
1
+ """
2
+ Use case logic for document validation within the Capture, Extract, Assemble,
3
+ Publish workflow.
4
+
5
+ This module contains use case classes that orchestrate business logic while
6
+ remaining framework-agnostic. Dependencies are injected via repository
7
+ instances following the Clean Architecture principles.
8
+ """
9
+
10
+ import hashlib
11
+ import io
12
+ import json
13
+ import logging
14
+ from datetime import datetime
15
+ from typing import Callable, Dict, List, Tuple
16
+
17
+ import multihash
18
+
19
+ from julee.domain.models import (
20
+ ContentStream,
21
+ Document,
22
+ DocumentPolicyValidation,
23
+ DocumentStatus,
24
+ KnowledgeServiceQuery,
25
+ Policy,
26
+ )
27
+ from julee.domain.models.policy import (
28
+ DocumentPolicyValidationStatus,
29
+ )
30
+ from julee.domain.repositories import (
31
+ DocumentPolicyValidationRepository,
32
+ DocumentRepository,
33
+ KnowledgeServiceConfigRepository,
34
+ KnowledgeServiceQueryRepository,
35
+ PolicyRepository,
36
+ )
37
+ from julee.services import KnowledgeService
38
+ from julee.util.validation import ensure_repository_protocol
39
+
40
+ from .decorators import try_use_case_step
41
+
42
+ logger = logging.getLogger(__name__)
43
+
44
+
45
+ class ValidateDocumentUseCase:
46
+ """
47
+ Use case for validating documents against policies.
48
+
49
+ This class orchestrates the business logic for document validation within
50
+ the Capture, Extract, Assemble, Publish workflow while remaining
51
+ framework-agnostic. It depends only on repository protocols, not
52
+ concrete implementations.
53
+
54
+ In workflow contexts, this use case is called from workflow code with
55
+ repository stubs that delegate to Temporal activities for durability.
56
+ The use case remains completely unaware of whether it's running in a
57
+ workflow context or a simple async context - it just calls repository
58
+ methods and expects them to work correctly.
59
+
60
+ Architectural Notes:
61
+ - This class contains pure business logic with no framework dependencies
62
+ - Repository dependencies are injected via constructor
63
+ (dependency inversion)
64
+ - All error handling and compensation logic is contained here
65
+ - The use case works with domain objects exclusively
66
+ - Deterministic execution is guaranteed by avoiding
67
+ non-deterministic operations
68
+ """
69
+
70
+ def __init__(
71
+ self,
72
+ document_repo: DocumentRepository,
73
+ knowledge_service_query_repo: KnowledgeServiceQueryRepository,
74
+ knowledge_service_config_repo: KnowledgeServiceConfigRepository,
75
+ policy_repo: PolicyRepository,
76
+ document_policy_validation_repo: DocumentPolicyValidationRepository,
77
+ knowledge_service: KnowledgeService,
78
+ now_fn: Callable[[], datetime],
79
+ ) -> None:
80
+ """Initialize validate document use case.
81
+
82
+ Args:
83
+ document_repo: Repository for document operations
84
+ knowledge_service_query_repo: Repository for knowledge service
85
+ query operations
86
+ knowledge_service_config_repo: Repository for knowledge service
87
+ configuration operations
88
+ policy_repo: Repository for policy operations
89
+ document_policy_validation_repo: Repository for document policy
90
+ validation operations
91
+ knowledge_service: Knowledge service instance for external
92
+ operations
93
+ now_fn: Function to get current time (e.g., workflow.now for
94
+ Temporal workflows)
95
+
96
+ Note:
97
+ The repositories passed here may be concrete implementations
98
+ (for testing or direct execution) or workflow stubs (for
99
+ Temporal workflow execution). The use case doesn't know or care
100
+ which - it just calls the methods defined in the protocols.
101
+
102
+ Repositories are validated at construction time to catch
103
+ configuration errors early in the application lifecycle.
104
+ """
105
+ # Validate at construction time for early error detection
106
+ self.document_repo = ensure_repository_protocol(
107
+ document_repo,
108
+ DocumentRepository, # type: ignore[type-abstract]
109
+ )
110
+ self.knowledge_service = knowledge_service
111
+ self.knowledge_service_query_repo = ensure_repository_protocol(
112
+ knowledge_service_query_repo,
113
+ KnowledgeServiceQueryRepository, # type: ignore[type-abstract]
114
+ )
115
+ self.knowledge_service_config_repo = ensure_repository_protocol(
116
+ knowledge_service_config_repo,
117
+ KnowledgeServiceConfigRepository, # type: ignore[type-abstract]
118
+ )
119
+ self.policy_repo = ensure_repository_protocol(
120
+ policy_repo,
121
+ PolicyRepository, # type: ignore[type-abstract]
122
+ )
123
+ self.document_policy_validation_repo = ensure_repository_protocol(
124
+ document_policy_validation_repo,
125
+ DocumentPolicyValidationRepository, # type: ignore[type-abstract]
126
+ )
127
+ self.now_fn = now_fn
128
+
129
+ async def validate_document(
130
+ self, document_id: str, policy_id: str
131
+ ) -> DocumentPolicyValidation:
132
+ """
133
+ Validate a document against a policy and return the validation result.
134
+
135
+ This method orchestrates the core validation workflow:
136
+ 1. Generates a unique validation ID
137
+ 2. Retrieves the document and policy
138
+ 3. Creates and stores the initial validation record
139
+ 4. Retrieves all validation queries needed for the policy
140
+ 5. Retrieves all knowledge services needed for validation
141
+ 6. Registers the document with knowledge services
142
+ 7. Executes validation queries and calculates scores
143
+ 8. Determines pass/fail and updates validation record
144
+
145
+ Args:
146
+ document_id: ID of the document to validate
147
+ policy_id: ID of the policy to validate against
148
+
149
+ Returns:
150
+ DocumentPolicyValidation with validation results
151
+
152
+ Raises:
153
+ ValueError: If required entities are not found or invalid
154
+ RuntimeError: If validation processing fails
155
+ """
156
+ logger.debug(
157
+ "Starting document validation use case",
158
+ extra={
159
+ "document_id": document_id,
160
+ "policy_id": policy_id,
161
+ },
162
+ )
163
+
164
+ # Step 1: Generate unique validation ID
165
+ validation_id = await self.document_policy_validation_repo.generate_id()
166
+
167
+ # Step 2: Retrieve document and policy (validate they exist)
168
+ document = await self._retrieve_document(document_id)
169
+ policy = await self._retrieve_policy(policy_id)
170
+
171
+ # Step 3: Create and store initial validation record
172
+ validation = DocumentPolicyValidation(
173
+ validation_id=validation_id,
174
+ input_document_id=document_id,
175
+ policy_id=policy_id,
176
+ status=DocumentPolicyValidationStatus.PENDING,
177
+ validation_scores=[],
178
+ started_at=self.now_fn(),
179
+ )
180
+
181
+ await self.document_policy_validation_repo.save(validation)
182
+
183
+ logger.debug(
184
+ "Initial validation record created",
185
+ extra={
186
+ "validation_id": validation_id,
187
+ "document_id": document_id,
188
+ "policy_id": policy_id,
189
+ "status": validation.status.value,
190
+ },
191
+ )
192
+
193
+ try:
194
+ # Step 4: Update status to in progress
195
+ validation.status = DocumentPolicyValidationStatus.IN_PROGRESS
196
+ await self.document_policy_validation_repo.save(validation)
197
+
198
+ # Step 5: Retrieve all queries needed for this policy
199
+ all_queries = await self._retrieve_all_queries(policy)
200
+
201
+ # Step 6: Register the document with knowledge services
202
+ document_registrations = await self._register_document_with_services(
203
+ document, all_queries
204
+ )
205
+
206
+ # Step 7: Execute validation queries and calculate scores
207
+ validation_scores = await self._execute_validation_queries(
208
+ document,
209
+ policy,
210
+ document_registrations,
211
+ all_queries,
212
+ )
213
+
214
+ # Step 9: Update validation with scores
215
+ validation.validation_scores = validation_scores
216
+ validation.status = DocumentPolicyValidationStatus.VALIDATION_COMPLETE
217
+ await self.document_policy_validation_repo.save(validation)
218
+
219
+ # Step 10: Check if transformations are needed
220
+ initial_passed = self._determine_validation_result(
221
+ validation_scores, policy.validation_scores
222
+ )
223
+
224
+ if initial_passed or not policy.has_transformations:
225
+ # No transformations needed - either passed or no
226
+ # transformations available
227
+ final_status = (
228
+ DocumentPolicyValidationStatus.PASSED
229
+ if initial_passed
230
+ else DocumentPolicyValidationStatus.FAILED
231
+ )
232
+
233
+ validation = DocumentPolicyValidation(
234
+ validation_id=validation.validation_id,
235
+ input_document_id=validation.input_document_id,
236
+ policy_id=validation.policy_id,
237
+ validation_scores=validation_scores,
238
+ transformed_document_id=validation.transformed_document_id,
239
+ post_transform_validation_scores=validation.post_transform_validation_scores,
240
+ started_at=validation.started_at,
241
+ completed_at=self.now_fn(),
242
+ error_message=validation.error_message,
243
+ status=final_status,
244
+ passed=initial_passed,
245
+ )
246
+
247
+ await self.document_policy_validation_repo.save(validation)
248
+
249
+ logger.info(
250
+ "Document validation completed without transformations",
251
+ extra={
252
+ "validation_id": validation_id,
253
+ "document_id": document_id,
254
+ "policy_id": policy_id,
255
+ "passed": initial_passed,
256
+ "validation_scores": validation_scores,
257
+ },
258
+ )
259
+
260
+ return validation
261
+
262
+ # Step 11: Initial validation failed and transformations are
263
+ # available
264
+ validation.status = DocumentPolicyValidationStatus.TRANSFORMATION_REQUIRED
265
+ await self.document_policy_validation_repo.save(validation)
266
+
267
+ logger.info(
268
+ "Initial validation failed, applying transformations",
269
+ extra={
270
+ "validation_id": validation_id,
271
+ "document_id": document_id,
272
+ "policy_id": policy_id,
273
+ "initial_scores": validation_scores,
274
+ },
275
+ )
276
+
277
+ # Step 12: Apply transformations
278
+ validation.status = (
279
+ DocumentPolicyValidationStatus.TRANSFORMATION_IN_PROGRESS
280
+ )
281
+ await self.document_policy_validation_repo.save(validation)
282
+
283
+ transformed_document = await self._apply_transformations(
284
+ document,
285
+ policy,
286
+ all_queries,
287
+ document_registrations,
288
+ )
289
+
290
+ validation.transformed_document_id = transformed_document.document_id
291
+ validation.status = DocumentPolicyValidationStatus.TRANSFORMATION_COMPLETE
292
+ await self.document_policy_validation_repo.save(validation)
293
+
294
+ # Step 13: Register transformed document with knowledge services
295
+ transformed_document_registrations = (
296
+ await self._register_document_with_services(
297
+ transformed_document, all_queries
298
+ )
299
+ )
300
+
301
+ # Step 14: Re-run validation queries on transformed document
302
+ validation.status = DocumentPolicyValidationStatus.IN_PROGRESS
303
+ await self.document_policy_validation_repo.save(validation)
304
+
305
+ post_transform_validation_scores = await self._execute_validation_queries(
306
+ transformed_document,
307
+ policy,
308
+ transformed_document_registrations,
309
+ all_queries,
310
+ )
311
+
312
+ # Step 15: Determine final result based on post-transformation
313
+ # scores
314
+ final_passed = self._determine_validation_result(
315
+ post_transform_validation_scores, policy.validation_scores
316
+ )
317
+
318
+ final_status = (
319
+ DocumentPolicyValidationStatus.PASSED
320
+ if final_passed
321
+ else DocumentPolicyValidationStatus.FAILED
322
+ )
323
+
324
+ validation = DocumentPolicyValidation(
325
+ validation_id=validation.validation_id,
326
+ input_document_id=validation.input_document_id,
327
+ policy_id=validation.policy_id,
328
+ validation_scores=validation_scores,
329
+ transformed_document_id=transformed_document.document_id,
330
+ post_transform_validation_scores=post_transform_validation_scores,
331
+ started_at=validation.started_at,
332
+ completed_at=self.now_fn(),
333
+ error_message=validation.error_message,
334
+ status=final_status,
335
+ passed=final_passed,
336
+ )
337
+
338
+ await self.document_policy_validation_repo.save(validation)
339
+
340
+ logger.info(
341
+ "Document validation completed with transformations",
342
+ extra={
343
+ "validation_id": validation_id,
344
+ "document_id": document_id,
345
+ "policy_id": policy_id,
346
+ "passed": final_passed,
347
+ "initial_scores": validation_scores,
348
+ "final_scores": post_transform_validation_scores,
349
+ "transformed_document_id": (transformed_document.document_id),
350
+ },
351
+ )
352
+
353
+ return validation
354
+
355
+ except Exception as e:
356
+ # Mark validation as failed due to error
357
+ validation.status = DocumentPolicyValidationStatus.ERROR
358
+ validation.error_message = str(e)
359
+ validation.passed = False
360
+ validation.completed_at = self.now_fn()
361
+ await self.document_policy_validation_repo.save(validation)
362
+
363
+ logger.error(
364
+ "Document validation failed",
365
+ extra={
366
+ "validation_id": validation_id,
367
+ "document_id": document_id,
368
+ "policy_id": policy_id,
369
+ "error": str(e),
370
+ },
371
+ exc_info=True,
372
+ )
373
+ raise
374
+
375
+ @try_use_case_step("document_retrieval")
376
+ async def _retrieve_document(self, document_id: str) -> Document:
377
+ """Retrieve document with error handling."""
378
+ document = await self.document_repo.get(document_id)
379
+ if not document:
380
+ raise ValueError(f"Document not found: {document_id}")
381
+ return document
382
+
383
+ @try_use_case_step("policy_retrieval")
384
+ async def _retrieve_policy(self, policy_id: str) -> Policy:
385
+ """Retrieve policy with error handling."""
386
+ policy = await self.policy_repo.get(policy_id)
387
+ if not policy:
388
+ raise ValueError(f"Policy not found: {policy_id}")
389
+ return policy
390
+
391
+ @try_use_case_step("all_queries_retrieval")
392
+ async def _retrieve_all_queries(
393
+ self, policy: Policy
394
+ ) -> Dict[str, KnowledgeServiceQuery]:
395
+ """Retrieve all knowledge service queries needed for validation and
396
+ transformation."""
397
+ all_queries = {}
398
+
399
+ # Get validation queries
400
+ for query_id, required_score in policy.validation_scores:
401
+ query = await self.knowledge_service_query_repo.get(query_id)
402
+ if not query:
403
+ raise ValueError(f"Validation query not found: {query_id}")
404
+ all_queries[query_id] = query
405
+
406
+ # Get transformation queries
407
+ if policy.transformation_queries:
408
+ for query_id in policy.transformation_queries:
409
+ query = await self.knowledge_service_query_repo.get(query_id)
410
+ if not query:
411
+ raise ValueError(f"Transformation query not found: {query_id}")
412
+ all_queries[query_id] = query
413
+
414
+ return all_queries
415
+
416
+ @try_use_case_step("document_registration")
417
+ async def _register_document_with_services(
418
+ self,
419
+ document: Document,
420
+ queries: Dict[str, KnowledgeServiceQuery],
421
+ ) -> Dict[str, str]:
422
+ """
423
+ Register the document with all knowledge services needed for
424
+ validation.
425
+
426
+ Args:
427
+ document: The document to register
428
+ queries: Dict of query_id to KnowledgeServiceQuery objects
429
+
430
+ Returns:
431
+ Dict mapping knowledge_service_id to service_file_id
432
+ """
433
+ registrations = {}
434
+ required_service_ids = {
435
+ query.knowledge_service_id for query in queries.values()
436
+ }
437
+
438
+ for knowledge_service_id in required_service_ids:
439
+ # Get the config for this service
440
+ config = await self.knowledge_service_config_repo.get(knowledge_service_id)
441
+ if not config:
442
+ raise ValueError(
443
+ f"Knowledge service config not found: {knowledge_service_id}"
444
+ )
445
+
446
+ registration_result = await self.knowledge_service.register_file(
447
+ config, document
448
+ )
449
+ registrations[knowledge_service_id] = (
450
+ registration_result.knowledge_service_file_id
451
+ )
452
+
453
+ return registrations
454
+
455
+ @try_use_case_step("validation_execution")
456
+ async def _execute_validation_queries(
457
+ self,
458
+ document: Document,
459
+ policy: Policy,
460
+ document_registrations: Dict[str, str],
461
+ queries: Dict[str, KnowledgeServiceQuery],
462
+ ) -> List[Tuple[str, int]]:
463
+ """
464
+ Execute all validation queries and return the actual scores achieved.
465
+
466
+ Args:
467
+ document: The document being validated
468
+ policy: The policy being applied
469
+ document_registrations: Mapping of service_id to service_file_id
470
+ queries: Dict of query_id to KnowledgeServiceQuery objects
471
+
472
+ Returns:
473
+ List of (query_id, actual_score) tuples
474
+ """
475
+ validation_scores = []
476
+
477
+ # Execute each validation query defined in the policy
478
+ for query_id, required_score in policy.validation_scores:
479
+ # Get the query configuration
480
+ query = queries[query_id]
481
+
482
+ # Get the config for this service
483
+ config = await self.knowledge_service_config_repo.get(
484
+ query.knowledge_service_id
485
+ )
486
+ if not config:
487
+ raise ValueError(
488
+ f"Knowledge service config not found: {query.knowledge_service_id}"
489
+ )
490
+
491
+ # Get the service file ID from our registrations
492
+ service_file_id = document_registrations.get(query.knowledge_service_id)
493
+ if not service_file_id:
494
+ raise ValueError(
495
+ f"Document not registered with service {query.knowledge_service_id}"
496
+ )
497
+
498
+ # Execute the validation query
499
+ query_result = await self.knowledge_service.execute_query(
500
+ config,
501
+ query.prompt,
502
+ [service_file_id],
503
+ query.query_metadata,
504
+ query.assistant_prompt,
505
+ )
506
+
507
+ # Extract the score from the query result
508
+ actual_score = self._extract_score_from_result(query_result.result_data)
509
+ validation_scores.append((query_id, actual_score))
510
+
511
+ logger.debug(
512
+ "Validation query executed",
513
+ extra={
514
+ "query_id": query_id,
515
+ "required_score": required_score,
516
+ "actual_score": actual_score,
517
+ "passed": actual_score >= required_score,
518
+ },
519
+ )
520
+
521
+ return validation_scores
522
+
523
+ def _extract_score_from_result(self, result_data: Dict) -> int:
524
+ """
525
+ Extract a numeric score from the knowledge service query result.
526
+
527
+ Similar to _parse_query_result, but expects a numeric response.
528
+ Returns the actual score without range validation to preserve data
529
+ integrity.
530
+ """
531
+ response_text = result_data.get("response", "")
532
+ if not response_text:
533
+ raise ValueError("Empty response from knowledge service")
534
+
535
+ # Try to parse response as integer directly
536
+ try:
537
+ score = int(response_text.strip())
538
+ return score
539
+ except ValueError as e:
540
+ raise ValueError(
541
+ f"Failed to parse numeric score from response: {response_text}"
542
+ ) from e
543
+
544
+ def _determine_validation_result(
545
+ self,
546
+ actual_scores: List[Tuple[str, int]],
547
+ required_scores: List[Tuple[str, int]],
548
+ ) -> bool:
549
+ """
550
+ Determine if validation passed based on actual vs required scores.
551
+
552
+ Args:
553
+ actual_scores: List of (query_id, actual_score) tuples
554
+ required_scores: List of (query_id, required_score) tuples from
555
+ policy
556
+
557
+ Returns:
558
+ True if all required scores were met or exceeded, False otherwise
559
+ """
560
+ # Convert to dictionaries for easier lookup
561
+ actual_scores_dict = dict(actual_scores)
562
+ required_scores_dict = dict(required_scores)
563
+
564
+ # Check if all required scores were met
565
+ for query_id, required_score in required_scores_dict.items():
566
+ actual_score = actual_scores_dict.get(query_id, 0)
567
+ if actual_score < required_score:
568
+ logger.debug(
569
+ "Validation failed for query",
570
+ extra={
571
+ "query_id": query_id,
572
+ "required_score": required_score,
573
+ "actual_score": actual_score,
574
+ },
575
+ )
576
+ return False
577
+
578
+ return True
579
+
580
+ @try_use_case_step("document_transformation")
581
+ async def _apply_transformations(
582
+ self,
583
+ document: Document,
584
+ policy: Policy,
585
+ all_queries: Dict[str, KnowledgeServiceQuery],
586
+ document_registrations: Dict[str, str],
587
+ ) -> Document:
588
+ """
589
+ Apply transformation queries to a document and return the
590
+ transformed document.
591
+
592
+ Args:
593
+ document: The original document to transform
594
+ policy: The policy containing transformation query IDs
595
+ all_queries: Dict of all queries (validation and transformation)
596
+ document_registrations: Mapping of service_id to service_file_id
597
+
598
+ Returns:
599
+ New Document object with transformed content
600
+
601
+ Raises:
602
+ ValueError: If transformation queries are not found or fail
603
+ RuntimeError: If document transformation fails
604
+ """
605
+ if not policy.transformation_queries:
606
+ raise ValueError("No transformation queries provided")
607
+
608
+ logger.debug(
609
+ "Applying transformations to document",
610
+ extra={
611
+ "document_id": document.document_id,
612
+ "transformation_query_ids": policy.transformation_queries,
613
+ },
614
+ )
615
+
616
+ # Apply transformations sequentially
617
+ current_content = document.content
618
+ if current_content is None:
619
+ raise ValueError("Document content stream is required for transformation")
620
+ current_content.seek(0)
621
+ transformed_content = current_content.read().decode("utf-8")
622
+ current_content.seek(0)
623
+
624
+ for query_id in policy.transformation_queries:
625
+ query = all_queries[query_id]
626
+
627
+ # Get the config for this service
628
+ config = await self.knowledge_service_config_repo.get(
629
+ query.knowledge_service_id
630
+ )
631
+ if not config:
632
+ raise ValueError(
633
+ f"Knowledge service config not found: {query.knowledge_service_id}"
634
+ )
635
+
636
+ # Get the service file ID from our registrations
637
+ service_file_id = document_registrations.get(query.knowledge_service_id)
638
+ if not service_file_id:
639
+ raise ValueError(
640
+ f"Document not registered with service {query.knowledge_service_id}"
641
+ )
642
+
643
+ # Execute the transformation query
644
+ transformation_result = await self.knowledge_service.execute_query(
645
+ config,
646
+ query.prompt,
647
+ [service_file_id],
648
+ query.query_metadata,
649
+ query.assistant_prompt,
650
+ )
651
+
652
+ # Extract transformed content from result
653
+ transformed_content = self._extract_transformed_content(
654
+ transformation_result.result_data
655
+ )
656
+
657
+ logger.debug(
658
+ "Transformation query applied",
659
+ extra={
660
+ "query_id": query_id,
661
+ "original_length": document.size_bytes,
662
+ "transformed_length": len(transformed_content),
663
+ },
664
+ )
665
+
666
+ # Create new document with transformed content
667
+ transformed_document_id = await self.document_repo.generate_id()
668
+
669
+ # Create content stream from transformed text
670
+ transformed_bytes = transformed_content.encode("utf-8")
671
+ transformed_stream = io.BytesIO(transformed_bytes)
672
+
673
+ # Calculate multihash for transformed content
674
+ sha256_hasher = hashlib.sha256()
675
+ sha256_hasher.update(transformed_bytes)
676
+ sha256_hash = sha256_hasher.digest()
677
+ mhash = multihash.encode(sha256_hash, multihash.SHA2_256)
678
+ proper_multihash = str(mhash.hex())
679
+
680
+ transformed_document = Document(
681
+ document_id=transformed_document_id,
682
+ original_filename=f"transformed_{document.original_filename}",
683
+ content_type=document.content_type,
684
+ size_bytes=len(transformed_bytes),
685
+ content_multihash=proper_multihash,
686
+ status=DocumentStatus.CAPTURED,
687
+ content=ContentStream(transformed_stream),
688
+ created_at=self.now_fn(),
689
+ updated_at=self.now_fn(),
690
+ )
691
+
692
+ # Save the transformed document
693
+ await self.document_repo.save(transformed_document)
694
+
695
+ logger.info(
696
+ "Document transformation completed",
697
+ extra={
698
+ "original_document_id": document.document_id,
699
+ "transformed_document_id": transformed_document.document_id,
700
+ "original_size": document.size_bytes,
701
+ "transformed_size": transformed_document.size_bytes,
702
+ },
703
+ )
704
+
705
+ return transformed_document
706
+
707
+ def _extract_transformed_content(self, result_data: Dict) -> str:
708
+ """
709
+ Extract transformed document content from knowledge service result.
710
+
711
+ Args:
712
+ result_data: Result data from knowledge service transformation
713
+ query
714
+
715
+ Returns:
716
+ Transformed document content as valid JSON string
717
+
718
+ Raises:
719
+ ValueError: If no valid JSON content can be extracted from result
720
+ """
721
+ response_text = result_data.get("response", "")
722
+ if not response_text:
723
+ raise ValueError("Empty response from transformation query")
724
+
725
+ # The response must be valid JSON
726
+ stripped_response: str = response_text.strip()
727
+ try:
728
+ # Parse to validate JSON structure
729
+ json.loads(stripped_response)
730
+ # Return the original response text (preserving formatting)
731
+ return stripped_response
732
+ except json.JSONDecodeError as e:
733
+ raise ValueError(
734
+ f"Transformation result must be valid JSON, got: "
735
+ f"{response_text[:100]}... Parse error: {e}"
736
+ )