julee 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- julee/__init__.py +3 -0
- julee/api/__init__.py +20 -0
- julee/api/app.py +180 -0
- julee/api/dependencies.py +257 -0
- julee/api/requests.py +175 -0
- julee/api/responses.py +43 -0
- julee/api/routers/__init__.py +43 -0
- julee/api/routers/assembly_specifications.py +212 -0
- julee/api/routers/documents.py +182 -0
- julee/api/routers/knowledge_service_configs.py +79 -0
- julee/api/routers/knowledge_service_queries.py +293 -0
- julee/api/routers/system.py +137 -0
- julee/api/routers/workflows.py +234 -0
- julee/api/services/__init__.py +20 -0
- julee/api/services/system_initialization.py +214 -0
- julee/api/tests/__init__.py +14 -0
- julee/api/tests/routers/__init__.py +17 -0
- julee/api/tests/routers/test_assembly_specifications.py +749 -0
- julee/api/tests/routers/test_documents.py +301 -0
- julee/api/tests/routers/test_knowledge_service_configs.py +234 -0
- julee/api/tests/routers/test_knowledge_service_queries.py +738 -0
- julee/api/tests/routers/test_system.py +179 -0
- julee/api/tests/routers/test_workflows.py +393 -0
- julee/api/tests/test_app.py +285 -0
- julee/api/tests/test_dependencies.py +245 -0
- julee/api/tests/test_requests.py +250 -0
- julee/domain/__init__.py +22 -0
- julee/domain/models/__init__.py +49 -0
- julee/domain/models/assembly/__init__.py +17 -0
- julee/domain/models/assembly/assembly.py +103 -0
- julee/domain/models/assembly/tests/__init__.py +0 -0
- julee/domain/models/assembly/tests/factories.py +37 -0
- julee/domain/models/assembly/tests/test_assembly.py +430 -0
- julee/domain/models/assembly_specification/__init__.py +24 -0
- julee/domain/models/assembly_specification/assembly_specification.py +172 -0
- julee/domain/models/assembly_specification/knowledge_service_query.py +123 -0
- julee/domain/models/assembly_specification/tests/__init__.py +0 -0
- julee/domain/models/assembly_specification/tests/factories.py +78 -0
- julee/domain/models/assembly_specification/tests/test_assembly_specification.py +490 -0
- julee/domain/models/assembly_specification/tests/test_knowledge_service_query.py +310 -0
- julee/domain/models/custom_fields/__init__.py +0 -0
- julee/domain/models/custom_fields/content_stream.py +68 -0
- julee/domain/models/custom_fields/tests/__init__.py +0 -0
- julee/domain/models/custom_fields/tests/test_custom_fields.py +53 -0
- julee/domain/models/document/__init__.py +17 -0
- julee/domain/models/document/document.py +150 -0
- julee/domain/models/document/tests/__init__.py +0 -0
- julee/domain/models/document/tests/factories.py +76 -0
- julee/domain/models/document/tests/test_document.py +297 -0
- julee/domain/models/knowledge_service_config/__init__.py +17 -0
- julee/domain/models/knowledge_service_config/knowledge_service_config.py +86 -0
- julee/domain/models/policy/__init__.py +15 -0
- julee/domain/models/policy/document_policy_validation.py +220 -0
- julee/domain/models/policy/policy.py +203 -0
- julee/domain/models/policy/tests/__init__.py +0 -0
- julee/domain/models/policy/tests/factories.py +47 -0
- julee/domain/models/policy/tests/test_document_policy_validation.py +420 -0
- julee/domain/models/policy/tests/test_policy.py +546 -0
- julee/domain/repositories/__init__.py +27 -0
- julee/domain/repositories/assembly.py +45 -0
- julee/domain/repositories/assembly_specification.py +52 -0
- julee/domain/repositories/base.py +146 -0
- julee/domain/repositories/document.py +49 -0
- julee/domain/repositories/document_policy_validation.py +52 -0
- julee/domain/repositories/knowledge_service_config.py +54 -0
- julee/domain/repositories/knowledge_service_query.py +44 -0
- julee/domain/repositories/policy.py +49 -0
- julee/domain/use_cases/__init__.py +17 -0
- julee/domain/use_cases/decorators.py +107 -0
- julee/domain/use_cases/extract_assemble_data.py +649 -0
- julee/domain/use_cases/initialize_system_data.py +842 -0
- julee/domain/use_cases/tests/__init__.py +7 -0
- julee/domain/use_cases/tests/test_extract_assemble_data.py +548 -0
- julee/domain/use_cases/tests/test_initialize_system_data.py +455 -0
- julee/domain/use_cases/tests/test_validate_document.py +1228 -0
- julee/domain/use_cases/validate_document.py +736 -0
- julee/fixtures/assembly_specifications.yaml +70 -0
- julee/fixtures/documents.yaml +178 -0
- julee/fixtures/knowledge_service_configs.yaml +37 -0
- julee/fixtures/knowledge_service_queries.yaml +27 -0
- julee/repositories/__init__.py +17 -0
- julee/repositories/memory/__init__.py +31 -0
- julee/repositories/memory/assembly.py +84 -0
- julee/repositories/memory/assembly_specification.py +125 -0
- julee/repositories/memory/base.py +227 -0
- julee/repositories/memory/document.py +149 -0
- julee/repositories/memory/document_policy_validation.py +104 -0
- julee/repositories/memory/knowledge_service_config.py +123 -0
- julee/repositories/memory/knowledge_service_query.py +120 -0
- julee/repositories/memory/policy.py +87 -0
- julee/repositories/memory/tests/__init__.py +0 -0
- julee/repositories/memory/tests/test_document.py +212 -0
- julee/repositories/memory/tests/test_document_policy_validation.py +161 -0
- julee/repositories/memory/tests/test_policy.py +443 -0
- julee/repositories/minio/__init__.py +31 -0
- julee/repositories/minio/assembly.py +103 -0
- julee/repositories/minio/assembly_specification.py +170 -0
- julee/repositories/minio/client.py +570 -0
- julee/repositories/minio/document.py +530 -0
- julee/repositories/minio/document_policy_validation.py +120 -0
- julee/repositories/minio/knowledge_service_config.py +187 -0
- julee/repositories/minio/knowledge_service_query.py +211 -0
- julee/repositories/minio/policy.py +106 -0
- julee/repositories/minio/tests/__init__.py +0 -0
- julee/repositories/minio/tests/fake_client.py +213 -0
- julee/repositories/minio/tests/test_assembly.py +374 -0
- julee/repositories/minio/tests/test_assembly_specification.py +391 -0
- julee/repositories/minio/tests/test_client_protocol.py +57 -0
- julee/repositories/minio/tests/test_document.py +591 -0
- julee/repositories/minio/tests/test_document_policy_validation.py +192 -0
- julee/repositories/minio/tests/test_knowledge_service_config.py +374 -0
- julee/repositories/minio/tests/test_knowledge_service_query.py +438 -0
- julee/repositories/minio/tests/test_policy.py +559 -0
- julee/repositories/temporal/__init__.py +38 -0
- julee/repositories/temporal/activities.py +114 -0
- julee/repositories/temporal/activity_names.py +34 -0
- julee/repositories/temporal/proxies.py +159 -0
- julee/services/__init__.py +18 -0
- julee/services/knowledge_service/__init__.py +48 -0
- julee/services/knowledge_service/anthropic/__init__.py +12 -0
- julee/services/knowledge_service/anthropic/knowledge_service.py +331 -0
- julee/services/knowledge_service/anthropic/tests/test_knowledge_service.py +318 -0
- julee/services/knowledge_service/factory.py +138 -0
- julee/services/knowledge_service/knowledge_service.py +160 -0
- julee/services/knowledge_service/memory/__init__.py +13 -0
- julee/services/knowledge_service/memory/knowledge_service.py +278 -0
- julee/services/knowledge_service/memory/test_knowledge_service.py +345 -0
- julee/services/knowledge_service/test_factory.py +112 -0
- julee/services/temporal/__init__.py +38 -0
- julee/services/temporal/activities.py +86 -0
- julee/services/temporal/activity_names.py +22 -0
- julee/services/temporal/proxies.py +41 -0
- julee/util/__init__.py +0 -0
- julee/util/domain.py +119 -0
- julee/util/repos/__init__.py +0 -0
- julee/util/repos/minio/__init__.py +0 -0
- julee/util/repos/minio/file_storage.py +213 -0
- julee/util/repos/temporal/__init__.py +11 -0
- julee/util/repos/temporal/client_proxies/file_storage.py +68 -0
- julee/util/repos/temporal/data_converter.py +123 -0
- julee/util/repos/temporal/minio_file_storage.py +12 -0
- julee/util/repos/temporal/proxies/__init__.py +0 -0
- julee/util/repos/temporal/proxies/file_storage.py +58 -0
- julee/util/repositories.py +55 -0
- julee/util/temporal/__init__.py +22 -0
- julee/util/temporal/activities.py +123 -0
- julee/util/temporal/decorators.py +473 -0
- julee/util/tests/__init__.py +1 -0
- julee/util/tests/test_decorators.py +770 -0
- julee/util/validation/__init__.py +29 -0
- julee/util/validation/repository.py +100 -0
- julee/util/validation/type_guards.py +369 -0
- julee/worker.py +211 -0
- julee/workflows/__init__.py +26 -0
- julee/workflows/extract_assemble.py +215 -0
- julee/workflows/validate_document.py +228 -0
- julee-0.1.0.dist-info/METADATA +195 -0
- julee-0.1.0.dist-info/RECORD +161 -0
- julee-0.1.0.dist-info/WHEEL +5 -0
- julee-0.1.0.dist-info/licenses/LICENSE +674 -0
- julee-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,736 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Use case logic for document validation within the Capture, Extract, Assemble,
|
|
3
|
+
Publish workflow.
|
|
4
|
+
|
|
5
|
+
This module contains use case classes that orchestrate business logic while
|
|
6
|
+
remaining framework-agnostic. Dependencies are injected via repository
|
|
7
|
+
instances following the Clean Architecture principles.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import hashlib
|
|
11
|
+
import io
|
|
12
|
+
import json
|
|
13
|
+
import logging
|
|
14
|
+
from datetime import datetime
|
|
15
|
+
from typing import Callable, Dict, List, Tuple
|
|
16
|
+
|
|
17
|
+
import multihash
|
|
18
|
+
|
|
19
|
+
from julee.domain.models import (
|
|
20
|
+
ContentStream,
|
|
21
|
+
Document,
|
|
22
|
+
DocumentPolicyValidation,
|
|
23
|
+
DocumentStatus,
|
|
24
|
+
KnowledgeServiceQuery,
|
|
25
|
+
Policy,
|
|
26
|
+
)
|
|
27
|
+
from julee.domain.models.policy import (
|
|
28
|
+
DocumentPolicyValidationStatus,
|
|
29
|
+
)
|
|
30
|
+
from julee.domain.repositories import (
|
|
31
|
+
DocumentPolicyValidationRepository,
|
|
32
|
+
DocumentRepository,
|
|
33
|
+
KnowledgeServiceConfigRepository,
|
|
34
|
+
KnowledgeServiceQueryRepository,
|
|
35
|
+
PolicyRepository,
|
|
36
|
+
)
|
|
37
|
+
from julee.services import KnowledgeService
|
|
38
|
+
from julee.util.validation import ensure_repository_protocol
|
|
39
|
+
|
|
40
|
+
from .decorators import try_use_case_step
|
|
41
|
+
|
|
42
|
+
logger = logging.getLogger(__name__)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class ValidateDocumentUseCase:
|
|
46
|
+
"""
|
|
47
|
+
Use case for validating documents against policies.
|
|
48
|
+
|
|
49
|
+
This class orchestrates the business logic for document validation within
|
|
50
|
+
the Capture, Extract, Assemble, Publish workflow while remaining
|
|
51
|
+
framework-agnostic. It depends only on repository protocols, not
|
|
52
|
+
concrete implementations.
|
|
53
|
+
|
|
54
|
+
In workflow contexts, this use case is called from workflow code with
|
|
55
|
+
repository stubs that delegate to Temporal activities for durability.
|
|
56
|
+
The use case remains completely unaware of whether it's running in a
|
|
57
|
+
workflow context or a simple async context - it just calls repository
|
|
58
|
+
methods and expects them to work correctly.
|
|
59
|
+
|
|
60
|
+
Architectural Notes:
|
|
61
|
+
- This class contains pure business logic with no framework dependencies
|
|
62
|
+
- Repository dependencies are injected via constructor
|
|
63
|
+
(dependency inversion)
|
|
64
|
+
- All error handling and compensation logic is contained here
|
|
65
|
+
- The use case works with domain objects exclusively
|
|
66
|
+
- Deterministic execution is guaranteed by avoiding
|
|
67
|
+
non-deterministic operations
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
document_repo: DocumentRepository,
|
|
73
|
+
knowledge_service_query_repo: KnowledgeServiceQueryRepository,
|
|
74
|
+
knowledge_service_config_repo: KnowledgeServiceConfigRepository,
|
|
75
|
+
policy_repo: PolicyRepository,
|
|
76
|
+
document_policy_validation_repo: DocumentPolicyValidationRepository,
|
|
77
|
+
knowledge_service: KnowledgeService,
|
|
78
|
+
now_fn: Callable[[], datetime],
|
|
79
|
+
) -> None:
|
|
80
|
+
"""Initialize validate document use case.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
document_repo: Repository for document operations
|
|
84
|
+
knowledge_service_query_repo: Repository for knowledge service
|
|
85
|
+
query operations
|
|
86
|
+
knowledge_service_config_repo: Repository for knowledge service
|
|
87
|
+
configuration operations
|
|
88
|
+
policy_repo: Repository for policy operations
|
|
89
|
+
document_policy_validation_repo: Repository for document policy
|
|
90
|
+
validation operations
|
|
91
|
+
knowledge_service: Knowledge service instance for external
|
|
92
|
+
operations
|
|
93
|
+
now_fn: Function to get current time (e.g., workflow.now for
|
|
94
|
+
Temporal workflows)
|
|
95
|
+
|
|
96
|
+
Note:
|
|
97
|
+
The repositories passed here may be concrete implementations
|
|
98
|
+
(for testing or direct execution) or workflow stubs (for
|
|
99
|
+
Temporal workflow execution). The use case doesn't know or care
|
|
100
|
+
which - it just calls the methods defined in the protocols.
|
|
101
|
+
|
|
102
|
+
Repositories are validated at construction time to catch
|
|
103
|
+
configuration errors early in the application lifecycle.
|
|
104
|
+
"""
|
|
105
|
+
# Validate at construction time for early error detection
|
|
106
|
+
self.document_repo = ensure_repository_protocol(
|
|
107
|
+
document_repo,
|
|
108
|
+
DocumentRepository, # type: ignore[type-abstract]
|
|
109
|
+
)
|
|
110
|
+
self.knowledge_service = knowledge_service
|
|
111
|
+
self.knowledge_service_query_repo = ensure_repository_protocol(
|
|
112
|
+
knowledge_service_query_repo,
|
|
113
|
+
KnowledgeServiceQueryRepository, # type: ignore[type-abstract]
|
|
114
|
+
)
|
|
115
|
+
self.knowledge_service_config_repo = ensure_repository_protocol(
|
|
116
|
+
knowledge_service_config_repo,
|
|
117
|
+
KnowledgeServiceConfigRepository, # type: ignore[type-abstract]
|
|
118
|
+
)
|
|
119
|
+
self.policy_repo = ensure_repository_protocol(
|
|
120
|
+
policy_repo,
|
|
121
|
+
PolicyRepository, # type: ignore[type-abstract]
|
|
122
|
+
)
|
|
123
|
+
self.document_policy_validation_repo = ensure_repository_protocol(
|
|
124
|
+
document_policy_validation_repo,
|
|
125
|
+
DocumentPolicyValidationRepository, # type: ignore[type-abstract]
|
|
126
|
+
)
|
|
127
|
+
self.now_fn = now_fn
|
|
128
|
+
|
|
129
|
+
async def validate_document(
|
|
130
|
+
self, document_id: str, policy_id: str
|
|
131
|
+
) -> DocumentPolicyValidation:
|
|
132
|
+
"""
|
|
133
|
+
Validate a document against a policy and return the validation result.
|
|
134
|
+
|
|
135
|
+
This method orchestrates the core validation workflow:
|
|
136
|
+
1. Generates a unique validation ID
|
|
137
|
+
2. Retrieves the document and policy
|
|
138
|
+
3. Creates and stores the initial validation record
|
|
139
|
+
4. Retrieves all validation queries needed for the policy
|
|
140
|
+
5. Retrieves all knowledge services needed for validation
|
|
141
|
+
6. Registers the document with knowledge services
|
|
142
|
+
7. Executes validation queries and calculates scores
|
|
143
|
+
8. Determines pass/fail and updates validation record
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
document_id: ID of the document to validate
|
|
147
|
+
policy_id: ID of the policy to validate against
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
DocumentPolicyValidation with validation results
|
|
151
|
+
|
|
152
|
+
Raises:
|
|
153
|
+
ValueError: If required entities are not found or invalid
|
|
154
|
+
RuntimeError: If validation processing fails
|
|
155
|
+
"""
|
|
156
|
+
logger.debug(
|
|
157
|
+
"Starting document validation use case",
|
|
158
|
+
extra={
|
|
159
|
+
"document_id": document_id,
|
|
160
|
+
"policy_id": policy_id,
|
|
161
|
+
},
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
# Step 1: Generate unique validation ID
|
|
165
|
+
validation_id = await self.document_policy_validation_repo.generate_id()
|
|
166
|
+
|
|
167
|
+
# Step 2: Retrieve document and policy (validate they exist)
|
|
168
|
+
document = await self._retrieve_document(document_id)
|
|
169
|
+
policy = await self._retrieve_policy(policy_id)
|
|
170
|
+
|
|
171
|
+
# Step 3: Create and store initial validation record
|
|
172
|
+
validation = DocumentPolicyValidation(
|
|
173
|
+
validation_id=validation_id,
|
|
174
|
+
input_document_id=document_id,
|
|
175
|
+
policy_id=policy_id,
|
|
176
|
+
status=DocumentPolicyValidationStatus.PENDING,
|
|
177
|
+
validation_scores=[],
|
|
178
|
+
started_at=self.now_fn(),
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
await self.document_policy_validation_repo.save(validation)
|
|
182
|
+
|
|
183
|
+
logger.debug(
|
|
184
|
+
"Initial validation record created",
|
|
185
|
+
extra={
|
|
186
|
+
"validation_id": validation_id,
|
|
187
|
+
"document_id": document_id,
|
|
188
|
+
"policy_id": policy_id,
|
|
189
|
+
"status": validation.status.value,
|
|
190
|
+
},
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
try:
|
|
194
|
+
# Step 4: Update status to in progress
|
|
195
|
+
validation.status = DocumentPolicyValidationStatus.IN_PROGRESS
|
|
196
|
+
await self.document_policy_validation_repo.save(validation)
|
|
197
|
+
|
|
198
|
+
# Step 5: Retrieve all queries needed for this policy
|
|
199
|
+
all_queries = await self._retrieve_all_queries(policy)
|
|
200
|
+
|
|
201
|
+
# Step 6: Register the document with knowledge services
|
|
202
|
+
document_registrations = await self._register_document_with_services(
|
|
203
|
+
document, all_queries
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
# Step 7: Execute validation queries and calculate scores
|
|
207
|
+
validation_scores = await self._execute_validation_queries(
|
|
208
|
+
document,
|
|
209
|
+
policy,
|
|
210
|
+
document_registrations,
|
|
211
|
+
all_queries,
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
# Step 9: Update validation with scores
|
|
215
|
+
validation.validation_scores = validation_scores
|
|
216
|
+
validation.status = DocumentPolicyValidationStatus.VALIDATION_COMPLETE
|
|
217
|
+
await self.document_policy_validation_repo.save(validation)
|
|
218
|
+
|
|
219
|
+
# Step 10: Check if transformations are needed
|
|
220
|
+
initial_passed = self._determine_validation_result(
|
|
221
|
+
validation_scores, policy.validation_scores
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
if initial_passed or not policy.has_transformations:
|
|
225
|
+
# No transformations needed - either passed or no
|
|
226
|
+
# transformations available
|
|
227
|
+
final_status = (
|
|
228
|
+
DocumentPolicyValidationStatus.PASSED
|
|
229
|
+
if initial_passed
|
|
230
|
+
else DocumentPolicyValidationStatus.FAILED
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
validation = DocumentPolicyValidation(
|
|
234
|
+
validation_id=validation.validation_id,
|
|
235
|
+
input_document_id=validation.input_document_id,
|
|
236
|
+
policy_id=validation.policy_id,
|
|
237
|
+
validation_scores=validation_scores,
|
|
238
|
+
transformed_document_id=validation.transformed_document_id,
|
|
239
|
+
post_transform_validation_scores=validation.post_transform_validation_scores,
|
|
240
|
+
started_at=validation.started_at,
|
|
241
|
+
completed_at=self.now_fn(),
|
|
242
|
+
error_message=validation.error_message,
|
|
243
|
+
status=final_status,
|
|
244
|
+
passed=initial_passed,
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
await self.document_policy_validation_repo.save(validation)
|
|
248
|
+
|
|
249
|
+
logger.info(
|
|
250
|
+
"Document validation completed without transformations",
|
|
251
|
+
extra={
|
|
252
|
+
"validation_id": validation_id,
|
|
253
|
+
"document_id": document_id,
|
|
254
|
+
"policy_id": policy_id,
|
|
255
|
+
"passed": initial_passed,
|
|
256
|
+
"validation_scores": validation_scores,
|
|
257
|
+
},
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
return validation
|
|
261
|
+
|
|
262
|
+
# Step 11: Initial validation failed and transformations are
|
|
263
|
+
# available
|
|
264
|
+
validation.status = DocumentPolicyValidationStatus.TRANSFORMATION_REQUIRED
|
|
265
|
+
await self.document_policy_validation_repo.save(validation)
|
|
266
|
+
|
|
267
|
+
logger.info(
|
|
268
|
+
"Initial validation failed, applying transformations",
|
|
269
|
+
extra={
|
|
270
|
+
"validation_id": validation_id,
|
|
271
|
+
"document_id": document_id,
|
|
272
|
+
"policy_id": policy_id,
|
|
273
|
+
"initial_scores": validation_scores,
|
|
274
|
+
},
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# Step 12: Apply transformations
|
|
278
|
+
validation.status = (
|
|
279
|
+
DocumentPolicyValidationStatus.TRANSFORMATION_IN_PROGRESS
|
|
280
|
+
)
|
|
281
|
+
await self.document_policy_validation_repo.save(validation)
|
|
282
|
+
|
|
283
|
+
transformed_document = await self._apply_transformations(
|
|
284
|
+
document,
|
|
285
|
+
policy,
|
|
286
|
+
all_queries,
|
|
287
|
+
document_registrations,
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
validation.transformed_document_id = transformed_document.document_id
|
|
291
|
+
validation.status = DocumentPolicyValidationStatus.TRANSFORMATION_COMPLETE
|
|
292
|
+
await self.document_policy_validation_repo.save(validation)
|
|
293
|
+
|
|
294
|
+
# Step 13: Register transformed document with knowledge services
|
|
295
|
+
transformed_document_registrations = (
|
|
296
|
+
await self._register_document_with_services(
|
|
297
|
+
transformed_document, all_queries
|
|
298
|
+
)
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
# Step 14: Re-run validation queries on transformed document
|
|
302
|
+
validation.status = DocumentPolicyValidationStatus.IN_PROGRESS
|
|
303
|
+
await self.document_policy_validation_repo.save(validation)
|
|
304
|
+
|
|
305
|
+
post_transform_validation_scores = await self._execute_validation_queries(
|
|
306
|
+
transformed_document,
|
|
307
|
+
policy,
|
|
308
|
+
transformed_document_registrations,
|
|
309
|
+
all_queries,
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
# Step 15: Determine final result based on post-transformation
|
|
313
|
+
# scores
|
|
314
|
+
final_passed = self._determine_validation_result(
|
|
315
|
+
post_transform_validation_scores, policy.validation_scores
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
final_status = (
|
|
319
|
+
DocumentPolicyValidationStatus.PASSED
|
|
320
|
+
if final_passed
|
|
321
|
+
else DocumentPolicyValidationStatus.FAILED
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
validation = DocumentPolicyValidation(
|
|
325
|
+
validation_id=validation.validation_id,
|
|
326
|
+
input_document_id=validation.input_document_id,
|
|
327
|
+
policy_id=validation.policy_id,
|
|
328
|
+
validation_scores=validation_scores,
|
|
329
|
+
transformed_document_id=transformed_document.document_id,
|
|
330
|
+
post_transform_validation_scores=post_transform_validation_scores,
|
|
331
|
+
started_at=validation.started_at,
|
|
332
|
+
completed_at=self.now_fn(),
|
|
333
|
+
error_message=validation.error_message,
|
|
334
|
+
status=final_status,
|
|
335
|
+
passed=final_passed,
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
await self.document_policy_validation_repo.save(validation)
|
|
339
|
+
|
|
340
|
+
logger.info(
|
|
341
|
+
"Document validation completed with transformations",
|
|
342
|
+
extra={
|
|
343
|
+
"validation_id": validation_id,
|
|
344
|
+
"document_id": document_id,
|
|
345
|
+
"policy_id": policy_id,
|
|
346
|
+
"passed": final_passed,
|
|
347
|
+
"initial_scores": validation_scores,
|
|
348
|
+
"final_scores": post_transform_validation_scores,
|
|
349
|
+
"transformed_document_id": (transformed_document.document_id),
|
|
350
|
+
},
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
return validation
|
|
354
|
+
|
|
355
|
+
except Exception as e:
|
|
356
|
+
# Mark validation as failed due to error
|
|
357
|
+
validation.status = DocumentPolicyValidationStatus.ERROR
|
|
358
|
+
validation.error_message = str(e)
|
|
359
|
+
validation.passed = False
|
|
360
|
+
validation.completed_at = self.now_fn()
|
|
361
|
+
await self.document_policy_validation_repo.save(validation)
|
|
362
|
+
|
|
363
|
+
logger.error(
|
|
364
|
+
"Document validation failed",
|
|
365
|
+
extra={
|
|
366
|
+
"validation_id": validation_id,
|
|
367
|
+
"document_id": document_id,
|
|
368
|
+
"policy_id": policy_id,
|
|
369
|
+
"error": str(e),
|
|
370
|
+
},
|
|
371
|
+
exc_info=True,
|
|
372
|
+
)
|
|
373
|
+
raise
|
|
374
|
+
|
|
375
|
+
@try_use_case_step("document_retrieval")
|
|
376
|
+
async def _retrieve_document(self, document_id: str) -> Document:
|
|
377
|
+
"""Retrieve document with error handling."""
|
|
378
|
+
document = await self.document_repo.get(document_id)
|
|
379
|
+
if not document:
|
|
380
|
+
raise ValueError(f"Document not found: {document_id}")
|
|
381
|
+
return document
|
|
382
|
+
|
|
383
|
+
@try_use_case_step("policy_retrieval")
|
|
384
|
+
async def _retrieve_policy(self, policy_id: str) -> Policy:
|
|
385
|
+
"""Retrieve policy with error handling."""
|
|
386
|
+
policy = await self.policy_repo.get(policy_id)
|
|
387
|
+
if not policy:
|
|
388
|
+
raise ValueError(f"Policy not found: {policy_id}")
|
|
389
|
+
return policy
|
|
390
|
+
|
|
391
|
+
@try_use_case_step("all_queries_retrieval")
|
|
392
|
+
async def _retrieve_all_queries(
|
|
393
|
+
self, policy: Policy
|
|
394
|
+
) -> Dict[str, KnowledgeServiceQuery]:
|
|
395
|
+
"""Retrieve all knowledge service queries needed for validation and
|
|
396
|
+
transformation."""
|
|
397
|
+
all_queries = {}
|
|
398
|
+
|
|
399
|
+
# Get validation queries
|
|
400
|
+
for query_id, required_score in policy.validation_scores:
|
|
401
|
+
query = await self.knowledge_service_query_repo.get(query_id)
|
|
402
|
+
if not query:
|
|
403
|
+
raise ValueError(f"Validation query not found: {query_id}")
|
|
404
|
+
all_queries[query_id] = query
|
|
405
|
+
|
|
406
|
+
# Get transformation queries
|
|
407
|
+
if policy.transformation_queries:
|
|
408
|
+
for query_id in policy.transformation_queries:
|
|
409
|
+
query = await self.knowledge_service_query_repo.get(query_id)
|
|
410
|
+
if not query:
|
|
411
|
+
raise ValueError(f"Transformation query not found: {query_id}")
|
|
412
|
+
all_queries[query_id] = query
|
|
413
|
+
|
|
414
|
+
return all_queries
|
|
415
|
+
|
|
416
|
+
@try_use_case_step("document_registration")
|
|
417
|
+
async def _register_document_with_services(
|
|
418
|
+
self,
|
|
419
|
+
document: Document,
|
|
420
|
+
queries: Dict[str, KnowledgeServiceQuery],
|
|
421
|
+
) -> Dict[str, str]:
|
|
422
|
+
"""
|
|
423
|
+
Register the document with all knowledge services needed for
|
|
424
|
+
validation.
|
|
425
|
+
|
|
426
|
+
Args:
|
|
427
|
+
document: The document to register
|
|
428
|
+
queries: Dict of query_id to KnowledgeServiceQuery objects
|
|
429
|
+
|
|
430
|
+
Returns:
|
|
431
|
+
Dict mapping knowledge_service_id to service_file_id
|
|
432
|
+
"""
|
|
433
|
+
registrations = {}
|
|
434
|
+
required_service_ids = {
|
|
435
|
+
query.knowledge_service_id for query in queries.values()
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
for knowledge_service_id in required_service_ids:
|
|
439
|
+
# Get the config for this service
|
|
440
|
+
config = await self.knowledge_service_config_repo.get(knowledge_service_id)
|
|
441
|
+
if not config:
|
|
442
|
+
raise ValueError(
|
|
443
|
+
f"Knowledge service config not found: {knowledge_service_id}"
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
registration_result = await self.knowledge_service.register_file(
|
|
447
|
+
config, document
|
|
448
|
+
)
|
|
449
|
+
registrations[knowledge_service_id] = (
|
|
450
|
+
registration_result.knowledge_service_file_id
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
return registrations
|
|
454
|
+
|
|
455
|
+
@try_use_case_step("validation_execution")
|
|
456
|
+
async def _execute_validation_queries(
|
|
457
|
+
self,
|
|
458
|
+
document: Document,
|
|
459
|
+
policy: Policy,
|
|
460
|
+
document_registrations: Dict[str, str],
|
|
461
|
+
queries: Dict[str, KnowledgeServiceQuery],
|
|
462
|
+
) -> List[Tuple[str, int]]:
|
|
463
|
+
"""
|
|
464
|
+
Execute all validation queries and return the actual scores achieved.
|
|
465
|
+
|
|
466
|
+
Args:
|
|
467
|
+
document: The document being validated
|
|
468
|
+
policy: The policy being applied
|
|
469
|
+
document_registrations: Mapping of service_id to service_file_id
|
|
470
|
+
queries: Dict of query_id to KnowledgeServiceQuery objects
|
|
471
|
+
|
|
472
|
+
Returns:
|
|
473
|
+
List of (query_id, actual_score) tuples
|
|
474
|
+
"""
|
|
475
|
+
validation_scores = []
|
|
476
|
+
|
|
477
|
+
# Execute each validation query defined in the policy
|
|
478
|
+
for query_id, required_score in policy.validation_scores:
|
|
479
|
+
# Get the query configuration
|
|
480
|
+
query = queries[query_id]
|
|
481
|
+
|
|
482
|
+
# Get the config for this service
|
|
483
|
+
config = await self.knowledge_service_config_repo.get(
|
|
484
|
+
query.knowledge_service_id
|
|
485
|
+
)
|
|
486
|
+
if not config:
|
|
487
|
+
raise ValueError(
|
|
488
|
+
f"Knowledge service config not found: {query.knowledge_service_id}"
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
# Get the service file ID from our registrations
|
|
492
|
+
service_file_id = document_registrations.get(query.knowledge_service_id)
|
|
493
|
+
if not service_file_id:
|
|
494
|
+
raise ValueError(
|
|
495
|
+
f"Document not registered with service {query.knowledge_service_id}"
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
# Execute the validation query
|
|
499
|
+
query_result = await self.knowledge_service.execute_query(
|
|
500
|
+
config,
|
|
501
|
+
query.prompt,
|
|
502
|
+
[service_file_id],
|
|
503
|
+
query.query_metadata,
|
|
504
|
+
query.assistant_prompt,
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
# Extract the score from the query result
|
|
508
|
+
actual_score = self._extract_score_from_result(query_result.result_data)
|
|
509
|
+
validation_scores.append((query_id, actual_score))
|
|
510
|
+
|
|
511
|
+
logger.debug(
|
|
512
|
+
"Validation query executed",
|
|
513
|
+
extra={
|
|
514
|
+
"query_id": query_id,
|
|
515
|
+
"required_score": required_score,
|
|
516
|
+
"actual_score": actual_score,
|
|
517
|
+
"passed": actual_score >= required_score,
|
|
518
|
+
},
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
return validation_scores
|
|
522
|
+
|
|
523
|
+
def _extract_score_from_result(self, result_data: Dict) -> int:
|
|
524
|
+
"""
|
|
525
|
+
Extract a numeric score from the knowledge service query result.
|
|
526
|
+
|
|
527
|
+
Similar to _parse_query_result, but expects a numeric response.
|
|
528
|
+
Returns the actual score without range validation to preserve data
|
|
529
|
+
integrity.
|
|
530
|
+
"""
|
|
531
|
+
response_text = result_data.get("response", "")
|
|
532
|
+
if not response_text:
|
|
533
|
+
raise ValueError("Empty response from knowledge service")
|
|
534
|
+
|
|
535
|
+
# Try to parse response as integer directly
|
|
536
|
+
try:
|
|
537
|
+
score = int(response_text.strip())
|
|
538
|
+
return score
|
|
539
|
+
except ValueError as e:
|
|
540
|
+
raise ValueError(
|
|
541
|
+
f"Failed to parse numeric score from response: {response_text}"
|
|
542
|
+
) from e
|
|
543
|
+
|
|
544
|
+
def _determine_validation_result(
|
|
545
|
+
self,
|
|
546
|
+
actual_scores: List[Tuple[str, int]],
|
|
547
|
+
required_scores: List[Tuple[str, int]],
|
|
548
|
+
) -> bool:
|
|
549
|
+
"""
|
|
550
|
+
Determine if validation passed based on actual vs required scores.
|
|
551
|
+
|
|
552
|
+
Args:
|
|
553
|
+
actual_scores: List of (query_id, actual_score) tuples
|
|
554
|
+
required_scores: List of (query_id, required_score) tuples from
|
|
555
|
+
policy
|
|
556
|
+
|
|
557
|
+
Returns:
|
|
558
|
+
True if all required scores were met or exceeded, False otherwise
|
|
559
|
+
"""
|
|
560
|
+
# Convert to dictionaries for easier lookup
|
|
561
|
+
actual_scores_dict = dict(actual_scores)
|
|
562
|
+
required_scores_dict = dict(required_scores)
|
|
563
|
+
|
|
564
|
+
# Check if all required scores were met
|
|
565
|
+
for query_id, required_score in required_scores_dict.items():
|
|
566
|
+
actual_score = actual_scores_dict.get(query_id, 0)
|
|
567
|
+
if actual_score < required_score:
|
|
568
|
+
logger.debug(
|
|
569
|
+
"Validation failed for query",
|
|
570
|
+
extra={
|
|
571
|
+
"query_id": query_id,
|
|
572
|
+
"required_score": required_score,
|
|
573
|
+
"actual_score": actual_score,
|
|
574
|
+
},
|
|
575
|
+
)
|
|
576
|
+
return False
|
|
577
|
+
|
|
578
|
+
return True
|
|
579
|
+
|
|
580
|
+
@try_use_case_step("document_transformation")
|
|
581
|
+
async def _apply_transformations(
|
|
582
|
+
self,
|
|
583
|
+
document: Document,
|
|
584
|
+
policy: Policy,
|
|
585
|
+
all_queries: Dict[str, KnowledgeServiceQuery],
|
|
586
|
+
document_registrations: Dict[str, str],
|
|
587
|
+
) -> Document:
|
|
588
|
+
"""
|
|
589
|
+
Apply transformation queries to a document and return the
|
|
590
|
+
transformed document.
|
|
591
|
+
|
|
592
|
+
Args:
|
|
593
|
+
document: The original document to transform
|
|
594
|
+
policy: The policy containing transformation query IDs
|
|
595
|
+
all_queries: Dict of all queries (validation and transformation)
|
|
596
|
+
document_registrations: Mapping of service_id to service_file_id
|
|
597
|
+
|
|
598
|
+
Returns:
|
|
599
|
+
New Document object with transformed content
|
|
600
|
+
|
|
601
|
+
Raises:
|
|
602
|
+
ValueError: If transformation queries are not found or fail
|
|
603
|
+
RuntimeError: If document transformation fails
|
|
604
|
+
"""
|
|
605
|
+
if not policy.transformation_queries:
|
|
606
|
+
raise ValueError("No transformation queries provided")
|
|
607
|
+
|
|
608
|
+
logger.debug(
|
|
609
|
+
"Applying transformations to document",
|
|
610
|
+
extra={
|
|
611
|
+
"document_id": document.document_id,
|
|
612
|
+
"transformation_query_ids": policy.transformation_queries,
|
|
613
|
+
},
|
|
614
|
+
)
|
|
615
|
+
|
|
616
|
+
# Apply transformations sequentially
|
|
617
|
+
current_content = document.content
|
|
618
|
+
if current_content is None:
|
|
619
|
+
raise ValueError("Document content stream is required for transformation")
|
|
620
|
+
current_content.seek(0)
|
|
621
|
+
transformed_content = current_content.read().decode("utf-8")
|
|
622
|
+
current_content.seek(0)
|
|
623
|
+
|
|
624
|
+
for query_id in policy.transformation_queries:
|
|
625
|
+
query = all_queries[query_id]
|
|
626
|
+
|
|
627
|
+
# Get the config for this service
|
|
628
|
+
config = await self.knowledge_service_config_repo.get(
|
|
629
|
+
query.knowledge_service_id
|
|
630
|
+
)
|
|
631
|
+
if not config:
|
|
632
|
+
raise ValueError(
|
|
633
|
+
f"Knowledge service config not found: {query.knowledge_service_id}"
|
|
634
|
+
)
|
|
635
|
+
|
|
636
|
+
# Get the service file ID from our registrations
|
|
637
|
+
service_file_id = document_registrations.get(query.knowledge_service_id)
|
|
638
|
+
if not service_file_id:
|
|
639
|
+
raise ValueError(
|
|
640
|
+
f"Document not registered with service {query.knowledge_service_id}"
|
|
641
|
+
)
|
|
642
|
+
|
|
643
|
+
# Execute the transformation query
|
|
644
|
+
transformation_result = await self.knowledge_service.execute_query(
|
|
645
|
+
config,
|
|
646
|
+
query.prompt,
|
|
647
|
+
[service_file_id],
|
|
648
|
+
query.query_metadata,
|
|
649
|
+
query.assistant_prompt,
|
|
650
|
+
)
|
|
651
|
+
|
|
652
|
+
# Extract transformed content from result
|
|
653
|
+
transformed_content = self._extract_transformed_content(
|
|
654
|
+
transformation_result.result_data
|
|
655
|
+
)
|
|
656
|
+
|
|
657
|
+
logger.debug(
|
|
658
|
+
"Transformation query applied",
|
|
659
|
+
extra={
|
|
660
|
+
"query_id": query_id,
|
|
661
|
+
"original_length": document.size_bytes,
|
|
662
|
+
"transformed_length": len(transformed_content),
|
|
663
|
+
},
|
|
664
|
+
)
|
|
665
|
+
|
|
666
|
+
# Create new document with transformed content
|
|
667
|
+
transformed_document_id = await self.document_repo.generate_id()
|
|
668
|
+
|
|
669
|
+
# Create content stream from transformed text
|
|
670
|
+
transformed_bytes = transformed_content.encode("utf-8")
|
|
671
|
+
transformed_stream = io.BytesIO(transformed_bytes)
|
|
672
|
+
|
|
673
|
+
# Calculate multihash for transformed content
|
|
674
|
+
sha256_hasher = hashlib.sha256()
|
|
675
|
+
sha256_hasher.update(transformed_bytes)
|
|
676
|
+
sha256_hash = sha256_hasher.digest()
|
|
677
|
+
mhash = multihash.encode(sha256_hash, multihash.SHA2_256)
|
|
678
|
+
proper_multihash = str(mhash.hex())
|
|
679
|
+
|
|
680
|
+
transformed_document = Document(
|
|
681
|
+
document_id=transformed_document_id,
|
|
682
|
+
original_filename=f"transformed_{document.original_filename}",
|
|
683
|
+
content_type=document.content_type,
|
|
684
|
+
size_bytes=len(transformed_bytes),
|
|
685
|
+
content_multihash=proper_multihash,
|
|
686
|
+
status=DocumentStatus.CAPTURED,
|
|
687
|
+
content=ContentStream(transformed_stream),
|
|
688
|
+
created_at=self.now_fn(),
|
|
689
|
+
updated_at=self.now_fn(),
|
|
690
|
+
)
|
|
691
|
+
|
|
692
|
+
# Save the transformed document
|
|
693
|
+
await self.document_repo.save(transformed_document)
|
|
694
|
+
|
|
695
|
+
logger.info(
|
|
696
|
+
"Document transformation completed",
|
|
697
|
+
extra={
|
|
698
|
+
"original_document_id": document.document_id,
|
|
699
|
+
"transformed_document_id": transformed_document.document_id,
|
|
700
|
+
"original_size": document.size_bytes,
|
|
701
|
+
"transformed_size": transformed_document.size_bytes,
|
|
702
|
+
},
|
|
703
|
+
)
|
|
704
|
+
|
|
705
|
+
return transformed_document
|
|
706
|
+
|
|
707
|
+
def _extract_transformed_content(self, result_data: Dict) -> str:
|
|
708
|
+
"""
|
|
709
|
+
Extract transformed document content from knowledge service result.
|
|
710
|
+
|
|
711
|
+
Args:
|
|
712
|
+
result_data: Result data from knowledge service transformation
|
|
713
|
+
query
|
|
714
|
+
|
|
715
|
+
Returns:
|
|
716
|
+
Transformed document content as valid JSON string
|
|
717
|
+
|
|
718
|
+
Raises:
|
|
719
|
+
ValueError: If no valid JSON content can be extracted from result
|
|
720
|
+
"""
|
|
721
|
+
response_text = result_data.get("response", "")
|
|
722
|
+
if not response_text:
|
|
723
|
+
raise ValueError("Empty response from transformation query")
|
|
724
|
+
|
|
725
|
+
# The response must be valid JSON
|
|
726
|
+
stripped_response: str = response_text.strip()
|
|
727
|
+
try:
|
|
728
|
+
# Parse to validate JSON structure
|
|
729
|
+
json.loads(stripped_response)
|
|
730
|
+
# Return the original response text (preserving formatting)
|
|
731
|
+
return stripped_response
|
|
732
|
+
except json.JSONDecodeError as e:
|
|
733
|
+
raise ValueError(
|
|
734
|
+
f"Transformation result must be valid JSON, got: "
|
|
735
|
+
f"{response_text[:100]}... Parse error: {e}"
|
|
736
|
+
)
|