julee 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- julee/__init__.py +3 -0
- julee/api/__init__.py +20 -0
- julee/api/app.py +180 -0
- julee/api/dependencies.py +257 -0
- julee/api/requests.py +175 -0
- julee/api/responses.py +43 -0
- julee/api/routers/__init__.py +43 -0
- julee/api/routers/assembly_specifications.py +212 -0
- julee/api/routers/documents.py +182 -0
- julee/api/routers/knowledge_service_configs.py +79 -0
- julee/api/routers/knowledge_service_queries.py +293 -0
- julee/api/routers/system.py +137 -0
- julee/api/routers/workflows.py +234 -0
- julee/api/services/__init__.py +20 -0
- julee/api/services/system_initialization.py +214 -0
- julee/api/tests/__init__.py +14 -0
- julee/api/tests/routers/__init__.py +17 -0
- julee/api/tests/routers/test_assembly_specifications.py +749 -0
- julee/api/tests/routers/test_documents.py +301 -0
- julee/api/tests/routers/test_knowledge_service_configs.py +234 -0
- julee/api/tests/routers/test_knowledge_service_queries.py +738 -0
- julee/api/tests/routers/test_system.py +179 -0
- julee/api/tests/routers/test_workflows.py +393 -0
- julee/api/tests/test_app.py +285 -0
- julee/api/tests/test_dependencies.py +245 -0
- julee/api/tests/test_requests.py +250 -0
- julee/domain/__init__.py +22 -0
- julee/domain/models/__init__.py +49 -0
- julee/domain/models/assembly/__init__.py +17 -0
- julee/domain/models/assembly/assembly.py +103 -0
- julee/domain/models/assembly/tests/__init__.py +0 -0
- julee/domain/models/assembly/tests/factories.py +37 -0
- julee/domain/models/assembly/tests/test_assembly.py +430 -0
- julee/domain/models/assembly_specification/__init__.py +24 -0
- julee/domain/models/assembly_specification/assembly_specification.py +172 -0
- julee/domain/models/assembly_specification/knowledge_service_query.py +123 -0
- julee/domain/models/assembly_specification/tests/__init__.py +0 -0
- julee/domain/models/assembly_specification/tests/factories.py +78 -0
- julee/domain/models/assembly_specification/tests/test_assembly_specification.py +490 -0
- julee/domain/models/assembly_specification/tests/test_knowledge_service_query.py +310 -0
- julee/domain/models/custom_fields/__init__.py +0 -0
- julee/domain/models/custom_fields/content_stream.py +68 -0
- julee/domain/models/custom_fields/tests/__init__.py +0 -0
- julee/domain/models/custom_fields/tests/test_custom_fields.py +53 -0
- julee/domain/models/document/__init__.py +17 -0
- julee/domain/models/document/document.py +150 -0
- julee/domain/models/document/tests/__init__.py +0 -0
- julee/domain/models/document/tests/factories.py +76 -0
- julee/domain/models/document/tests/test_document.py +297 -0
- julee/domain/models/knowledge_service_config/__init__.py +17 -0
- julee/domain/models/knowledge_service_config/knowledge_service_config.py +86 -0
- julee/domain/models/policy/__init__.py +15 -0
- julee/domain/models/policy/document_policy_validation.py +220 -0
- julee/domain/models/policy/policy.py +203 -0
- julee/domain/models/policy/tests/__init__.py +0 -0
- julee/domain/models/policy/tests/factories.py +47 -0
- julee/domain/models/policy/tests/test_document_policy_validation.py +420 -0
- julee/domain/models/policy/tests/test_policy.py +546 -0
- julee/domain/repositories/__init__.py +27 -0
- julee/domain/repositories/assembly.py +45 -0
- julee/domain/repositories/assembly_specification.py +52 -0
- julee/domain/repositories/base.py +146 -0
- julee/domain/repositories/document.py +49 -0
- julee/domain/repositories/document_policy_validation.py +52 -0
- julee/domain/repositories/knowledge_service_config.py +54 -0
- julee/domain/repositories/knowledge_service_query.py +44 -0
- julee/domain/repositories/policy.py +49 -0
- julee/domain/use_cases/__init__.py +17 -0
- julee/domain/use_cases/decorators.py +107 -0
- julee/domain/use_cases/extract_assemble_data.py +649 -0
- julee/domain/use_cases/initialize_system_data.py +842 -0
- julee/domain/use_cases/tests/__init__.py +7 -0
- julee/domain/use_cases/tests/test_extract_assemble_data.py +548 -0
- julee/domain/use_cases/tests/test_initialize_system_data.py +455 -0
- julee/domain/use_cases/tests/test_validate_document.py +1228 -0
- julee/domain/use_cases/validate_document.py +736 -0
- julee/fixtures/assembly_specifications.yaml +70 -0
- julee/fixtures/documents.yaml +178 -0
- julee/fixtures/knowledge_service_configs.yaml +37 -0
- julee/fixtures/knowledge_service_queries.yaml +27 -0
- julee/repositories/__init__.py +17 -0
- julee/repositories/memory/__init__.py +31 -0
- julee/repositories/memory/assembly.py +84 -0
- julee/repositories/memory/assembly_specification.py +125 -0
- julee/repositories/memory/base.py +227 -0
- julee/repositories/memory/document.py +149 -0
- julee/repositories/memory/document_policy_validation.py +104 -0
- julee/repositories/memory/knowledge_service_config.py +123 -0
- julee/repositories/memory/knowledge_service_query.py +120 -0
- julee/repositories/memory/policy.py +87 -0
- julee/repositories/memory/tests/__init__.py +0 -0
- julee/repositories/memory/tests/test_document.py +212 -0
- julee/repositories/memory/tests/test_document_policy_validation.py +161 -0
- julee/repositories/memory/tests/test_policy.py +443 -0
- julee/repositories/minio/__init__.py +31 -0
- julee/repositories/minio/assembly.py +103 -0
- julee/repositories/minio/assembly_specification.py +170 -0
- julee/repositories/minio/client.py +570 -0
- julee/repositories/minio/document.py +530 -0
- julee/repositories/minio/document_policy_validation.py +120 -0
- julee/repositories/minio/knowledge_service_config.py +187 -0
- julee/repositories/minio/knowledge_service_query.py +211 -0
- julee/repositories/minio/policy.py +106 -0
- julee/repositories/minio/tests/__init__.py +0 -0
- julee/repositories/minio/tests/fake_client.py +213 -0
- julee/repositories/minio/tests/test_assembly.py +374 -0
- julee/repositories/minio/tests/test_assembly_specification.py +391 -0
- julee/repositories/minio/tests/test_client_protocol.py +57 -0
- julee/repositories/minio/tests/test_document.py +591 -0
- julee/repositories/minio/tests/test_document_policy_validation.py +192 -0
- julee/repositories/minio/tests/test_knowledge_service_config.py +374 -0
- julee/repositories/minio/tests/test_knowledge_service_query.py +438 -0
- julee/repositories/minio/tests/test_policy.py +559 -0
- julee/repositories/temporal/__init__.py +38 -0
- julee/repositories/temporal/activities.py +114 -0
- julee/repositories/temporal/activity_names.py +34 -0
- julee/repositories/temporal/proxies.py +159 -0
- julee/services/__init__.py +18 -0
- julee/services/knowledge_service/__init__.py +48 -0
- julee/services/knowledge_service/anthropic/__init__.py +12 -0
- julee/services/knowledge_service/anthropic/knowledge_service.py +331 -0
- julee/services/knowledge_service/anthropic/tests/test_knowledge_service.py +318 -0
- julee/services/knowledge_service/factory.py +138 -0
- julee/services/knowledge_service/knowledge_service.py +160 -0
- julee/services/knowledge_service/memory/__init__.py +13 -0
- julee/services/knowledge_service/memory/knowledge_service.py +278 -0
- julee/services/knowledge_service/memory/test_knowledge_service.py +345 -0
- julee/services/knowledge_service/test_factory.py +112 -0
- julee/services/temporal/__init__.py +38 -0
- julee/services/temporal/activities.py +86 -0
- julee/services/temporal/activity_names.py +22 -0
- julee/services/temporal/proxies.py +41 -0
- julee/util/__init__.py +0 -0
- julee/util/domain.py +119 -0
- julee/util/repos/__init__.py +0 -0
- julee/util/repos/minio/__init__.py +0 -0
- julee/util/repos/minio/file_storage.py +213 -0
- julee/util/repos/temporal/__init__.py +11 -0
- julee/util/repos/temporal/client_proxies/file_storage.py +68 -0
- julee/util/repos/temporal/data_converter.py +123 -0
- julee/util/repos/temporal/minio_file_storage.py +12 -0
- julee/util/repos/temporal/proxies/__init__.py +0 -0
- julee/util/repos/temporal/proxies/file_storage.py +58 -0
- julee/util/repositories.py +55 -0
- julee/util/temporal/__init__.py +22 -0
- julee/util/temporal/activities.py +123 -0
- julee/util/temporal/decorators.py +473 -0
- julee/util/tests/__init__.py +1 -0
- julee/util/tests/test_decorators.py +770 -0
- julee/util/validation/__init__.py +29 -0
- julee/util/validation/repository.py +100 -0
- julee/util/validation/type_guards.py +369 -0
- julee/worker.py +211 -0
- julee/workflows/__init__.py +26 -0
- julee/workflows/extract_assemble.py +215 -0
- julee/workflows/validate_document.py +228 -0
- julee-0.1.0.dist-info/METADATA +195 -0
- julee-0.1.0.dist-info/RECORD +161 -0
- julee-0.1.0.dist-info/WHEEL +5 -0
- julee-0.1.0.dist-info/licenses/LICENSE +674 -0
- julee-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,649 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Use case logic for data assembly within the Capture, Extract, Assemble,
|
|
3
|
+
Publish workflow.
|
|
4
|
+
|
|
5
|
+
This module contains use case classes that orchestrate business logic while
|
|
6
|
+
remaining framework-agnostic. Dependencies are injected via repository
|
|
7
|
+
instances following the Clean Architecture principles.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import hashlib
|
|
11
|
+
import json
|
|
12
|
+
import logging
|
|
13
|
+
from datetime import datetime, timezone
|
|
14
|
+
from typing import Any, Callable, Dict
|
|
15
|
+
|
|
16
|
+
import jsonpointer # type: ignore
|
|
17
|
+
import jsonschema
|
|
18
|
+
import multihash
|
|
19
|
+
|
|
20
|
+
from julee.domain.models import (
|
|
21
|
+
Assembly,
|
|
22
|
+
AssemblySpecification,
|
|
23
|
+
AssemblyStatus,
|
|
24
|
+
Document,
|
|
25
|
+
DocumentStatus,
|
|
26
|
+
KnowledgeServiceQuery,
|
|
27
|
+
)
|
|
28
|
+
from julee.domain.repositories import (
|
|
29
|
+
AssemblyRepository,
|
|
30
|
+
AssemblySpecificationRepository,
|
|
31
|
+
DocumentRepository,
|
|
32
|
+
KnowledgeServiceConfigRepository,
|
|
33
|
+
KnowledgeServiceQueryRepository,
|
|
34
|
+
)
|
|
35
|
+
from julee.services import KnowledgeService
|
|
36
|
+
from julee.util.validation import ensure_repository_protocol, validate_parameter_types
|
|
37
|
+
|
|
38
|
+
from .decorators import try_use_case_step
|
|
39
|
+
|
|
40
|
+
logger = logging.getLogger(__name__)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class ExtractAssembleDataUseCase:
|
|
44
|
+
"""
|
|
45
|
+
Use case for extracting and assembling documents according to
|
|
46
|
+
specifications.
|
|
47
|
+
|
|
48
|
+
This class orchestrates the business logic for the "Extract, Assemble"
|
|
49
|
+
phases of the Capture, Extract, Assemble, Publish workflow while remaining
|
|
50
|
+
framework-agnostic. It depends only on repository protocols, not
|
|
51
|
+
concrete implementations.
|
|
52
|
+
|
|
53
|
+
In workflow contexts, this use case is called from workflow code with
|
|
54
|
+
repository stubs that delegate to Temporal activities for durability.
|
|
55
|
+
The use case remains completely unaware of whether it's running in a
|
|
56
|
+
workflow context or a simple async context - it just calls repository
|
|
57
|
+
methods and expects them to work correctly.
|
|
58
|
+
|
|
59
|
+
Architectural Notes:
|
|
60
|
+
- This class contains pure business logic with no framework dependencies
|
|
61
|
+
- Repository dependencies are injected via constructor
|
|
62
|
+
(dependency inversion)
|
|
63
|
+
- All error handling and compensation logic is contained here
|
|
64
|
+
- The use case works with domain objects exclusively
|
|
65
|
+
- Deterministic execution is guaranteed by avoiding
|
|
66
|
+
non-deterministic operations
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
def __init__(
|
|
70
|
+
self,
|
|
71
|
+
document_repo: DocumentRepository,
|
|
72
|
+
assembly_repo: AssemblyRepository,
|
|
73
|
+
assembly_specification_repo: AssemblySpecificationRepository,
|
|
74
|
+
knowledge_service_query_repo: KnowledgeServiceQueryRepository,
|
|
75
|
+
knowledge_service_config_repo: KnowledgeServiceConfigRepository,
|
|
76
|
+
knowledge_service: KnowledgeService,
|
|
77
|
+
now_fn: Callable[[], datetime] = lambda: datetime.now(timezone.utc),
|
|
78
|
+
) -> None:
|
|
79
|
+
"""Initialize extract and assemble data use case.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
document_repo: Repository for document operations
|
|
83
|
+
assembly_repo: Repository for assembly operations
|
|
84
|
+
assembly_specification_repo: Repository for assembly
|
|
85
|
+
specification operations
|
|
86
|
+
knowledge_service_query_repo: Repository for knowledge service
|
|
87
|
+
query operations
|
|
88
|
+
knowledge_service_config_repo: Repository for knowledge service
|
|
89
|
+
configuration operations
|
|
90
|
+
knowledge_service: Knowledge service instance for external
|
|
91
|
+
operations
|
|
92
|
+
now_fn: Function to get current time (for workflow compatibility)
|
|
93
|
+
|
|
94
|
+
Note:
|
|
95
|
+
The repositories passed here may be concrete implementations
|
|
96
|
+
(for testing or direct execution) or workflow stubs (for
|
|
97
|
+
Temporal workflow execution). The use case doesn't know or care
|
|
98
|
+
which - it just calls the methods defined in the protocols.
|
|
99
|
+
|
|
100
|
+
Repositories are validated at construction time to catch
|
|
101
|
+
configuration errors early in the application lifecycle.
|
|
102
|
+
"""
|
|
103
|
+
# Validate at construction time for early error detection
|
|
104
|
+
self.document_repo = ensure_repository_protocol(
|
|
105
|
+
document_repo,
|
|
106
|
+
DocumentRepository, # type: ignore[type-abstract]
|
|
107
|
+
)
|
|
108
|
+
self.knowledge_service = knowledge_service
|
|
109
|
+
self.now_fn = now_fn
|
|
110
|
+
self.assembly_repo = ensure_repository_protocol(
|
|
111
|
+
assembly_repo,
|
|
112
|
+
AssemblyRepository, # type: ignore[type-abstract]
|
|
113
|
+
)
|
|
114
|
+
self.assembly_specification_repo = ensure_repository_protocol(
|
|
115
|
+
assembly_specification_repo,
|
|
116
|
+
AssemblySpecificationRepository, # type: ignore[type-abstract]
|
|
117
|
+
)
|
|
118
|
+
self.knowledge_service_query_repo = ensure_repository_protocol(
|
|
119
|
+
knowledge_service_query_repo,
|
|
120
|
+
KnowledgeServiceQueryRepository, # type: ignore[type-abstract]
|
|
121
|
+
)
|
|
122
|
+
self.knowledge_service_config_repo = ensure_repository_protocol(
|
|
123
|
+
knowledge_service_config_repo,
|
|
124
|
+
KnowledgeServiceConfigRepository, # type: ignore[type-abstract]
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
async def assemble_data(
|
|
128
|
+
self,
|
|
129
|
+
document_id: str,
|
|
130
|
+
assembly_specification_id: str,
|
|
131
|
+
workflow_id: str,
|
|
132
|
+
) -> Assembly:
|
|
133
|
+
"""
|
|
134
|
+
Assemble a document according to its specification and create a new
|
|
135
|
+
assembly.
|
|
136
|
+
|
|
137
|
+
This method orchestrates the core assembly workflow:
|
|
138
|
+
1. Generates a unique assembly ID
|
|
139
|
+
2. Retrieves the assembly specification
|
|
140
|
+
3. Stores the initial assembly in the repository
|
|
141
|
+
4. Retrieves all knowledge service queries needed for the assembly
|
|
142
|
+
5. Retrieves all knowledge service instances needed for the assembly
|
|
143
|
+
6. Retrieves the input document and registers it with knowledge
|
|
144
|
+
services
|
|
145
|
+
7. Performs the assembly iteration to create the assembled document
|
|
146
|
+
8. Adds the iteration to the assembly and returns it
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
document_id: ID of the document to assemble
|
|
150
|
+
assembly_specification_id: ID of the specification to use
|
|
151
|
+
workflow_id: Temporal workflow ID that creates this assembly
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
New Assembly with the assembled document iteration
|
|
155
|
+
|
|
156
|
+
Raises:
|
|
157
|
+
ValueError: If required entities are not found or invalid
|
|
158
|
+
RuntimeError: If assembly processing fails
|
|
159
|
+
"""
|
|
160
|
+
logger.debug(
|
|
161
|
+
"Starting data assembly use case",
|
|
162
|
+
extra={
|
|
163
|
+
"document_id": document_id,
|
|
164
|
+
"assembly_specification_id": assembly_specification_id,
|
|
165
|
+
"workflow_id": workflow_id,
|
|
166
|
+
},
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# Step 1: Generate unique assembly ID
|
|
170
|
+
assembly_id = await self._generate_assembly_id(
|
|
171
|
+
document_id, assembly_specification_id
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
# Step 2: Retrieve the assembly specification
|
|
175
|
+
assembly_specification = await self._retrieve_assembly_specification(
|
|
176
|
+
assembly_specification_id
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
# Step 3: Store the initial assembly
|
|
180
|
+
assembly = Assembly(
|
|
181
|
+
assembly_id=assembly_id,
|
|
182
|
+
assembly_specification_id=assembly_specification_id,
|
|
183
|
+
input_document_id=document_id,
|
|
184
|
+
workflow_id=workflow_id,
|
|
185
|
+
status=AssemblyStatus.IN_PROGRESS,
|
|
186
|
+
assembled_document_id=None,
|
|
187
|
+
created_at=self.now_fn(),
|
|
188
|
+
updated_at=self.now_fn(),
|
|
189
|
+
)
|
|
190
|
+
await self.assembly_repo.save(assembly)
|
|
191
|
+
|
|
192
|
+
logger.debug(
|
|
193
|
+
"Initial assembly stored",
|
|
194
|
+
extra={
|
|
195
|
+
"assembly_id": assembly_id,
|
|
196
|
+
"status": assembly.status.value,
|
|
197
|
+
},
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# Step 4: Retrieve all knowledge service queries once
|
|
201
|
+
queries = await self._retrieve_all_queries(assembly_specification)
|
|
202
|
+
|
|
203
|
+
# Step 5: Register the document with knowledge services
|
|
204
|
+
document = await self._retrieve_document(document_id)
|
|
205
|
+
document_registrations = await self._register_document_with_services(
|
|
206
|
+
document, queries
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
# Step 7: Perform the assembly iteration
|
|
210
|
+
try:
|
|
211
|
+
assembled_document_id = await self._assemble_iteration(
|
|
212
|
+
document,
|
|
213
|
+
assembly_specification,
|
|
214
|
+
document_registrations,
|
|
215
|
+
queries,
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
# Step 8: Set the assembled document and return
|
|
219
|
+
assembly.assembled_document_id = assembled_document_id
|
|
220
|
+
assembly.status = AssemblyStatus.COMPLETED
|
|
221
|
+
await self.assembly_repo.save(assembly)
|
|
222
|
+
|
|
223
|
+
logger.info(
|
|
224
|
+
"Assembly completed successfully",
|
|
225
|
+
extra={
|
|
226
|
+
"assembly_id": assembly_id,
|
|
227
|
+
"assembled_document_id": assembled_document_id,
|
|
228
|
+
},
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
return assembly
|
|
232
|
+
|
|
233
|
+
except Exception as e:
|
|
234
|
+
# Mark assembly as failed
|
|
235
|
+
assembly.status = AssemblyStatus.FAILED
|
|
236
|
+
await self.assembly_repo.save(assembly)
|
|
237
|
+
|
|
238
|
+
logger.error(
|
|
239
|
+
"Assembly failed",
|
|
240
|
+
extra={
|
|
241
|
+
"assembly_id": assembly_id,
|
|
242
|
+
"error": str(e),
|
|
243
|
+
},
|
|
244
|
+
exc_info=True,
|
|
245
|
+
)
|
|
246
|
+
raise
|
|
247
|
+
|
|
248
|
+
@try_use_case_step("document_registration")
|
|
249
|
+
@validate_parameter_types()
|
|
250
|
+
async def _register_document_with_services(
|
|
251
|
+
self,
|
|
252
|
+
document: Document,
|
|
253
|
+
queries: Dict[str, KnowledgeServiceQuery],
|
|
254
|
+
) -> Dict[str, str]:
|
|
255
|
+
"""
|
|
256
|
+
Register the document with all knowledge services needed for assembly.
|
|
257
|
+
|
|
258
|
+
This is a temporary solution - document registration will be handled
|
|
259
|
+
properly in a separate process later.
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
document: The document to register
|
|
263
|
+
queries: Dict of query_id to KnowledgeServiceQuery objects
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
Dict mapping knowledge_service_id to service_file_id
|
|
267
|
+
|
|
268
|
+
Raises:
|
|
269
|
+
RuntimeError: If registration fails
|
|
270
|
+
"""
|
|
271
|
+
registrations = {}
|
|
272
|
+
|
|
273
|
+
required_service_ids = {
|
|
274
|
+
query.knowledge_service_id for query in queries.values()
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
for knowledge_service_id in required_service_ids:
|
|
278
|
+
# Get the config for this service
|
|
279
|
+
config = await self.knowledge_service_config_repo.get(knowledge_service_id)
|
|
280
|
+
if not config:
|
|
281
|
+
raise ValueError(
|
|
282
|
+
f"Knowledge service config not found: {knowledge_service_id}"
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
registration_result = await self.knowledge_service.register_file(
|
|
286
|
+
config, document
|
|
287
|
+
)
|
|
288
|
+
registrations[knowledge_service_id] = (
|
|
289
|
+
registration_result.knowledge_service_file_id
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
return registrations
|
|
293
|
+
|
|
294
|
+
@try_use_case_step("queries_retrieval")
|
|
295
|
+
async def _retrieve_all_queries(
|
|
296
|
+
self, assembly_specification: AssemblySpecification
|
|
297
|
+
) -> Dict[str, KnowledgeServiceQuery]:
|
|
298
|
+
"""Retrieve all knowledge service queries needed for this assembly."""
|
|
299
|
+
query_ids = list(assembly_specification.knowledge_service_queries.values())
|
|
300
|
+
|
|
301
|
+
# TODO: TEMPORAL SERIALIZATION ISSUE - Replace with get_many when
|
|
302
|
+
# fixed
|
|
303
|
+
#
|
|
304
|
+
# Issue: Complex return type
|
|
305
|
+
# Dict[str, Optional[KnowledgeServiceQuery]] from get_many causes
|
|
306
|
+
# Temporal's type system to fall back to typing.Any, resulting in
|
|
307
|
+
# Pydantic models being deserialized as plain dictionaries instead of
|
|
308
|
+
# model instances.
|
|
309
|
+
#
|
|
310
|
+
# Error: "SERIALIZATION ISSUE DETECTED: parameter
|
|
311
|
+
# 'queries'['query-id'] is dict instead of KnowledgeServiceQuery!"
|
|
312
|
+
#
|
|
313
|
+
# Root Cause: Temporal's type resolution cannot handle the complex
|
|
314
|
+
# nested generic type Dict[str, Optional[T]] and passes typing.Any to
|
|
315
|
+
# the data converter, which then deserializes to plain dicts.
|
|
316
|
+
#
|
|
317
|
+
# Investigation: Full analysis showed:
|
|
318
|
+
# - Data converter debug output confirming typing.Any fallback
|
|
319
|
+
# - Repository type resolution working correctly
|
|
320
|
+
# - Guard check system detecting the exact issue
|
|
321
|
+
# - Evidence that simpler types (Optional[T]) work fine
|
|
322
|
+
#
|
|
323
|
+
# Temporary Fix: Use individual get() calls which return Optional[T]
|
|
324
|
+
# that Temporal handles correctly.
|
|
325
|
+
#
|
|
326
|
+
# Future Solutions:
|
|
327
|
+
# 1. Fix Temporal's type resolution for complex nested generics
|
|
328
|
+
# 2. Create custom data converter for this specific type pattern
|
|
329
|
+
# 3. Simplify repository interface to avoid Optional in batch
|
|
330
|
+
# operations
|
|
331
|
+
#
|
|
332
|
+
# Currently using individual get calls to avoid complex type
|
|
333
|
+
# serialization issue
|
|
334
|
+
queries = {}
|
|
335
|
+
for query_id in query_ids:
|
|
336
|
+
query = await self.knowledge_service_query_repo.get(query_id)
|
|
337
|
+
if not query:
|
|
338
|
+
raise ValueError(f"Knowledge service query not found: {query_id}")
|
|
339
|
+
queries[query_id] = query
|
|
340
|
+
return queries
|
|
341
|
+
|
|
342
|
+
@try_use_case_step("assembly_iteration")
|
|
343
|
+
async def _assemble_iteration(
|
|
344
|
+
self,
|
|
345
|
+
document: Document,
|
|
346
|
+
assembly_specification: AssemblySpecification,
|
|
347
|
+
document_registrations: Dict[str, str],
|
|
348
|
+
queries: Dict[str, KnowledgeServiceQuery],
|
|
349
|
+
) -> str:
|
|
350
|
+
"""
|
|
351
|
+
Perform a single assembly iteration using knowledge services.
|
|
352
|
+
|
|
353
|
+
This method:
|
|
354
|
+
1. Executes all knowledge service queries defined in the specification
|
|
355
|
+
2. Stitches together the query results into a complete JSON document
|
|
356
|
+
3. Creates and stores the assembled document
|
|
357
|
+
4. Returns the ID of the assembled document
|
|
358
|
+
|
|
359
|
+
Args:
|
|
360
|
+
document: The input document
|
|
361
|
+
assembly_specification: The specification defining how to assemble
|
|
362
|
+
document_registrations: Mapping of service_id to service_file_id
|
|
363
|
+
queries: Dict of query_id to KnowledgeServiceQuery objects
|
|
364
|
+
|
|
365
|
+
Returns:
|
|
366
|
+
ID of the newly created assembled document
|
|
367
|
+
|
|
368
|
+
Raises:
|
|
369
|
+
ValueError: If required entities are not found
|
|
370
|
+
RuntimeError: If knowledge service operations fail
|
|
371
|
+
"""
|
|
372
|
+
# Initialize the result data structure
|
|
373
|
+
assembled_data: Dict[str, Any] = {}
|
|
374
|
+
|
|
375
|
+
# Process each knowledge service query
|
|
376
|
+
# TODO: This is where we may want to fan-out/fan-in to do these
|
|
377
|
+
# in parallel.
|
|
378
|
+
for (
|
|
379
|
+
schema_pointer,
|
|
380
|
+
query_id,
|
|
381
|
+
) in assembly_specification.knowledge_service_queries.items():
|
|
382
|
+
# Get the relevant schema section
|
|
383
|
+
schema_section = self._extract_schema_section(
|
|
384
|
+
assembly_specification.jsonschema, schema_pointer
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
# Get the query configuration
|
|
388
|
+
query = queries[query_id]
|
|
389
|
+
|
|
390
|
+
# Get the config for this service
|
|
391
|
+
config = await self.knowledge_service_config_repo.get(
|
|
392
|
+
query.knowledge_service_id
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
if not config:
|
|
396
|
+
raise ValueError(
|
|
397
|
+
f"Knowledge service config not found: {query.knowledge_service_id}"
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
# Get the service file ID from our registrations
|
|
401
|
+
service_file_id = document_registrations.get(query.knowledge_service_id)
|
|
402
|
+
if not service_file_id:
|
|
403
|
+
raise ValueError(
|
|
404
|
+
f"Document not registered with service {query.knowledge_service_id}"
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
# Execute the query with schema section embedded in the prompt
|
|
408
|
+
query_text = self._build_query_with_schema(query.prompt, schema_section)
|
|
409
|
+
|
|
410
|
+
query_result = await self.knowledge_service.execute_query(
|
|
411
|
+
config,
|
|
412
|
+
query_text,
|
|
413
|
+
[service_file_id],
|
|
414
|
+
query.query_metadata,
|
|
415
|
+
query.assistant_prompt,
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
# Parse and store the result
|
|
419
|
+
result_data = self._parse_query_result(query_result.result_data)
|
|
420
|
+
self._store_result_in_assembled_data(
|
|
421
|
+
assembled_data, schema_pointer, result_data
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
# Validate the assembled data against the JSON schema
|
|
425
|
+
self._validate_assembled_data(assembled_data, assembly_specification)
|
|
426
|
+
|
|
427
|
+
# Create the assembled document
|
|
428
|
+
assembled_document_id = await self._create_assembled_document(
|
|
429
|
+
assembled_data, assembly_specification
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
return assembled_document_id
|
|
433
|
+
|
|
434
|
+
@try_use_case_step("assembly_id_generation")
|
|
435
|
+
async def _generate_assembly_id(
|
|
436
|
+
self, document_id: str, assembly_specification_id: str
|
|
437
|
+
) -> str:
|
|
438
|
+
"""Generate a unique assembly ID with consistent error handling."""
|
|
439
|
+
return await self.assembly_repo.generate_id()
|
|
440
|
+
|
|
441
|
+
@try_use_case_step("assembly_specification_retrieval")
|
|
442
|
+
async def _retrieve_assembly_specification(
|
|
443
|
+
self, assembly_specification_id: str
|
|
444
|
+
) -> AssemblySpecification:
|
|
445
|
+
"""Retrieve assembly specification with error handling."""
|
|
446
|
+
specification = await self.assembly_specification_repo.get(
|
|
447
|
+
assembly_specification_id
|
|
448
|
+
)
|
|
449
|
+
if not specification:
|
|
450
|
+
raise ValueError(
|
|
451
|
+
f"Assembly specification not found: {assembly_specification_id}"
|
|
452
|
+
)
|
|
453
|
+
return specification
|
|
454
|
+
|
|
455
|
+
@try_use_case_step("document_retrieval")
|
|
456
|
+
async def _retrieve_document(self, document_id: str) -> Document:
|
|
457
|
+
"""Retrieve document with error handling."""
|
|
458
|
+
document = await self.document_repo.get(document_id)
|
|
459
|
+
if not document:
|
|
460
|
+
raise ValueError(f"Document not found: {document_id}")
|
|
461
|
+
return document
|
|
462
|
+
|
|
463
|
+
def _extract_schema_section(
|
|
464
|
+
self, jsonschema: Dict[str, Any], schema_pointer: str
|
|
465
|
+
) -> Any:
|
|
466
|
+
"""Extract relevant section of JSON schema using JSON Pointer."""
|
|
467
|
+
if not schema_pointer:
|
|
468
|
+
# Empty pointer refers to the entire schema
|
|
469
|
+
return jsonschema
|
|
470
|
+
|
|
471
|
+
try:
|
|
472
|
+
ptr = jsonpointer.JsonPointer(schema_pointer)
|
|
473
|
+
result = ptr.resolve(jsonschema)
|
|
474
|
+
return result
|
|
475
|
+
except (jsonpointer.JsonPointerException, KeyError, TypeError) as e:
|
|
476
|
+
raise ValueError(f"Cannot extract schema section '{schema_pointer}': {e}")
|
|
477
|
+
|
|
478
|
+
def _build_query_with_schema(self, base_prompt: str, schema_section: Any) -> str:
|
|
479
|
+
"""Build the query text with embedded JSON schema section."""
|
|
480
|
+
schema_json = json.dumps(schema_section, indent=2)
|
|
481
|
+
return f"""{base_prompt}
|
|
482
|
+
|
|
483
|
+
Please structure your response according to this JSON schema:
|
|
484
|
+
{schema_json}
|
|
485
|
+
|
|
486
|
+
Return only valid JSON that conforms to this schema, without any surrounding
|
|
487
|
+
text or markdown formatting."""
|
|
488
|
+
|
|
489
|
+
def _parse_query_result(self, result_data: Dict[str, Any]) -> Any:
|
|
490
|
+
"""Parse the query result to extract the JSON response."""
|
|
491
|
+
response_text = result_data.get("response", "")
|
|
492
|
+
if not response_text:
|
|
493
|
+
raise ValueError("Empty response from knowledge service")
|
|
494
|
+
|
|
495
|
+
# Response must be valid JSON
|
|
496
|
+
try:
|
|
497
|
+
parsed_result = json.loads(response_text.strip())
|
|
498
|
+
return parsed_result
|
|
499
|
+
except json.JSONDecodeError as e:
|
|
500
|
+
raise ValueError(
|
|
501
|
+
f"Knowledge service response must be valid JSON. "
|
|
502
|
+
f"Complete response: {response_text} "
|
|
503
|
+
f"Parse error: {e}"
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
def _store_result_in_assembled_data(
|
|
507
|
+
self,
|
|
508
|
+
assembled_data: Dict[str, Any],
|
|
509
|
+
schema_pointer: str,
|
|
510
|
+
result_data: Any,
|
|
511
|
+
) -> None:
|
|
512
|
+
"""Store query result in appropriate location in assembled data."""
|
|
513
|
+
if not schema_pointer:
|
|
514
|
+
# Root level - merge the entire result if it's a dict,
|
|
515
|
+
# otherwise store as-is
|
|
516
|
+
if isinstance(result_data, dict):
|
|
517
|
+
assembled_data.update(result_data)
|
|
518
|
+
else:
|
|
519
|
+
# Can't merge non-dict at root level, this would be an error
|
|
520
|
+
raise ValueError("Cannot merge non-dict result data at root level")
|
|
521
|
+
else:
|
|
522
|
+
# Use JSON Pointer to set the data at the correct location
|
|
523
|
+
try:
|
|
524
|
+
# Convert pointer to path components, skipping "properties"
|
|
525
|
+
# wrapper
|
|
526
|
+
path_parts = (
|
|
527
|
+
schema_pointer.strip("/").split("/")
|
|
528
|
+
if schema_pointer.strip("/")
|
|
529
|
+
else []
|
|
530
|
+
)
|
|
531
|
+
|
|
532
|
+
# Remove "properties" from path if it exists (schema artifact)
|
|
533
|
+
if path_parts and path_parts[0] == "properties":
|
|
534
|
+
path_parts = path_parts[1:]
|
|
535
|
+
|
|
536
|
+
# If no path parts left, store at root level
|
|
537
|
+
if not path_parts:
|
|
538
|
+
if isinstance(result_data, dict):
|
|
539
|
+
assembled_data.update(result_data)
|
|
540
|
+
else:
|
|
541
|
+
# Can't merge non-dict at root level, this would be
|
|
542
|
+
# an error
|
|
543
|
+
raise ValueError(
|
|
544
|
+
"Cannot merge non-dict result data at root level"
|
|
545
|
+
)
|
|
546
|
+
return
|
|
547
|
+
|
|
548
|
+
# Navigate/create the nested structure
|
|
549
|
+
current = assembled_data
|
|
550
|
+
for part in path_parts[:-1]:
|
|
551
|
+
if part not in current:
|
|
552
|
+
current[part] = {}
|
|
553
|
+
current = current[part]
|
|
554
|
+
|
|
555
|
+
# Set the final value
|
|
556
|
+
current[path_parts[-1]] = result_data
|
|
557
|
+
|
|
558
|
+
except (KeyError, TypeError) as e:
|
|
559
|
+
raise ValueError(
|
|
560
|
+
f"Cannot store result at schema pointer '{schema_pointer}': {e}"
|
|
561
|
+
)
|
|
562
|
+
|
|
563
|
+
@try_use_case_step("assembled_document_creation")
|
|
564
|
+
async def _create_assembled_document(
|
|
565
|
+
self,
|
|
566
|
+
assembled_data: Dict[str, Any],
|
|
567
|
+
assembly_specification: AssemblySpecification,
|
|
568
|
+
) -> str:
|
|
569
|
+
"""Create and store the assembled document."""
|
|
570
|
+
|
|
571
|
+
# Generate document ID
|
|
572
|
+
document_id = await self.document_repo.generate_id()
|
|
573
|
+
|
|
574
|
+
# Convert assembled data to JSON string
|
|
575
|
+
assembled_content = json.dumps(assembled_data, indent=2)
|
|
576
|
+
content_bytes = assembled_content.encode("utf-8")
|
|
577
|
+
|
|
578
|
+
assembled_document = Document(
|
|
579
|
+
document_id=document_id,
|
|
580
|
+
original_filename=(
|
|
581
|
+
f"assembled_{assembly_specification.name.replace(' ', '_')}.json"
|
|
582
|
+
),
|
|
583
|
+
content_type="application/json",
|
|
584
|
+
size_bytes=len(content_bytes),
|
|
585
|
+
content_multihash=self._calculate_multihash_from_content(content_bytes),
|
|
586
|
+
status=DocumentStatus.ASSEMBLED,
|
|
587
|
+
content_string=assembled_content, # Use content_string for small
|
|
588
|
+
created_at=self.now_fn(),
|
|
589
|
+
updated_at=self.now_fn(),
|
|
590
|
+
)
|
|
591
|
+
|
|
592
|
+
# Save the document
|
|
593
|
+
await self.document_repo.save(assembled_document)
|
|
594
|
+
|
|
595
|
+
return document_id
|
|
596
|
+
|
|
597
|
+
def _validate_assembled_data(
|
|
598
|
+
self,
|
|
599
|
+
assembled_data: Dict[str, Any],
|
|
600
|
+
assembly_specification: AssemblySpecification,
|
|
601
|
+
) -> None:
|
|
602
|
+
"""Validate that the assembled data conforms to the JSON schema."""
|
|
603
|
+
try:
|
|
604
|
+
jsonschema.validate(assembled_data, assembly_specification.jsonschema)
|
|
605
|
+
logger.debug(
|
|
606
|
+
"Assembled data validation passed",
|
|
607
|
+
extra={
|
|
608
|
+
"assembly_specification_id": (
|
|
609
|
+
assembly_specification.assembly_specification_id
|
|
610
|
+
),
|
|
611
|
+
},
|
|
612
|
+
)
|
|
613
|
+
except jsonschema.ValidationError as e:
|
|
614
|
+
logger.error(
|
|
615
|
+
"Assembled data validation failed",
|
|
616
|
+
extra={
|
|
617
|
+
"assembly_specification_id": (
|
|
618
|
+
assembly_specification.assembly_specification_id
|
|
619
|
+
),
|
|
620
|
+
"validation_error": str(e),
|
|
621
|
+
"error_path": (list(e.absolute_path) if e.absolute_path else []),
|
|
622
|
+
"schema_path": (list(e.schema_path) if e.schema_path else []),
|
|
623
|
+
},
|
|
624
|
+
)
|
|
625
|
+
raise ValueError(
|
|
626
|
+
f"Assembled data does not conform to JSON schema: {e.message}"
|
|
627
|
+
)
|
|
628
|
+
except jsonschema.SchemaError as e:
|
|
629
|
+
logger.error(
|
|
630
|
+
"JSON schema is invalid",
|
|
631
|
+
extra={
|
|
632
|
+
"assembly_specification_id": (
|
|
633
|
+
assembly_specification.assembly_specification_id
|
|
634
|
+
),
|
|
635
|
+
"schema_error": str(e),
|
|
636
|
+
},
|
|
637
|
+
)
|
|
638
|
+
raise ValueError(
|
|
639
|
+
f"Invalid JSON schema in assembly specification: {e.message}"
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
def _calculate_multihash_from_content(self, content_bytes: bytes) -> str:
|
|
643
|
+
"""Calculate multihash from content bytes."""
|
|
644
|
+
# Calculate SHA-256 hash
|
|
645
|
+
sha256_hash = hashlib.sha256(content_bytes).digest()
|
|
646
|
+
|
|
647
|
+
# Create multihash with SHA-256 (code 0x12)
|
|
648
|
+
mhash = multihash.encode(sha256_hash, multihash.SHA2_256)
|
|
649
|
+
return str(mhash.hex())
|