julee 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. julee/__init__.py +3 -0
  2. julee/api/__init__.py +20 -0
  3. julee/api/app.py +180 -0
  4. julee/api/dependencies.py +257 -0
  5. julee/api/requests.py +175 -0
  6. julee/api/responses.py +43 -0
  7. julee/api/routers/__init__.py +43 -0
  8. julee/api/routers/assembly_specifications.py +212 -0
  9. julee/api/routers/documents.py +182 -0
  10. julee/api/routers/knowledge_service_configs.py +79 -0
  11. julee/api/routers/knowledge_service_queries.py +293 -0
  12. julee/api/routers/system.py +137 -0
  13. julee/api/routers/workflows.py +234 -0
  14. julee/api/services/__init__.py +20 -0
  15. julee/api/services/system_initialization.py +214 -0
  16. julee/api/tests/__init__.py +14 -0
  17. julee/api/tests/routers/__init__.py +17 -0
  18. julee/api/tests/routers/test_assembly_specifications.py +749 -0
  19. julee/api/tests/routers/test_documents.py +301 -0
  20. julee/api/tests/routers/test_knowledge_service_configs.py +234 -0
  21. julee/api/tests/routers/test_knowledge_service_queries.py +738 -0
  22. julee/api/tests/routers/test_system.py +179 -0
  23. julee/api/tests/routers/test_workflows.py +393 -0
  24. julee/api/tests/test_app.py +285 -0
  25. julee/api/tests/test_dependencies.py +245 -0
  26. julee/api/tests/test_requests.py +250 -0
  27. julee/domain/__init__.py +22 -0
  28. julee/domain/models/__init__.py +49 -0
  29. julee/domain/models/assembly/__init__.py +17 -0
  30. julee/domain/models/assembly/assembly.py +103 -0
  31. julee/domain/models/assembly/tests/__init__.py +0 -0
  32. julee/domain/models/assembly/tests/factories.py +37 -0
  33. julee/domain/models/assembly/tests/test_assembly.py +430 -0
  34. julee/domain/models/assembly_specification/__init__.py +24 -0
  35. julee/domain/models/assembly_specification/assembly_specification.py +172 -0
  36. julee/domain/models/assembly_specification/knowledge_service_query.py +123 -0
  37. julee/domain/models/assembly_specification/tests/__init__.py +0 -0
  38. julee/domain/models/assembly_specification/tests/factories.py +78 -0
  39. julee/domain/models/assembly_specification/tests/test_assembly_specification.py +490 -0
  40. julee/domain/models/assembly_specification/tests/test_knowledge_service_query.py +310 -0
  41. julee/domain/models/custom_fields/__init__.py +0 -0
  42. julee/domain/models/custom_fields/content_stream.py +68 -0
  43. julee/domain/models/custom_fields/tests/__init__.py +0 -0
  44. julee/domain/models/custom_fields/tests/test_custom_fields.py +53 -0
  45. julee/domain/models/document/__init__.py +17 -0
  46. julee/domain/models/document/document.py +150 -0
  47. julee/domain/models/document/tests/__init__.py +0 -0
  48. julee/domain/models/document/tests/factories.py +76 -0
  49. julee/domain/models/document/tests/test_document.py +297 -0
  50. julee/domain/models/knowledge_service_config/__init__.py +17 -0
  51. julee/domain/models/knowledge_service_config/knowledge_service_config.py +86 -0
  52. julee/domain/models/policy/__init__.py +15 -0
  53. julee/domain/models/policy/document_policy_validation.py +220 -0
  54. julee/domain/models/policy/policy.py +203 -0
  55. julee/domain/models/policy/tests/__init__.py +0 -0
  56. julee/domain/models/policy/tests/factories.py +47 -0
  57. julee/domain/models/policy/tests/test_document_policy_validation.py +420 -0
  58. julee/domain/models/policy/tests/test_policy.py +546 -0
  59. julee/domain/repositories/__init__.py +27 -0
  60. julee/domain/repositories/assembly.py +45 -0
  61. julee/domain/repositories/assembly_specification.py +52 -0
  62. julee/domain/repositories/base.py +146 -0
  63. julee/domain/repositories/document.py +49 -0
  64. julee/domain/repositories/document_policy_validation.py +52 -0
  65. julee/domain/repositories/knowledge_service_config.py +54 -0
  66. julee/domain/repositories/knowledge_service_query.py +44 -0
  67. julee/domain/repositories/policy.py +49 -0
  68. julee/domain/use_cases/__init__.py +17 -0
  69. julee/domain/use_cases/decorators.py +107 -0
  70. julee/domain/use_cases/extract_assemble_data.py +649 -0
  71. julee/domain/use_cases/initialize_system_data.py +842 -0
  72. julee/domain/use_cases/tests/__init__.py +7 -0
  73. julee/domain/use_cases/tests/test_extract_assemble_data.py +548 -0
  74. julee/domain/use_cases/tests/test_initialize_system_data.py +455 -0
  75. julee/domain/use_cases/tests/test_validate_document.py +1228 -0
  76. julee/domain/use_cases/validate_document.py +736 -0
  77. julee/fixtures/assembly_specifications.yaml +70 -0
  78. julee/fixtures/documents.yaml +178 -0
  79. julee/fixtures/knowledge_service_configs.yaml +37 -0
  80. julee/fixtures/knowledge_service_queries.yaml +27 -0
  81. julee/repositories/__init__.py +17 -0
  82. julee/repositories/memory/__init__.py +31 -0
  83. julee/repositories/memory/assembly.py +84 -0
  84. julee/repositories/memory/assembly_specification.py +125 -0
  85. julee/repositories/memory/base.py +227 -0
  86. julee/repositories/memory/document.py +149 -0
  87. julee/repositories/memory/document_policy_validation.py +104 -0
  88. julee/repositories/memory/knowledge_service_config.py +123 -0
  89. julee/repositories/memory/knowledge_service_query.py +120 -0
  90. julee/repositories/memory/policy.py +87 -0
  91. julee/repositories/memory/tests/__init__.py +0 -0
  92. julee/repositories/memory/tests/test_document.py +212 -0
  93. julee/repositories/memory/tests/test_document_policy_validation.py +161 -0
  94. julee/repositories/memory/tests/test_policy.py +443 -0
  95. julee/repositories/minio/__init__.py +31 -0
  96. julee/repositories/minio/assembly.py +103 -0
  97. julee/repositories/minio/assembly_specification.py +170 -0
  98. julee/repositories/minio/client.py +570 -0
  99. julee/repositories/minio/document.py +530 -0
  100. julee/repositories/minio/document_policy_validation.py +120 -0
  101. julee/repositories/minio/knowledge_service_config.py +187 -0
  102. julee/repositories/minio/knowledge_service_query.py +211 -0
  103. julee/repositories/minio/policy.py +106 -0
  104. julee/repositories/minio/tests/__init__.py +0 -0
  105. julee/repositories/minio/tests/fake_client.py +213 -0
  106. julee/repositories/minio/tests/test_assembly.py +374 -0
  107. julee/repositories/minio/tests/test_assembly_specification.py +391 -0
  108. julee/repositories/minio/tests/test_client_protocol.py +57 -0
  109. julee/repositories/minio/tests/test_document.py +591 -0
  110. julee/repositories/minio/tests/test_document_policy_validation.py +192 -0
  111. julee/repositories/minio/tests/test_knowledge_service_config.py +374 -0
  112. julee/repositories/minio/tests/test_knowledge_service_query.py +438 -0
  113. julee/repositories/minio/tests/test_policy.py +559 -0
  114. julee/repositories/temporal/__init__.py +38 -0
  115. julee/repositories/temporal/activities.py +114 -0
  116. julee/repositories/temporal/activity_names.py +34 -0
  117. julee/repositories/temporal/proxies.py +159 -0
  118. julee/services/__init__.py +18 -0
  119. julee/services/knowledge_service/__init__.py +48 -0
  120. julee/services/knowledge_service/anthropic/__init__.py +12 -0
  121. julee/services/knowledge_service/anthropic/knowledge_service.py +331 -0
  122. julee/services/knowledge_service/anthropic/tests/test_knowledge_service.py +318 -0
  123. julee/services/knowledge_service/factory.py +138 -0
  124. julee/services/knowledge_service/knowledge_service.py +160 -0
  125. julee/services/knowledge_service/memory/__init__.py +13 -0
  126. julee/services/knowledge_service/memory/knowledge_service.py +278 -0
  127. julee/services/knowledge_service/memory/test_knowledge_service.py +345 -0
  128. julee/services/knowledge_service/test_factory.py +112 -0
  129. julee/services/temporal/__init__.py +38 -0
  130. julee/services/temporal/activities.py +86 -0
  131. julee/services/temporal/activity_names.py +22 -0
  132. julee/services/temporal/proxies.py +41 -0
  133. julee/util/__init__.py +0 -0
  134. julee/util/domain.py +119 -0
  135. julee/util/repos/__init__.py +0 -0
  136. julee/util/repos/minio/__init__.py +0 -0
  137. julee/util/repos/minio/file_storage.py +213 -0
  138. julee/util/repos/temporal/__init__.py +11 -0
  139. julee/util/repos/temporal/client_proxies/file_storage.py +68 -0
  140. julee/util/repos/temporal/data_converter.py +123 -0
  141. julee/util/repos/temporal/minio_file_storage.py +12 -0
  142. julee/util/repos/temporal/proxies/__init__.py +0 -0
  143. julee/util/repos/temporal/proxies/file_storage.py +58 -0
  144. julee/util/repositories.py +55 -0
  145. julee/util/temporal/__init__.py +22 -0
  146. julee/util/temporal/activities.py +123 -0
  147. julee/util/temporal/decorators.py +473 -0
  148. julee/util/tests/__init__.py +1 -0
  149. julee/util/tests/test_decorators.py +770 -0
  150. julee/util/validation/__init__.py +29 -0
  151. julee/util/validation/repository.py +100 -0
  152. julee/util/validation/type_guards.py +369 -0
  153. julee/worker.py +211 -0
  154. julee/workflows/__init__.py +26 -0
  155. julee/workflows/extract_assemble.py +215 -0
  156. julee/workflows/validate_document.py +228 -0
  157. julee-0.1.0.dist-info/METADATA +195 -0
  158. julee-0.1.0.dist-info/RECORD +161 -0
  159. julee-0.1.0.dist-info/WHEEL +5 -0
  160. julee-0.1.0.dist-info/licenses/LICENSE +674 -0
  161. julee-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,649 @@
1
+ """
2
+ Use case logic for data assembly within the Capture, Extract, Assemble,
3
+ Publish workflow.
4
+
5
+ This module contains use case classes that orchestrate business logic while
6
+ remaining framework-agnostic. Dependencies are injected via repository
7
+ instances following the Clean Architecture principles.
8
+ """
9
+
10
+ import hashlib
11
+ import json
12
+ import logging
13
+ from datetime import datetime, timezone
14
+ from typing import Any, Callable, Dict
15
+
16
+ import jsonpointer # type: ignore
17
+ import jsonschema
18
+ import multihash
19
+
20
+ from julee.domain.models import (
21
+ Assembly,
22
+ AssemblySpecification,
23
+ AssemblyStatus,
24
+ Document,
25
+ DocumentStatus,
26
+ KnowledgeServiceQuery,
27
+ )
28
+ from julee.domain.repositories import (
29
+ AssemblyRepository,
30
+ AssemblySpecificationRepository,
31
+ DocumentRepository,
32
+ KnowledgeServiceConfigRepository,
33
+ KnowledgeServiceQueryRepository,
34
+ )
35
+ from julee.services import KnowledgeService
36
+ from julee.util.validation import ensure_repository_protocol, validate_parameter_types
37
+
38
+ from .decorators import try_use_case_step
39
+
40
+ logger = logging.getLogger(__name__)
41
+
42
+
43
+ class ExtractAssembleDataUseCase:
44
+ """
45
+ Use case for extracting and assembling documents according to
46
+ specifications.
47
+
48
+ This class orchestrates the business logic for the "Extract, Assemble"
49
+ phases of the Capture, Extract, Assemble, Publish workflow while remaining
50
+ framework-agnostic. It depends only on repository protocols, not
51
+ concrete implementations.
52
+
53
+ In workflow contexts, this use case is called from workflow code with
54
+ repository stubs that delegate to Temporal activities for durability.
55
+ The use case remains completely unaware of whether it's running in a
56
+ workflow context or a simple async context - it just calls repository
57
+ methods and expects them to work correctly.
58
+
59
+ Architectural Notes:
60
+ - This class contains pure business logic with no framework dependencies
61
+ - Repository dependencies are injected via constructor
62
+ (dependency inversion)
63
+ - All error handling and compensation logic is contained here
64
+ - The use case works with domain objects exclusively
65
+ - Deterministic execution is guaranteed by avoiding
66
+ non-deterministic operations
67
+ """
68
+
69
+ def __init__(
70
+ self,
71
+ document_repo: DocumentRepository,
72
+ assembly_repo: AssemblyRepository,
73
+ assembly_specification_repo: AssemblySpecificationRepository,
74
+ knowledge_service_query_repo: KnowledgeServiceQueryRepository,
75
+ knowledge_service_config_repo: KnowledgeServiceConfigRepository,
76
+ knowledge_service: KnowledgeService,
77
+ now_fn: Callable[[], datetime] = lambda: datetime.now(timezone.utc),
78
+ ) -> None:
79
+ """Initialize extract and assemble data use case.
80
+
81
+ Args:
82
+ document_repo: Repository for document operations
83
+ assembly_repo: Repository for assembly operations
84
+ assembly_specification_repo: Repository for assembly
85
+ specification operations
86
+ knowledge_service_query_repo: Repository for knowledge service
87
+ query operations
88
+ knowledge_service_config_repo: Repository for knowledge service
89
+ configuration operations
90
+ knowledge_service: Knowledge service instance for external
91
+ operations
92
+ now_fn: Function to get current time (for workflow compatibility)
93
+
94
+ Note:
95
+ The repositories passed here may be concrete implementations
96
+ (for testing or direct execution) or workflow stubs (for
97
+ Temporal workflow execution). The use case doesn't know or care
98
+ which - it just calls the methods defined in the protocols.
99
+
100
+ Repositories are validated at construction time to catch
101
+ configuration errors early in the application lifecycle.
102
+ """
103
+ # Validate at construction time for early error detection
104
+ self.document_repo = ensure_repository_protocol(
105
+ document_repo,
106
+ DocumentRepository, # type: ignore[type-abstract]
107
+ )
108
+ self.knowledge_service = knowledge_service
109
+ self.now_fn = now_fn
110
+ self.assembly_repo = ensure_repository_protocol(
111
+ assembly_repo,
112
+ AssemblyRepository, # type: ignore[type-abstract]
113
+ )
114
+ self.assembly_specification_repo = ensure_repository_protocol(
115
+ assembly_specification_repo,
116
+ AssemblySpecificationRepository, # type: ignore[type-abstract]
117
+ )
118
+ self.knowledge_service_query_repo = ensure_repository_protocol(
119
+ knowledge_service_query_repo,
120
+ KnowledgeServiceQueryRepository, # type: ignore[type-abstract]
121
+ )
122
+ self.knowledge_service_config_repo = ensure_repository_protocol(
123
+ knowledge_service_config_repo,
124
+ KnowledgeServiceConfigRepository, # type: ignore[type-abstract]
125
+ )
126
+
127
+ async def assemble_data(
128
+ self,
129
+ document_id: str,
130
+ assembly_specification_id: str,
131
+ workflow_id: str,
132
+ ) -> Assembly:
133
+ """
134
+ Assemble a document according to its specification and create a new
135
+ assembly.
136
+
137
+ This method orchestrates the core assembly workflow:
138
+ 1. Generates a unique assembly ID
139
+ 2. Retrieves the assembly specification
140
+ 3. Stores the initial assembly in the repository
141
+ 4. Retrieves all knowledge service queries needed for the assembly
142
+ 5. Retrieves all knowledge service instances needed for the assembly
143
+ 6. Retrieves the input document and registers it with knowledge
144
+ services
145
+ 7. Performs the assembly iteration to create the assembled document
146
+ 8. Adds the iteration to the assembly and returns it
147
+
148
+ Args:
149
+ document_id: ID of the document to assemble
150
+ assembly_specification_id: ID of the specification to use
151
+ workflow_id: Temporal workflow ID that creates this assembly
152
+
153
+ Returns:
154
+ New Assembly with the assembled document iteration
155
+
156
+ Raises:
157
+ ValueError: If required entities are not found or invalid
158
+ RuntimeError: If assembly processing fails
159
+ """
160
+ logger.debug(
161
+ "Starting data assembly use case",
162
+ extra={
163
+ "document_id": document_id,
164
+ "assembly_specification_id": assembly_specification_id,
165
+ "workflow_id": workflow_id,
166
+ },
167
+ )
168
+
169
+ # Step 1: Generate unique assembly ID
170
+ assembly_id = await self._generate_assembly_id(
171
+ document_id, assembly_specification_id
172
+ )
173
+
174
+ # Step 2: Retrieve the assembly specification
175
+ assembly_specification = await self._retrieve_assembly_specification(
176
+ assembly_specification_id
177
+ )
178
+
179
+ # Step 3: Store the initial assembly
180
+ assembly = Assembly(
181
+ assembly_id=assembly_id,
182
+ assembly_specification_id=assembly_specification_id,
183
+ input_document_id=document_id,
184
+ workflow_id=workflow_id,
185
+ status=AssemblyStatus.IN_PROGRESS,
186
+ assembled_document_id=None,
187
+ created_at=self.now_fn(),
188
+ updated_at=self.now_fn(),
189
+ )
190
+ await self.assembly_repo.save(assembly)
191
+
192
+ logger.debug(
193
+ "Initial assembly stored",
194
+ extra={
195
+ "assembly_id": assembly_id,
196
+ "status": assembly.status.value,
197
+ },
198
+ )
199
+
200
+ # Step 4: Retrieve all knowledge service queries once
201
+ queries = await self._retrieve_all_queries(assembly_specification)
202
+
203
+ # Step 5: Register the document with knowledge services
204
+ document = await self._retrieve_document(document_id)
205
+ document_registrations = await self._register_document_with_services(
206
+ document, queries
207
+ )
208
+
209
+ # Step 7: Perform the assembly iteration
210
+ try:
211
+ assembled_document_id = await self._assemble_iteration(
212
+ document,
213
+ assembly_specification,
214
+ document_registrations,
215
+ queries,
216
+ )
217
+
218
+ # Step 8: Set the assembled document and return
219
+ assembly.assembled_document_id = assembled_document_id
220
+ assembly.status = AssemblyStatus.COMPLETED
221
+ await self.assembly_repo.save(assembly)
222
+
223
+ logger.info(
224
+ "Assembly completed successfully",
225
+ extra={
226
+ "assembly_id": assembly_id,
227
+ "assembled_document_id": assembled_document_id,
228
+ },
229
+ )
230
+
231
+ return assembly
232
+
233
+ except Exception as e:
234
+ # Mark assembly as failed
235
+ assembly.status = AssemblyStatus.FAILED
236
+ await self.assembly_repo.save(assembly)
237
+
238
+ logger.error(
239
+ "Assembly failed",
240
+ extra={
241
+ "assembly_id": assembly_id,
242
+ "error": str(e),
243
+ },
244
+ exc_info=True,
245
+ )
246
+ raise
247
+
248
+ @try_use_case_step("document_registration")
249
+ @validate_parameter_types()
250
+ async def _register_document_with_services(
251
+ self,
252
+ document: Document,
253
+ queries: Dict[str, KnowledgeServiceQuery],
254
+ ) -> Dict[str, str]:
255
+ """
256
+ Register the document with all knowledge services needed for assembly.
257
+
258
+ This is a temporary solution - document registration will be handled
259
+ properly in a separate process later.
260
+
261
+ Args:
262
+ document: The document to register
263
+ queries: Dict of query_id to KnowledgeServiceQuery objects
264
+
265
+ Returns:
266
+ Dict mapping knowledge_service_id to service_file_id
267
+
268
+ Raises:
269
+ RuntimeError: If registration fails
270
+ """
271
+ registrations = {}
272
+
273
+ required_service_ids = {
274
+ query.knowledge_service_id for query in queries.values()
275
+ }
276
+
277
+ for knowledge_service_id in required_service_ids:
278
+ # Get the config for this service
279
+ config = await self.knowledge_service_config_repo.get(knowledge_service_id)
280
+ if not config:
281
+ raise ValueError(
282
+ f"Knowledge service config not found: {knowledge_service_id}"
283
+ )
284
+
285
+ registration_result = await self.knowledge_service.register_file(
286
+ config, document
287
+ )
288
+ registrations[knowledge_service_id] = (
289
+ registration_result.knowledge_service_file_id
290
+ )
291
+
292
+ return registrations
293
+
294
+ @try_use_case_step("queries_retrieval")
295
+ async def _retrieve_all_queries(
296
+ self, assembly_specification: AssemblySpecification
297
+ ) -> Dict[str, KnowledgeServiceQuery]:
298
+ """Retrieve all knowledge service queries needed for this assembly."""
299
+ query_ids = list(assembly_specification.knowledge_service_queries.values())
300
+
301
+ # TODO: TEMPORAL SERIALIZATION ISSUE - Replace with get_many when
302
+ # fixed
303
+ #
304
+ # Issue: Complex return type
305
+ # Dict[str, Optional[KnowledgeServiceQuery]] from get_many causes
306
+ # Temporal's type system to fall back to typing.Any, resulting in
307
+ # Pydantic models being deserialized as plain dictionaries instead of
308
+ # model instances.
309
+ #
310
+ # Error: "SERIALIZATION ISSUE DETECTED: parameter
311
+ # 'queries'['query-id'] is dict instead of KnowledgeServiceQuery!"
312
+ #
313
+ # Root Cause: Temporal's type resolution cannot handle the complex
314
+ # nested generic type Dict[str, Optional[T]] and passes typing.Any to
315
+ # the data converter, which then deserializes to plain dicts.
316
+ #
317
+ # Investigation: Full analysis showed:
318
+ # - Data converter debug output confirming typing.Any fallback
319
+ # - Repository type resolution working correctly
320
+ # - Guard check system detecting the exact issue
321
+ # - Evidence that simpler types (Optional[T]) work fine
322
+ #
323
+ # Temporary Fix: Use individual get() calls which return Optional[T]
324
+ # that Temporal handles correctly.
325
+ #
326
+ # Future Solutions:
327
+ # 1. Fix Temporal's type resolution for complex nested generics
328
+ # 2. Create custom data converter for this specific type pattern
329
+ # 3. Simplify repository interface to avoid Optional in batch
330
+ # operations
331
+ #
332
+ # Currently using individual get calls to avoid complex type
333
+ # serialization issue
334
+ queries = {}
335
+ for query_id in query_ids:
336
+ query = await self.knowledge_service_query_repo.get(query_id)
337
+ if not query:
338
+ raise ValueError(f"Knowledge service query not found: {query_id}")
339
+ queries[query_id] = query
340
+ return queries
341
+
342
+ @try_use_case_step("assembly_iteration")
343
+ async def _assemble_iteration(
344
+ self,
345
+ document: Document,
346
+ assembly_specification: AssemblySpecification,
347
+ document_registrations: Dict[str, str],
348
+ queries: Dict[str, KnowledgeServiceQuery],
349
+ ) -> str:
350
+ """
351
+ Perform a single assembly iteration using knowledge services.
352
+
353
+ This method:
354
+ 1. Executes all knowledge service queries defined in the specification
355
+ 2. Stitches together the query results into a complete JSON document
356
+ 3. Creates and stores the assembled document
357
+ 4. Returns the ID of the assembled document
358
+
359
+ Args:
360
+ document: The input document
361
+ assembly_specification: The specification defining how to assemble
362
+ document_registrations: Mapping of service_id to service_file_id
363
+ queries: Dict of query_id to KnowledgeServiceQuery objects
364
+
365
+ Returns:
366
+ ID of the newly created assembled document
367
+
368
+ Raises:
369
+ ValueError: If required entities are not found
370
+ RuntimeError: If knowledge service operations fail
371
+ """
372
+ # Initialize the result data structure
373
+ assembled_data: Dict[str, Any] = {}
374
+
375
+ # Process each knowledge service query
376
+ # TODO: This is where we may want to fan-out/fan-in to do these
377
+ # in parallel.
378
+ for (
379
+ schema_pointer,
380
+ query_id,
381
+ ) in assembly_specification.knowledge_service_queries.items():
382
+ # Get the relevant schema section
383
+ schema_section = self._extract_schema_section(
384
+ assembly_specification.jsonschema, schema_pointer
385
+ )
386
+
387
+ # Get the query configuration
388
+ query = queries[query_id]
389
+
390
+ # Get the config for this service
391
+ config = await self.knowledge_service_config_repo.get(
392
+ query.knowledge_service_id
393
+ )
394
+
395
+ if not config:
396
+ raise ValueError(
397
+ f"Knowledge service config not found: {query.knowledge_service_id}"
398
+ )
399
+
400
+ # Get the service file ID from our registrations
401
+ service_file_id = document_registrations.get(query.knowledge_service_id)
402
+ if not service_file_id:
403
+ raise ValueError(
404
+ f"Document not registered with service {query.knowledge_service_id}"
405
+ )
406
+
407
+ # Execute the query with schema section embedded in the prompt
408
+ query_text = self._build_query_with_schema(query.prompt, schema_section)
409
+
410
+ query_result = await self.knowledge_service.execute_query(
411
+ config,
412
+ query_text,
413
+ [service_file_id],
414
+ query.query_metadata,
415
+ query.assistant_prompt,
416
+ )
417
+
418
+ # Parse and store the result
419
+ result_data = self._parse_query_result(query_result.result_data)
420
+ self._store_result_in_assembled_data(
421
+ assembled_data, schema_pointer, result_data
422
+ )
423
+
424
+ # Validate the assembled data against the JSON schema
425
+ self._validate_assembled_data(assembled_data, assembly_specification)
426
+
427
+ # Create the assembled document
428
+ assembled_document_id = await self._create_assembled_document(
429
+ assembled_data, assembly_specification
430
+ )
431
+
432
+ return assembled_document_id
433
+
434
+ @try_use_case_step("assembly_id_generation")
435
+ async def _generate_assembly_id(
436
+ self, document_id: str, assembly_specification_id: str
437
+ ) -> str:
438
+ """Generate a unique assembly ID with consistent error handling."""
439
+ return await self.assembly_repo.generate_id()
440
+
441
+ @try_use_case_step("assembly_specification_retrieval")
442
+ async def _retrieve_assembly_specification(
443
+ self, assembly_specification_id: str
444
+ ) -> AssemblySpecification:
445
+ """Retrieve assembly specification with error handling."""
446
+ specification = await self.assembly_specification_repo.get(
447
+ assembly_specification_id
448
+ )
449
+ if not specification:
450
+ raise ValueError(
451
+ f"Assembly specification not found: {assembly_specification_id}"
452
+ )
453
+ return specification
454
+
455
+ @try_use_case_step("document_retrieval")
456
+ async def _retrieve_document(self, document_id: str) -> Document:
457
+ """Retrieve document with error handling."""
458
+ document = await self.document_repo.get(document_id)
459
+ if not document:
460
+ raise ValueError(f"Document not found: {document_id}")
461
+ return document
462
+
463
+ def _extract_schema_section(
464
+ self, jsonschema: Dict[str, Any], schema_pointer: str
465
+ ) -> Any:
466
+ """Extract relevant section of JSON schema using JSON Pointer."""
467
+ if not schema_pointer:
468
+ # Empty pointer refers to the entire schema
469
+ return jsonschema
470
+
471
+ try:
472
+ ptr = jsonpointer.JsonPointer(schema_pointer)
473
+ result = ptr.resolve(jsonschema)
474
+ return result
475
+ except (jsonpointer.JsonPointerException, KeyError, TypeError) as e:
476
+ raise ValueError(f"Cannot extract schema section '{schema_pointer}': {e}")
477
+
478
+ def _build_query_with_schema(self, base_prompt: str, schema_section: Any) -> str:
479
+ """Build the query text with embedded JSON schema section."""
480
+ schema_json = json.dumps(schema_section, indent=2)
481
+ return f"""{base_prompt}
482
+
483
+ Please structure your response according to this JSON schema:
484
+ {schema_json}
485
+
486
+ Return only valid JSON that conforms to this schema, without any surrounding
487
+ text or markdown formatting."""
488
+
489
+ def _parse_query_result(self, result_data: Dict[str, Any]) -> Any:
490
+ """Parse the query result to extract the JSON response."""
491
+ response_text = result_data.get("response", "")
492
+ if not response_text:
493
+ raise ValueError("Empty response from knowledge service")
494
+
495
+ # Response must be valid JSON
496
+ try:
497
+ parsed_result = json.loads(response_text.strip())
498
+ return parsed_result
499
+ except json.JSONDecodeError as e:
500
+ raise ValueError(
501
+ f"Knowledge service response must be valid JSON. "
502
+ f"Complete response: {response_text} "
503
+ f"Parse error: {e}"
504
+ )
505
+
506
+ def _store_result_in_assembled_data(
507
+ self,
508
+ assembled_data: Dict[str, Any],
509
+ schema_pointer: str,
510
+ result_data: Any,
511
+ ) -> None:
512
+ """Store query result in appropriate location in assembled data."""
513
+ if not schema_pointer:
514
+ # Root level - merge the entire result if it's a dict,
515
+ # otherwise store as-is
516
+ if isinstance(result_data, dict):
517
+ assembled_data.update(result_data)
518
+ else:
519
+ # Can't merge non-dict at root level, this would be an error
520
+ raise ValueError("Cannot merge non-dict result data at root level")
521
+ else:
522
+ # Use JSON Pointer to set the data at the correct location
523
+ try:
524
+ # Convert pointer to path components, skipping "properties"
525
+ # wrapper
526
+ path_parts = (
527
+ schema_pointer.strip("/").split("/")
528
+ if schema_pointer.strip("/")
529
+ else []
530
+ )
531
+
532
+ # Remove "properties" from path if it exists (schema artifact)
533
+ if path_parts and path_parts[0] == "properties":
534
+ path_parts = path_parts[1:]
535
+
536
+ # If no path parts left, store at root level
537
+ if not path_parts:
538
+ if isinstance(result_data, dict):
539
+ assembled_data.update(result_data)
540
+ else:
541
+ # Can't merge non-dict at root level, this would be
542
+ # an error
543
+ raise ValueError(
544
+ "Cannot merge non-dict result data at root level"
545
+ )
546
+ return
547
+
548
+ # Navigate/create the nested structure
549
+ current = assembled_data
550
+ for part in path_parts[:-1]:
551
+ if part not in current:
552
+ current[part] = {}
553
+ current = current[part]
554
+
555
+ # Set the final value
556
+ current[path_parts[-1]] = result_data
557
+
558
+ except (KeyError, TypeError) as e:
559
+ raise ValueError(
560
+ f"Cannot store result at schema pointer '{schema_pointer}': {e}"
561
+ )
562
+
563
+ @try_use_case_step("assembled_document_creation")
564
+ async def _create_assembled_document(
565
+ self,
566
+ assembled_data: Dict[str, Any],
567
+ assembly_specification: AssemblySpecification,
568
+ ) -> str:
569
+ """Create and store the assembled document."""
570
+
571
+ # Generate document ID
572
+ document_id = await self.document_repo.generate_id()
573
+
574
+ # Convert assembled data to JSON string
575
+ assembled_content = json.dumps(assembled_data, indent=2)
576
+ content_bytes = assembled_content.encode("utf-8")
577
+
578
+ assembled_document = Document(
579
+ document_id=document_id,
580
+ original_filename=(
581
+ f"assembled_{assembly_specification.name.replace(' ', '_')}.json"
582
+ ),
583
+ content_type="application/json",
584
+ size_bytes=len(content_bytes),
585
+ content_multihash=self._calculate_multihash_from_content(content_bytes),
586
+ status=DocumentStatus.ASSEMBLED,
587
+ content_string=assembled_content, # Use content_string for small
588
+ created_at=self.now_fn(),
589
+ updated_at=self.now_fn(),
590
+ )
591
+
592
+ # Save the document
593
+ await self.document_repo.save(assembled_document)
594
+
595
+ return document_id
596
+
597
+ def _validate_assembled_data(
598
+ self,
599
+ assembled_data: Dict[str, Any],
600
+ assembly_specification: AssemblySpecification,
601
+ ) -> None:
602
+ """Validate that the assembled data conforms to the JSON schema."""
603
+ try:
604
+ jsonschema.validate(assembled_data, assembly_specification.jsonschema)
605
+ logger.debug(
606
+ "Assembled data validation passed",
607
+ extra={
608
+ "assembly_specification_id": (
609
+ assembly_specification.assembly_specification_id
610
+ ),
611
+ },
612
+ )
613
+ except jsonschema.ValidationError as e:
614
+ logger.error(
615
+ "Assembled data validation failed",
616
+ extra={
617
+ "assembly_specification_id": (
618
+ assembly_specification.assembly_specification_id
619
+ ),
620
+ "validation_error": str(e),
621
+ "error_path": (list(e.absolute_path) if e.absolute_path else []),
622
+ "schema_path": (list(e.schema_path) if e.schema_path else []),
623
+ },
624
+ )
625
+ raise ValueError(
626
+ f"Assembled data does not conform to JSON schema: {e.message}"
627
+ )
628
+ except jsonschema.SchemaError as e:
629
+ logger.error(
630
+ "JSON schema is invalid",
631
+ extra={
632
+ "assembly_specification_id": (
633
+ assembly_specification.assembly_specification_id
634
+ ),
635
+ "schema_error": str(e),
636
+ },
637
+ )
638
+ raise ValueError(
639
+ f"Invalid JSON schema in assembly specification: {e.message}"
640
+ )
641
+
642
+ def _calculate_multihash_from_content(self, content_bytes: bytes) -> str:
643
+ """Calculate multihash from content bytes."""
644
+ # Calculate SHA-256 hash
645
+ sha256_hash = hashlib.sha256(content_bytes).digest()
646
+
647
+ # Create multihash with SHA-256 (code 0x12)
648
+ mhash = multihash.encode(sha256_hash, multihash.SHA2_256)
649
+ return str(mhash.hex())