julee 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. julee/__init__.py +3 -0
  2. julee/api/__init__.py +20 -0
  3. julee/api/app.py +180 -0
  4. julee/api/dependencies.py +257 -0
  5. julee/api/requests.py +175 -0
  6. julee/api/responses.py +43 -0
  7. julee/api/routers/__init__.py +43 -0
  8. julee/api/routers/assembly_specifications.py +212 -0
  9. julee/api/routers/documents.py +182 -0
  10. julee/api/routers/knowledge_service_configs.py +79 -0
  11. julee/api/routers/knowledge_service_queries.py +293 -0
  12. julee/api/routers/system.py +137 -0
  13. julee/api/routers/workflows.py +234 -0
  14. julee/api/services/__init__.py +20 -0
  15. julee/api/services/system_initialization.py +214 -0
  16. julee/api/tests/__init__.py +14 -0
  17. julee/api/tests/routers/__init__.py +17 -0
  18. julee/api/tests/routers/test_assembly_specifications.py +749 -0
  19. julee/api/tests/routers/test_documents.py +301 -0
  20. julee/api/tests/routers/test_knowledge_service_configs.py +234 -0
  21. julee/api/tests/routers/test_knowledge_service_queries.py +738 -0
  22. julee/api/tests/routers/test_system.py +179 -0
  23. julee/api/tests/routers/test_workflows.py +393 -0
  24. julee/api/tests/test_app.py +285 -0
  25. julee/api/tests/test_dependencies.py +245 -0
  26. julee/api/tests/test_requests.py +250 -0
  27. julee/domain/__init__.py +22 -0
  28. julee/domain/models/__init__.py +49 -0
  29. julee/domain/models/assembly/__init__.py +17 -0
  30. julee/domain/models/assembly/assembly.py +103 -0
  31. julee/domain/models/assembly/tests/__init__.py +0 -0
  32. julee/domain/models/assembly/tests/factories.py +37 -0
  33. julee/domain/models/assembly/tests/test_assembly.py +430 -0
  34. julee/domain/models/assembly_specification/__init__.py +24 -0
  35. julee/domain/models/assembly_specification/assembly_specification.py +172 -0
  36. julee/domain/models/assembly_specification/knowledge_service_query.py +123 -0
  37. julee/domain/models/assembly_specification/tests/__init__.py +0 -0
  38. julee/domain/models/assembly_specification/tests/factories.py +78 -0
  39. julee/domain/models/assembly_specification/tests/test_assembly_specification.py +490 -0
  40. julee/domain/models/assembly_specification/tests/test_knowledge_service_query.py +310 -0
  41. julee/domain/models/custom_fields/__init__.py +0 -0
  42. julee/domain/models/custom_fields/content_stream.py +68 -0
  43. julee/domain/models/custom_fields/tests/__init__.py +0 -0
  44. julee/domain/models/custom_fields/tests/test_custom_fields.py +53 -0
  45. julee/domain/models/document/__init__.py +17 -0
  46. julee/domain/models/document/document.py +150 -0
  47. julee/domain/models/document/tests/__init__.py +0 -0
  48. julee/domain/models/document/tests/factories.py +76 -0
  49. julee/domain/models/document/tests/test_document.py +297 -0
  50. julee/domain/models/knowledge_service_config/__init__.py +17 -0
  51. julee/domain/models/knowledge_service_config/knowledge_service_config.py +86 -0
  52. julee/domain/models/policy/__init__.py +15 -0
  53. julee/domain/models/policy/document_policy_validation.py +220 -0
  54. julee/domain/models/policy/policy.py +203 -0
  55. julee/domain/models/policy/tests/__init__.py +0 -0
  56. julee/domain/models/policy/tests/factories.py +47 -0
  57. julee/domain/models/policy/tests/test_document_policy_validation.py +420 -0
  58. julee/domain/models/policy/tests/test_policy.py +546 -0
  59. julee/domain/repositories/__init__.py +27 -0
  60. julee/domain/repositories/assembly.py +45 -0
  61. julee/domain/repositories/assembly_specification.py +52 -0
  62. julee/domain/repositories/base.py +146 -0
  63. julee/domain/repositories/document.py +49 -0
  64. julee/domain/repositories/document_policy_validation.py +52 -0
  65. julee/domain/repositories/knowledge_service_config.py +54 -0
  66. julee/domain/repositories/knowledge_service_query.py +44 -0
  67. julee/domain/repositories/policy.py +49 -0
  68. julee/domain/use_cases/__init__.py +17 -0
  69. julee/domain/use_cases/decorators.py +107 -0
  70. julee/domain/use_cases/extract_assemble_data.py +649 -0
  71. julee/domain/use_cases/initialize_system_data.py +842 -0
  72. julee/domain/use_cases/tests/__init__.py +7 -0
  73. julee/domain/use_cases/tests/test_extract_assemble_data.py +548 -0
  74. julee/domain/use_cases/tests/test_initialize_system_data.py +455 -0
  75. julee/domain/use_cases/tests/test_validate_document.py +1228 -0
  76. julee/domain/use_cases/validate_document.py +736 -0
  77. julee/fixtures/assembly_specifications.yaml +70 -0
  78. julee/fixtures/documents.yaml +178 -0
  79. julee/fixtures/knowledge_service_configs.yaml +37 -0
  80. julee/fixtures/knowledge_service_queries.yaml +27 -0
  81. julee/repositories/__init__.py +17 -0
  82. julee/repositories/memory/__init__.py +31 -0
  83. julee/repositories/memory/assembly.py +84 -0
  84. julee/repositories/memory/assembly_specification.py +125 -0
  85. julee/repositories/memory/base.py +227 -0
  86. julee/repositories/memory/document.py +149 -0
  87. julee/repositories/memory/document_policy_validation.py +104 -0
  88. julee/repositories/memory/knowledge_service_config.py +123 -0
  89. julee/repositories/memory/knowledge_service_query.py +120 -0
  90. julee/repositories/memory/policy.py +87 -0
  91. julee/repositories/memory/tests/__init__.py +0 -0
  92. julee/repositories/memory/tests/test_document.py +212 -0
  93. julee/repositories/memory/tests/test_document_policy_validation.py +161 -0
  94. julee/repositories/memory/tests/test_policy.py +443 -0
  95. julee/repositories/minio/__init__.py +31 -0
  96. julee/repositories/minio/assembly.py +103 -0
  97. julee/repositories/minio/assembly_specification.py +170 -0
  98. julee/repositories/minio/client.py +570 -0
  99. julee/repositories/minio/document.py +530 -0
  100. julee/repositories/minio/document_policy_validation.py +120 -0
  101. julee/repositories/minio/knowledge_service_config.py +187 -0
  102. julee/repositories/minio/knowledge_service_query.py +211 -0
  103. julee/repositories/minio/policy.py +106 -0
  104. julee/repositories/minio/tests/__init__.py +0 -0
  105. julee/repositories/minio/tests/fake_client.py +213 -0
  106. julee/repositories/minio/tests/test_assembly.py +374 -0
  107. julee/repositories/minio/tests/test_assembly_specification.py +391 -0
  108. julee/repositories/minio/tests/test_client_protocol.py +57 -0
  109. julee/repositories/minio/tests/test_document.py +591 -0
  110. julee/repositories/minio/tests/test_document_policy_validation.py +192 -0
  111. julee/repositories/minio/tests/test_knowledge_service_config.py +374 -0
  112. julee/repositories/minio/tests/test_knowledge_service_query.py +438 -0
  113. julee/repositories/minio/tests/test_policy.py +559 -0
  114. julee/repositories/temporal/__init__.py +38 -0
  115. julee/repositories/temporal/activities.py +114 -0
  116. julee/repositories/temporal/activity_names.py +34 -0
  117. julee/repositories/temporal/proxies.py +159 -0
  118. julee/services/__init__.py +18 -0
  119. julee/services/knowledge_service/__init__.py +48 -0
  120. julee/services/knowledge_service/anthropic/__init__.py +12 -0
  121. julee/services/knowledge_service/anthropic/knowledge_service.py +331 -0
  122. julee/services/knowledge_service/anthropic/tests/test_knowledge_service.py +318 -0
  123. julee/services/knowledge_service/factory.py +138 -0
  124. julee/services/knowledge_service/knowledge_service.py +160 -0
  125. julee/services/knowledge_service/memory/__init__.py +13 -0
  126. julee/services/knowledge_service/memory/knowledge_service.py +278 -0
  127. julee/services/knowledge_service/memory/test_knowledge_service.py +345 -0
  128. julee/services/knowledge_service/test_factory.py +112 -0
  129. julee/services/temporal/__init__.py +38 -0
  130. julee/services/temporal/activities.py +86 -0
  131. julee/services/temporal/activity_names.py +22 -0
  132. julee/services/temporal/proxies.py +41 -0
  133. julee/util/__init__.py +0 -0
  134. julee/util/domain.py +119 -0
  135. julee/util/repos/__init__.py +0 -0
  136. julee/util/repos/minio/__init__.py +0 -0
  137. julee/util/repos/minio/file_storage.py +213 -0
  138. julee/util/repos/temporal/__init__.py +11 -0
  139. julee/util/repos/temporal/client_proxies/file_storage.py +68 -0
  140. julee/util/repos/temporal/data_converter.py +123 -0
  141. julee/util/repos/temporal/minio_file_storage.py +12 -0
  142. julee/util/repos/temporal/proxies/__init__.py +0 -0
  143. julee/util/repos/temporal/proxies/file_storage.py +58 -0
  144. julee/util/repositories.py +55 -0
  145. julee/util/temporal/__init__.py +22 -0
  146. julee/util/temporal/activities.py +123 -0
  147. julee/util/temporal/decorators.py +473 -0
  148. julee/util/tests/__init__.py +1 -0
  149. julee/util/tests/test_decorators.py +770 -0
  150. julee/util/validation/__init__.py +29 -0
  151. julee/util/validation/repository.py +100 -0
  152. julee/util/validation/type_guards.py +369 -0
  153. julee/worker.py +211 -0
  154. julee/workflows/__init__.py +26 -0
  155. julee/workflows/extract_assemble.py +215 -0
  156. julee/workflows/validate_document.py +228 -0
  157. julee-0.1.0.dist-info/METADATA +195 -0
  158. julee-0.1.0.dist-info/RECORD +161 -0
  159. julee-0.1.0.dist-info/WHEEL +5 -0
  160. julee-0.1.0.dist-info/licenses/LICENSE +674 -0
  161. julee-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,310 @@
1
+ """
2
+ Comprehensive tests for KnowledgeServiceQuery domain model.
3
+
4
+ This test module documents the design decisions made for the
5
+ KnowledgeServiceQuery domain model using table-based tests. It covers:
6
+
7
+ - KnowledgeServiceQuery instantiation with various field combinations
8
+ - JSON Pointer validation for schema_pointer field
9
+ - JSON serialization behavior
10
+ - Field validation for required fields
11
+
12
+ Design decisions documented:
13
+ - KnowledgeServiceQuery must have all required fields (query_id, name,
14
+ knowledge_service_id, prompt, schema_pointer)
15
+ - Schema pointer must be a valid JSON Pointer (RFC 6901)
16
+ - All text fields must be non-empty and non-whitespace
17
+ - Version field has a default but can be customized
18
+ - Status defaults to ACTIVE
19
+ """
20
+
21
+ import pytest
22
+
23
+ from julee.domain.models.assembly_specification import (
24
+ KnowledgeServiceQuery,
25
+ )
26
+ from .factories import KnowledgeServiceQueryFactory
27
+
28
+
29
+ class TestKnowledgeServiceQueryInstantiation:
30
+ """Test KnowledgeServiceQuery creation with various field combinations."""
31
+
32
+ @pytest.mark.parametrize(
33
+ "query_id,name,knowledge_service_id,prompt,expected_success",
34
+ [
35
+ # Valid cases
36
+ (
37
+ "query-1",
38
+ "Extract Attendees",
39
+ "ragflow-service-1",
40
+ "Extract list of meeting attendees with their roles",
41
+ True,
42
+ ),
43
+ (
44
+ "query-2",
45
+ "Extract Action Items",
46
+ "knowledge-service-2",
47
+ "Extract action items with assignees and due dates",
48
+ True,
49
+ ),
50
+ (
51
+ "query-3",
52
+ "Extract Meeting Metadata",
53
+ "service-alpha",
54
+ "Extract basic meeting information like title, date, time",
55
+ True,
56
+ ),
57
+ # Invalid cases - empty required fields
58
+ (
59
+ "",
60
+ "Test Query",
61
+ "service-1",
62
+ "Test prompt",
63
+ False,
64
+ ), # Empty query_id
65
+ (
66
+ "query-4",
67
+ "",
68
+ "service-1",
69
+ "Test prompt",
70
+ False,
71
+ ), # Empty name
72
+ (
73
+ "query-5",
74
+ "Test Query",
75
+ "",
76
+ "Test prompt",
77
+ False,
78
+ ), # Empty knowledge_service_id
79
+ (
80
+ "query-6",
81
+ "Test Query",
82
+ "service-1",
83
+ "",
84
+ False,
85
+ ), # Empty prompt
86
+ # Invalid cases - whitespace only
87
+ (
88
+ " ",
89
+ "Test Query",
90
+ "service-1",
91
+ "Test prompt",
92
+ False,
93
+ ), # Whitespace query_id
94
+ (
95
+ "query-8",
96
+ " ",
97
+ "service-1",
98
+ "Test prompt",
99
+ False,
100
+ ), # Whitespace name
101
+ (
102
+ "query-9",
103
+ "Test Query",
104
+ " ",
105
+ "Test prompt",
106
+ False,
107
+ ), # Whitespace knowledge_service_id
108
+ (
109
+ "query-10",
110
+ "Test Query",
111
+ "service-1",
112
+ " ",
113
+ False,
114
+ ), # Whitespace prompt
115
+ ],
116
+ )
117
+ def test_knowledge_service_query_creation_validation(
118
+ self,
119
+ query_id: str,
120
+ name: str,
121
+ knowledge_service_id: str,
122
+ prompt: str,
123
+ expected_success: bool,
124
+ ) -> None:
125
+ """Test query creation with various field validation scenarios."""
126
+ if expected_success:
127
+ # Should create successfully
128
+ query = KnowledgeServiceQuery(
129
+ query_id=query_id,
130
+ name=name,
131
+ knowledge_service_id=knowledge_service_id,
132
+ prompt=prompt,
133
+ )
134
+ assert query.query_id == query_id.strip()
135
+ assert query.name == name.strip()
136
+ assert query.knowledge_service_id == knowledge_service_id.strip()
137
+ assert query.prompt == prompt.strip()
138
+ else:
139
+ # Should raise validation error
140
+ with pytest.raises(Exception): # Could be ValueError or ValidationError
141
+ KnowledgeServiceQuery(
142
+ query_id=query_id,
143
+ name=name,
144
+ knowledge_service_id=knowledge_service_id,
145
+ prompt=prompt,
146
+ )
147
+
148
+
149
+ class TestKnowledgeServiceQuerySerialization:
150
+ """Test KnowledgeServiceQuery JSON serialization behavior."""
151
+
152
+ def test_knowledge_service_query_json_serialization(self) -> None:
153
+ """Test that KnowledgeServiceQuery serializes to JSON correctly."""
154
+ query = KnowledgeServiceQueryFactory.build(
155
+ query_id="attendee-extractor",
156
+ name="Meeting Attendee Extractor",
157
+ knowledge_service_id="ragflow-primary",
158
+ prompt="Extract meeting attendees with names, roles",
159
+ )
160
+
161
+ json_str = query.model_dump_json()
162
+ import json
163
+
164
+ json_data = json.loads(json_str)
165
+
166
+ # All fields should be present in JSON
167
+ assert json_data["query_id"] == query.query_id
168
+ assert json_data["name"] == query.name
169
+ assert json_data["knowledge_service_id"] == query.knowledge_service_id
170
+ assert json_data["prompt"] == query.prompt
171
+
172
+ def test_knowledge_service_query_json_roundtrip(self) -> None:
173
+ """Test that KnowledgeServiceQuery can be serialized to JSON and
174
+ deserialized back."""
175
+ original_query = KnowledgeServiceQueryFactory.build()
176
+
177
+ # Serialize to JSON
178
+ json_str = original_query.model_dump_json()
179
+ import json
180
+
181
+ json_data = json.loads(json_str)
182
+
183
+ # Deserialize back to KnowledgeServiceQuery
184
+ reconstructed_query = KnowledgeServiceQuery(**json_data)
185
+
186
+ # Should be equivalent
187
+ assert reconstructed_query.query_id == original_query.query_id
188
+ assert reconstructed_query.name == original_query.name
189
+ assert (
190
+ reconstructed_query.knowledge_service_id
191
+ == original_query.knowledge_service_id
192
+ )
193
+ assert reconstructed_query.prompt == original_query.prompt
194
+
195
+
196
+ class TestKnowledgeServiceQueryDefaults:
197
+ """Test KnowledgeServiceQuery default values and behavior."""
198
+
199
+ def test_knowledge_service_query_default_values(self) -> None:
200
+ """Test that KnowledgeServiceQuery has correct default values."""
201
+ minimal_query = KnowledgeServiceQuery(
202
+ query_id="test-id",
203
+ name="Test Query",
204
+ knowledge_service_id="test-service",
205
+ prompt="Test prompt",
206
+ )
207
+
208
+ assert minimal_query.created_at is not None
209
+ assert minimal_query.updated_at is not None
210
+
211
+ def test_knowledge_service_query_custom_values(self) -> None:
212
+ """Test KnowledgeServiceQuery with custom non-default values."""
213
+ custom_query = KnowledgeServiceQuery(
214
+ query_id="custom-id",
215
+ name="Custom Query",
216
+ knowledge_service_id="custom-service",
217
+ prompt="Custom prompt",
218
+ )
219
+
220
+ assert custom_query.query_id == "custom-id"
221
+ assert custom_query.name == "Custom Query"
222
+
223
+
224
+ class TestKnowledgeServiceQueryMetadata:
225
+ """Test KnowledgeServiceQuery query_metadata field functionality."""
226
+
227
+ def test_query_metadata_defaults_to_empty_dict(self) -> None:
228
+ """Test that query_metadata defaults to an empty dict."""
229
+ query = KnowledgeServiceQuery(
230
+ query_id="test-id",
231
+ name="Test Query",
232
+ knowledge_service_id="test-service",
233
+ prompt="Test prompt",
234
+ )
235
+
236
+ assert query.query_metadata == {}
237
+
238
+ def test_query_metadata_accepts_custom_values(self) -> None:
239
+ """Test that query_metadata can accept custom service values."""
240
+ metadata = {
241
+ "model": "claude-sonnet-4-20250514",
242
+ "max_tokens": 4000,
243
+ "temperature": 0.1,
244
+ }
245
+
246
+ query = KnowledgeServiceQuery(
247
+ query_id="test-id",
248
+ name="Test Query",
249
+ knowledge_service_id="anthropic-service",
250
+ prompt="Test prompt",
251
+ query_metadata=metadata,
252
+ )
253
+
254
+ assert query.query_metadata == metadata
255
+ assert query.query_metadata["model"] == "claude-sonnet-4-20250514"
256
+ assert query.query_metadata["max_tokens"] == 4000
257
+ assert query.query_metadata["temperature"] == 0.1
258
+
259
+ def test_query_metadata_serialization(self) -> None:
260
+ """Test that query_metadata serializes correctly in JSON."""
261
+ metadata = {
262
+ "model": "gpt-4",
263
+ "temperature": 0.2,
264
+ "top_p": 0.9,
265
+ "custom_config": {"endpoint": "v2", "retries": 3},
266
+ }
267
+
268
+ query = KnowledgeServiceQuery(
269
+ query_id="openai-query",
270
+ name="OpenAI Query",
271
+ knowledge_service_id="openai-service",
272
+ prompt="Test prompt for OpenAI",
273
+ query_metadata=metadata,
274
+ )
275
+
276
+ json_str = query.model_dump_json()
277
+ import json
278
+
279
+ json_data = json.loads(json_str)
280
+
281
+ assert json_data["query_metadata"] == metadata
282
+ assert json_data["query_metadata"]["model"] == "gpt-4"
283
+ assert json_data["query_metadata"]["custom_config"]["endpoint"] == "v2"
284
+
285
+ def test_query_metadata_roundtrip_serialization(self) -> None:
286
+ """Test query_metadata survives JSON roundtrip serialization."""
287
+ metadata = {
288
+ "model": "claude-sonnet-4-20250514",
289
+ "max_tokens": 2000,
290
+ "temperature": 0.0,
291
+ "citations": True,
292
+ }
293
+
294
+ original = KnowledgeServiceQuery(
295
+ query_id="roundtrip-test",
296
+ name="Roundtrip Test",
297
+ knowledge_service_id="test-service",
298
+ prompt="Test roundtrip serialization",
299
+ query_metadata=metadata,
300
+ )
301
+
302
+ # Serialize and deserialize
303
+ json_str = original.model_dump_json()
304
+ import json
305
+
306
+ json_data = json.loads(json_str)
307
+ reconstructed = KnowledgeServiceQuery(**json_data)
308
+
309
+ assert reconstructed.query_metadata == original.query_metadata
310
+ assert reconstructed.query_metadata == metadata
File without changes
@@ -0,0 +1,68 @@
1
+ """
2
+ Custom Pydantic field types for the CEAP workflow domain.
3
+
4
+ This module contains custom field types that provide proper Pydantic
5
+ validation
6
+ for specialized data types used in the document processing workflow.
7
+ """
8
+
9
+ from pydantic import GetCoreSchemaHandler
10
+ from pydantic_core import core_schema
11
+ from typing import Any
12
+ import io
13
+
14
+
15
+ class ContentStream:
16
+ """Wrapper for IO streams that provides proper Pydantic validation.
17
+
18
+ This class wraps io.IOBase instances to provide proper Pydantic validation
19
+ without requiring arbitrary_types_allowed. It ensures that only valid
20
+ stream objects are accepted while providing a clean interface for
21
+ stream operations.
22
+ """
23
+
24
+ def __init__(self, stream: io.IOBase):
25
+ if not isinstance(stream, io.IOBase):
26
+ raise ValueError(
27
+ "ContentStream requires an io.IOBase instance, " + f"got {type(stream)}"
28
+ )
29
+ self._stream = stream
30
+
31
+ def read(self, size: int = -1) -> bytes:
32
+ """Read from the underlying stream."""
33
+ result = self.stream.read(size)
34
+ if not isinstance(result, bytes):
35
+ # Handle case where stream returns str (like StringIO)
36
+ if isinstance(result, str):
37
+ return result.encode("utf-8")
38
+ return b"" # Fallback for other types
39
+ return result
40
+
41
+ def seek(self, offset: int, whence: int = 0) -> int:
42
+ """Seek in the underlying stream."""
43
+ return self._stream.seek(offset, whence)
44
+
45
+ def tell(self) -> int:
46
+ """Get current position in stream."""
47
+ return self._stream.tell()
48
+
49
+ @property
50
+ def stream(self) -> io.IOBase:
51
+ """Access the underlying stream."""
52
+ return self._stream
53
+
54
+ @classmethod
55
+ def __get_pydantic_core_schema__(
56
+ cls, source_type: type, handler: GetCoreSchemaHandler
57
+ ) -> core_schema.CoreSchema:
58
+ """Define how Pydantic should validate this type."""
59
+ return core_schema.no_info_plain_validator_function(cls._validate)
60
+
61
+ @classmethod
62
+ def _validate(cls, value: Any) -> "ContentStream":
63
+ """Validate input and convert to ContentStream."""
64
+ if isinstance(value, cls):
65
+ return value
66
+ if isinstance(value, io.IOBase):
67
+ return cls(value)
68
+ raise ValueError(f"ContentStream expects io.IOBase, got {type(value)}")
File without changes
@@ -0,0 +1,53 @@
1
+ """
2
+ Tests for custom Pydantic field types.
3
+
4
+ This test module validates the behavior of custom field types used in
5
+ the domain models, particularly focusing on ContentStream which wraps
6
+ io.IOBase instances for proper Pydantic validation.
7
+
8
+ Design decisions documented:
9
+ - ContentStream accepts any io.IOBase instance
10
+ - ContentStream provides read, seek, tell interface
11
+ - ContentStream validates input types at creation
12
+ - ContentStream works with Pydantic validation without arbitrary_types_allowed
13
+ """
14
+
15
+ import pytest
16
+ import io
17
+ from typing import Any
18
+
19
+ from julee.domain.models.custom_fields.content_stream import (
20
+ ContentStream,
21
+ )
22
+
23
+
24
+ @pytest.mark.parametrize(
25
+ "stream_input,error_message",
26
+ [
27
+ # Valid inputs - io.IOBase instances
28
+ (io.BytesIO(b"test content"), None),
29
+ (io.StringIO("text content"), None),
30
+ (io.BufferedReader(io.BytesIO(b"buffered")), None),
31
+ (ContentStream(io.BytesIO(b"nested content")).stream, None),
32
+ # Invalid inputs - not io.IOBase
33
+ (b"raw bytes", "ContentStream requires an io.IOBase instance"),
34
+ ("string", "ContentStream requires an io.IOBase instance"),
35
+ (123, "ContentStream requires an io.IOBase instance"),
36
+ (None, "ContentStream requires an io.IOBase instance"),
37
+ ([], "ContentStream requires an io.IOBase instance"),
38
+ ({}, "ContentStream requires an io.IOBase instance"),
39
+ ],
40
+ )
41
+ def test_content_stream_validation(
42
+ stream_input: Any, error_message: str | None
43
+ ) -> None:
44
+ """Test ContentStream validation with various input types including nested
45
+ streams."""
46
+ if error_message is None:
47
+ # Should create successfully
48
+ content_stream = ContentStream(stream_input)
49
+ assert content_stream.stream is stream_input
50
+ else:
51
+ # Should raise ValueError with specific message
52
+ with pytest.raises(ValueError, match=error_message):
53
+ ContentStream(stream_input)
@@ -0,0 +1,17 @@
1
+ """
2
+ Document domain package for the Capture, Extract, Assemble, Publish workflow.
3
+
4
+ This package contains the Document domain object and its related functionality
5
+ for the CEAP workflow system.
6
+
7
+ Document represents complete document entities including content and metadata,
8
+ providing a stream-like interface for efficient handling of both small and
9
+ large documents.
10
+ """
11
+
12
+ from .document import Document, DocumentStatus
13
+
14
+ __all__ = [
15
+ "Document",
16
+ "DocumentStatus",
17
+ ]
@@ -0,0 +1,150 @@
1
+ """
2
+ Document domain models for the Capture, Extract, Assemble, Publish workflow.
3
+
4
+ This module contains the core document domain objects that represent
5
+ documents and their metadata in the CEAP workflow system.
6
+
7
+ All domain models use Pydantic BaseModel for validation, serialization,
8
+ and type safety, following the patterns established in the sample project.
9
+ """
10
+
11
+ from pydantic import BaseModel, Field, field_validator, model_validator
12
+ from pydantic import ValidationInfo
13
+ from typing import Callable, Optional, List, Dict, Any
14
+ from datetime import datetime, timezone
15
+ from enum import Enum
16
+ from julee.domain.models.custom_fields.content_stream import (
17
+ ContentStream,
18
+ )
19
+
20
+
21
+ def delegate_to_content(*method_names: str) -> Callable[[type], type]:
22
+ """Decorator to delegate IO methods to the content stream property."""
23
+
24
+ def decorator(cls: type) -> type:
25
+ for method_name in method_names:
26
+
27
+ def make_delegated_method(name: str) -> Callable[..., Any]:
28
+ def delegated_method(self: Any, *args: Any, **kwargs: Any) -> Any:
29
+ return getattr(self.content, name)(*args, **kwargs)
30
+
31
+ delegated_method.__name__ = name
32
+ delegated_method.__doc__ = f"Delegate {name} to content stream."
33
+ return delegated_method
34
+
35
+ setattr(cls, method_name, make_delegated_method(method_name))
36
+ return cls
37
+
38
+ return decorator
39
+
40
+
41
+ class DocumentStatus(str, Enum):
42
+ """Status of a document through the Capture, Extract, Assemble, Publish
43
+ pipeline."""
44
+
45
+ CAPTURED = "captured"
46
+ REGISTERED = "registered" # Registered with knowledge service
47
+ # Assembly specification types determined
48
+ ASSEMBLY_SPECIFICATION_IDENTIFIED = "assembly_specification_identified"
49
+ EXTRACTED = "extracted" # Extractions completed
50
+ ASSEMBLED = "assembled" # Template rendered and policies applied
51
+ PUBLISHED = "published"
52
+ FAILED = "failed"
53
+
54
+
55
+ @delegate_to_content("read", "seek", "tell")
56
+ class Document(BaseModel):
57
+ """Complete document entity including content and metadata.
58
+
59
+ This is the primary domain model that represents a complete document
60
+ in the CEAP workflow system. Content is provided as a ContentStream
61
+ for efficient handling of both small and large documents.
62
+
63
+ The content stream is excluded from JSON serialization - use separate
64
+ content endpoints for streaming binary data over HTTP.
65
+ """
66
+
67
+ # Core document identification
68
+ document_id: str
69
+ original_filename: str
70
+ content_type: str
71
+ size_bytes: int = Field(gt=0, description="Size must be positive")
72
+ content_multihash: str = Field(
73
+ description="Multihash of document content for integrity verification"
74
+ )
75
+
76
+ # Document processing state
77
+ status: DocumentStatus = DocumentStatus.CAPTURED
78
+ knowledge_service_id: Optional[str] = None
79
+ assembly_types: List[str] = Field(default_factory=list)
80
+
81
+ # Timestamps
82
+ created_at: Optional[datetime] = Field(
83
+ default_factory=lambda: datetime.now(timezone.utc)
84
+ )
85
+ updated_at: Optional[datetime] = Field(
86
+ default_factory=lambda: datetime.now(timezone.utc)
87
+ )
88
+
89
+ # Additional data and content stream
90
+ additional_metadata: Dict[str, Any] = Field(default_factory=dict)
91
+ content: Optional[ContentStream] = Field(default=None, exclude=True)
92
+ content_string: Optional[str] = Field(
93
+ default=None,
94
+ description="Small content as string (few KB max). Use for "
95
+ "workflow-generated content to avoid ContentStream serialization "
96
+ "issues. For larger content, ensure calling from concrete "
97
+ "implementations (ie. outside workflows and use-cases) and use "
98
+ "content field instead.",
99
+ )
100
+
101
+ @field_validator("document_id")
102
+ @classmethod
103
+ def document_id_must_not_be_empty(cls, v: str) -> str:
104
+ if not v or not v.strip():
105
+ raise ValueError("Document ID cannot be empty")
106
+ return v.strip()
107
+
108
+ @field_validator("original_filename")
109
+ @classmethod
110
+ def filename_must_not_be_empty(cls, v: str) -> str:
111
+ if not v or not v.strip():
112
+ raise ValueError("Original filename cannot be empty")
113
+ return v.strip()
114
+
115
+ @field_validator("content_type")
116
+ @classmethod
117
+ def content_type_must_not_be_empty(cls, v: str) -> str:
118
+ if not v or not v.strip():
119
+ raise ValueError("Content type cannot be empty")
120
+ return v.strip()
121
+
122
+ @field_validator("content_multihash")
123
+ @classmethod
124
+ def content_multihash_must_not_be_empty(cls, v: str) -> str:
125
+ # TODO: actually validate the multihash against the content?
126
+ if not v or not v.strip():
127
+ raise ValueError("Content multihash cannot be empty")
128
+ return v.strip()
129
+
130
+ @model_validator(mode="after")
131
+ def validate_content_fields(self, info: ValidationInfo) -> "Document":
132
+ """Ensure document has either content or content_string, not both."""
133
+ # Check if we're in a Temporal deserialization context
134
+ if info.context and info.context.get("temporal_validation"):
135
+ return self
136
+
137
+ # Normal validation for direct instantiation
138
+ has_content = self.content is not None
139
+ has_content_string = self.content_string is not None
140
+
141
+ if has_content and has_content_string:
142
+ raise ValueError(
143
+ "Document cannot have both content and content_string. "
144
+ "Provide only one."
145
+ )
146
+ elif not has_content and not has_content_string:
147
+ raise ValueError(
148
+ "Document must have either content or content_string. " "Provide one."
149
+ )
150
+ return self
File without changes
@@ -0,0 +1,76 @@
1
+ """
2
+ Test factories for Document domain objects using factory_boy.
3
+
4
+ This module provides factory_boy factories for creating test instances of
5
+ Document domain objects with sensible defaults.
6
+ """
7
+
8
+ import io
9
+ from datetime import datetime, timezone
10
+ from typing import Any
11
+ from factory.base import Factory
12
+ from factory.faker import Faker
13
+ from factory.declarations import LazyAttribute, LazyFunction
14
+
15
+ from julee.domain.models.document import Document, DocumentStatus
16
+ from julee.domain.models.custom_fields.content_stream import (
17
+ ContentStream,
18
+ )
19
+
20
+
21
+ # Helper functions to generate content bytes consistently
22
+ def _get_default_content_bytes() -> bytes:
23
+ """Generate the default content bytes for documents."""
24
+ return b"Test document content for testing purposes"
25
+
26
+
27
+ class ContentStreamFactory(Factory):
28
+ class Meta:
29
+ model = ContentStream
30
+
31
+ # Create ContentStream with BytesIO containing test content
32
+ @classmethod
33
+ def _create(cls, model_class: type[ContentStream], **kwargs: Any) -> ContentStream:
34
+ content = kwargs.get("content", b"Test stream content")
35
+ return model_class(io.BytesIO(content))
36
+
37
+ @classmethod
38
+ def _build(cls, model_class: type[ContentStream], **kwargs: Any) -> ContentStream:
39
+ content = kwargs.get("content", b"Test stream content")
40
+ return model_class(io.BytesIO(content))
41
+
42
+
43
+ class DocumentFactory(Factory):
44
+ """Factory for creating Document instances with sensible test defaults."""
45
+
46
+ class Meta:
47
+ model = Document
48
+
49
+ # Core document identification
50
+ document_id = Faker("uuid4")
51
+ original_filename = "test_document.txt"
52
+ content_type = "text/plain"
53
+ content_multihash = Faker("sha256")
54
+
55
+ # Document processing state
56
+ status = DocumentStatus.CAPTURED
57
+ knowledge_service_id = None
58
+ assembly_types: list[str] = []
59
+
60
+ # Timestamps
61
+ created_at = LazyFunction(lambda: datetime.now(timezone.utc))
62
+ updated_at = LazyFunction(lambda: datetime.now(timezone.utc))
63
+
64
+ # Additional data
65
+ additional_metadata: dict[str, Any] = {}
66
+
67
+ # Content - using LazyAttribute to create fresh BytesIO for each instance
68
+ @LazyAttribute
69
+ def size_bytes(self) -> int:
70
+ # Calculate size from the default content
71
+ return len(_get_default_content_bytes())
72
+
73
+ @LazyAttribute
74
+ def content(self) -> ContentStream:
75
+ # Create ContentStream with default content
76
+ return ContentStream(io.BytesIO(_get_default_content_bytes()))