julee 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- julee/__init__.py +3 -0
- julee/api/__init__.py +20 -0
- julee/api/app.py +180 -0
- julee/api/dependencies.py +257 -0
- julee/api/requests.py +175 -0
- julee/api/responses.py +43 -0
- julee/api/routers/__init__.py +43 -0
- julee/api/routers/assembly_specifications.py +212 -0
- julee/api/routers/documents.py +182 -0
- julee/api/routers/knowledge_service_configs.py +79 -0
- julee/api/routers/knowledge_service_queries.py +293 -0
- julee/api/routers/system.py +137 -0
- julee/api/routers/workflows.py +234 -0
- julee/api/services/__init__.py +20 -0
- julee/api/services/system_initialization.py +214 -0
- julee/api/tests/__init__.py +14 -0
- julee/api/tests/routers/__init__.py +17 -0
- julee/api/tests/routers/test_assembly_specifications.py +749 -0
- julee/api/tests/routers/test_documents.py +301 -0
- julee/api/tests/routers/test_knowledge_service_configs.py +234 -0
- julee/api/tests/routers/test_knowledge_service_queries.py +738 -0
- julee/api/tests/routers/test_system.py +179 -0
- julee/api/tests/routers/test_workflows.py +393 -0
- julee/api/tests/test_app.py +285 -0
- julee/api/tests/test_dependencies.py +245 -0
- julee/api/tests/test_requests.py +250 -0
- julee/domain/__init__.py +22 -0
- julee/domain/models/__init__.py +49 -0
- julee/domain/models/assembly/__init__.py +17 -0
- julee/domain/models/assembly/assembly.py +103 -0
- julee/domain/models/assembly/tests/__init__.py +0 -0
- julee/domain/models/assembly/tests/factories.py +37 -0
- julee/domain/models/assembly/tests/test_assembly.py +430 -0
- julee/domain/models/assembly_specification/__init__.py +24 -0
- julee/domain/models/assembly_specification/assembly_specification.py +172 -0
- julee/domain/models/assembly_specification/knowledge_service_query.py +123 -0
- julee/domain/models/assembly_specification/tests/__init__.py +0 -0
- julee/domain/models/assembly_specification/tests/factories.py +78 -0
- julee/domain/models/assembly_specification/tests/test_assembly_specification.py +490 -0
- julee/domain/models/assembly_specification/tests/test_knowledge_service_query.py +310 -0
- julee/domain/models/custom_fields/__init__.py +0 -0
- julee/domain/models/custom_fields/content_stream.py +68 -0
- julee/domain/models/custom_fields/tests/__init__.py +0 -0
- julee/domain/models/custom_fields/tests/test_custom_fields.py +53 -0
- julee/domain/models/document/__init__.py +17 -0
- julee/domain/models/document/document.py +150 -0
- julee/domain/models/document/tests/__init__.py +0 -0
- julee/domain/models/document/tests/factories.py +76 -0
- julee/domain/models/document/tests/test_document.py +297 -0
- julee/domain/models/knowledge_service_config/__init__.py +17 -0
- julee/domain/models/knowledge_service_config/knowledge_service_config.py +86 -0
- julee/domain/models/policy/__init__.py +15 -0
- julee/domain/models/policy/document_policy_validation.py +220 -0
- julee/domain/models/policy/policy.py +203 -0
- julee/domain/models/policy/tests/__init__.py +0 -0
- julee/domain/models/policy/tests/factories.py +47 -0
- julee/domain/models/policy/tests/test_document_policy_validation.py +420 -0
- julee/domain/models/policy/tests/test_policy.py +546 -0
- julee/domain/repositories/__init__.py +27 -0
- julee/domain/repositories/assembly.py +45 -0
- julee/domain/repositories/assembly_specification.py +52 -0
- julee/domain/repositories/base.py +146 -0
- julee/domain/repositories/document.py +49 -0
- julee/domain/repositories/document_policy_validation.py +52 -0
- julee/domain/repositories/knowledge_service_config.py +54 -0
- julee/domain/repositories/knowledge_service_query.py +44 -0
- julee/domain/repositories/policy.py +49 -0
- julee/domain/use_cases/__init__.py +17 -0
- julee/domain/use_cases/decorators.py +107 -0
- julee/domain/use_cases/extract_assemble_data.py +649 -0
- julee/domain/use_cases/initialize_system_data.py +842 -0
- julee/domain/use_cases/tests/__init__.py +7 -0
- julee/domain/use_cases/tests/test_extract_assemble_data.py +548 -0
- julee/domain/use_cases/tests/test_initialize_system_data.py +455 -0
- julee/domain/use_cases/tests/test_validate_document.py +1228 -0
- julee/domain/use_cases/validate_document.py +736 -0
- julee/fixtures/assembly_specifications.yaml +70 -0
- julee/fixtures/documents.yaml +178 -0
- julee/fixtures/knowledge_service_configs.yaml +37 -0
- julee/fixtures/knowledge_service_queries.yaml +27 -0
- julee/repositories/__init__.py +17 -0
- julee/repositories/memory/__init__.py +31 -0
- julee/repositories/memory/assembly.py +84 -0
- julee/repositories/memory/assembly_specification.py +125 -0
- julee/repositories/memory/base.py +227 -0
- julee/repositories/memory/document.py +149 -0
- julee/repositories/memory/document_policy_validation.py +104 -0
- julee/repositories/memory/knowledge_service_config.py +123 -0
- julee/repositories/memory/knowledge_service_query.py +120 -0
- julee/repositories/memory/policy.py +87 -0
- julee/repositories/memory/tests/__init__.py +0 -0
- julee/repositories/memory/tests/test_document.py +212 -0
- julee/repositories/memory/tests/test_document_policy_validation.py +161 -0
- julee/repositories/memory/tests/test_policy.py +443 -0
- julee/repositories/minio/__init__.py +31 -0
- julee/repositories/minio/assembly.py +103 -0
- julee/repositories/minio/assembly_specification.py +170 -0
- julee/repositories/minio/client.py +570 -0
- julee/repositories/minio/document.py +530 -0
- julee/repositories/minio/document_policy_validation.py +120 -0
- julee/repositories/minio/knowledge_service_config.py +187 -0
- julee/repositories/minio/knowledge_service_query.py +211 -0
- julee/repositories/minio/policy.py +106 -0
- julee/repositories/minio/tests/__init__.py +0 -0
- julee/repositories/minio/tests/fake_client.py +213 -0
- julee/repositories/minio/tests/test_assembly.py +374 -0
- julee/repositories/minio/tests/test_assembly_specification.py +391 -0
- julee/repositories/minio/tests/test_client_protocol.py +57 -0
- julee/repositories/minio/tests/test_document.py +591 -0
- julee/repositories/minio/tests/test_document_policy_validation.py +192 -0
- julee/repositories/minio/tests/test_knowledge_service_config.py +374 -0
- julee/repositories/minio/tests/test_knowledge_service_query.py +438 -0
- julee/repositories/minio/tests/test_policy.py +559 -0
- julee/repositories/temporal/__init__.py +38 -0
- julee/repositories/temporal/activities.py +114 -0
- julee/repositories/temporal/activity_names.py +34 -0
- julee/repositories/temporal/proxies.py +159 -0
- julee/services/__init__.py +18 -0
- julee/services/knowledge_service/__init__.py +48 -0
- julee/services/knowledge_service/anthropic/__init__.py +12 -0
- julee/services/knowledge_service/anthropic/knowledge_service.py +331 -0
- julee/services/knowledge_service/anthropic/tests/test_knowledge_service.py +318 -0
- julee/services/knowledge_service/factory.py +138 -0
- julee/services/knowledge_service/knowledge_service.py +160 -0
- julee/services/knowledge_service/memory/__init__.py +13 -0
- julee/services/knowledge_service/memory/knowledge_service.py +278 -0
- julee/services/knowledge_service/memory/test_knowledge_service.py +345 -0
- julee/services/knowledge_service/test_factory.py +112 -0
- julee/services/temporal/__init__.py +38 -0
- julee/services/temporal/activities.py +86 -0
- julee/services/temporal/activity_names.py +22 -0
- julee/services/temporal/proxies.py +41 -0
- julee/util/__init__.py +0 -0
- julee/util/domain.py +119 -0
- julee/util/repos/__init__.py +0 -0
- julee/util/repos/minio/__init__.py +0 -0
- julee/util/repos/minio/file_storage.py +213 -0
- julee/util/repos/temporal/__init__.py +11 -0
- julee/util/repos/temporal/client_proxies/file_storage.py +68 -0
- julee/util/repos/temporal/data_converter.py +123 -0
- julee/util/repos/temporal/minio_file_storage.py +12 -0
- julee/util/repos/temporal/proxies/__init__.py +0 -0
- julee/util/repos/temporal/proxies/file_storage.py +58 -0
- julee/util/repositories.py +55 -0
- julee/util/temporal/__init__.py +22 -0
- julee/util/temporal/activities.py +123 -0
- julee/util/temporal/decorators.py +473 -0
- julee/util/tests/__init__.py +1 -0
- julee/util/tests/test_decorators.py +770 -0
- julee/util/validation/__init__.py +29 -0
- julee/util/validation/repository.py +100 -0
- julee/util/validation/type_guards.py +369 -0
- julee/worker.py +211 -0
- julee/workflows/__init__.py +26 -0
- julee/workflows/extract_assemble.py +215 -0
- julee/workflows/validate_document.py +228 -0
- julee-0.1.0.dist-info/METADATA +195 -0
- julee-0.1.0.dist-info/RECORD +161 -0
- julee-0.1.0.dist-info/WHEEL +5 -0
- julee-0.1.0.dist-info/licenses/LICENSE +674 -0
- julee-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,591 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Unit tests for MinioDocumentRepository.
|
|
3
|
+
|
|
4
|
+
These tests mock the Minio client to test the repository implementation logic
|
|
5
|
+
without requiring a real MinIO instance. They follow the Clean Architecture
|
|
6
|
+
testing patterns and verify idempotency, error handling, and content.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import io
|
|
10
|
+
import pytest
|
|
11
|
+
import hashlib
|
|
12
|
+
import multihash
|
|
13
|
+
from typing import Any
|
|
14
|
+
from unittest.mock import Mock
|
|
15
|
+
from minio.error import S3Error
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
from julee.repositories.minio.document import MinioDocumentRepository
|
|
19
|
+
from julee.domain.models.document import Document, DocumentStatus
|
|
20
|
+
from julee.domain.models.custom_fields.content_stream import (
|
|
21
|
+
ContentStream,
|
|
22
|
+
)
|
|
23
|
+
from .fake_client import FakeMinioClient
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@pytest.fixture
|
|
27
|
+
def fake_minio_client() -> FakeMinioClient:
|
|
28
|
+
"""Provide a fake Minio client for state-based testing."""
|
|
29
|
+
return FakeMinioClient()
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@pytest.fixture
|
|
33
|
+
def repository(fake_minio_client: FakeMinioClient) -> MinioDocumentRepository:
|
|
34
|
+
"""Provide a repository instance with fake client."""
|
|
35
|
+
return MinioDocumentRepository(fake_minio_client)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@pytest.fixture
|
|
39
|
+
def sample_content() -> ContentStream:
|
|
40
|
+
"""Sample content for testing."""
|
|
41
|
+
content_bytes = b"This is test content for document storage"
|
|
42
|
+
return ContentStream(io.BytesIO(content_bytes))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@pytest.fixture
|
|
46
|
+
def sample_document(sample_content: ContentStream) -> Document:
|
|
47
|
+
"""Sample document for testing."""
|
|
48
|
+
# Calculate the actual multihash for this content
|
|
49
|
+
content_bytes = b"This is test content for document storage"
|
|
50
|
+
sha256_hash = hashlib.sha256(content_bytes).digest()
|
|
51
|
+
mh = multihash.encode(sha256_hash, multihash.SHA2_256)
|
|
52
|
+
actual_multihash = str(mh.hex())
|
|
53
|
+
|
|
54
|
+
return Document(
|
|
55
|
+
document_id="test-doc-123",
|
|
56
|
+
original_filename="test.txt",
|
|
57
|
+
content_type="text/plain",
|
|
58
|
+
size_bytes=len(content_bytes),
|
|
59
|
+
content_multihash=actual_multihash,
|
|
60
|
+
status=DocumentStatus.CAPTURED,
|
|
61
|
+
content=sample_content,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class TestMinioDocumentRepositoryInitialization:
|
|
66
|
+
"""Test repository initialization and bucket creation."""
|
|
67
|
+
|
|
68
|
+
def test_init_creates_buckets_when_missing(self) -> None:
|
|
69
|
+
"""Test that missing buckets are created during initialization."""
|
|
70
|
+
fake_client = FakeMinioClient()
|
|
71
|
+
|
|
72
|
+
# Verify no buckets exist initially
|
|
73
|
+
assert not fake_client.bucket_exists("documents")
|
|
74
|
+
assert not fake_client.bucket_exists("documents-content")
|
|
75
|
+
|
|
76
|
+
# Initialize repository - should create buckets
|
|
77
|
+
MinioDocumentRepository(fake_client)
|
|
78
|
+
|
|
79
|
+
# Verify buckets were created
|
|
80
|
+
assert fake_client.bucket_exists("documents")
|
|
81
|
+
assert fake_client.bucket_exists("documents-content")
|
|
82
|
+
|
|
83
|
+
def test_init_skips_existing_buckets(self) -> None:
|
|
84
|
+
"""Test that existing buckets are not recreated."""
|
|
85
|
+
fake_client = FakeMinioClient()
|
|
86
|
+
|
|
87
|
+
# Pre-create buckets
|
|
88
|
+
fake_client.make_bucket("documents")
|
|
89
|
+
fake_client.make_bucket("documents-content")
|
|
90
|
+
|
|
91
|
+
# Initialize repository - should not fail or recreate
|
|
92
|
+
MinioDocumentRepository(fake_client)
|
|
93
|
+
|
|
94
|
+
# Verify buckets still exist (no exception thrown)
|
|
95
|
+
assert fake_client.bucket_exists("documents")
|
|
96
|
+
assert fake_client.bucket_exists("documents-content")
|
|
97
|
+
|
|
98
|
+
def test_init_handles_bucket_creation_error(self) -> None:
|
|
99
|
+
"""Test proper error handling during bucket creation."""
|
|
100
|
+
fake_client = FakeMinioClient()
|
|
101
|
+
|
|
102
|
+
# Pre-create one bucket to cause a conflict
|
|
103
|
+
fake_client.make_bucket("documents")
|
|
104
|
+
|
|
105
|
+
# Override make_bucket to raise error for second bucket
|
|
106
|
+
original_make_bucket = fake_client.make_bucket
|
|
107
|
+
|
|
108
|
+
def failing_make_bucket(bucket_name: str) -> None:
|
|
109
|
+
if bucket_name == "documents-content":
|
|
110
|
+
raise S3Error(
|
|
111
|
+
code="AccessDenied",
|
|
112
|
+
message="Access denied",
|
|
113
|
+
resource="AccessDenied",
|
|
114
|
+
request_id="req123",
|
|
115
|
+
host_id="host123",
|
|
116
|
+
response=Mock(),
|
|
117
|
+
)
|
|
118
|
+
return original_make_bucket(bucket_name)
|
|
119
|
+
|
|
120
|
+
fake_client.make_bucket = failing_make_bucket # type: ignore[method-assign]
|
|
121
|
+
|
|
122
|
+
with pytest.raises(S3Error):
|
|
123
|
+
MinioDocumentRepository(fake_client)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class TestMinioDocumentRepositoryStore:
|
|
127
|
+
"""Test document storage operations."""
|
|
128
|
+
|
|
129
|
+
async def test_store_new_document(
|
|
130
|
+
self, fake_minio_client: FakeMinioClient, sample_document: Document
|
|
131
|
+
) -> None:
|
|
132
|
+
"""Test storing a new document with content."""
|
|
133
|
+
repository = MinioDocumentRepository(fake_minio_client)
|
|
134
|
+
|
|
135
|
+
# Verify buckets are empty initially
|
|
136
|
+
assert fake_minio_client.get_object_count("documents") == 0
|
|
137
|
+
assert fake_minio_client.get_object_count("documents-content") == 0
|
|
138
|
+
|
|
139
|
+
# Act
|
|
140
|
+
await repository.save(sample_document)
|
|
141
|
+
|
|
142
|
+
# Assert content and metadata were stored
|
|
143
|
+
assert fake_minio_client.get_object_count("documents") == 1
|
|
144
|
+
assert fake_minio_client.get_object_count("documents-content") == 1
|
|
145
|
+
|
|
146
|
+
# Verify content was stored with calculated multihash as key
|
|
147
|
+
content_objects = fake_minio_client.get_stored_objects("documents-content")
|
|
148
|
+
calculated_multihash = sample_document.content_multihash
|
|
149
|
+
assert calculated_multihash in content_objects
|
|
150
|
+
|
|
151
|
+
# Verify metadata was stored with document ID as key
|
|
152
|
+
metadata_objects = fake_minio_client.get_stored_objects("documents")
|
|
153
|
+
assert sample_document.document_id in metadata_objects
|
|
154
|
+
|
|
155
|
+
async def test_store_document_with_existing_content_deduplication(
|
|
156
|
+
self, fake_minio_client: FakeMinioClient, sample_document: Document
|
|
157
|
+
) -> None:
|
|
158
|
+
"""Test that existing content is not re-stored (deduplication)."""
|
|
159
|
+
repository = MinioDocumentRepository(fake_minio_client)
|
|
160
|
+
|
|
161
|
+
# Store first document
|
|
162
|
+
await repository.save(sample_document)
|
|
163
|
+
|
|
164
|
+
# Verify first document was stored with correct multihash
|
|
165
|
+
stored_multihash = sample_document.content_multihash
|
|
166
|
+
|
|
167
|
+
# Create second document with identical content but different metadata
|
|
168
|
+
assert sample_document.content is not None
|
|
169
|
+
sample_document.content.seek(0) # Reset stream
|
|
170
|
+
content_bytes = sample_document.content.read()
|
|
171
|
+
sample_document.content.seek(0) # Reset again
|
|
172
|
+
|
|
173
|
+
second_document = Document(
|
|
174
|
+
document_id="different-doc-456",
|
|
175
|
+
original_filename="different.txt",
|
|
176
|
+
content_type="text/plain",
|
|
177
|
+
size_bytes=len(content_bytes),
|
|
178
|
+
content_multihash=stored_multihash, # Same calculated multihash
|
|
179
|
+
status=DocumentStatus.CAPTURED,
|
|
180
|
+
content=ContentStream(io.BytesIO(content_bytes)),
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# Store second document - should reuse existing content
|
|
184
|
+
await repository.save(second_document)
|
|
185
|
+
|
|
186
|
+
# Assert: 2 metadata objects, but only 1 content object (worked)
|
|
187
|
+
assert fake_minio_client.get_object_count("documents") == 2
|
|
188
|
+
assert fake_minio_client.get_object_count("documents-content") == 1
|
|
189
|
+
|
|
190
|
+
# Verify deduplication: both documents reference same content object
|
|
191
|
+
content_objects = fake_minio_client.get_stored_objects("documents-content")
|
|
192
|
+
assert len(content_objects) == 1 # Only one content object stored
|
|
193
|
+
assert stored_multihash in content_objects # Stored under the correct hash key
|
|
194
|
+
|
|
195
|
+
# Verify both documents have the same multihash (share content)
|
|
196
|
+
assert sample_document.content_multihash == stored_multihash
|
|
197
|
+
assert second_document.content_multihash == stored_multihash
|
|
198
|
+
|
|
199
|
+
async def test_store_updates_multihash_when_different(
|
|
200
|
+
self, fake_minio_client: FakeMinioClient, sample_document: Document
|
|
201
|
+
) -> None:
|
|
202
|
+
"""Test that document multihash is updated when calculated differs."""
|
|
203
|
+
repository = MinioDocumentRepository(fake_minio_client)
|
|
204
|
+
|
|
205
|
+
# Deliberately set an incorrect multihash to test correction
|
|
206
|
+
correct_multihash = sample_document.content_multihash
|
|
207
|
+
sample_document.content_multihash = "incorrect_hash_12345"
|
|
208
|
+
|
|
209
|
+
# Act
|
|
210
|
+
await repository.save(sample_document)
|
|
211
|
+
|
|
212
|
+
# Assert multihash was corrected to the calculated value
|
|
213
|
+
assert sample_document.content_multihash == correct_multihash
|
|
214
|
+
assert sample_document.content_multihash != "incorrect_hash_12345"
|
|
215
|
+
|
|
216
|
+
# Verify content is stored under the calculated multihash
|
|
217
|
+
content_objects = fake_minio_client.get_stored_objects("documents-content")
|
|
218
|
+
assert correct_multihash in content_objects
|
|
219
|
+
assert "incorrect_hash_12345" not in content_objects
|
|
220
|
+
|
|
221
|
+
async def test_store_handles_content_storage_error(
|
|
222
|
+
self, fake_minio_client: FakeMinioClient, sample_document: Document
|
|
223
|
+
) -> None:
|
|
224
|
+
"""Test proper error handling during content storage."""
|
|
225
|
+
repository = MinioDocumentRepository(fake_minio_client)
|
|
226
|
+
|
|
227
|
+
# Override put_object to raise error when storing content
|
|
228
|
+
original_put_object = repository.client.put_object
|
|
229
|
+
|
|
230
|
+
def failing_put_object(
|
|
231
|
+
bucket_name: str,
|
|
232
|
+
object_name: str,
|
|
233
|
+
data: Any,
|
|
234
|
+
length: int,
|
|
235
|
+
**kwargs: Any,
|
|
236
|
+
) -> Any:
|
|
237
|
+
if bucket_name == "documents-content":
|
|
238
|
+
raise S3Error(
|
|
239
|
+
code="AccessDenied",
|
|
240
|
+
message="Access denied",
|
|
241
|
+
resource="AccessDenied",
|
|
242
|
+
request_id="req123",
|
|
243
|
+
host_id="host123",
|
|
244
|
+
response=Mock(),
|
|
245
|
+
)
|
|
246
|
+
return original_put_object(bucket_name, object_name, data, length, **kwargs)
|
|
247
|
+
|
|
248
|
+
repository.client.put_object = failing_put_object # type: ignore[method-assign, assignment]
|
|
249
|
+
|
|
250
|
+
# Act & Assert
|
|
251
|
+
with pytest.raises(S3Error):
|
|
252
|
+
await repository.save(sample_document)
|
|
253
|
+
|
|
254
|
+
# Verify no objects were stored
|
|
255
|
+
assert fake_minio_client.get_object_count("documents") == 0
|
|
256
|
+
assert fake_minio_client.get_object_count("documents-content") == 0
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
class TestMinioDocumentRepositoryGet:
|
|
260
|
+
"""Test document retrieval operations."""
|
|
261
|
+
|
|
262
|
+
async def test_get_existing_document(
|
|
263
|
+
self, repository: MinioDocumentRepository, sample_document: Document
|
|
264
|
+
) -> None:
|
|
265
|
+
"""Test retrieving an existing document with content."""
|
|
266
|
+
# Store a document first
|
|
267
|
+
await repository.save(sample_document)
|
|
268
|
+
|
|
269
|
+
# Act - retrieve the document
|
|
270
|
+
result = await repository.get(sample_document.document_id)
|
|
271
|
+
|
|
272
|
+
# Assert
|
|
273
|
+
assert result is not None
|
|
274
|
+
assert result.document_id == sample_document.document_id
|
|
275
|
+
assert result.original_filename == sample_document.original_filename
|
|
276
|
+
assert result.content_type == sample_document.content_type
|
|
277
|
+
assert result.size_bytes == sample_document.size_bytes
|
|
278
|
+
|
|
279
|
+
# Verify content can be read
|
|
280
|
+
assert result.content is not None
|
|
281
|
+
retrieved_content = result.content.read()
|
|
282
|
+
|
|
283
|
+
# Reset sample document content for comparison
|
|
284
|
+
assert sample_document.content is not None
|
|
285
|
+
sample_document.content.seek(0)
|
|
286
|
+
original_content = sample_document.content.read()
|
|
287
|
+
|
|
288
|
+
assert retrieved_content == original_content
|
|
289
|
+
|
|
290
|
+
async def test_get_document_missing_content_multihash(
|
|
291
|
+
self, repository: MinioDocumentRepository
|
|
292
|
+
) -> None:
|
|
293
|
+
"""Test handling document metadata without content_multihash."""
|
|
294
|
+
# Manually store invalid metadata (missing content_multihash)
|
|
295
|
+
invalid_metadata_json = (
|
|
296
|
+
'{"document_id": "test-123", "original_filename": "test.txt"}'
|
|
297
|
+
)
|
|
298
|
+
repository.client.put_object(
|
|
299
|
+
"documents",
|
|
300
|
+
"test-123",
|
|
301
|
+
io.BytesIO(invalid_metadata_json.encode("utf-8")),
|
|
302
|
+
len(invalid_metadata_json),
|
|
303
|
+
content_type="application/json",
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
# Act
|
|
307
|
+
result = await repository.get("test-123")
|
|
308
|
+
|
|
309
|
+
# Assert
|
|
310
|
+
assert result is None
|
|
311
|
+
|
|
312
|
+
async def test_get_document_with_missing_content(
|
|
313
|
+
self, repository: MinioDocumentRepository
|
|
314
|
+
) -> None:
|
|
315
|
+
"""Test that missing content returns None."""
|
|
316
|
+
# Store metadata but not content
|
|
317
|
+
metadata_json = (
|
|
318
|
+
'{"document_id": "test-123", "content_multihash": "missing_hash",'
|
|
319
|
+
' "original_filename": "test.txt", "content_type": "text/plain",'
|
|
320
|
+
' "size_bytes": 100, "status": "captured"}'
|
|
321
|
+
)
|
|
322
|
+
repository.client.put_object(
|
|
323
|
+
"documents",
|
|
324
|
+
"test-123",
|
|
325
|
+
io.BytesIO(metadata_json.encode("utf-8")),
|
|
326
|
+
len(metadata_json),
|
|
327
|
+
content_type="application/json",
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
# Note: content with multihash "missing_hash" does not exist
|
|
331
|
+
|
|
332
|
+
# Act
|
|
333
|
+
result = await repository.get("test-123")
|
|
334
|
+
|
|
335
|
+
# Assert - should return None when content is missing
|
|
336
|
+
assert result is None
|
|
337
|
+
|
|
338
|
+
async def test_get_nonexistent_document(
|
|
339
|
+
self, repository: MinioDocumentRepository
|
|
340
|
+
) -> None:
|
|
341
|
+
"""Test retrieving a document that doesn't exist."""
|
|
342
|
+
# Act - try to get a document that was never stored
|
|
343
|
+
result = await repository.get("nonexistent-123")
|
|
344
|
+
|
|
345
|
+
# Assert
|
|
346
|
+
assert result is None
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
class TestMinioDocumentRepositoryUpdate:
|
|
350
|
+
"""Test document update operations."""
|
|
351
|
+
|
|
352
|
+
async def test_update_document(
|
|
353
|
+
self, repository: MinioDocumentRepository, sample_document: Document
|
|
354
|
+
) -> None:
|
|
355
|
+
"""Test updating a document."""
|
|
356
|
+
# Store document initially
|
|
357
|
+
await repository.save(sample_document)
|
|
358
|
+
original_updated_at = sample_document.updated_at
|
|
359
|
+
|
|
360
|
+
# Modify document
|
|
361
|
+
sample_document.status = DocumentStatus.EXTRACTED
|
|
362
|
+
|
|
363
|
+
# Act
|
|
364
|
+
await repository.save(sample_document)
|
|
365
|
+
|
|
366
|
+
# Assert updated_at was changed
|
|
367
|
+
assert sample_document.updated_at != original_updated_at
|
|
368
|
+
if original_updated_at and sample_document.updated_at:
|
|
369
|
+
assert sample_document.updated_at > original_updated_at
|
|
370
|
+
|
|
371
|
+
# Verify document was actually updated in storage
|
|
372
|
+
retrieved_doc = await repository.get(sample_document.document_id)
|
|
373
|
+
assert retrieved_doc is not None
|
|
374
|
+
assert retrieved_doc.status == DocumentStatus.EXTRACTED
|
|
375
|
+
assert retrieved_doc.updated_at == sample_document.updated_at
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
class TestMinioDocumentRepositoryGenerateId:
|
|
379
|
+
"""Test ID generation."""
|
|
380
|
+
|
|
381
|
+
async def test_generate_id(self, repository: MinioDocumentRepository) -> None:
|
|
382
|
+
"""Test that generate_id returns a unique string."""
|
|
383
|
+
# Act
|
|
384
|
+
doc_id_1 = await repository.generate_id()
|
|
385
|
+
doc_id_2 = await repository.generate_id()
|
|
386
|
+
|
|
387
|
+
# Assert
|
|
388
|
+
assert isinstance(doc_id_1, str)
|
|
389
|
+
assert isinstance(doc_id_2, str)
|
|
390
|
+
assert doc_id_1 != doc_id_2
|
|
391
|
+
assert len(doc_id_1) > 0
|
|
392
|
+
assert len(doc_id_2) > 0
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
class TestMinioDocumentRepositoryMultihash:
|
|
396
|
+
"""Test multihash calculation functionality."""
|
|
397
|
+
|
|
398
|
+
def test_calculate_multihash_from_stream(
|
|
399
|
+
self, repository: MinioDocumentRepository
|
|
400
|
+
) -> None:
|
|
401
|
+
"""Test multihash calculation from stream."""
|
|
402
|
+
content = b"test content for hashing"
|
|
403
|
+
stream = ContentStream(io.BytesIO(content))
|
|
404
|
+
|
|
405
|
+
# Act
|
|
406
|
+
multihash_result = repository._calculate_multihash_from_stream(stream)
|
|
407
|
+
|
|
408
|
+
# Assert
|
|
409
|
+
assert isinstance(multihash_result, str)
|
|
410
|
+
assert len(multihash_result) > 0
|
|
411
|
+
|
|
412
|
+
# Test deterministic - same content should produce same hash
|
|
413
|
+
stream.seek(0)
|
|
414
|
+
multihash_result_2 = repository._calculate_multihash_from_stream(stream)
|
|
415
|
+
assert multihash_result == multihash_result_2
|
|
416
|
+
|
|
417
|
+
def test_calculate_multihash_from_empty_stream(
|
|
418
|
+
self, repository: MinioDocumentRepository
|
|
419
|
+
) -> None:
|
|
420
|
+
"""Test multihash calculation from empty stream."""
|
|
421
|
+
stream = ContentStream(io.BytesIO(b""))
|
|
422
|
+
|
|
423
|
+
# Act
|
|
424
|
+
multihash_result = repository._calculate_multihash_from_stream(stream)
|
|
425
|
+
|
|
426
|
+
# Assert
|
|
427
|
+
assert isinstance(multihash_result, str)
|
|
428
|
+
assert len(multihash_result) > 0
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
class TestMinioDocumentRepositoryContentString:
|
|
432
|
+
"""Test content_string functionality."""
|
|
433
|
+
|
|
434
|
+
async def test_save_document_with_content_string(
|
|
435
|
+
self, repository: MinioDocumentRepository
|
|
436
|
+
) -> None:
|
|
437
|
+
"""Test saving document with content_string (small content)."""
|
|
438
|
+
content = '{"assembled": "document", "data": "test"}'
|
|
439
|
+
|
|
440
|
+
# Create document with content_string
|
|
441
|
+
document = Document(
|
|
442
|
+
document_id="test-doc-content-string",
|
|
443
|
+
original_filename="assembled.json",
|
|
444
|
+
content_type="application/json",
|
|
445
|
+
size_bytes=100, # Will be updated automatically
|
|
446
|
+
content_multihash="placeholder", # Will be updated automatically
|
|
447
|
+
status=DocumentStatus.CAPTURED,
|
|
448
|
+
content_string=content,
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
# Act - save should convert content_string to ContentStream
|
|
452
|
+
await repository.save(document)
|
|
453
|
+
|
|
454
|
+
# Assert document was saved successfully
|
|
455
|
+
retrieved = await repository.get(document.document_id)
|
|
456
|
+
assert retrieved is not None
|
|
457
|
+
assert retrieved.content_multihash != "placeholder" # Hash was calculated
|
|
458
|
+
assert retrieved.size_bytes == len(content.encode("utf-8"))
|
|
459
|
+
|
|
460
|
+
# Verify content can be read
|
|
461
|
+
assert retrieved.content is not None
|
|
462
|
+
retrieved_content = retrieved.content.read().decode("utf-8")
|
|
463
|
+
assert retrieved_content == content
|
|
464
|
+
|
|
465
|
+
async def test_save_document_with_content_string_unicode(
|
|
466
|
+
self, repository: MinioDocumentRepository
|
|
467
|
+
) -> None:
|
|
468
|
+
"""Test saving document with unicode content_string."""
|
|
469
|
+
content = '{"title": "测试文档", "emoji": "🚀", "content": "éñ"}'
|
|
470
|
+
|
|
471
|
+
document = Document(
|
|
472
|
+
document_id="test-doc-unicode",
|
|
473
|
+
original_filename="unicode.json",
|
|
474
|
+
content_type="application/json",
|
|
475
|
+
size_bytes=100,
|
|
476
|
+
content_multihash="placeholder",
|
|
477
|
+
status=DocumentStatus.CAPTURED,
|
|
478
|
+
content_string=content,
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
await repository.save(document)
|
|
482
|
+
retrieved = await repository.get(document.document_id)
|
|
483
|
+
|
|
484
|
+
assert retrieved is not None
|
|
485
|
+
assert retrieved.content is not None
|
|
486
|
+
retrieved_content = retrieved.content.read().decode("utf-8")
|
|
487
|
+
assert retrieved_content == content
|
|
488
|
+
|
|
489
|
+
# Note: Empty content test removed because domain model requires
|
|
490
|
+
# size_bytes > 0
|
|
491
|
+
|
|
492
|
+
async def test_save_excludes_content_string_from_metadata(
|
|
493
|
+
self,
|
|
494
|
+
repository: MinioDocumentRepository,
|
|
495
|
+
fake_minio_client: FakeMinioClient,
|
|
496
|
+
) -> None:
|
|
497
|
+
"""Test that content_string is not stored in metadata."""
|
|
498
|
+
content = '{"test": "data that should not be in metadata"}'
|
|
499
|
+
|
|
500
|
+
document = Document(
|
|
501
|
+
document_id="test-metadata-exclusion",
|
|
502
|
+
original_filename="test.json",
|
|
503
|
+
content_type="application/json",
|
|
504
|
+
size_bytes=100,
|
|
505
|
+
content_multihash="placeholder",
|
|
506
|
+
status=DocumentStatus.CAPTURED,
|
|
507
|
+
content_string=content,
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
await repository.save(document)
|
|
511
|
+
|
|
512
|
+
# Check raw metadata stored in MinIO
|
|
513
|
+
metadata_response = fake_minio_client.get_object(
|
|
514
|
+
bucket_name="documents", object_name="test-metadata-exclusion"
|
|
515
|
+
)
|
|
516
|
+
metadata_data = metadata_response.read()
|
|
517
|
+
metadata_json = metadata_data.decode("utf-8")
|
|
518
|
+
|
|
519
|
+
import json
|
|
520
|
+
|
|
521
|
+
metadata_dict = json.loads(metadata_json)
|
|
522
|
+
|
|
523
|
+
# Verify content_string is not in stored metadata
|
|
524
|
+
assert "content_string" not in metadata_dict
|
|
525
|
+
assert "content" not in metadata_dict
|
|
526
|
+
|
|
527
|
+
# Verify essential fields are still present
|
|
528
|
+
assert metadata_dict["document_id"] == "test-metadata-exclusion"
|
|
529
|
+
assert "content_multihash" in metadata_dict
|
|
530
|
+
assert "status" in metadata_dict
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
class TestMinioDocumentRepositoryErrorHandling:
|
|
534
|
+
"""Test error handling scenarios."""
|
|
535
|
+
|
|
536
|
+
async def test_store_handles_metadata_storage_error(
|
|
537
|
+
self, fake_minio_client: FakeMinioClient, sample_document: Document
|
|
538
|
+
) -> None:
|
|
539
|
+
"""Test error handling when metadata storage fails."""
|
|
540
|
+
repository = MinioDocumentRepository(fake_minio_client)
|
|
541
|
+
|
|
542
|
+
# Override put_object to fail only for metadata storage
|
|
543
|
+
original_put_object = repository.client.put_object
|
|
544
|
+
|
|
545
|
+
def failing_put_object(
|
|
546
|
+
bucket_name: str,
|
|
547
|
+
object_name: str,
|
|
548
|
+
data: Any,
|
|
549
|
+
length: int,
|
|
550
|
+
**kwargs: Any,
|
|
551
|
+
) -> Any:
|
|
552
|
+
if bucket_name == "documents":
|
|
553
|
+
raise S3Error(
|
|
554
|
+
code="AccessDenied",
|
|
555
|
+
message="Access denied",
|
|
556
|
+
resource="AccessDenied",
|
|
557
|
+
request_id="req123",
|
|
558
|
+
host_id="host123",
|
|
559
|
+
response=Mock(),
|
|
560
|
+
)
|
|
561
|
+
return original_put_object(bucket_name, object_name, data, length, **kwargs)
|
|
562
|
+
|
|
563
|
+
repository.client.put_object = failing_put_object # type: ignore[method-assign, assignment]
|
|
564
|
+
|
|
565
|
+
# Act & Assert
|
|
566
|
+
with pytest.raises(S3Error):
|
|
567
|
+
await repository.save(sample_document)
|
|
568
|
+
|
|
569
|
+
# Verify content was stored but metadata was not
|
|
570
|
+
assert fake_minio_client.get_object_count("documents-content") == 1
|
|
571
|
+
assert fake_minio_client.get_object_count("documents") == 0
|
|
572
|
+
|
|
573
|
+
async def test_get_handles_unexpected_error(
|
|
574
|
+
self, repository: MinioDocumentRepository
|
|
575
|
+
) -> None:
|
|
576
|
+
"""Test handling of unexpected errors during get operation."""
|
|
577
|
+
# Override get_object to raise unexpected error
|
|
578
|
+
original_get_object = repository.client.get_object
|
|
579
|
+
|
|
580
|
+
def failing_get_object(bucket_name: str, object_name: str) -> Any:
|
|
581
|
+
if bucket_name == "documents":
|
|
582
|
+
raise Exception("Unexpected error")
|
|
583
|
+
return original_get_object(bucket_name, object_name)
|
|
584
|
+
|
|
585
|
+
repository.client.get_object = failing_get_object # type: ignore[method-assign]
|
|
586
|
+
|
|
587
|
+
# Act
|
|
588
|
+
result = await repository.get("test-123")
|
|
589
|
+
|
|
590
|
+
# Assert - should return None and not propagate exception
|
|
591
|
+
assert result is None
|