morphik 1.2.3__tar.gz → 1.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {morphik-1.2.3 → morphik-1.2.4}/.gitignore +0 -3
- {morphik-1.2.3 → morphik-1.2.4}/PKG-INFO +7 -6
- {morphik-1.2.3 → morphik-1.2.4}/README.md +6 -5
- {morphik-1.2.3 → morphik-1.2.4}/morphik/__init__.py +4 -2
- {morphik-1.2.3 → morphik-1.2.4}/morphik/_internal.py +0 -10
- {morphik-1.2.3 → morphik-1.2.4}/morphik/_scoped_ops.py +0 -2
- {morphik-1.2.3 → morphik-1.2.4}/morphik/_shared.py +54 -1
- {morphik-1.2.3 → morphik-1.2.4}/morphik/async_.py +233 -14
- {morphik-1.2.3 → morphik-1.2.4}/morphik/models.py +21 -0
- {morphik-1.2.3 → morphik-1.2.4}/morphik/sync.py +219 -13
- {morphik-1.2.3 → morphik-1.2.4}/morphik/tests/test_scoped_ops_unit.py +55 -33
- {morphik-1.2.3 → morphik-1.2.4}/morphik/tests/test_shared_helpers.py +36 -1
- {morphik-1.2.3 → morphik-1.2.4}/pyproject.toml +1 -1
- {morphik-1.2.3 → morphik-1.2.4}/morphik/exceptions.py +0 -0
- {morphik-1.2.3 → morphik-1.2.4}/morphik/tests/README.md +0 -0
- {morphik-1.2.3 → morphik-1.2.4}/morphik/tests/__init__.py +0 -0
- {morphik-1.2.3 → morphik-1.2.4}/morphik/tests/example_usage.py +0 -0
- {morphik-1.2.3 → morphik-1.2.4}/morphik/tests/test_app_ops.py +0 -0
- {morphik-1.2.3 → morphik-1.2.4}/morphik/tests/test_async.py +0 -0
- {morphik-1.2.3 → morphik-1.2.4}/morphik/tests/test_docs/sample1.txt +0 -0
- {morphik-1.2.3 → morphik-1.2.4}/morphik/tests/test_docs/sample2.txt +0 -0
- {morphik-1.2.3 → morphik-1.2.4}/morphik/tests/test_docs/sample3.txt +0 -0
- {morphik-1.2.3 → morphik-1.2.4}/morphik/tests/test_sync.py +0 -0
- {morphik-1.2.3 → morphik-1.2.4}/morphik/tests/test_update_document_metadata_rename.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: morphik
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.4
|
|
4
4
|
Summary: Morphik Python Client
|
|
5
5
|
Author-email: Morphik <founders@morphik.ai>
|
|
6
6
|
Requires-Python: >=3.8
|
|
@@ -91,6 +91,12 @@ response = db.query(
|
|
|
91
91
|
)
|
|
92
92
|
|
|
93
93
|
print(response.completion)
|
|
94
|
+
|
|
95
|
+
# Migrate this app's documents into another Morphik deployment.
|
|
96
|
+
# Run this from a machine that can reach both source and target, such as
|
|
97
|
+
# inside a customer's VPN for on-prem targets.
|
|
98
|
+
result = db.migrate(target_uri="morphik://owner_id:token@onprem.example.com", target_is_local=True)
|
|
99
|
+
print(result.created_count, result.skipped_count, result.failed_count)
|
|
94
100
|
```
|
|
95
101
|
|
|
96
102
|
### Nested Folders & Folder Depth
|
|
@@ -106,11 +112,6 @@ renamed = moved.rename("specs-v2")
|
|
|
106
112
|
# Scope queries to a path and include descendants with folder_depth=-1
|
|
107
113
|
chunks = folder.retrieve_chunks(query="design notes", folder_depth=-1)
|
|
108
114
|
docs = db.list_documents(folder_name="/projects/alpha", folder_depth=-1)
|
|
109
|
-
|
|
110
|
-
# List only the fields you need. The server reads and returns just those columns, so
|
|
111
|
-
# the full document text is never downloaded — fast for large corpora.
|
|
112
|
-
for doc in db.list_documents(fields=["metadata"]).documents:
|
|
113
|
-
print(doc.external_id, doc.metadata)
|
|
114
115
|
```
|
|
115
116
|
|
|
116
117
|
`Folder.full_path` is exposed on folder objects, and `Document.folder_path` mirrors server responses for tracing scope.
|
|
@@ -78,6 +78,12 @@ response = db.query(
|
|
|
78
78
|
)
|
|
79
79
|
|
|
80
80
|
print(response.completion)
|
|
81
|
+
|
|
82
|
+
# Migrate this app's documents into another Morphik deployment.
|
|
83
|
+
# Run this from a machine that can reach both source and target, such as
|
|
84
|
+
# inside a customer's VPN for on-prem targets.
|
|
85
|
+
result = db.migrate(target_uri="morphik://owner_id:token@onprem.example.com", target_is_local=True)
|
|
86
|
+
print(result.created_count, result.skipped_count, result.failed_count)
|
|
81
87
|
```
|
|
82
88
|
|
|
83
89
|
### Nested Folders & Folder Depth
|
|
@@ -93,11 +99,6 @@ renamed = moved.rename("specs-v2")
|
|
|
93
99
|
# Scope queries to a path and include descendants with folder_depth=-1
|
|
94
100
|
chunks = folder.retrieve_chunks(query="design notes", folder_depth=-1)
|
|
95
101
|
docs = db.list_documents(folder_name="/projects/alpha", folder_depth=-1)
|
|
96
|
-
|
|
97
|
-
# List only the fields you need. The server reads and returns just those columns, so
|
|
98
|
-
# the full document text is never downloaded — fast for large corpora.
|
|
99
|
-
for doc in db.list_documents(fields=["metadata"]).documents:
|
|
100
|
-
print(doc.external_id, doc.metadata)
|
|
101
102
|
```
|
|
102
103
|
|
|
103
104
|
`Folder.full_path` is exposed on folder objects, and `Document.folder_path` mirrors server responses for tracing scope.
|
|
@@ -3,7 +3,7 @@ Morphik Python SDK for document ingestion and querying.
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
from .async_ import AsyncMorphik
|
|
6
|
-
from .models import Document, DocumentQueryResponse, Summary
|
|
6
|
+
from .models import Document, DocumentQueryResponse, MigrationDocumentResult, MigrationResult, Summary
|
|
7
7
|
from .sync import Morphik
|
|
8
8
|
|
|
9
9
|
__all__ = [
|
|
@@ -12,6 +12,8 @@ __all__ = [
|
|
|
12
12
|
"Document",
|
|
13
13
|
"Summary",
|
|
14
14
|
"DocumentQueryResponse",
|
|
15
|
+
"MigrationDocumentResult",
|
|
16
|
+
"MigrationResult",
|
|
15
17
|
]
|
|
16
18
|
|
|
17
|
-
__version__ = "1.2.
|
|
19
|
+
__version__ = "1.2.4"
|
|
@@ -428,7 +428,6 @@ class _MorphikClientLogic:
|
|
|
428
428
|
completed_only: bool,
|
|
429
429
|
sort_by: Optional[str],
|
|
430
430
|
sort_direction: str,
|
|
431
|
-
fields: Optional[List[str]] = None,
|
|
432
431
|
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
|
433
432
|
"""Prepare request for list_docs endpoint"""
|
|
434
433
|
params = {}
|
|
@@ -451,15 +450,6 @@ class _MorphikClientLogic:
|
|
|
451
450
|
"sort_by": sort_by,
|
|
452
451
|
"sort_direction": sort_direction,
|
|
453
452
|
}
|
|
454
|
-
if fields:
|
|
455
|
-
# Always include the fields required to reconstruct a Document client-side, so
|
|
456
|
-
# projected responses still parse into Document objects. When any metadata field
|
|
457
|
-
# is requested, also pull metadata_types so typed values (datetime/date/decimal)
|
|
458
|
-
# are reconstructed instead of returned as raw strings.
|
|
459
|
-
projected = ["external_id", "content_type", *fields]
|
|
460
|
-
if any(field.split(".", 1)[0] == "metadata" for field in fields):
|
|
461
|
-
projected.append("metadata_types")
|
|
462
|
-
data["fields"] = list(dict.fromkeys(projected))
|
|
463
453
|
return params, data
|
|
464
454
|
|
|
465
455
|
def _prepare_batch_get_documents_request(
|
|
@@ -277,7 +277,6 @@ class _ScopedOperationsMixin:
|
|
|
277
277
|
completed_only: bool,
|
|
278
278
|
sort_by: Optional[str],
|
|
279
279
|
sort_direction: str,
|
|
280
|
-
fields: Optional[List[str]] = None,
|
|
281
280
|
):
|
|
282
281
|
params, data = self._logic._prepare_list_documents_request(
|
|
283
282
|
skip,
|
|
@@ -292,7 +291,6 @@ class _ScopedOperationsMixin:
|
|
|
292
291
|
completed_only,
|
|
293
292
|
sort_by,
|
|
294
293
|
sort_direction,
|
|
295
|
-
fields,
|
|
296
294
|
)
|
|
297
295
|
|
|
298
296
|
return self._execute_scoped_operation(
|
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Any, Dict, Iterable, List, Optional, Union
|
|
5
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
6
6
|
from urllib.parse import quote
|
|
7
7
|
|
|
8
8
|
from pydantic import BaseModel
|
|
@@ -10,6 +10,17 @@ from pydantic import BaseModel
|
|
|
10
10
|
MAX_LIMIT = 500
|
|
11
11
|
MIN_LOG_HOURS = 0.1
|
|
12
12
|
MAX_LOG_HOURS = 168.0
|
|
13
|
+
MIGRATION_SOURCE_METADATA_KEY = "_morphik_migration"
|
|
14
|
+
MIGRATION_RESERVED_METADATA_FIELDS = {
|
|
15
|
+
"app_id",
|
|
16
|
+
"end_user_id",
|
|
17
|
+
"external_id",
|
|
18
|
+
"filename",
|
|
19
|
+
"folder_id",
|
|
20
|
+
"folder_name",
|
|
21
|
+
"folder_path",
|
|
22
|
+
"owner_id",
|
|
23
|
+
}
|
|
13
24
|
|
|
14
25
|
|
|
15
26
|
def merge_folders(
|
|
@@ -197,3 +208,45 @@ def normalize_additional_folders(
|
|
|
197
208
|
if additional_folders:
|
|
198
209
|
return list(additional_folders) + folder_list
|
|
199
210
|
return folder_list
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def build_migration_metadata(
|
|
214
|
+
document: Any,
|
|
215
|
+
*,
|
|
216
|
+
include_source_metadata: bool = True,
|
|
217
|
+
) -> Tuple[Dict[str, Any], Dict[str, str]]:
|
|
218
|
+
"""Prepare document metadata for migration ingestion.
|
|
219
|
+
|
|
220
|
+
The target API owns fields such as external_id, folder_name, and app_id, so
|
|
221
|
+
those values must travel through dedicated migration parameters instead of
|
|
222
|
+
user metadata.
|
|
223
|
+
"""
|
|
224
|
+
metadata = dict(getattr(document, "metadata", None) or {})
|
|
225
|
+
metadata_types = dict(getattr(document, "metadata_types", None) or {})
|
|
226
|
+
|
|
227
|
+
for field in MIGRATION_RESERVED_METADATA_FIELDS:
|
|
228
|
+
metadata.pop(field, None)
|
|
229
|
+
metadata_types.pop(field, None)
|
|
230
|
+
|
|
231
|
+
if include_source_metadata:
|
|
232
|
+
system_metadata = getattr(document, "system_metadata", None) or {}
|
|
233
|
+
source_info = {
|
|
234
|
+
"source_document_id": getattr(document, "external_id", None),
|
|
235
|
+
"source_app_id": getattr(document, "app_id", None),
|
|
236
|
+
"source_filename": getattr(document, "filename", None),
|
|
237
|
+
"source_created_at": system_metadata.get("created_at") if isinstance(system_metadata, dict) else None,
|
|
238
|
+
"source_updated_at": system_metadata.get("updated_at") if isinstance(system_metadata, dict) else None,
|
|
239
|
+
}
|
|
240
|
+
source_info = {key: value for key, value in source_info.items() if value is not None}
|
|
241
|
+
|
|
242
|
+
existing_source_info = metadata.get(MIGRATION_SOURCE_METADATA_KEY)
|
|
243
|
+
if isinstance(existing_source_info, dict):
|
|
244
|
+
existing_source_info = dict(existing_source_info)
|
|
245
|
+
for key, value in source_info.items():
|
|
246
|
+
existing_source_info.setdefault(key, value)
|
|
247
|
+
metadata[MIGRATION_SOURCE_METADATA_KEY] = existing_source_info
|
|
248
|
+
else:
|
|
249
|
+
metadata[MIGRATION_SOURCE_METADATA_KEY] = source_info
|
|
250
|
+
metadata_types.setdefault(MIGRATION_SOURCE_METADATA_KEY, "object")
|
|
251
|
+
|
|
252
|
+
return metadata, metadata_types
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import inspect
|
|
1
2
|
import json
|
|
2
3
|
import logging
|
|
3
4
|
import warnings
|
|
@@ -19,6 +20,7 @@ from ._shared import (
|
|
|
19
20
|
build_folder_rename_path,
|
|
20
21
|
build_list_apps_params,
|
|
21
22
|
build_logs_params,
|
|
23
|
+
build_migration_metadata,
|
|
22
24
|
build_rename_app_params,
|
|
23
25
|
build_requeue_payload,
|
|
24
26
|
build_rotate_app_params,
|
|
@@ -42,6 +44,8 @@ from .models import (
|
|
|
42
44
|
IngestTextRequest,
|
|
43
45
|
ListDocsResponse,
|
|
44
46
|
LogResponse,
|
|
47
|
+
MigrationDocumentResult,
|
|
48
|
+
MigrationResult,
|
|
45
49
|
QueryPromptOverrides,
|
|
46
50
|
RequeueIngestionJob,
|
|
47
51
|
RequeueIngestionResponse,
|
|
@@ -267,15 +271,8 @@ class _AsyncScopedClientOps:
|
|
|
267
271
|
completed_only: bool = False,
|
|
268
272
|
sort_by: Optional[str] = "updated_at",
|
|
269
273
|
sort_direction: str = "desc",
|
|
270
|
-
fields: Optional[List[str]] = None,
|
|
271
274
|
) -> ListDocsResponse:
|
|
272
|
-
"""List documents within this scope (async).
|
|
273
|
-
|
|
274
|
-
Args:
|
|
275
|
-
fields: Optional list of fields to return for each document (e.g. ["metadata"]).
|
|
276
|
-
Only those fields are read and returned, so the full document text is never
|
|
277
|
-
downloaded. external_id and content_type are always included.
|
|
278
|
-
"""
|
|
275
|
+
"""List documents within this scope (async)."""
|
|
279
276
|
effective_folder = self._merge_folders(additional_folders)
|
|
280
277
|
return await self._client._scoped_list_documents(
|
|
281
278
|
skip=skip,
|
|
@@ -290,7 +287,6 @@ class _AsyncScopedClientOps:
|
|
|
290
287
|
completed_only=completed_only,
|
|
291
288
|
sort_by=sort_by,
|
|
292
289
|
sort_direction=sort_direction,
|
|
293
|
-
fields=fields,
|
|
294
290
|
)
|
|
295
291
|
|
|
296
292
|
async def batch_get_documents(
|
|
@@ -1239,7 +1235,6 @@ class AsyncMorphik(_ScopedOperationsMixin):
|
|
|
1239
1235
|
completed_only: bool = False,
|
|
1240
1236
|
sort_by: Optional[str] = "updated_at",
|
|
1241
1237
|
sort_direction: str = "desc",
|
|
1242
|
-
fields: Optional[List[str]] = None,
|
|
1243
1238
|
) -> ListDocsResponse:
|
|
1244
1239
|
"""
|
|
1245
1240
|
List accessible documents.
|
|
@@ -1256,9 +1251,6 @@ class AsyncMorphik(_ScopedOperationsMixin):
|
|
|
1256
1251
|
completed_only: Only return completed documents
|
|
1257
1252
|
sort_by: Field to sort by (created_at, updated_at, filename, external_id)
|
|
1258
1253
|
sort_direction: Sort direction (asc, desc)
|
|
1259
|
-
fields: Optional list of fields to return for each document (e.g. ["metadata"]).
|
|
1260
|
-
Only those fields are read and returned, so the full document text is never
|
|
1261
|
-
downloaded. external_id and content_type are always included.
|
|
1262
1254
|
Returns:
|
|
1263
1255
|
ListDocsResponse: Response with documents and metadata
|
|
1264
1256
|
|
|
@@ -1276,7 +1268,234 @@ class AsyncMorphik(_ScopedOperationsMixin):
|
|
|
1276
1268
|
completed_only=completed_only,
|
|
1277
1269
|
sort_by=sort_by,
|
|
1278
1270
|
sort_direction=sort_direction,
|
|
1279
|
-
|
|
1271
|
+
)
|
|
1272
|
+
|
|
1273
|
+
async def migrate(
|
|
1274
|
+
self,
|
|
1275
|
+
target_uri: str,
|
|
1276
|
+
*,
|
|
1277
|
+
filters: Optional[Dict[str, Any]] = None,
|
|
1278
|
+
folder_name: Optional[Union[str, List[str]]] = None,
|
|
1279
|
+
folder_depth: Optional[int] = None,
|
|
1280
|
+
end_user_id: Optional[str] = None,
|
|
1281
|
+
skip: int = 0,
|
|
1282
|
+
limit: Optional[int] = None,
|
|
1283
|
+
batch_size: int = 100,
|
|
1284
|
+
completed_only: bool = True,
|
|
1285
|
+
use_colpali: bool = True,
|
|
1286
|
+
preserve_folders: bool = True,
|
|
1287
|
+
preserve_end_user_id: bool = True,
|
|
1288
|
+
preserve_summaries: bool = True,
|
|
1289
|
+
include_source_metadata: bool = True,
|
|
1290
|
+
on_conflict: Literal["skip", "fail"] = "skip",
|
|
1291
|
+
continue_on_error: bool = True,
|
|
1292
|
+
target_timeout: Optional[int] = None,
|
|
1293
|
+
target_is_local: bool = False,
|
|
1294
|
+
progress_callback: Optional[Callable[[MigrationDocumentResult], Any]] = None,
|
|
1295
|
+
) -> MigrationResult:
|
|
1296
|
+
"""Migrate documents from this client into another Morphik URI.
|
|
1297
|
+
|
|
1298
|
+
Set target_is_local=True for local/on-prem targets that should use HTTP
|
|
1299
|
+
or skip TLS verification.
|
|
1300
|
+
"""
|
|
1301
|
+
if batch_size <= 0:
|
|
1302
|
+
raise ValueError("batch_size must be greater than 0")
|
|
1303
|
+
if limit is not None and limit < 0:
|
|
1304
|
+
raise ValueError("limit must be greater than or equal to 0")
|
|
1305
|
+
|
|
1306
|
+
target = AsyncMorphik(target_uri, timeout=target_timeout or self._logic._timeout, is_local=target_is_local)
|
|
1307
|
+
results: List[MigrationDocumentResult] = []
|
|
1308
|
+
total_source_count: Optional[int] = None
|
|
1309
|
+
current_skip = max(skip, 0)
|
|
1310
|
+
remaining = limit
|
|
1311
|
+
page_size = min(batch_size, 500)
|
|
1312
|
+
|
|
1313
|
+
try:
|
|
1314
|
+
while remaining is None or remaining > 0:
|
|
1315
|
+
current_limit = page_size if remaining is None else min(page_size, remaining)
|
|
1316
|
+
page = await self._scoped_list_documents(
|
|
1317
|
+
skip=current_skip,
|
|
1318
|
+
limit=current_limit,
|
|
1319
|
+
filters=filters,
|
|
1320
|
+
folder_name=folder_name,
|
|
1321
|
+
folder_depth=folder_depth,
|
|
1322
|
+
end_user_id=end_user_id,
|
|
1323
|
+
include_total_count=total_source_count is None,
|
|
1324
|
+
include_status_counts=False,
|
|
1325
|
+
include_folder_counts=False,
|
|
1326
|
+
completed_only=completed_only,
|
|
1327
|
+
sort_by="updated_at",
|
|
1328
|
+
sort_direction="desc",
|
|
1329
|
+
)
|
|
1330
|
+
if total_source_count is None:
|
|
1331
|
+
total_source_count = page.total_count
|
|
1332
|
+
if not page.documents:
|
|
1333
|
+
break
|
|
1334
|
+
|
|
1335
|
+
for source_document in page.documents:
|
|
1336
|
+
try:
|
|
1337
|
+
result = await self._migrate_single_document(
|
|
1338
|
+
target=target,
|
|
1339
|
+
source_document=source_document,
|
|
1340
|
+
use_colpali=use_colpali,
|
|
1341
|
+
preserve_folders=preserve_folders,
|
|
1342
|
+
preserve_end_user_id=preserve_end_user_id,
|
|
1343
|
+
preserve_summaries=preserve_summaries,
|
|
1344
|
+
include_source_metadata=include_source_metadata,
|
|
1345
|
+
on_conflict=on_conflict,
|
|
1346
|
+
)
|
|
1347
|
+
except Exception as exc: # noqa: BLE001
|
|
1348
|
+
result = MigrationDocumentResult(
|
|
1349
|
+
source_document_id=source_document.external_id,
|
|
1350
|
+
filename=source_document.filename,
|
|
1351
|
+
status="failed",
|
|
1352
|
+
error=str(exc),
|
|
1353
|
+
)
|
|
1354
|
+
if not continue_on_error:
|
|
1355
|
+
results.append(result)
|
|
1356
|
+
await self._emit_migration_progress(progress_callback, result)
|
|
1357
|
+
raise
|
|
1358
|
+
|
|
1359
|
+
results.append(result)
|
|
1360
|
+
await self._emit_migration_progress(progress_callback, result)
|
|
1361
|
+
|
|
1362
|
+
if remaining is not None:
|
|
1363
|
+
remaining -= len(page.documents)
|
|
1364
|
+
if not page.has_more:
|
|
1365
|
+
break
|
|
1366
|
+
next_skip = page.next_skip if page.next_skip is not None else current_skip + page.returned_count
|
|
1367
|
+
if next_skip <= current_skip:
|
|
1368
|
+
break
|
|
1369
|
+
current_skip = next_skip
|
|
1370
|
+
finally:
|
|
1371
|
+
await target.close()
|
|
1372
|
+
|
|
1373
|
+
return self._build_migration_result(results, total_source_count)
|
|
1374
|
+
|
|
1375
|
+
async def _migrate_single_document(
|
|
1376
|
+
self,
|
|
1377
|
+
*,
|
|
1378
|
+
target: "AsyncMorphik",
|
|
1379
|
+
source_document: Document,
|
|
1380
|
+
use_colpali: bool,
|
|
1381
|
+
preserve_folders: bool,
|
|
1382
|
+
preserve_end_user_id: bool,
|
|
1383
|
+
preserve_summaries: bool,
|
|
1384
|
+
include_source_metadata: bool,
|
|
1385
|
+
on_conflict: Literal["skip", "fail"],
|
|
1386
|
+
) -> MigrationDocumentResult:
|
|
1387
|
+
metadata, metadata_types = build_migration_metadata(
|
|
1388
|
+
source_document,
|
|
1389
|
+
include_source_metadata=include_source_metadata,
|
|
1390
|
+
)
|
|
1391
|
+
file_bytes = await self.get_document_file(source_document.external_id)
|
|
1392
|
+
folder = (source_document.folder_path or source_document.folder_name) if preserve_folders else None
|
|
1393
|
+
end_user = source_document.end_user_id if preserve_end_user_id else None
|
|
1394
|
+
|
|
1395
|
+
status, target_document = await target._ingest_migrated_document(
|
|
1396
|
+
source_document_id=source_document.external_id,
|
|
1397
|
+
file_content=file_bytes,
|
|
1398
|
+
filename=source_document.filename or source_document.external_id,
|
|
1399
|
+
content_type=source_document.content_type,
|
|
1400
|
+
metadata=metadata,
|
|
1401
|
+
metadata_types=metadata_types,
|
|
1402
|
+
folder_name=folder,
|
|
1403
|
+
end_user_id=end_user,
|
|
1404
|
+
use_colpali=use_colpali,
|
|
1405
|
+
on_conflict=on_conflict,
|
|
1406
|
+
)
|
|
1407
|
+
|
|
1408
|
+
if preserve_summaries and status == "created":
|
|
1409
|
+
await self._copy_document_summary(source_document.external_id, target, target_document.external_id)
|
|
1410
|
+
|
|
1411
|
+
return MigrationDocumentResult(
|
|
1412
|
+
source_document_id=source_document.external_id,
|
|
1413
|
+
target_document_id=target_document.external_id,
|
|
1414
|
+
filename=source_document.filename,
|
|
1415
|
+
status=status,
|
|
1416
|
+
)
|
|
1417
|
+
|
|
1418
|
+
async def _ingest_migrated_document(
|
|
1419
|
+
self,
|
|
1420
|
+
*,
|
|
1421
|
+
source_document_id: str,
|
|
1422
|
+
file_content: bytes,
|
|
1423
|
+
filename: str,
|
|
1424
|
+
content_type: Optional[str],
|
|
1425
|
+
metadata: Dict[str, Any],
|
|
1426
|
+
metadata_types: Optional[Dict[str, str]],
|
|
1427
|
+
folder_name: Optional[str],
|
|
1428
|
+
end_user_id: Optional[str],
|
|
1429
|
+
use_colpali: bool,
|
|
1430
|
+
on_conflict: Literal["skip", "fail"],
|
|
1431
|
+
) -> tuple[str, Document]:
|
|
1432
|
+
serialized_metadata, inferred_types = self._logic._serialize_metadata_map(metadata)
|
|
1433
|
+
metadata_type_payload = {**inferred_types, **(metadata_types or {})}
|
|
1434
|
+
metadata_type_payload = {
|
|
1435
|
+
key: value for key, value in metadata_type_payload.items() if key in serialized_metadata
|
|
1436
|
+
}
|
|
1437
|
+
|
|
1438
|
+
form_data: Dict[str, Any] = {
|
|
1439
|
+
"source_document_id": source_document_id,
|
|
1440
|
+
"metadata": json.dumps(serialized_metadata),
|
|
1441
|
+
"metadata_types": json.dumps(metadata_type_payload),
|
|
1442
|
+
"use_colpali": str(use_colpali).lower(),
|
|
1443
|
+
"on_conflict": on_conflict,
|
|
1444
|
+
}
|
|
1445
|
+
if folder_name:
|
|
1446
|
+
form_data["folder_name"] = folder_name
|
|
1447
|
+
if end_user_id:
|
|
1448
|
+
form_data["end_user_id"] = end_user_id
|
|
1449
|
+
|
|
1450
|
+
file_obj = BytesIO(file_content)
|
|
1451
|
+
files = {"file": (filename, file_obj, content_type or "application/octet-stream")}
|
|
1452
|
+
response = await self._request("POST", "migrate/document", data=form_data, files=files)
|
|
1453
|
+
document = self._logic._parse_document_response(response["document"])
|
|
1454
|
+
document._client = self
|
|
1455
|
+
return response.get("status", "created"), document
|
|
1456
|
+
|
|
1457
|
+
async def _copy_document_summary(
|
|
1458
|
+
self,
|
|
1459
|
+
source_document_id: str,
|
|
1460
|
+
target: "AsyncMorphik",
|
|
1461
|
+
target_document_id: str,
|
|
1462
|
+
) -> None:
|
|
1463
|
+
try:
|
|
1464
|
+
summary = await self.get_document_summary(source_document_id)
|
|
1465
|
+
except httpx.HTTPStatusError as exc:
|
|
1466
|
+
if exc.response.status_code == 404:
|
|
1467
|
+
return
|
|
1468
|
+
raise
|
|
1469
|
+
await target.upsert_document_summary(
|
|
1470
|
+
document_id=target_document_id,
|
|
1471
|
+
content=summary.content,
|
|
1472
|
+
versioning=False,
|
|
1473
|
+
overwrite_latest=True,
|
|
1474
|
+
)
|
|
1475
|
+
|
|
1476
|
+
@staticmethod
|
|
1477
|
+
async def _emit_migration_progress(
|
|
1478
|
+
progress_callback: Optional[Callable[[MigrationDocumentResult], Any]],
|
|
1479
|
+
result: MigrationDocumentResult,
|
|
1480
|
+
) -> None:
|
|
1481
|
+
if progress_callback is None:
|
|
1482
|
+
return
|
|
1483
|
+
callback_result = progress_callback(result)
|
|
1484
|
+
if inspect.isawaitable(callback_result):
|
|
1485
|
+
await callback_result
|
|
1486
|
+
|
|
1487
|
+
@staticmethod
|
|
1488
|
+
def _build_migration_result(
|
|
1489
|
+
results: List[MigrationDocumentResult],
|
|
1490
|
+
total_source_count: Optional[int],
|
|
1491
|
+
) -> MigrationResult:
|
|
1492
|
+
return MigrationResult(
|
|
1493
|
+
documents=results,
|
|
1494
|
+
total_source_count=total_source_count,
|
|
1495
|
+
attempted_count=len(results),
|
|
1496
|
+
created_count=sum(1 for item in results if item.status == "created"),
|
|
1497
|
+
skipped_count=sum(1 for item in results if item.status == "skipped"),
|
|
1498
|
+
failed_count=sum(1 for item in results if item.status == "failed"),
|
|
1280
1499
|
)
|
|
1281
1500
|
|
|
1282
1501
|
async def get_document(self, document_id: str) -> Document:
|
|
@@ -415,6 +415,27 @@ class ListDocsResponse(BaseModel):
|
|
|
415
415
|
folder_counts: Optional[List[FolderCount]] = Field(None, description="Document counts by folder")
|
|
416
416
|
|
|
417
417
|
|
|
418
|
+
class MigrationDocumentResult(BaseModel):
|
|
419
|
+
"""Per-document result from a migration run."""
|
|
420
|
+
|
|
421
|
+
source_document_id: str = Field(..., description="Document ID in the source Morphik app")
|
|
422
|
+
target_document_id: Optional[str] = Field(None, description="Document ID in the target Morphik app")
|
|
423
|
+
filename: Optional[str] = Field(None, description="Migrated filename")
|
|
424
|
+
status: Literal["created", "skipped", "failed"] = Field(..., description="Migration outcome")
|
|
425
|
+
error: Optional[str] = Field(None, description="Error message when status is failed")
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
class MigrationResult(BaseModel):
|
|
429
|
+
"""Summary returned by client.migrate."""
|
|
430
|
+
|
|
431
|
+
documents: List[MigrationDocumentResult] = Field(default_factory=list)
|
|
432
|
+
total_source_count: Optional[int] = Field(None, description="Total matching source documents, when available")
|
|
433
|
+
attempted_count: int = Field(..., description="Number of source documents attempted")
|
|
434
|
+
created_count: int = Field(..., description="Number of target documents created")
|
|
435
|
+
skipped_count: int = Field(..., description="Number of target documents skipped because they already existed")
|
|
436
|
+
failed_count: int = Field(..., description="Number of documents that failed migration")
|
|
437
|
+
|
|
438
|
+
|
|
418
439
|
class IngestTextRequest(BaseModel):
|
|
419
440
|
"""Request model for ingesting text content"""
|
|
420
441
|
|
|
@@ -19,6 +19,7 @@ from ._shared import (
|
|
|
19
19
|
build_folder_rename_path,
|
|
20
20
|
build_list_apps_params,
|
|
21
21
|
build_logs_params,
|
|
22
|
+
build_migration_metadata,
|
|
22
23
|
build_rename_app_params,
|
|
23
24
|
build_requeue_payload,
|
|
24
25
|
build_rotate_app_params,
|
|
@@ -42,6 +43,8 @@ from .models import (
|
|
|
42
43
|
IngestTextRequest,
|
|
43
44
|
ListDocsResponse,
|
|
44
45
|
LogResponse,
|
|
46
|
+
MigrationDocumentResult,
|
|
47
|
+
MigrationResult,
|
|
45
48
|
QueryPromptOverrides,
|
|
46
49
|
RequeueIngestionJob,
|
|
47
50
|
RequeueIngestionResponse,
|
|
@@ -283,16 +286,9 @@ class _ScopedClientOps:
|
|
|
283
286
|
completed_only: bool = False,
|
|
284
287
|
sort_by: Optional[str] = "updated_at",
|
|
285
288
|
sort_direction: str = "desc",
|
|
286
|
-
fields: Optional[List[str]] = None,
|
|
287
289
|
) -> ListDocsResponse:
|
|
288
290
|
"""
|
|
289
291
|
List documents within this scope.
|
|
290
|
-
|
|
291
|
-
Args:
|
|
292
|
-
fields: Optional list of fields to return for each document (e.g.
|
|
293
|
-
["metadata"]). Only those fields are read and returned, so the full
|
|
294
|
-
document text is never downloaded. external_id and content_type are
|
|
295
|
-
always included.
|
|
296
292
|
"""
|
|
297
293
|
effective_folder = self._merge_folders(additional_folders)
|
|
298
294
|
return self._client._scoped_list_documents(
|
|
@@ -308,7 +304,6 @@ class _ScopedClientOps:
|
|
|
308
304
|
completed_only=completed_only,
|
|
309
305
|
sort_by=sort_by,
|
|
310
306
|
sort_direction=sort_direction,
|
|
311
|
-
fields=fields,
|
|
312
307
|
)
|
|
313
308
|
|
|
314
309
|
def batch_get_documents(
|
|
@@ -1278,7 +1273,6 @@ class Morphik(_ScopedOperationsMixin):
|
|
|
1278
1273
|
completed_only: bool = False,
|
|
1279
1274
|
sort_by: Optional[str] = "updated_at",
|
|
1280
1275
|
sort_direction: str = "desc",
|
|
1281
|
-
fields: Optional[List[str]] = None,
|
|
1282
1276
|
) -> ListDocsResponse:
|
|
1283
1277
|
"""
|
|
1284
1278
|
List accessible documents.
|
|
@@ -1295,9 +1289,6 @@ class Morphik(_ScopedOperationsMixin):
|
|
|
1295
1289
|
completed_only: Only return completed documents
|
|
1296
1290
|
sort_by: Field to sort by (created_at, updated_at, filename, external_id)
|
|
1297
1291
|
sort_direction: Sort direction (asc, desc)
|
|
1298
|
-
fields: Optional list of fields to return for each document (e.g. ["metadata"]).
|
|
1299
|
-
Only those fields are read and returned, so the full document text is never
|
|
1300
|
-
downloaded. external_id and content_type are always included.
|
|
1301
1292
|
Returns:
|
|
1302
1293
|
ListDocsResponse: Response with documents and metadata
|
|
1303
1294
|
|
|
@@ -1315,7 +1306,222 @@ class Morphik(_ScopedOperationsMixin):
|
|
|
1315
1306
|
completed_only=completed_only,
|
|
1316
1307
|
sort_by=sort_by,
|
|
1317
1308
|
sort_direction=sort_direction,
|
|
1318
|
-
|
|
1309
|
+
)
|
|
1310
|
+
|
|
1311
|
+
def migrate(
|
|
1312
|
+
self,
|
|
1313
|
+
target_uri: str,
|
|
1314
|
+
*,
|
|
1315
|
+
filters: Optional[Dict[str, Any]] = None,
|
|
1316
|
+
folder_name: Optional[Union[str, List[str]]] = None,
|
|
1317
|
+
folder_depth: Optional[int] = None,
|
|
1318
|
+
end_user_id: Optional[str] = None,
|
|
1319
|
+
skip: int = 0,
|
|
1320
|
+
limit: Optional[int] = None,
|
|
1321
|
+
batch_size: int = 100,
|
|
1322
|
+
completed_only: bool = True,
|
|
1323
|
+
use_colpali: bool = True,
|
|
1324
|
+
preserve_folders: bool = True,
|
|
1325
|
+
preserve_end_user_id: bool = True,
|
|
1326
|
+
preserve_summaries: bool = True,
|
|
1327
|
+
include_source_metadata: bool = True,
|
|
1328
|
+
on_conflict: Literal["skip", "fail"] = "skip",
|
|
1329
|
+
continue_on_error: bool = True,
|
|
1330
|
+
target_timeout: Optional[int] = None,
|
|
1331
|
+
target_is_local: bool = False,
|
|
1332
|
+
progress_callback: Optional[Callable[[MigrationDocumentResult], None]] = None,
|
|
1333
|
+
) -> MigrationResult:
|
|
1334
|
+
"""Migrate documents from this client into another Morphik URI.
|
|
1335
|
+
|
|
1336
|
+
The caller should run this from a network location that can reach both
|
|
1337
|
+
the source URI and the target URI, for example from inside a customer's
|
|
1338
|
+
VPN when the target deployment is on-prem. Set target_is_local=True for
|
|
1339
|
+
local/on-prem targets that should use HTTP or skip TLS verification.
|
|
1340
|
+
"""
|
|
1341
|
+
if batch_size <= 0:
|
|
1342
|
+
raise ValueError("batch_size must be greater than 0")
|
|
1343
|
+
if limit is not None and limit < 0:
|
|
1344
|
+
raise ValueError("limit must be greater than or equal to 0")
|
|
1345
|
+
|
|
1346
|
+
target = Morphik(target_uri, timeout=target_timeout or self._logic._timeout, is_local=target_is_local)
|
|
1347
|
+
results: List[MigrationDocumentResult] = []
|
|
1348
|
+
total_source_count: Optional[int] = None
|
|
1349
|
+
current_skip = max(skip, 0)
|
|
1350
|
+
remaining = limit
|
|
1351
|
+
page_size = min(batch_size, 500)
|
|
1352
|
+
|
|
1353
|
+
try:
|
|
1354
|
+
while remaining is None or remaining > 0:
|
|
1355
|
+
current_limit = page_size if remaining is None else min(page_size, remaining)
|
|
1356
|
+
page = self._scoped_list_documents(
|
|
1357
|
+
skip=current_skip,
|
|
1358
|
+
limit=current_limit,
|
|
1359
|
+
filters=filters,
|
|
1360
|
+
folder_name=folder_name,
|
|
1361
|
+
folder_depth=folder_depth,
|
|
1362
|
+
end_user_id=end_user_id,
|
|
1363
|
+
include_total_count=total_source_count is None,
|
|
1364
|
+
include_status_counts=False,
|
|
1365
|
+
include_folder_counts=False,
|
|
1366
|
+
completed_only=completed_only,
|
|
1367
|
+
sort_by="updated_at",
|
|
1368
|
+
sort_direction="desc",
|
|
1369
|
+
)
|
|
1370
|
+
if total_source_count is None:
|
|
1371
|
+
total_source_count = page.total_count
|
|
1372
|
+
if not page.documents:
|
|
1373
|
+
break
|
|
1374
|
+
|
|
1375
|
+
for source_document in page.documents:
|
|
1376
|
+
try:
|
|
1377
|
+
result = self._migrate_single_document(
|
|
1378
|
+
target=target,
|
|
1379
|
+
source_document=source_document,
|
|
1380
|
+
use_colpali=use_colpali,
|
|
1381
|
+
preserve_folders=preserve_folders,
|
|
1382
|
+
preserve_end_user_id=preserve_end_user_id,
|
|
1383
|
+
preserve_summaries=preserve_summaries,
|
|
1384
|
+
include_source_metadata=include_source_metadata,
|
|
1385
|
+
on_conflict=on_conflict,
|
|
1386
|
+
)
|
|
1387
|
+
except Exception as exc: # noqa: BLE001
|
|
1388
|
+
result = MigrationDocumentResult(
|
|
1389
|
+
source_document_id=source_document.external_id,
|
|
1390
|
+
filename=source_document.filename,
|
|
1391
|
+
status="failed",
|
|
1392
|
+
error=str(exc),
|
|
1393
|
+
)
|
|
1394
|
+
if not continue_on_error:
|
|
1395
|
+
results.append(result)
|
|
1396
|
+
if progress_callback:
|
|
1397
|
+
progress_callback(result)
|
|
1398
|
+
raise
|
|
1399
|
+
|
|
1400
|
+
results.append(result)
|
|
1401
|
+
if progress_callback:
|
|
1402
|
+
progress_callback(result)
|
|
1403
|
+
|
|
1404
|
+
if remaining is not None:
|
|
1405
|
+
remaining -= len(page.documents)
|
|
1406
|
+
if not page.has_more:
|
|
1407
|
+
break
|
|
1408
|
+
next_skip = page.next_skip if page.next_skip is not None else current_skip + page.returned_count
|
|
1409
|
+
if next_skip <= current_skip:
|
|
1410
|
+
break
|
|
1411
|
+
current_skip = next_skip
|
|
1412
|
+
finally:
|
|
1413
|
+
target.close()
|
|
1414
|
+
|
|
1415
|
+
return self._build_migration_result(results, total_source_count)
|
|
1416
|
+
|
|
1417
|
+
def _migrate_single_document(
|
|
1418
|
+
self,
|
|
1419
|
+
*,
|
|
1420
|
+
target: "Morphik",
|
|
1421
|
+
source_document: Document,
|
|
1422
|
+
use_colpali: bool,
|
|
1423
|
+
preserve_folders: bool,
|
|
1424
|
+
preserve_end_user_id: bool,
|
|
1425
|
+
preserve_summaries: bool,
|
|
1426
|
+
include_source_metadata: bool,
|
|
1427
|
+
on_conflict: Literal["skip", "fail"],
|
|
1428
|
+
) -> MigrationDocumentResult:
|
|
1429
|
+
metadata, metadata_types = build_migration_metadata(
|
|
1430
|
+
source_document,
|
|
1431
|
+
include_source_metadata=include_source_metadata,
|
|
1432
|
+
)
|
|
1433
|
+
file_bytes = self.get_document_file(source_document.external_id)
|
|
1434
|
+
folder = (source_document.folder_path or source_document.folder_name) if preserve_folders else None
|
|
1435
|
+
end_user = source_document.end_user_id if preserve_end_user_id else None
|
|
1436
|
+
|
|
1437
|
+
status, target_document = target._ingest_migrated_document(
|
|
1438
|
+
source_document_id=source_document.external_id,
|
|
1439
|
+
file_content=file_bytes,
|
|
1440
|
+
filename=source_document.filename or source_document.external_id,
|
|
1441
|
+
content_type=source_document.content_type,
|
|
1442
|
+
metadata=metadata,
|
|
1443
|
+
metadata_types=metadata_types,
|
|
1444
|
+
folder_name=folder,
|
|
1445
|
+
end_user_id=end_user,
|
|
1446
|
+
use_colpali=use_colpali,
|
|
1447
|
+
on_conflict=on_conflict,
|
|
1448
|
+
)
|
|
1449
|
+
|
|
1450
|
+
if preserve_summaries and status == "created":
|
|
1451
|
+
self._copy_document_summary(source_document.external_id, target, target_document.external_id)
|
|
1452
|
+
|
|
1453
|
+
return MigrationDocumentResult(
|
|
1454
|
+
source_document_id=source_document.external_id,
|
|
1455
|
+
target_document_id=target_document.external_id,
|
|
1456
|
+
filename=source_document.filename,
|
|
1457
|
+
status=status,
|
|
1458
|
+
)
|
|
1459
|
+
|
|
1460
|
+
def _ingest_migrated_document(
|
|
1461
|
+
self,
|
|
1462
|
+
*,
|
|
1463
|
+
source_document_id: str,
|
|
1464
|
+
file_content: bytes,
|
|
1465
|
+
filename: str,
|
|
1466
|
+
content_type: Optional[str],
|
|
1467
|
+
metadata: Dict[str, Any],
|
|
1468
|
+
metadata_types: Optional[Dict[str, str]],
|
|
1469
|
+
folder_name: Optional[str],
|
|
1470
|
+
end_user_id: Optional[str],
|
|
1471
|
+
use_colpali: bool,
|
|
1472
|
+
on_conflict: Literal["skip", "fail"],
|
|
1473
|
+
) -> tuple[str, Document]:
|
|
1474
|
+
serialized_metadata, inferred_types = self._logic._serialize_metadata_map(metadata)
|
|
1475
|
+
metadata_type_payload = {**inferred_types, **(metadata_types or {})}
|
|
1476
|
+
metadata_type_payload = {
|
|
1477
|
+
key: value for key, value in metadata_type_payload.items() if key in serialized_metadata
|
|
1478
|
+
}
|
|
1479
|
+
|
|
1480
|
+
form_data: Dict[str, Any] = {
|
|
1481
|
+
"source_document_id": source_document_id,
|
|
1482
|
+
"metadata": json.dumps(serialized_metadata),
|
|
1483
|
+
"metadata_types": json.dumps(metadata_type_payload),
|
|
1484
|
+
"use_colpali": str(use_colpali).lower(),
|
|
1485
|
+
"on_conflict": on_conflict,
|
|
1486
|
+
}
|
|
1487
|
+
if folder_name:
|
|
1488
|
+
form_data["folder_name"] = folder_name
|
|
1489
|
+
if end_user_id:
|
|
1490
|
+
form_data["end_user_id"] = end_user_id
|
|
1491
|
+
|
|
1492
|
+
file_obj = BytesIO(file_content)
|
|
1493
|
+
files = {"file": (filename, file_obj, content_type or "application/octet-stream")}
|
|
1494
|
+
response = self._request("POST", "migrate/document", data=form_data, files=files)
|
|
1495
|
+
document = self._logic._parse_document_response(response["document"])
|
|
1496
|
+
document._client = self
|
|
1497
|
+
return response.get("status", "created"), document
|
|
1498
|
+
|
|
1499
|
+
def _copy_document_summary(self, source_document_id: str, target: "Morphik", target_document_id: str) -> None:
|
|
1500
|
+
try:
|
|
1501
|
+
summary = self.get_document_summary(source_document_id)
|
|
1502
|
+
except httpx.HTTPStatusError as exc:
|
|
1503
|
+
if exc.response.status_code == 404:
|
|
1504
|
+
return
|
|
1505
|
+
raise
|
|
1506
|
+
target.upsert_document_summary(
|
|
1507
|
+
document_id=target_document_id,
|
|
1508
|
+
content=summary.content,
|
|
1509
|
+
versioning=False,
|
|
1510
|
+
overwrite_latest=True,
|
|
1511
|
+
)
|
|
1512
|
+
|
|
1513
|
+
@staticmethod
|
|
1514
|
+
def _build_migration_result(
|
|
1515
|
+
results: List[MigrationDocumentResult],
|
|
1516
|
+
total_source_count: Optional[int],
|
|
1517
|
+
) -> MigrationResult:
|
|
1518
|
+
return MigrationResult(
|
|
1519
|
+
documents=results,
|
|
1520
|
+
total_source_count=total_source_count,
|
|
1521
|
+
attempted_count=len(results),
|
|
1522
|
+
created_count=sum(1 for item in results if item.status == "created"),
|
|
1523
|
+
skipped_count=sum(1 for item in results if item.status == "skipped"),
|
|
1524
|
+
failed_count=sum(1 for item in results if item.status == "failed"),
|
|
1319
1525
|
)
|
|
1320
1526
|
|
|
1321
1527
|
def get_document(self, document_id: str) -> Document:
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
1
3
|
import httpx
|
|
2
4
|
import jwt
|
|
3
5
|
import pytest
|
|
@@ -117,39 +119,6 @@ def test_sync_list_documents_payloads_across_scopes():
|
|
|
117
119
|
client.close()
|
|
118
120
|
|
|
119
121
|
|
|
120
|
-
def test_sync_list_documents_fields_projection():
|
|
121
|
-
client, calls = _make_sync_client()
|
|
122
|
-
try:
|
|
123
|
-
# external_id + content_type are always added so the response parses into a Document;
|
|
124
|
-
# metadata_types is added so typed metadata values are reconstructed, not left as strings.
|
|
125
|
-
client.list_documents(fields=["metadata"])
|
|
126
|
-
assert calls.pop()["data"]["fields"] == ["external_id", "content_type", "metadata", "metadata_types"]
|
|
127
|
-
|
|
128
|
-
# Already-included required fields are not duplicated; order is preserved.
|
|
129
|
-
client.list_documents(fields=["external_id", "filename", "metadata"])
|
|
130
|
-
assert calls.pop()["data"]["fields"] == [
|
|
131
|
-
"external_id",
|
|
132
|
-
"content_type",
|
|
133
|
-
"filename",
|
|
134
|
-
"metadata",
|
|
135
|
-
"metadata_types",
|
|
136
|
-
]
|
|
137
|
-
|
|
138
|
-
# Nested metadata paths also trigger metadata_types.
|
|
139
|
-
client.list_documents(fields=["metadata.client"])
|
|
140
|
-
assert calls.pop()["data"]["fields"] == ["external_id", "content_type", "metadata.client", "metadata_types"]
|
|
141
|
-
|
|
142
|
-
# Non-metadata projection does not pull metadata_types.
|
|
143
|
-
client.list_documents(fields=["filename"])
|
|
144
|
-
assert calls.pop()["data"]["fields"] == ["external_id", "content_type", "filename"]
|
|
145
|
-
|
|
146
|
-
# No fields -> no projection requested (full documents).
|
|
147
|
-
client.list_documents()
|
|
148
|
-
assert "fields" not in calls.pop()["data"]
|
|
149
|
-
finally:
|
|
150
|
-
client.close()
|
|
151
|
-
|
|
152
|
-
|
|
153
122
|
def test_async_client_http2_toggle(monkeypatch):
|
|
154
123
|
captured = []
|
|
155
124
|
|
|
@@ -338,6 +307,59 @@ def test_sync_get_document_by_filename_scoped_params_and_encoding():
|
|
|
338
307
|
client.close()
|
|
339
308
|
|
|
340
309
|
|
|
310
|
+
def test_sync_ingest_migrated_document_posts_migration_payload():
|
|
311
|
+
client = Morphik()
|
|
312
|
+
calls = []
|
|
313
|
+
|
|
314
|
+
def fake_request(method, endpoint, data=None, files=None, params=None):
|
|
315
|
+
calls.append({"method": method, "endpoint": endpoint, "data": data, "files": files, "params": params})
|
|
316
|
+
return {
|
|
317
|
+
"status": "created",
|
|
318
|
+
"document": {
|
|
319
|
+
"external_id": "source-doc-1",
|
|
320
|
+
"content_type": "application/pdf",
|
|
321
|
+
"filename": "report.pdf",
|
|
322
|
+
},
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
client._request = fake_request # type: ignore[attr-defined]
|
|
326
|
+
try:
|
|
327
|
+
status, doc = client._ingest_migrated_document(
|
|
328
|
+
source_document_id="source-doc-1",
|
|
329
|
+
file_content=b"pdf-bytes",
|
|
330
|
+
filename="report.pdf",
|
|
331
|
+
content_type="application/pdf",
|
|
332
|
+
metadata={"category": "finance", "_morphik_migration": {"source_document_id": "source-doc-1"}},
|
|
333
|
+
metadata_types={"category": "string", "_morphik_migration": "object"},
|
|
334
|
+
folder_name="/finance/reports",
|
|
335
|
+
end_user_id="customer-1",
|
|
336
|
+
use_colpali=True,
|
|
337
|
+
on_conflict="skip",
|
|
338
|
+
)
|
|
339
|
+
call = calls.pop()
|
|
340
|
+
assert call["method"] == "POST"
|
|
341
|
+
assert call["endpoint"] == "migrate/document"
|
|
342
|
+
assert call["data"]["source_document_id"] == "source-doc-1"
|
|
343
|
+
assert call["data"]["folder_name"] == "/finance/reports"
|
|
344
|
+
assert call["data"]["end_user_id"] == "customer-1"
|
|
345
|
+
assert call["data"]["use_colpali"] == "true"
|
|
346
|
+
assert call["data"]["on_conflict"] == "skip"
|
|
347
|
+
assert json.loads(call["data"]["metadata"]) == {
|
|
348
|
+
"category": "finance",
|
|
349
|
+
"_morphik_migration": {"source_document_id": "source-doc-1"},
|
|
350
|
+
}
|
|
351
|
+
assert json.loads(call["data"]["metadata_types"]) == {
|
|
352
|
+
"category": "string",
|
|
353
|
+
"_morphik_migration": "object",
|
|
354
|
+
}
|
|
355
|
+
assert call["files"]["file"][0] == "report.pdf"
|
|
356
|
+
assert call["files"]["file"][2] == "application/pdf"
|
|
357
|
+
assert status == "created"
|
|
358
|
+
assert doc.external_id == "source-doc-1"
|
|
359
|
+
finally:
|
|
360
|
+
client.close()
|
|
361
|
+
|
|
362
|
+
|
|
341
363
|
def test_sync_folder_get_document_by_filename_scoped():
|
|
342
364
|
client, calls = _make_sync_client()
|
|
343
365
|
try:
|
|
@@ -7,6 +7,7 @@ from morphik._shared import (
|
|
|
7
7
|
build_document_by_filename_params,
|
|
8
8
|
build_list_apps_params,
|
|
9
9
|
build_logs_params,
|
|
10
|
+
build_migration_metadata,
|
|
10
11
|
build_rename_app_params,
|
|
11
12
|
build_requeue_payload,
|
|
12
13
|
build_rotate_app_params,
|
|
@@ -14,7 +15,7 @@ from morphik._shared import (
|
|
|
14
15
|
merge_folders,
|
|
15
16
|
normalize_additional_folders,
|
|
16
17
|
)
|
|
17
|
-
from morphik.models import RequeueIngestionJob
|
|
18
|
+
from morphik.models import Document, RequeueIngestionJob
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
def test_merge_folders_variants():
|
|
@@ -119,3 +120,37 @@ def test_normalize_additional_folders_alias():
|
|
|
119
120
|
assert normalize_additional_folders(None, "b") == ["b"]
|
|
120
121
|
assert normalize_additional_folders(["a"], "b") == ["a", "b"]
|
|
121
122
|
assert normalize_additional_folders(["a"], ["b", "c"]) == ["a", "b", "c"]
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def test_build_migration_metadata_strips_managed_fields_and_records_source():
|
|
126
|
+
doc = Document(
|
|
127
|
+
external_id="source-doc-1",
|
|
128
|
+
content_type="application/pdf",
|
|
129
|
+
filename="report.pdf",
|
|
130
|
+
app_id="source-app",
|
|
131
|
+
folder_path="/finance/reports",
|
|
132
|
+
end_user_id="customer-1",
|
|
133
|
+
metadata={
|
|
134
|
+
"external_id": "source-doc-1",
|
|
135
|
+
"folder_name": "/finance/reports",
|
|
136
|
+
"end_user_id": "customer-1",
|
|
137
|
+
"category": "finance",
|
|
138
|
+
},
|
|
139
|
+
metadata_types={
|
|
140
|
+
"external_id": "string",
|
|
141
|
+
"folder_name": "string",
|
|
142
|
+
"end_user_id": "string",
|
|
143
|
+
"category": "string",
|
|
144
|
+
},
|
|
145
|
+
system_metadata={"created_at": "2026-01-01T00:00:00Z", "updated_at": "2026-01-02T00:00:00Z"},
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
metadata, metadata_types = build_migration_metadata(doc)
|
|
149
|
+
|
|
150
|
+
assert metadata["category"] == "finance"
|
|
151
|
+
assert "external_id" not in metadata
|
|
152
|
+
assert "folder_name" not in metadata
|
|
153
|
+
assert "end_user_id" not in metadata
|
|
154
|
+
assert metadata["_morphik_migration"]["source_document_id"] == "source-doc-1"
|
|
155
|
+
assert metadata["_morphik_migration"]["source_app_id"] == "source-app"
|
|
156
|
+
assert metadata_types == {"category": "string", "_morphik_migration": "object"}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|