morphik 1.2.3__tar.gz → 1.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {morphik-1.2.3 → morphik-1.2.4}/.gitignore +0 -3
  2. {morphik-1.2.3 → morphik-1.2.4}/PKG-INFO +7 -6
  3. {morphik-1.2.3 → morphik-1.2.4}/README.md +6 -5
  4. {morphik-1.2.3 → morphik-1.2.4}/morphik/__init__.py +4 -2
  5. {morphik-1.2.3 → morphik-1.2.4}/morphik/_internal.py +0 -10
  6. {morphik-1.2.3 → morphik-1.2.4}/morphik/_scoped_ops.py +0 -2
  7. {morphik-1.2.3 → morphik-1.2.4}/morphik/_shared.py +54 -1
  8. {morphik-1.2.3 → morphik-1.2.4}/morphik/async_.py +233 -14
  9. {morphik-1.2.3 → morphik-1.2.4}/morphik/models.py +21 -0
  10. {morphik-1.2.3 → morphik-1.2.4}/morphik/sync.py +219 -13
  11. {morphik-1.2.3 → morphik-1.2.4}/morphik/tests/test_scoped_ops_unit.py +55 -33
  12. {morphik-1.2.3 → morphik-1.2.4}/morphik/tests/test_shared_helpers.py +36 -1
  13. {morphik-1.2.3 → morphik-1.2.4}/pyproject.toml +1 -1
  14. {morphik-1.2.3 → morphik-1.2.4}/morphik/exceptions.py +0 -0
  15. {morphik-1.2.3 → morphik-1.2.4}/morphik/tests/README.md +0 -0
  16. {morphik-1.2.3 → morphik-1.2.4}/morphik/tests/__init__.py +0 -0
  17. {morphik-1.2.3 → morphik-1.2.4}/morphik/tests/example_usage.py +0 -0
  18. {morphik-1.2.3 → morphik-1.2.4}/morphik/tests/test_app_ops.py +0 -0
  19. {morphik-1.2.3 → morphik-1.2.4}/morphik/tests/test_async.py +0 -0
  20. {morphik-1.2.3 → morphik-1.2.4}/morphik/tests/test_docs/sample1.txt +0 -0
  21. {morphik-1.2.3 → morphik-1.2.4}/morphik/tests/test_docs/sample2.txt +0 -0
  22. {morphik-1.2.3 → morphik-1.2.4}/morphik/tests/test_docs/sample3.txt +0 -0
  23. {morphik-1.2.3 → morphik-1.2.4}/morphik/tests/test_sync.py +0 -0
  24. {morphik-1.2.3 → morphik-1.2.4}/morphik/tests/test_update_document_metadata_rename.py +0 -0
@@ -58,6 +58,3 @@ multi_vector_embeddings_*.json
58
58
  # Rust build artifacts
59
59
  morphik_rust/target/
60
60
  morphik_rust/Cargo.lock
61
- DOCS_BY_FACILITIES/
62
- eval_freshworks/
63
- iep_test/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: morphik
3
- Version: 1.2.3
3
+ Version: 1.2.4
4
4
  Summary: Morphik Python Client
5
5
  Author-email: Morphik <founders@morphik.ai>
6
6
  Requires-Python: >=3.8
@@ -91,6 +91,12 @@ response = db.query(
91
91
  )
92
92
 
93
93
  print(response.completion)
94
+
95
+ # Migrate this app's documents into another Morphik deployment.
96
+ # Run this from a machine that can reach both source and target, such as
97
+ # inside a customer's VPN for on-prem targets.
98
+ result = db.migrate(target_uri="morphik://owner_id:token@onprem.example.com", target_is_local=True)
99
+ print(result.created_count, result.skipped_count, result.failed_count)
94
100
  ```
95
101
 
96
102
  ### Nested Folders & Folder Depth
@@ -106,11 +112,6 @@ renamed = moved.rename("specs-v2")
106
112
  # Scope queries to a path and include descendants with folder_depth=-1
107
113
  chunks = folder.retrieve_chunks(query="design notes", folder_depth=-1)
108
114
  docs = db.list_documents(folder_name="/projects/alpha", folder_depth=-1)
109
-
110
- # List only the fields you need. The server reads and returns just those columns, so
111
- # the full document text is never downloaded — fast for large corpora.
112
- for doc in db.list_documents(fields=["metadata"]).documents:
113
- print(doc.external_id, doc.metadata)
114
115
  ```
115
116
 
116
117
  `Folder.full_path` is exposed on folder objects, and `Document.folder_path` mirrors server responses for tracing scope.
@@ -78,6 +78,12 @@ response = db.query(
78
78
  )
79
79
 
80
80
  print(response.completion)
81
+
82
+ # Migrate this app's documents into another Morphik deployment.
83
+ # Run this from a machine that can reach both source and target, such as
84
+ # inside a customer's VPN for on-prem targets.
85
+ result = db.migrate(target_uri="morphik://owner_id:token@onprem.example.com", target_is_local=True)
86
+ print(result.created_count, result.skipped_count, result.failed_count)
81
87
  ```
82
88
 
83
89
  ### Nested Folders & Folder Depth
@@ -93,11 +99,6 @@ renamed = moved.rename("specs-v2")
93
99
  # Scope queries to a path and include descendants with folder_depth=-1
94
100
  chunks = folder.retrieve_chunks(query="design notes", folder_depth=-1)
95
101
  docs = db.list_documents(folder_name="/projects/alpha", folder_depth=-1)
96
-
97
- # List only the fields you need. The server reads and returns just those columns, so
98
- # the full document text is never downloaded — fast for large corpora.
99
- for doc in db.list_documents(fields=["metadata"]).documents:
100
- print(doc.external_id, doc.metadata)
101
102
  ```
102
103
 
103
104
  `Folder.full_path` is exposed on folder objects, and `Document.folder_path` mirrors server responses for tracing scope.
@@ -3,7 +3,7 @@ Morphik Python SDK for document ingestion and querying.
3
3
  """
4
4
 
5
5
  from .async_ import AsyncMorphik
6
- from .models import Document, DocumentQueryResponse, Summary
6
+ from .models import Document, DocumentQueryResponse, MigrationDocumentResult, MigrationResult, Summary
7
7
  from .sync import Morphik
8
8
 
9
9
  __all__ = [
@@ -12,6 +12,8 @@ __all__ = [
12
12
  "Document",
13
13
  "Summary",
14
14
  "DocumentQueryResponse",
15
+ "MigrationDocumentResult",
16
+ "MigrationResult",
15
17
  ]
16
18
 
17
- __version__ = "1.2.3"
19
+ __version__ = "1.2.4"
@@ -428,7 +428,6 @@ class _MorphikClientLogic:
428
428
  completed_only: bool,
429
429
  sort_by: Optional[str],
430
430
  sort_direction: str,
431
- fields: Optional[List[str]] = None,
432
431
  ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
433
432
  """Prepare request for list_docs endpoint"""
434
433
  params = {}
@@ -451,15 +450,6 @@ class _MorphikClientLogic:
451
450
  "sort_by": sort_by,
452
451
  "sort_direction": sort_direction,
453
452
  }
454
- if fields:
455
- # Always include the fields required to reconstruct a Document client-side, so
456
- # projected responses still parse into Document objects. When any metadata field
457
- # is requested, also pull metadata_types so typed values (datetime/date/decimal)
458
- # are reconstructed instead of returned as raw strings.
459
- projected = ["external_id", "content_type", *fields]
460
- if any(field.split(".", 1)[0] == "metadata" for field in fields):
461
- projected.append("metadata_types")
462
- data["fields"] = list(dict.fromkeys(projected))
463
453
  return params, data
464
454
 
465
455
  def _prepare_batch_get_documents_request(
@@ -277,7 +277,6 @@ class _ScopedOperationsMixin:
277
277
  completed_only: bool,
278
278
  sort_by: Optional[str],
279
279
  sort_direction: str,
280
- fields: Optional[List[str]] = None,
281
280
  ):
282
281
  params, data = self._logic._prepare_list_documents_request(
283
282
  skip,
@@ -292,7 +291,6 @@ class _ScopedOperationsMixin:
292
291
  completed_only,
293
292
  sort_by,
294
293
  sort_direction,
295
- fields,
296
294
  )
297
295
 
298
296
  return self._execute_scoped_operation(
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import json
4
4
  from pathlib import Path
5
- from typing import Any, Dict, Iterable, List, Optional, Union
5
+ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
6
6
  from urllib.parse import quote
7
7
 
8
8
  from pydantic import BaseModel
@@ -10,6 +10,17 @@ from pydantic import BaseModel
10
10
  MAX_LIMIT = 500
11
11
  MIN_LOG_HOURS = 0.1
12
12
  MAX_LOG_HOURS = 168.0
13
+ MIGRATION_SOURCE_METADATA_KEY = "_morphik_migration"
14
+ MIGRATION_RESERVED_METADATA_FIELDS = {
15
+ "app_id",
16
+ "end_user_id",
17
+ "external_id",
18
+ "filename",
19
+ "folder_id",
20
+ "folder_name",
21
+ "folder_path",
22
+ "owner_id",
23
+ }
13
24
 
14
25
 
15
26
  def merge_folders(
@@ -197,3 +208,45 @@ def normalize_additional_folders(
197
208
  if additional_folders:
198
209
  return list(additional_folders) + folder_list
199
210
  return folder_list
211
+
212
+
213
+ def build_migration_metadata(
214
+ document: Any,
215
+ *,
216
+ include_source_metadata: bool = True,
217
+ ) -> Tuple[Dict[str, Any], Dict[str, str]]:
218
+ """Prepare document metadata for migration ingestion.
219
+
220
+ The target API owns fields such as external_id, folder_name, and app_id, so
221
+ those values must travel through dedicated migration parameters instead of
222
+ user metadata.
223
+ """
224
+ metadata = dict(getattr(document, "metadata", None) or {})
225
+ metadata_types = dict(getattr(document, "metadata_types", None) or {})
226
+
227
+ for field in MIGRATION_RESERVED_METADATA_FIELDS:
228
+ metadata.pop(field, None)
229
+ metadata_types.pop(field, None)
230
+
231
+ if include_source_metadata:
232
+ system_metadata = getattr(document, "system_metadata", None) or {}
233
+ source_info = {
234
+ "source_document_id": getattr(document, "external_id", None),
235
+ "source_app_id": getattr(document, "app_id", None),
236
+ "source_filename": getattr(document, "filename", None),
237
+ "source_created_at": system_metadata.get("created_at") if isinstance(system_metadata, dict) else None,
238
+ "source_updated_at": system_metadata.get("updated_at") if isinstance(system_metadata, dict) else None,
239
+ }
240
+ source_info = {key: value for key, value in source_info.items() if value is not None}
241
+
242
+ existing_source_info = metadata.get(MIGRATION_SOURCE_METADATA_KEY)
243
+ if isinstance(existing_source_info, dict):
244
+ existing_source_info = dict(existing_source_info)
245
+ for key, value in source_info.items():
246
+ existing_source_info.setdefault(key, value)
247
+ metadata[MIGRATION_SOURCE_METADATA_KEY] = existing_source_info
248
+ else:
249
+ metadata[MIGRATION_SOURCE_METADATA_KEY] = source_info
250
+ metadata_types.setdefault(MIGRATION_SOURCE_METADATA_KEY, "object")
251
+
252
+ return metadata, metadata_types
@@ -1,3 +1,4 @@
1
+ import inspect
1
2
  import json
2
3
  import logging
3
4
  import warnings
@@ -19,6 +20,7 @@ from ._shared import (
19
20
  build_folder_rename_path,
20
21
  build_list_apps_params,
21
22
  build_logs_params,
23
+ build_migration_metadata,
22
24
  build_rename_app_params,
23
25
  build_requeue_payload,
24
26
  build_rotate_app_params,
@@ -42,6 +44,8 @@ from .models import (
42
44
  IngestTextRequest,
43
45
  ListDocsResponse,
44
46
  LogResponse,
47
+ MigrationDocumentResult,
48
+ MigrationResult,
45
49
  QueryPromptOverrides,
46
50
  RequeueIngestionJob,
47
51
  RequeueIngestionResponse,
@@ -267,15 +271,8 @@ class _AsyncScopedClientOps:
267
271
  completed_only: bool = False,
268
272
  sort_by: Optional[str] = "updated_at",
269
273
  sort_direction: str = "desc",
270
- fields: Optional[List[str]] = None,
271
274
  ) -> ListDocsResponse:
272
- """List documents within this scope (async).
273
-
274
- Args:
275
- fields: Optional list of fields to return for each document (e.g. ["metadata"]).
276
- Only those fields are read and returned, so the full document text is never
277
- downloaded. external_id and content_type are always included.
278
- """
275
+ """List documents within this scope (async)."""
279
276
  effective_folder = self._merge_folders(additional_folders)
280
277
  return await self._client._scoped_list_documents(
281
278
  skip=skip,
@@ -290,7 +287,6 @@ class _AsyncScopedClientOps:
290
287
  completed_only=completed_only,
291
288
  sort_by=sort_by,
292
289
  sort_direction=sort_direction,
293
- fields=fields,
294
290
  )
295
291
 
296
292
  async def batch_get_documents(
@@ -1239,7 +1235,6 @@ class AsyncMorphik(_ScopedOperationsMixin):
1239
1235
  completed_only: bool = False,
1240
1236
  sort_by: Optional[str] = "updated_at",
1241
1237
  sort_direction: str = "desc",
1242
- fields: Optional[List[str]] = None,
1243
1238
  ) -> ListDocsResponse:
1244
1239
  """
1245
1240
  List accessible documents.
@@ -1256,9 +1251,6 @@ class AsyncMorphik(_ScopedOperationsMixin):
1256
1251
  completed_only: Only return completed documents
1257
1252
  sort_by: Field to sort by (created_at, updated_at, filename, external_id)
1258
1253
  sort_direction: Sort direction (asc, desc)
1259
- fields: Optional list of fields to return for each document (e.g. ["metadata"]).
1260
- Only those fields are read and returned, so the full document text is never
1261
- downloaded. external_id and content_type are always included.
1262
1254
  Returns:
1263
1255
  ListDocsResponse: Response with documents and metadata
1264
1256
 
@@ -1276,7 +1268,234 @@ class AsyncMorphik(_ScopedOperationsMixin):
1276
1268
  completed_only=completed_only,
1277
1269
  sort_by=sort_by,
1278
1270
  sort_direction=sort_direction,
1279
- fields=fields,
1271
+ )
1272
+
1273
+ async def migrate(
1274
+ self,
1275
+ target_uri: str,
1276
+ *,
1277
+ filters: Optional[Dict[str, Any]] = None,
1278
+ folder_name: Optional[Union[str, List[str]]] = None,
1279
+ folder_depth: Optional[int] = None,
1280
+ end_user_id: Optional[str] = None,
1281
+ skip: int = 0,
1282
+ limit: Optional[int] = None,
1283
+ batch_size: int = 100,
1284
+ completed_only: bool = True,
1285
+ use_colpali: bool = True,
1286
+ preserve_folders: bool = True,
1287
+ preserve_end_user_id: bool = True,
1288
+ preserve_summaries: bool = True,
1289
+ include_source_metadata: bool = True,
1290
+ on_conflict: Literal["skip", "fail"] = "skip",
1291
+ continue_on_error: bool = True,
1292
+ target_timeout: Optional[int] = None,
1293
+ target_is_local: bool = False,
1294
+ progress_callback: Optional[Callable[[MigrationDocumentResult], Any]] = None,
1295
+ ) -> MigrationResult:
1296
+ """Migrate documents from this client into another Morphik URI.
1297
+
1298
+ Set target_is_local=True for local/on-prem targets that should use HTTP
1299
+ or skip TLS verification.
1300
+ """
1301
+ if batch_size <= 0:
1302
+ raise ValueError("batch_size must be greater than 0")
1303
+ if limit is not None and limit < 0:
1304
+ raise ValueError("limit must be greater than or equal to 0")
1305
+
1306
+ target = AsyncMorphik(target_uri, timeout=target_timeout or self._logic._timeout, is_local=target_is_local)
1307
+ results: List[MigrationDocumentResult] = []
1308
+ total_source_count: Optional[int] = None
1309
+ current_skip = max(skip, 0)
1310
+ remaining = limit
1311
+ page_size = min(batch_size, 500)
1312
+
1313
+ try:
1314
+ while remaining is None or remaining > 0:
1315
+ current_limit = page_size if remaining is None else min(page_size, remaining)
1316
+ page = await self._scoped_list_documents(
1317
+ skip=current_skip,
1318
+ limit=current_limit,
1319
+ filters=filters,
1320
+ folder_name=folder_name,
1321
+ folder_depth=folder_depth,
1322
+ end_user_id=end_user_id,
1323
+ include_total_count=total_source_count is None,
1324
+ include_status_counts=False,
1325
+ include_folder_counts=False,
1326
+ completed_only=completed_only,
1327
+ sort_by="updated_at",
1328
+ sort_direction="desc",
1329
+ )
1330
+ if total_source_count is None:
1331
+ total_source_count = page.total_count
1332
+ if not page.documents:
1333
+ break
1334
+
1335
+ for source_document in page.documents:
1336
+ try:
1337
+ result = await self._migrate_single_document(
1338
+ target=target,
1339
+ source_document=source_document,
1340
+ use_colpali=use_colpali,
1341
+ preserve_folders=preserve_folders,
1342
+ preserve_end_user_id=preserve_end_user_id,
1343
+ preserve_summaries=preserve_summaries,
1344
+ include_source_metadata=include_source_metadata,
1345
+ on_conflict=on_conflict,
1346
+ )
1347
+ except Exception as exc: # noqa: BLE001
1348
+ result = MigrationDocumentResult(
1349
+ source_document_id=source_document.external_id,
1350
+ filename=source_document.filename,
1351
+ status="failed",
1352
+ error=str(exc),
1353
+ )
1354
+ if not continue_on_error:
1355
+ results.append(result)
1356
+ await self._emit_migration_progress(progress_callback, result)
1357
+ raise
1358
+
1359
+ results.append(result)
1360
+ await self._emit_migration_progress(progress_callback, result)
1361
+
1362
+ if remaining is not None:
1363
+ remaining -= len(page.documents)
1364
+ if not page.has_more:
1365
+ break
1366
+ next_skip = page.next_skip if page.next_skip is not None else current_skip + page.returned_count
1367
+ if next_skip <= current_skip:
1368
+ break
1369
+ current_skip = next_skip
1370
+ finally:
1371
+ await target.close()
1372
+
1373
+ return self._build_migration_result(results, total_source_count)
1374
+
1375
+ async def _migrate_single_document(
1376
+ self,
1377
+ *,
1378
+ target: "AsyncMorphik",
1379
+ source_document: Document,
1380
+ use_colpali: bool,
1381
+ preserve_folders: bool,
1382
+ preserve_end_user_id: bool,
1383
+ preserve_summaries: bool,
1384
+ include_source_metadata: bool,
1385
+ on_conflict: Literal["skip", "fail"],
1386
+ ) -> MigrationDocumentResult:
1387
+ metadata, metadata_types = build_migration_metadata(
1388
+ source_document,
1389
+ include_source_metadata=include_source_metadata,
1390
+ )
1391
+ file_bytes = await self.get_document_file(source_document.external_id)
1392
+ folder = (source_document.folder_path or source_document.folder_name) if preserve_folders else None
1393
+ end_user = source_document.end_user_id if preserve_end_user_id else None
1394
+
1395
+ status, target_document = await target._ingest_migrated_document(
1396
+ source_document_id=source_document.external_id,
1397
+ file_content=file_bytes,
1398
+ filename=source_document.filename or source_document.external_id,
1399
+ content_type=source_document.content_type,
1400
+ metadata=metadata,
1401
+ metadata_types=metadata_types,
1402
+ folder_name=folder,
1403
+ end_user_id=end_user,
1404
+ use_colpali=use_colpali,
1405
+ on_conflict=on_conflict,
1406
+ )
1407
+
1408
+ if preserve_summaries and status == "created":
1409
+ await self._copy_document_summary(source_document.external_id, target, target_document.external_id)
1410
+
1411
+ return MigrationDocumentResult(
1412
+ source_document_id=source_document.external_id,
1413
+ target_document_id=target_document.external_id,
1414
+ filename=source_document.filename,
1415
+ status=status,
1416
+ )
1417
+
1418
+ async def _ingest_migrated_document(
1419
+ self,
1420
+ *,
1421
+ source_document_id: str,
1422
+ file_content: bytes,
1423
+ filename: str,
1424
+ content_type: Optional[str],
1425
+ metadata: Dict[str, Any],
1426
+ metadata_types: Optional[Dict[str, str]],
1427
+ folder_name: Optional[str],
1428
+ end_user_id: Optional[str],
1429
+ use_colpali: bool,
1430
+ on_conflict: Literal["skip", "fail"],
1431
+ ) -> tuple[str, Document]:
1432
+ serialized_metadata, inferred_types = self._logic._serialize_metadata_map(metadata)
1433
+ metadata_type_payload = {**inferred_types, **(metadata_types or {})}
1434
+ metadata_type_payload = {
1435
+ key: value for key, value in metadata_type_payload.items() if key in serialized_metadata
1436
+ }
1437
+
1438
+ form_data: Dict[str, Any] = {
1439
+ "source_document_id": source_document_id,
1440
+ "metadata": json.dumps(serialized_metadata),
1441
+ "metadata_types": json.dumps(metadata_type_payload),
1442
+ "use_colpali": str(use_colpali).lower(),
1443
+ "on_conflict": on_conflict,
1444
+ }
1445
+ if folder_name:
1446
+ form_data["folder_name"] = folder_name
1447
+ if end_user_id:
1448
+ form_data["end_user_id"] = end_user_id
1449
+
1450
+ file_obj = BytesIO(file_content)
1451
+ files = {"file": (filename, file_obj, content_type or "application/octet-stream")}
1452
+ response = await self._request("POST", "migrate/document", data=form_data, files=files)
1453
+ document = self._logic._parse_document_response(response["document"])
1454
+ document._client = self
1455
+ return response.get("status", "created"), document
1456
+
1457
+ async def _copy_document_summary(
1458
+ self,
1459
+ source_document_id: str,
1460
+ target: "AsyncMorphik",
1461
+ target_document_id: str,
1462
+ ) -> None:
1463
+ try:
1464
+ summary = await self.get_document_summary(source_document_id)
1465
+ except httpx.HTTPStatusError as exc:
1466
+ if exc.response.status_code == 404:
1467
+ return
1468
+ raise
1469
+ await target.upsert_document_summary(
1470
+ document_id=target_document_id,
1471
+ content=summary.content,
1472
+ versioning=False,
1473
+ overwrite_latest=True,
1474
+ )
1475
+
1476
+ @staticmethod
1477
+ async def _emit_migration_progress(
1478
+ progress_callback: Optional[Callable[[MigrationDocumentResult], Any]],
1479
+ result: MigrationDocumentResult,
1480
+ ) -> None:
1481
+ if progress_callback is None:
1482
+ return
1483
+ callback_result = progress_callback(result)
1484
+ if inspect.isawaitable(callback_result):
1485
+ await callback_result
1486
+
1487
+ @staticmethod
1488
+ def _build_migration_result(
1489
+ results: List[MigrationDocumentResult],
1490
+ total_source_count: Optional[int],
1491
+ ) -> MigrationResult:
1492
+ return MigrationResult(
1493
+ documents=results,
1494
+ total_source_count=total_source_count,
1495
+ attempted_count=len(results),
1496
+ created_count=sum(1 for item in results if item.status == "created"),
1497
+ skipped_count=sum(1 for item in results if item.status == "skipped"),
1498
+ failed_count=sum(1 for item in results if item.status == "failed"),
1280
1499
  )
1281
1500
 
1282
1501
  async def get_document(self, document_id: str) -> Document:
@@ -415,6 +415,27 @@ class ListDocsResponse(BaseModel):
415
415
  folder_counts: Optional[List[FolderCount]] = Field(None, description="Document counts by folder")
416
416
 
417
417
 
418
+ class MigrationDocumentResult(BaseModel):
419
+ """Per-document result from a migration run."""
420
+
421
+ source_document_id: str = Field(..., description="Document ID in the source Morphik app")
422
+ target_document_id: Optional[str] = Field(None, description="Document ID in the target Morphik app")
423
+ filename: Optional[str] = Field(None, description="Migrated filename")
424
+ status: Literal["created", "skipped", "failed"] = Field(..., description="Migration outcome")
425
+ error: Optional[str] = Field(None, description="Error message when status is failed")
426
+
427
+
428
+ class MigrationResult(BaseModel):
429
+ """Summary returned by client.migrate."""
430
+
431
+ documents: List[MigrationDocumentResult] = Field(default_factory=list)
432
+ total_source_count: Optional[int] = Field(None, description="Total matching source documents, when available")
433
+ attempted_count: int = Field(..., description="Number of source documents attempted")
434
+ created_count: int = Field(..., description="Number of target documents created")
435
+ skipped_count: int = Field(..., description="Number of target documents skipped because they already existed")
436
+ failed_count: int = Field(..., description="Number of documents that failed migration")
437
+
438
+
418
439
  class IngestTextRequest(BaseModel):
419
440
  """Request model for ingesting text content"""
420
441
 
@@ -19,6 +19,7 @@ from ._shared import (
19
19
  build_folder_rename_path,
20
20
  build_list_apps_params,
21
21
  build_logs_params,
22
+ build_migration_metadata,
22
23
  build_rename_app_params,
23
24
  build_requeue_payload,
24
25
  build_rotate_app_params,
@@ -42,6 +43,8 @@ from .models import (
42
43
  IngestTextRequest,
43
44
  ListDocsResponse,
44
45
  LogResponse,
46
+ MigrationDocumentResult,
47
+ MigrationResult,
45
48
  QueryPromptOverrides,
46
49
  RequeueIngestionJob,
47
50
  RequeueIngestionResponse,
@@ -283,16 +286,9 @@ class _ScopedClientOps:
283
286
  completed_only: bool = False,
284
287
  sort_by: Optional[str] = "updated_at",
285
288
  sort_direction: str = "desc",
286
- fields: Optional[List[str]] = None,
287
289
  ) -> ListDocsResponse:
288
290
  """
289
291
  List documents within this scope.
290
-
291
- Args:
292
- fields: Optional list of fields to return for each document (e.g.
293
- ["metadata"]). Only those fields are read and returned, so the full
294
- document text is never downloaded. external_id and content_type are
295
- always included.
296
292
  """
297
293
  effective_folder = self._merge_folders(additional_folders)
298
294
  return self._client._scoped_list_documents(
@@ -308,7 +304,6 @@ class _ScopedClientOps:
308
304
  completed_only=completed_only,
309
305
  sort_by=sort_by,
310
306
  sort_direction=sort_direction,
311
- fields=fields,
312
307
  )
313
308
 
314
309
  def batch_get_documents(
@@ -1278,7 +1273,6 @@ class Morphik(_ScopedOperationsMixin):
1278
1273
  completed_only: bool = False,
1279
1274
  sort_by: Optional[str] = "updated_at",
1280
1275
  sort_direction: str = "desc",
1281
- fields: Optional[List[str]] = None,
1282
1276
  ) -> ListDocsResponse:
1283
1277
  """
1284
1278
  List accessible documents.
@@ -1295,9 +1289,6 @@ class Morphik(_ScopedOperationsMixin):
1295
1289
  completed_only: Only return completed documents
1296
1290
  sort_by: Field to sort by (created_at, updated_at, filename, external_id)
1297
1291
  sort_direction: Sort direction (asc, desc)
1298
- fields: Optional list of fields to return for each document (e.g. ["metadata"]).
1299
- Only those fields are read and returned, so the full document text is never
1300
- downloaded. external_id and content_type are always included.
1301
1292
  Returns:
1302
1293
  ListDocsResponse: Response with documents and metadata
1303
1294
 
@@ -1315,7 +1306,222 @@ class Morphik(_ScopedOperationsMixin):
1315
1306
  completed_only=completed_only,
1316
1307
  sort_by=sort_by,
1317
1308
  sort_direction=sort_direction,
1318
- fields=fields,
1309
+ )
1310
+
1311
+ def migrate(
1312
+ self,
1313
+ target_uri: str,
1314
+ *,
1315
+ filters: Optional[Dict[str, Any]] = None,
1316
+ folder_name: Optional[Union[str, List[str]]] = None,
1317
+ folder_depth: Optional[int] = None,
1318
+ end_user_id: Optional[str] = None,
1319
+ skip: int = 0,
1320
+ limit: Optional[int] = None,
1321
+ batch_size: int = 100,
1322
+ completed_only: bool = True,
1323
+ use_colpali: bool = True,
1324
+ preserve_folders: bool = True,
1325
+ preserve_end_user_id: bool = True,
1326
+ preserve_summaries: bool = True,
1327
+ include_source_metadata: bool = True,
1328
+ on_conflict: Literal["skip", "fail"] = "skip",
1329
+ continue_on_error: bool = True,
1330
+ target_timeout: Optional[int] = None,
1331
+ target_is_local: bool = False,
1332
+ progress_callback: Optional[Callable[[MigrationDocumentResult], None]] = None,
1333
+ ) -> MigrationResult:
1334
+ """Migrate documents from this client into another Morphik URI.
1335
+
1336
+ The caller should run this from a network location that can reach both
1337
+ the source URI and the target URI, for example from inside a customer's
1338
+ VPN when the target deployment is on-prem. Set target_is_local=True for
1339
+ local/on-prem targets that should use HTTP or skip TLS verification.
1340
+ """
1341
+ if batch_size <= 0:
1342
+ raise ValueError("batch_size must be greater than 0")
1343
+ if limit is not None and limit < 0:
1344
+ raise ValueError("limit must be greater than or equal to 0")
1345
+
1346
+ target = Morphik(target_uri, timeout=target_timeout or self._logic._timeout, is_local=target_is_local)
1347
+ results: List[MigrationDocumentResult] = []
1348
+ total_source_count: Optional[int] = None
1349
+ current_skip = max(skip, 0)
1350
+ remaining = limit
1351
+ page_size = min(batch_size, 500)
1352
+
1353
+ try:
1354
+ while remaining is None or remaining > 0:
1355
+ current_limit = page_size if remaining is None else min(page_size, remaining)
1356
+ page = self._scoped_list_documents(
1357
+ skip=current_skip,
1358
+ limit=current_limit,
1359
+ filters=filters,
1360
+ folder_name=folder_name,
1361
+ folder_depth=folder_depth,
1362
+ end_user_id=end_user_id,
1363
+ include_total_count=total_source_count is None,
1364
+ include_status_counts=False,
1365
+ include_folder_counts=False,
1366
+ completed_only=completed_only,
1367
+ sort_by="updated_at",
1368
+ sort_direction="desc",
1369
+ )
1370
+ if total_source_count is None:
1371
+ total_source_count = page.total_count
1372
+ if not page.documents:
1373
+ break
1374
+
1375
+ for source_document in page.documents:
1376
+ try:
1377
+ result = self._migrate_single_document(
1378
+ target=target,
1379
+ source_document=source_document,
1380
+ use_colpali=use_colpali,
1381
+ preserve_folders=preserve_folders,
1382
+ preserve_end_user_id=preserve_end_user_id,
1383
+ preserve_summaries=preserve_summaries,
1384
+ include_source_metadata=include_source_metadata,
1385
+ on_conflict=on_conflict,
1386
+ )
1387
+ except Exception as exc: # noqa: BLE001
1388
+ result = MigrationDocumentResult(
1389
+ source_document_id=source_document.external_id,
1390
+ filename=source_document.filename,
1391
+ status="failed",
1392
+ error=str(exc),
1393
+ )
1394
+ if not continue_on_error:
1395
+ results.append(result)
1396
+ if progress_callback:
1397
+ progress_callback(result)
1398
+ raise
1399
+
1400
+ results.append(result)
1401
+ if progress_callback:
1402
+ progress_callback(result)
1403
+
1404
+ if remaining is not None:
1405
+ remaining -= len(page.documents)
1406
+ if not page.has_more:
1407
+ break
1408
+ next_skip = page.next_skip if page.next_skip is not None else current_skip + page.returned_count
1409
+ if next_skip <= current_skip:
1410
+ break
1411
+ current_skip = next_skip
1412
+ finally:
1413
+ target.close()
1414
+
1415
+ return self._build_migration_result(results, total_source_count)
1416
+
1417
+ def _migrate_single_document(
1418
+ self,
1419
+ *,
1420
+ target: "Morphik",
1421
+ source_document: Document,
1422
+ use_colpali: bool,
1423
+ preserve_folders: bool,
1424
+ preserve_end_user_id: bool,
1425
+ preserve_summaries: bool,
1426
+ include_source_metadata: bool,
1427
+ on_conflict: Literal["skip", "fail"],
1428
+ ) -> MigrationDocumentResult:
1429
+ metadata, metadata_types = build_migration_metadata(
1430
+ source_document,
1431
+ include_source_metadata=include_source_metadata,
1432
+ )
1433
+ file_bytes = self.get_document_file(source_document.external_id)
1434
+ folder = (source_document.folder_path or source_document.folder_name) if preserve_folders else None
1435
+ end_user = source_document.end_user_id if preserve_end_user_id else None
1436
+
1437
+ status, target_document = target._ingest_migrated_document(
1438
+ source_document_id=source_document.external_id,
1439
+ file_content=file_bytes,
1440
+ filename=source_document.filename or source_document.external_id,
1441
+ content_type=source_document.content_type,
1442
+ metadata=metadata,
1443
+ metadata_types=metadata_types,
1444
+ folder_name=folder,
1445
+ end_user_id=end_user,
1446
+ use_colpali=use_colpali,
1447
+ on_conflict=on_conflict,
1448
+ )
1449
+
1450
+ if preserve_summaries and status == "created":
1451
+ self._copy_document_summary(source_document.external_id, target, target_document.external_id)
1452
+
1453
+ return MigrationDocumentResult(
1454
+ source_document_id=source_document.external_id,
1455
+ target_document_id=target_document.external_id,
1456
+ filename=source_document.filename,
1457
+ status=status,
1458
+ )
1459
+
1460
+ def _ingest_migrated_document(
1461
+ self,
1462
+ *,
1463
+ source_document_id: str,
1464
+ file_content: bytes,
1465
+ filename: str,
1466
+ content_type: Optional[str],
1467
+ metadata: Dict[str, Any],
1468
+ metadata_types: Optional[Dict[str, str]],
1469
+ folder_name: Optional[str],
1470
+ end_user_id: Optional[str],
1471
+ use_colpali: bool,
1472
+ on_conflict: Literal["skip", "fail"],
1473
+ ) -> tuple[str, Document]:
1474
+ serialized_metadata, inferred_types = self._logic._serialize_metadata_map(metadata)
1475
+ metadata_type_payload = {**inferred_types, **(metadata_types or {})}
1476
+ metadata_type_payload = {
1477
+ key: value for key, value in metadata_type_payload.items() if key in serialized_metadata
1478
+ }
1479
+
1480
+ form_data: Dict[str, Any] = {
1481
+ "source_document_id": source_document_id,
1482
+ "metadata": json.dumps(serialized_metadata),
1483
+ "metadata_types": json.dumps(metadata_type_payload),
1484
+ "use_colpali": str(use_colpali).lower(),
1485
+ "on_conflict": on_conflict,
1486
+ }
1487
+ if folder_name:
1488
+ form_data["folder_name"] = folder_name
1489
+ if end_user_id:
1490
+ form_data["end_user_id"] = end_user_id
1491
+
1492
+ file_obj = BytesIO(file_content)
1493
+ files = {"file": (filename, file_obj, content_type or "application/octet-stream")}
1494
+ response = self._request("POST", "migrate/document", data=form_data, files=files)
1495
+ document = self._logic._parse_document_response(response["document"])
1496
+ document._client = self
1497
+ return response.get("status", "created"), document
1498
+
1499
+ def _copy_document_summary(self, source_document_id: str, target: "Morphik", target_document_id: str) -> None:
1500
+ try:
1501
+ summary = self.get_document_summary(source_document_id)
1502
+ except httpx.HTTPStatusError as exc:
1503
+ if exc.response.status_code == 404:
1504
+ return
1505
+ raise
1506
+ target.upsert_document_summary(
1507
+ document_id=target_document_id,
1508
+ content=summary.content,
1509
+ versioning=False,
1510
+ overwrite_latest=True,
1511
+ )
1512
+
1513
+ @staticmethod
1514
+ def _build_migration_result(
1515
+ results: List[MigrationDocumentResult],
1516
+ total_source_count: Optional[int],
1517
+ ) -> MigrationResult:
1518
+ return MigrationResult(
1519
+ documents=results,
1520
+ total_source_count=total_source_count,
1521
+ attempted_count=len(results),
1522
+ created_count=sum(1 for item in results if item.status == "created"),
1523
+ skipped_count=sum(1 for item in results if item.status == "skipped"),
1524
+ failed_count=sum(1 for item in results if item.status == "failed"),
1319
1525
  )
1320
1526
 
1321
1527
  def get_document(self, document_id: str) -> Document:
@@ -1,3 +1,5 @@
1
+ import json
2
+
1
3
  import httpx
2
4
  import jwt
3
5
  import pytest
@@ -117,39 +119,6 @@ def test_sync_list_documents_payloads_across_scopes():
117
119
  client.close()
118
120
 
119
121
 
120
- def test_sync_list_documents_fields_projection():
121
- client, calls = _make_sync_client()
122
- try:
123
- # external_id + content_type are always added so the response parses into a Document;
124
- # metadata_types is added so typed metadata values are reconstructed, not left as strings.
125
- client.list_documents(fields=["metadata"])
126
- assert calls.pop()["data"]["fields"] == ["external_id", "content_type", "metadata", "metadata_types"]
127
-
128
- # Already-included required fields are not duplicated; order is preserved.
129
- client.list_documents(fields=["external_id", "filename", "metadata"])
130
- assert calls.pop()["data"]["fields"] == [
131
- "external_id",
132
- "content_type",
133
- "filename",
134
- "metadata",
135
- "metadata_types",
136
- ]
137
-
138
- # Nested metadata paths also trigger metadata_types.
139
- client.list_documents(fields=["metadata.client"])
140
- assert calls.pop()["data"]["fields"] == ["external_id", "content_type", "metadata.client", "metadata_types"]
141
-
142
- # Non-metadata projection does not pull metadata_types.
143
- client.list_documents(fields=["filename"])
144
- assert calls.pop()["data"]["fields"] == ["external_id", "content_type", "filename"]
145
-
146
- # No fields -> no projection requested (full documents).
147
- client.list_documents()
148
- assert "fields" not in calls.pop()["data"]
149
- finally:
150
- client.close()
151
-
152
-
153
122
  def test_async_client_http2_toggle(monkeypatch):
154
123
  captured = []
155
124
 
@@ -338,6 +307,59 @@ def test_sync_get_document_by_filename_scoped_params_and_encoding():
338
307
  client.close()
339
308
 
340
309
 
310
+ def test_sync_ingest_migrated_document_posts_migration_payload():
311
+ client = Morphik()
312
+ calls = []
313
+
314
+ def fake_request(method, endpoint, data=None, files=None, params=None):
315
+ calls.append({"method": method, "endpoint": endpoint, "data": data, "files": files, "params": params})
316
+ return {
317
+ "status": "created",
318
+ "document": {
319
+ "external_id": "source-doc-1",
320
+ "content_type": "application/pdf",
321
+ "filename": "report.pdf",
322
+ },
323
+ }
324
+
325
+ client._request = fake_request # type: ignore[attr-defined]
326
+ try:
327
+ status, doc = client._ingest_migrated_document(
328
+ source_document_id="source-doc-1",
329
+ file_content=b"pdf-bytes",
330
+ filename="report.pdf",
331
+ content_type="application/pdf",
332
+ metadata={"category": "finance", "_morphik_migration": {"source_document_id": "source-doc-1"}},
333
+ metadata_types={"category": "string", "_morphik_migration": "object"},
334
+ folder_name="/finance/reports",
335
+ end_user_id="customer-1",
336
+ use_colpali=True,
337
+ on_conflict="skip",
338
+ )
339
+ call = calls.pop()
340
+ assert call["method"] == "POST"
341
+ assert call["endpoint"] == "migrate/document"
342
+ assert call["data"]["source_document_id"] == "source-doc-1"
343
+ assert call["data"]["folder_name"] == "/finance/reports"
344
+ assert call["data"]["end_user_id"] == "customer-1"
345
+ assert call["data"]["use_colpali"] == "true"
346
+ assert call["data"]["on_conflict"] == "skip"
347
+ assert json.loads(call["data"]["metadata"]) == {
348
+ "category": "finance",
349
+ "_morphik_migration": {"source_document_id": "source-doc-1"},
350
+ }
351
+ assert json.loads(call["data"]["metadata_types"]) == {
352
+ "category": "string",
353
+ "_morphik_migration": "object",
354
+ }
355
+ assert call["files"]["file"][0] == "report.pdf"
356
+ assert call["files"]["file"][2] == "application/pdf"
357
+ assert status == "created"
358
+ assert doc.external_id == "source-doc-1"
359
+ finally:
360
+ client.close()
361
+
362
+
341
363
  def test_sync_folder_get_document_by_filename_scoped():
342
364
  client, calls = _make_sync_client()
343
365
  try:
@@ -7,6 +7,7 @@ from morphik._shared import (
7
7
  build_document_by_filename_params,
8
8
  build_list_apps_params,
9
9
  build_logs_params,
10
+ build_migration_metadata,
10
11
  build_rename_app_params,
11
12
  build_requeue_payload,
12
13
  build_rotate_app_params,
@@ -14,7 +15,7 @@ from morphik._shared import (
14
15
  merge_folders,
15
16
  normalize_additional_folders,
16
17
  )
17
- from morphik.models import RequeueIngestionJob
18
+ from morphik.models import Document, RequeueIngestionJob
18
19
 
19
20
 
20
21
  def test_merge_folders_variants():
@@ -119,3 +120,37 @@ def test_normalize_additional_folders_alias():
119
120
  assert normalize_additional_folders(None, "b") == ["b"]
120
121
  assert normalize_additional_folders(["a"], "b") == ["a", "b"]
121
122
  assert normalize_additional_folders(["a"], ["b", "c"]) == ["a", "b", "c"]
123
+
124
+
125
+ def test_build_migration_metadata_strips_managed_fields_and_records_source():
126
+ doc = Document(
127
+ external_id="source-doc-1",
128
+ content_type="application/pdf",
129
+ filename="report.pdf",
130
+ app_id="source-app",
131
+ folder_path="/finance/reports",
132
+ end_user_id="customer-1",
133
+ metadata={
134
+ "external_id": "source-doc-1",
135
+ "folder_name": "/finance/reports",
136
+ "end_user_id": "customer-1",
137
+ "category": "finance",
138
+ },
139
+ metadata_types={
140
+ "external_id": "string",
141
+ "folder_name": "string",
142
+ "end_user_id": "string",
143
+ "category": "string",
144
+ },
145
+ system_metadata={"created_at": "2026-01-01T00:00:00Z", "updated_at": "2026-01-02T00:00:00Z"},
146
+ )
147
+
148
+ metadata, metadata_types = build_migration_metadata(doc)
149
+
150
+ assert metadata["category"] == "finance"
151
+ assert "external_id" not in metadata
152
+ assert "folder_name" not in metadata
153
+ assert "end_user_id" not in metadata
154
+ assert metadata["_morphik_migration"]["source_document_id"] == "source-doc-1"
155
+ assert metadata["_morphik_migration"]["source_app_id"] == "source-app"
156
+ assert metadata_types == {"category": "string", "_morphik_migration": "object"}
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "morphik"
7
- version = "1.2.3"
7
+ version = "1.2.4"
8
8
  authors = [
9
9
  { name = "Morphik", email = "founders@morphik.ai" },
10
10
  ]
File without changes
File without changes