h2ogpte 1.6.41rc5__py3-none-any.whl → 1.6.43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. h2ogpte/__init__.py +1 -1
  2. h2ogpte/cli/__init__.py +0 -0
  3. h2ogpte/cli/commands/__init__.py +0 -0
  4. h2ogpte/cli/commands/command_handlers/__init__.py +0 -0
  5. h2ogpte/cli/commands/command_handlers/agent.py +41 -0
  6. h2ogpte/cli/commands/command_handlers/chat.py +37 -0
  7. h2ogpte/cli/commands/command_handlers/clear.py +8 -0
  8. h2ogpte/cli/commands/command_handlers/collection.py +67 -0
  9. h2ogpte/cli/commands/command_handlers/config.py +113 -0
  10. h2ogpte/cli/commands/command_handlers/disconnect.py +36 -0
  11. h2ogpte/cli/commands/command_handlers/exit.py +37 -0
  12. h2ogpte/cli/commands/command_handlers/help.py +8 -0
  13. h2ogpte/cli/commands/command_handlers/history.py +29 -0
  14. h2ogpte/cli/commands/command_handlers/rag.py +146 -0
  15. h2ogpte/cli/commands/command_handlers/research_agent.py +45 -0
  16. h2ogpte/cli/commands/command_handlers/session.py +77 -0
  17. h2ogpte/cli/commands/command_handlers/status.py +33 -0
  18. h2ogpte/cli/commands/dispatcher.py +79 -0
  19. h2ogpte/cli/core/__init__.py +0 -0
  20. h2ogpte/cli/core/app.py +105 -0
  21. h2ogpte/cli/core/config.py +199 -0
  22. h2ogpte/cli/core/encryption.py +104 -0
  23. h2ogpte/cli/core/session.py +171 -0
  24. h2ogpte/cli/integrations/__init__.py +0 -0
  25. h2ogpte/cli/integrations/agent.py +338 -0
  26. h2ogpte/cli/integrations/rag.py +442 -0
  27. h2ogpte/cli/main.py +90 -0
  28. h2ogpte/cli/ui/__init__.py +0 -0
  29. h2ogpte/cli/ui/hbot_prompt.py +435 -0
  30. h2ogpte/cli/ui/prompts.py +129 -0
  31. h2ogpte/cli/ui/status_bar.py +133 -0
  32. h2ogpte/cli/utils/__init__.py +0 -0
  33. h2ogpte/cli/utils/file_manager.py +411 -0
  34. h2ogpte/connectors.py +11 -0
  35. h2ogpte/h2ogpte.py +619 -69
  36. h2ogpte/h2ogpte_async.py +631 -70
  37. h2ogpte/h2ogpte_sync_base.py +8 -1
  38. h2ogpte/rest_async/__init__.py +8 -3
  39. h2ogpte/rest_async/api/chat_api.py +29 -0
  40. h2ogpte/rest_async/api/collections_api.py +293 -0
  41. h2ogpte/rest_async/api/document_ingestion_api.py +1365 -436
  42. h2ogpte/rest_async/api/extractors_api.py +2874 -70
  43. h2ogpte/rest_async/api/prompt_templates_api.py +32 -32
  44. h2ogpte/rest_async/api_client.py +1 -1
  45. h2ogpte/rest_async/configuration.py +1 -1
  46. h2ogpte/rest_async/models/__init__.py +7 -2
  47. h2ogpte/rest_async/models/chat_completion.py +4 -2
  48. h2ogpte/rest_async/models/chat_completion_delta.py +5 -3
  49. h2ogpte/rest_async/models/chat_completion_request.py +1 -1
  50. h2ogpte/rest_async/models/chat_session.py +4 -2
  51. h2ogpte/rest_async/models/chat_settings.py +1 -1
  52. h2ogpte/rest_async/models/collection.py +4 -2
  53. h2ogpte/rest_async/models/collection_create_request.py +4 -2
  54. h2ogpte/rest_async/models/confluence_credentials.py +89 -0
  55. h2ogpte/rest_async/models/create_chat_session_request.py +87 -0
  56. h2ogpte/rest_async/models/extraction_request.py +1 -1
  57. h2ogpte/rest_async/models/extractor.py +4 -2
  58. h2ogpte/rest_async/models/guardrails_settings.py +8 -4
  59. h2ogpte/rest_async/models/guardrails_settings_create_request.py +1 -1
  60. h2ogpte/rest_async/models/ingest_from_confluence_body.py +97 -0
  61. h2ogpte/rest_async/models/process_document_job_request.py +1 -1
  62. h2ogpte/rest_async/models/question_request.py +1 -1
  63. h2ogpte/rest_async/models/{reset_and_share_prompt_template_request.py → reset_and_share_request.py} +6 -6
  64. h2ogpte/{rest_sync/models/reset_and_share_prompt_template_with_groups_request.py → rest_async/models/reset_and_share_with_groups_request.py} +6 -6
  65. h2ogpte/rest_async/models/summarize_request.py +1 -1
  66. h2ogpte/rest_async/models/update_collection_privacy_request.py +6 -4
  67. h2ogpte/rest_async/models/update_collection_workspace_request.py +87 -0
  68. h2ogpte/rest_async/models/update_extractor_privacy_request.py +87 -0
  69. h2ogpte/rest_sync/__init__.py +8 -3
  70. h2ogpte/rest_sync/api/chat_api.py +29 -0
  71. h2ogpte/rest_sync/api/collections_api.py +293 -0
  72. h2ogpte/rest_sync/api/document_ingestion_api.py +1365 -436
  73. h2ogpte/rest_sync/api/extractors_api.py +2874 -70
  74. h2ogpte/rest_sync/api/prompt_templates_api.py +32 -32
  75. h2ogpte/rest_sync/api_client.py +1 -1
  76. h2ogpte/rest_sync/configuration.py +1 -1
  77. h2ogpte/rest_sync/models/__init__.py +7 -2
  78. h2ogpte/rest_sync/models/chat_completion.py +4 -2
  79. h2ogpte/rest_sync/models/chat_completion_delta.py +5 -3
  80. h2ogpte/rest_sync/models/chat_completion_request.py +1 -1
  81. h2ogpte/rest_sync/models/chat_session.py +4 -2
  82. h2ogpte/rest_sync/models/chat_settings.py +1 -1
  83. h2ogpte/rest_sync/models/collection.py +4 -2
  84. h2ogpte/rest_sync/models/collection_create_request.py +4 -2
  85. h2ogpte/rest_sync/models/confluence_credentials.py +89 -0
  86. h2ogpte/rest_sync/models/create_chat_session_request.py +87 -0
  87. h2ogpte/rest_sync/models/extraction_request.py +1 -1
  88. h2ogpte/rest_sync/models/extractor.py +4 -2
  89. h2ogpte/rest_sync/models/guardrails_settings.py +8 -4
  90. h2ogpte/rest_sync/models/guardrails_settings_create_request.py +1 -1
  91. h2ogpte/rest_sync/models/ingest_from_confluence_body.py +97 -0
  92. h2ogpte/rest_sync/models/process_document_job_request.py +1 -1
  93. h2ogpte/rest_sync/models/question_request.py +1 -1
  94. h2ogpte/rest_sync/models/{reset_and_share_prompt_template_request.py → reset_and_share_request.py} +6 -6
  95. h2ogpte/{rest_async/models/reset_and_share_prompt_template_with_groups_request.py → rest_sync/models/reset_and_share_with_groups_request.py} +6 -6
  96. h2ogpte/rest_sync/models/summarize_request.py +1 -1
  97. h2ogpte/rest_sync/models/update_collection_privacy_request.py +6 -4
  98. h2ogpte/rest_sync/models/update_collection_workspace_request.py +87 -0
  99. h2ogpte/rest_sync/models/update_extractor_privacy_request.py +87 -0
  100. h2ogpte/session.py +14 -2
  101. h2ogpte/session_async.py +33 -6
  102. h2ogpte/types.py +9 -1
  103. {h2ogpte-1.6.41rc5.dist-info → h2ogpte-1.6.43.dist-info}/METADATA +5 -1
  104. {h2ogpte-1.6.41rc5.dist-info → h2ogpte-1.6.43.dist-info}/RECORD +107 -64
  105. h2ogpte-1.6.43.dist-info/entry_points.txt +2 -0
  106. {h2ogpte-1.6.41rc5.dist-info → h2ogpte-1.6.43.dist-info}/WHEEL +0 -0
  107. {h2ogpte-1.6.41rc5.dist-info → h2ogpte-1.6.43.dist-info}/top_level.txt +0 -0
@@ -20,6 +20,7 @@ from pydantic import Field, StrictBool, StrictBytes, StrictFloat, StrictInt, Str
20
20
  from typing import List, Optional, Tuple, Union
21
21
  from typing_extensions import Annotated
22
22
  from h2ogpte.rest_async.models.ingest_from_azure_blob_storage_body import IngestFromAzureBlobStorageBody
23
+ from h2ogpte.rest_async.models.ingest_from_confluence_body import IngestFromConfluenceBody
23
24
  from h2ogpte.rest_async.models.ingest_from_file_system_body import IngestFromFileSystemBody
24
25
  from h2ogpte.rest_async.models.ingest_from_gcs_body import IngestFromGcsBody
25
26
  from h2ogpte.rest_async.models.ingest_from_s3_body import IngestFromS3Body
@@ -982,10 +983,10 @@ class DocumentIngestionApi:
982
983
 
983
984
 
984
985
  @validate_call
985
- async def create_ingest_from_file_system_job(
986
+ async def create_ingest_from_confluence_job(
986
987
  self,
987
988
  collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
988
- ingest_from_file_system_body: IngestFromFileSystemBody,
989
+ ingest_from_confluence_body: IngestFromConfluenceBody,
989
990
  gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
990
991
  gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
991
992
  audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
@@ -1009,14 +1010,14 @@ class DocumentIngestionApi:
1009
1010
  _headers: Optional[Dict[StrictStr, Any]] = None,
1010
1011
  _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
1011
1012
  ) -> JobDetails:
1012
- """Creates a job to add files from the local system into a collection.
1013
+ """Creates a job to ingest confluence pages into collection.
1013
1014
 
1014
- Creates a job to add files from the local system into a collection.
1015
+ Creates a job to confluence pages into collection. If an ingested page has sub-pages, the subpages are also ingested.
1015
1016
 
1016
1017
  :param collection_id: String id of the collection to add the ingested documents into. (required)
1017
1018
  :type collection_id: str
1018
- :param ingest_from_file_system_body: (required)
1019
- :type ingest_from_file_system_body: IngestFromFileSystemBody
1019
+ :param ingest_from_confluence_body: (required)
1020
+ :type ingest_from_confluence_body: IngestFromConfluenceBody
1020
1021
  :param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
1021
1022
  :type gen_doc_summaries: bool
1022
1023
  :param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
@@ -1059,9 +1060,9 @@ class DocumentIngestionApi:
1059
1060
  :return: Returns the result object.
1060
1061
  """ # noqa: E501
1061
1062
 
1062
- _param = self._create_ingest_from_file_system_job_serialize(
1063
+ _param = self._create_ingest_from_confluence_job_serialize(
1063
1064
  collection_id=collection_id,
1064
- ingest_from_file_system_body=ingest_from_file_system_body,
1065
+ ingest_from_confluence_body=ingest_from_confluence_body,
1065
1066
  gen_doc_summaries=gen_doc_summaries,
1066
1067
  gen_doc_questions=gen_doc_questions,
1067
1068
  audio_input_language=audio_input_language,
@@ -1094,10 +1095,10 @@ class DocumentIngestionApi:
1094
1095
 
1095
1096
 
1096
1097
  @validate_call
1097
- async def create_ingest_from_file_system_job_with_http_info(
1098
+ async def create_ingest_from_confluence_job_with_http_info(
1098
1099
  self,
1099
1100
  collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
1100
- ingest_from_file_system_body: IngestFromFileSystemBody,
1101
+ ingest_from_confluence_body: IngestFromConfluenceBody,
1101
1102
  gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
1102
1103
  gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
1103
1104
  audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
@@ -1121,14 +1122,14 @@ class DocumentIngestionApi:
1121
1122
  _headers: Optional[Dict[StrictStr, Any]] = None,
1122
1123
  _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
1123
1124
  ) -> ApiResponse[JobDetails]:
1124
- """Creates a job to add files from the local system into a collection.
1125
+ """Creates a job to ingest confluence pages into collection.
1125
1126
 
1126
- Creates a job to add files from the local system into a collection.
1127
+ Creates a job to confluence pages into collection. If an ingested page has sub-pages, the subpages are also ingested.
1127
1128
 
1128
1129
  :param collection_id: String id of the collection to add the ingested documents into. (required)
1129
1130
  :type collection_id: str
1130
- :param ingest_from_file_system_body: (required)
1131
- :type ingest_from_file_system_body: IngestFromFileSystemBody
1131
+ :param ingest_from_confluence_body: (required)
1132
+ :type ingest_from_confluence_body: IngestFromConfluenceBody
1132
1133
  :param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
1133
1134
  :type gen_doc_summaries: bool
1134
1135
  :param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
@@ -1171,9 +1172,9 @@ class DocumentIngestionApi:
1171
1172
  :return: Returns the result object.
1172
1173
  """ # noqa: E501
1173
1174
 
1174
- _param = self._create_ingest_from_file_system_job_serialize(
1175
+ _param = self._create_ingest_from_confluence_job_serialize(
1175
1176
  collection_id=collection_id,
1176
- ingest_from_file_system_body=ingest_from_file_system_body,
1177
+ ingest_from_confluence_body=ingest_from_confluence_body,
1177
1178
  gen_doc_summaries=gen_doc_summaries,
1178
1179
  gen_doc_questions=gen_doc_questions,
1179
1180
  audio_input_language=audio_input_language,
@@ -1206,10 +1207,10 @@ class DocumentIngestionApi:
1206
1207
 
1207
1208
 
1208
1209
  @validate_call
1209
- async def create_ingest_from_file_system_job_without_preload_content(
1210
+ async def create_ingest_from_confluence_job_without_preload_content(
1210
1211
  self,
1211
1212
  collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
1212
- ingest_from_file_system_body: IngestFromFileSystemBody,
1213
+ ingest_from_confluence_body: IngestFromConfluenceBody,
1213
1214
  gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
1214
1215
  gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
1215
1216
  audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
@@ -1233,14 +1234,14 @@ class DocumentIngestionApi:
1233
1234
  _headers: Optional[Dict[StrictStr, Any]] = None,
1234
1235
  _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
1235
1236
  ) -> RESTResponseType:
1236
- """Creates a job to add files from the local system into a collection.
1237
+ """Creates a job to ingest confluence pages into collection.
1237
1238
 
1238
- Creates a job to add files from the local system into a collection.
1239
+ Creates a job to confluence pages into collection. If an ingested page has sub-pages, the subpages are also ingested.
1239
1240
 
1240
1241
  :param collection_id: String id of the collection to add the ingested documents into. (required)
1241
1242
  :type collection_id: str
1242
- :param ingest_from_file_system_body: (required)
1243
- :type ingest_from_file_system_body: IngestFromFileSystemBody
1243
+ :param ingest_from_confluence_body: (required)
1244
+ :type ingest_from_confluence_body: IngestFromConfluenceBody
1244
1245
  :param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
1245
1246
  :type gen_doc_summaries: bool
1246
1247
  :param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
@@ -1283,9 +1284,9 @@ class DocumentIngestionApi:
1283
1284
  :return: Returns the result object.
1284
1285
  """ # noqa: E501
1285
1286
 
1286
- _param = self._create_ingest_from_file_system_job_serialize(
1287
+ _param = self._create_ingest_from_confluence_job_serialize(
1287
1288
  collection_id=collection_id,
1288
- ingest_from_file_system_body=ingest_from_file_system_body,
1289
+ ingest_from_confluence_body=ingest_from_confluence_body,
1289
1290
  gen_doc_summaries=gen_doc_summaries,
1290
1291
  gen_doc_questions=gen_doc_questions,
1291
1292
  audio_input_language=audio_input_language,
@@ -1313,10 +1314,10 @@ class DocumentIngestionApi:
1313
1314
  return response_data.response
1314
1315
 
1315
1316
 
1316
- def _create_ingest_from_file_system_job_serialize(
1317
+ def _create_ingest_from_confluence_job_serialize(
1317
1318
  self,
1318
1319
  collection_id,
1319
- ingest_from_file_system_body,
1320
+ ingest_from_confluence_body,
1320
1321
  gen_doc_summaries,
1321
1322
  gen_doc_questions,
1322
1323
  audio_input_language,
@@ -1396,8 +1397,8 @@ class DocumentIngestionApi:
1396
1397
  # process the header parameters
1397
1398
  # process the form parameters
1398
1399
  # process the body parameter
1399
- if ingest_from_file_system_body is not None:
1400
- _body_params = ingest_from_file_system_body
1400
+ if ingest_from_confluence_body is not None:
1401
+ _body_params = ingest_from_confluence_body
1401
1402
 
1402
1403
 
1403
1404
  # set the HTTP header `Accept`
@@ -1429,7 +1430,7 @@ class DocumentIngestionApi:
1429
1430
 
1430
1431
  return self.api_client.param_serialize(
1431
1432
  method='POST',
1432
- resource_path='/ingest/file_system/job',
1433
+ resource_path='/ingest/confluence/job',
1433
1434
  path_params=_path_params,
1434
1435
  query_params=_query_params,
1435
1436
  header_params=_header_params,
@@ -1446,10 +1447,10 @@ class DocumentIngestionApi:
1446
1447
 
1447
1448
 
1448
1449
  @validate_call
1449
- async def create_ingest_from_gcs_job(
1450
+ async def create_ingest_from_file_system_job(
1450
1451
  self,
1451
1452
  collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
1452
- ingest_from_gcs_body: IngestFromGcsBody,
1453
+ ingest_from_file_system_body: IngestFromFileSystemBody,
1453
1454
  gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
1454
1455
  gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
1455
1456
  audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
@@ -1473,14 +1474,14 @@ class DocumentIngestionApi:
1473
1474
  _headers: Optional[Dict[StrictStr, Any]] = None,
1474
1475
  _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
1475
1476
  ) -> JobDetails:
1476
- """Creates a job to add files from the Google Cloud Storage into a collection.
1477
+ """Creates a job to add files from the local system into a collection.
1477
1478
 
1478
- Creates a job to add files from the Google Cloud Storage into a collection.
1479
+ Creates a job to add files from the local system into a collection.
1479
1480
 
1480
1481
  :param collection_id: String id of the collection to add the ingested documents into. (required)
1481
1482
  :type collection_id: str
1482
- :param ingest_from_gcs_body: (required)
1483
- :type ingest_from_gcs_body: IngestFromGcsBody
1483
+ :param ingest_from_file_system_body: (required)
1484
+ :type ingest_from_file_system_body: IngestFromFileSystemBody
1484
1485
  :param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
1485
1486
  :type gen_doc_summaries: bool
1486
1487
  :param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
@@ -1523,9 +1524,9 @@ class DocumentIngestionApi:
1523
1524
  :return: Returns the result object.
1524
1525
  """ # noqa: E501
1525
1526
 
1526
- _param = self._create_ingest_from_gcs_job_serialize(
1527
+ _param = self._create_ingest_from_file_system_job_serialize(
1527
1528
  collection_id=collection_id,
1528
- ingest_from_gcs_body=ingest_from_gcs_body,
1529
+ ingest_from_file_system_body=ingest_from_file_system_body,
1529
1530
  gen_doc_summaries=gen_doc_summaries,
1530
1531
  gen_doc_questions=gen_doc_questions,
1531
1532
  audio_input_language=audio_input_language,
@@ -1558,10 +1559,10 @@ class DocumentIngestionApi:
1558
1559
 
1559
1560
 
1560
1561
  @validate_call
1561
- async def create_ingest_from_gcs_job_with_http_info(
1562
+ async def create_ingest_from_file_system_job_with_http_info(
1562
1563
  self,
1563
1564
  collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
1564
- ingest_from_gcs_body: IngestFromGcsBody,
1565
+ ingest_from_file_system_body: IngestFromFileSystemBody,
1565
1566
  gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
1566
1567
  gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
1567
1568
  audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
@@ -1585,14 +1586,14 @@ class DocumentIngestionApi:
1585
1586
  _headers: Optional[Dict[StrictStr, Any]] = None,
1586
1587
  _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
1587
1588
  ) -> ApiResponse[JobDetails]:
1588
- """Creates a job to add files from the Google Cloud Storage into a collection.
1589
+ """Creates a job to add files from the local system into a collection.
1589
1590
 
1590
- Creates a job to add files from the Google Cloud Storage into a collection.
1591
+ Creates a job to add files from the local system into a collection.
1591
1592
 
1592
1593
  :param collection_id: String id of the collection to add the ingested documents into. (required)
1593
1594
  :type collection_id: str
1594
- :param ingest_from_gcs_body: (required)
1595
- :type ingest_from_gcs_body: IngestFromGcsBody
1595
+ :param ingest_from_file_system_body: (required)
1596
+ :type ingest_from_file_system_body: IngestFromFileSystemBody
1596
1597
  :param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
1597
1598
  :type gen_doc_summaries: bool
1598
1599
  :param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
@@ -1635,9 +1636,9 @@ class DocumentIngestionApi:
1635
1636
  :return: Returns the result object.
1636
1637
  """ # noqa: E501
1637
1638
 
1638
- _param = self._create_ingest_from_gcs_job_serialize(
1639
+ _param = self._create_ingest_from_file_system_job_serialize(
1639
1640
  collection_id=collection_id,
1640
- ingest_from_gcs_body=ingest_from_gcs_body,
1641
+ ingest_from_file_system_body=ingest_from_file_system_body,
1641
1642
  gen_doc_summaries=gen_doc_summaries,
1642
1643
  gen_doc_questions=gen_doc_questions,
1643
1644
  audio_input_language=audio_input_language,
@@ -1670,10 +1671,10 @@ class DocumentIngestionApi:
1670
1671
 
1671
1672
 
1672
1673
  @validate_call
1673
- async def create_ingest_from_gcs_job_without_preload_content(
1674
+ async def create_ingest_from_file_system_job_without_preload_content(
1674
1675
  self,
1675
1676
  collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
1676
- ingest_from_gcs_body: IngestFromGcsBody,
1677
+ ingest_from_file_system_body: IngestFromFileSystemBody,
1677
1678
  gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
1678
1679
  gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
1679
1680
  audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
@@ -1697,14 +1698,14 @@ class DocumentIngestionApi:
1697
1698
  _headers: Optional[Dict[StrictStr, Any]] = None,
1698
1699
  _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
1699
1700
  ) -> RESTResponseType:
1700
- """Creates a job to add files from the Google Cloud Storage into a collection.
1701
+ """Creates a job to add files from the local system into a collection.
1701
1702
 
1702
- Creates a job to add files from the Google Cloud Storage into a collection.
1703
+ Creates a job to add files from the local system into a collection.
1703
1704
 
1704
1705
  :param collection_id: String id of the collection to add the ingested documents into. (required)
1705
1706
  :type collection_id: str
1706
- :param ingest_from_gcs_body: (required)
1707
- :type ingest_from_gcs_body: IngestFromGcsBody
1707
+ :param ingest_from_file_system_body: (required)
1708
+ :type ingest_from_file_system_body: IngestFromFileSystemBody
1708
1709
  :param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
1709
1710
  :type gen_doc_summaries: bool
1710
1711
  :param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
@@ -1747,9 +1748,9 @@ class DocumentIngestionApi:
1747
1748
  :return: Returns the result object.
1748
1749
  """ # noqa: E501
1749
1750
 
1750
- _param = self._create_ingest_from_gcs_job_serialize(
1751
+ _param = self._create_ingest_from_file_system_job_serialize(
1751
1752
  collection_id=collection_id,
1752
- ingest_from_gcs_body=ingest_from_gcs_body,
1753
+ ingest_from_file_system_body=ingest_from_file_system_body,
1753
1754
  gen_doc_summaries=gen_doc_summaries,
1754
1755
  gen_doc_questions=gen_doc_questions,
1755
1756
  audio_input_language=audio_input_language,
@@ -1777,10 +1778,10 @@ class DocumentIngestionApi:
1777
1778
  return response_data.response
1778
1779
 
1779
1780
 
1780
- def _create_ingest_from_gcs_job_serialize(
1781
+ def _create_ingest_from_file_system_job_serialize(
1781
1782
  self,
1782
1783
  collection_id,
1783
- ingest_from_gcs_body,
1784
+ ingest_from_file_system_body,
1784
1785
  gen_doc_summaries,
1785
1786
  gen_doc_questions,
1786
1787
  audio_input_language,
@@ -1860,8 +1861,8 @@ class DocumentIngestionApi:
1860
1861
  # process the header parameters
1861
1862
  # process the form parameters
1862
1863
  # process the body parameter
1863
- if ingest_from_gcs_body is not None:
1864
- _body_params = ingest_from_gcs_body
1864
+ if ingest_from_file_system_body is not None:
1865
+ _body_params = ingest_from_file_system_body
1865
1866
 
1866
1867
 
1867
1868
  # set the HTTP header `Accept`
@@ -1893,7 +1894,7 @@ class DocumentIngestionApi:
1893
1894
 
1894
1895
  return self.api_client.param_serialize(
1895
1896
  method='POST',
1896
- resource_path='/ingest/gcs/job',
1897
+ resource_path='/ingest/file_system/job',
1897
1898
  path_params=_path_params,
1898
1899
  query_params=_query_params,
1899
1900
  header_params=_header_params,
@@ -1910,14 +1911,19 @@ class DocumentIngestionApi:
1910
1911
 
1911
1912
 
1912
1913
  @validate_call
1913
- async def create_ingest_from_plain_text_job(
1914
+ async def create_ingest_from_gcs_job(
1914
1915
  self,
1915
1916
  collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
1916
- file_name: Annotated[StrictStr, Field(description="String of the file name to use for the document.")],
1917
- body: Annotated[StrictStr, Field(description="The text that will ingested into a collection.")],
1917
+ ingest_from_gcs_body: IngestFromGcsBody,
1918
1918
  gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
1919
1919
  gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
1920
- metadata: Annotated[Optional[StrictStr], Field(description="String with json-encoded metadata for the document.")] = None,
1920
+ audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
1921
+ ocr_model: Annotated[Optional[StrictStr], Field(description="Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).")] = None,
1922
+ tesseract_lang: Annotated[Optional[StrictStr], Field(description="Which language to use when using ocr_model=\"tesseract\".")] = None,
1923
+ keep_tables_as_one_chunk: Annotated[Optional[StrictBool], Field(description="When tables are identified by the table parser the table tokens will be kept in a single chunk.")] = None,
1924
+ chunk_by_page: Annotated[Optional[StrictBool], Field(description="Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.")] = None,
1925
+ handwriting_check: Annotated[Optional[StrictBool], Field(description="Check pages for handwriting. Will use specialized models if handwriting is found.")] = None,
1926
+ ingest_mode: Annotated[Optional[StrictStr], Field(description="Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.")] = None,
1921
1927
  timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
1922
1928
  _request_timeout: Union[
1923
1929
  None,
@@ -1932,22 +1938,32 @@ class DocumentIngestionApi:
1932
1938
  _headers: Optional[Dict[StrictStr, Any]] = None,
1933
1939
  _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
1934
1940
  ) -> JobDetails:
1935
- """Creates a job to add plain text to a collection.
1941
+ """Creates a job to add files from the Google Cloud Storage into a collection.
1936
1942
 
1937
- Creates a job to add plain text to a collection.
1943
+ Creates a job to add files from the Google Cloud Storage into a collection.
1938
1944
 
1939
1945
  :param collection_id: String id of the collection to add the ingested documents into. (required)
1940
1946
  :type collection_id: str
1941
- :param file_name: String of the file name to use for the document. (required)
1942
- :type file_name: str
1943
- :param body: The text that will ingested into a collection. (required)
1944
- :type body: str
1947
+ :param ingest_from_gcs_body: (required)
1948
+ :type ingest_from_gcs_body: IngestFromGcsBody
1945
1949
  :param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
1946
1950
  :type gen_doc_summaries: bool
1947
1951
  :param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
1948
1952
  :type gen_doc_questions: bool
1949
- :param metadata: String with json-encoded metadata for the document.
1950
- :type metadata: str
1953
+ :param audio_input_language: Language of audio files.
1954
+ :type audio_input_language: str
1955
+ :param ocr_model: Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).
1956
+ :type ocr_model: str
1957
+ :param tesseract_lang: Which language to use when using ocr_model=\"tesseract\".
1958
+ :type tesseract_lang: str
1959
+ :param keep_tables_as_one_chunk: When tables are identified by the table parser the table tokens will be kept in a single chunk.
1960
+ :type keep_tables_as_one_chunk: bool
1961
+ :param chunk_by_page: Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.
1962
+ :type chunk_by_page: bool
1963
+ :param handwriting_check: Check pages for handwriting. Will use specialized models if handwriting is found.
1964
+ :type handwriting_check: bool
1965
+ :param ingest_mode: Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.
1966
+ :type ingest_mode: str
1951
1967
  :param timeout: Timeout in seconds
1952
1968
  :type timeout: float
1953
1969
  :param _request_timeout: timeout setting for this request. If one
@@ -1972,13 +1988,18 @@ class DocumentIngestionApi:
1972
1988
  :return: Returns the result object.
1973
1989
  """ # noqa: E501
1974
1990
 
1975
- _param = self._create_ingest_from_plain_text_job_serialize(
1991
+ _param = self._create_ingest_from_gcs_job_serialize(
1976
1992
  collection_id=collection_id,
1977
- file_name=file_name,
1978
- body=body,
1993
+ ingest_from_gcs_body=ingest_from_gcs_body,
1979
1994
  gen_doc_summaries=gen_doc_summaries,
1980
1995
  gen_doc_questions=gen_doc_questions,
1981
- metadata=metadata,
1996
+ audio_input_language=audio_input_language,
1997
+ ocr_model=ocr_model,
1998
+ tesseract_lang=tesseract_lang,
1999
+ keep_tables_as_one_chunk=keep_tables_as_one_chunk,
2000
+ chunk_by_page=chunk_by_page,
2001
+ handwriting_check=handwriting_check,
2002
+ ingest_mode=ingest_mode,
1982
2003
  timeout=timeout,
1983
2004
  _request_auth=_request_auth,
1984
2005
  _content_type=_content_type,
@@ -2002,14 +2023,19 @@ class DocumentIngestionApi:
2002
2023
 
2003
2024
 
2004
2025
  @validate_call
2005
- async def create_ingest_from_plain_text_job_with_http_info(
2026
+ async def create_ingest_from_gcs_job_with_http_info(
2006
2027
  self,
2007
2028
  collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
2008
- file_name: Annotated[StrictStr, Field(description="String of the file name to use for the document.")],
2009
- body: Annotated[StrictStr, Field(description="The text that will ingested into a collection.")],
2029
+ ingest_from_gcs_body: IngestFromGcsBody,
2010
2030
  gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
2011
2031
  gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
2012
- metadata: Annotated[Optional[StrictStr], Field(description="String with json-encoded metadata for the document.")] = None,
2032
+ audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
2033
+ ocr_model: Annotated[Optional[StrictStr], Field(description="Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).")] = None,
2034
+ tesseract_lang: Annotated[Optional[StrictStr], Field(description="Which language to use when using ocr_model=\"tesseract\".")] = None,
2035
+ keep_tables_as_one_chunk: Annotated[Optional[StrictBool], Field(description="When tables are identified by the table parser the table tokens will be kept in a single chunk.")] = None,
2036
+ chunk_by_page: Annotated[Optional[StrictBool], Field(description="Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.")] = None,
2037
+ handwriting_check: Annotated[Optional[StrictBool], Field(description="Check pages for handwriting. Will use specialized models if handwriting is found.")] = None,
2038
+ ingest_mode: Annotated[Optional[StrictStr], Field(description="Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.")] = None,
2013
2039
  timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
2014
2040
  _request_timeout: Union[
2015
2041
  None,
@@ -2024,22 +2050,32 @@ class DocumentIngestionApi:
2024
2050
  _headers: Optional[Dict[StrictStr, Any]] = None,
2025
2051
  _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
2026
2052
  ) -> ApiResponse[JobDetails]:
2027
- """Creates a job to add plain text to a collection.
2053
+ """Creates a job to add files from the Google Cloud Storage into a collection.
2028
2054
 
2029
- Creates a job to add plain text to a collection.
2055
+ Creates a job to add files from the Google Cloud Storage into a collection.
2030
2056
 
2031
2057
  :param collection_id: String id of the collection to add the ingested documents into. (required)
2032
2058
  :type collection_id: str
2033
- :param file_name: String of the file name to use for the document. (required)
2034
- :type file_name: str
2035
- :param body: The text that will ingested into a collection. (required)
2036
- :type body: str
2059
+ :param ingest_from_gcs_body: (required)
2060
+ :type ingest_from_gcs_body: IngestFromGcsBody
2037
2061
  :param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
2038
2062
  :type gen_doc_summaries: bool
2039
2063
  :param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
2040
2064
  :type gen_doc_questions: bool
2041
- :param metadata: String with json-encoded metadata for the document.
2042
- :type metadata: str
2065
+ :param audio_input_language: Language of audio files.
2066
+ :type audio_input_language: str
2067
+ :param ocr_model: Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).
2068
+ :type ocr_model: str
2069
+ :param tesseract_lang: Which language to use when using ocr_model=\"tesseract\".
2070
+ :type tesseract_lang: str
2071
+ :param keep_tables_as_one_chunk: When tables are identified by the table parser the table tokens will be kept in a single chunk.
2072
+ :type keep_tables_as_one_chunk: bool
2073
+ :param chunk_by_page: Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.
2074
+ :type chunk_by_page: bool
2075
+ :param handwriting_check: Check pages for handwriting. Will use specialized models if handwriting is found.
2076
+ :type handwriting_check: bool
2077
+ :param ingest_mode: Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.
2078
+ :type ingest_mode: str
2043
2079
  :param timeout: Timeout in seconds
2044
2080
  :type timeout: float
2045
2081
  :param _request_timeout: timeout setting for this request. If one
@@ -2064,13 +2100,18 @@ class DocumentIngestionApi:
2064
2100
  :return: Returns the result object.
2065
2101
  """ # noqa: E501
2066
2102
 
2067
- _param = self._create_ingest_from_plain_text_job_serialize(
2103
+ _param = self._create_ingest_from_gcs_job_serialize(
2068
2104
  collection_id=collection_id,
2069
- file_name=file_name,
2070
- body=body,
2105
+ ingest_from_gcs_body=ingest_from_gcs_body,
2071
2106
  gen_doc_summaries=gen_doc_summaries,
2072
2107
  gen_doc_questions=gen_doc_questions,
2073
- metadata=metadata,
2108
+ audio_input_language=audio_input_language,
2109
+ ocr_model=ocr_model,
2110
+ tesseract_lang=tesseract_lang,
2111
+ keep_tables_as_one_chunk=keep_tables_as_one_chunk,
2112
+ chunk_by_page=chunk_by_page,
2113
+ handwriting_check=handwriting_check,
2114
+ ingest_mode=ingest_mode,
2074
2115
  timeout=timeout,
2075
2116
  _request_auth=_request_auth,
2076
2117
  _content_type=_content_type,
@@ -2094,14 +2135,19 @@ class DocumentIngestionApi:
2094
2135
 
2095
2136
 
2096
2137
  @validate_call
2097
- async def create_ingest_from_plain_text_job_without_preload_content(
2138
+ async def create_ingest_from_gcs_job_without_preload_content(
2098
2139
  self,
2099
2140
  collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
2100
- file_name: Annotated[StrictStr, Field(description="String of the file name to use for the document.")],
2101
- body: Annotated[StrictStr, Field(description="The text that will ingested into a collection.")],
2141
+ ingest_from_gcs_body: IngestFromGcsBody,
2102
2142
  gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
2103
2143
  gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
2104
- metadata: Annotated[Optional[StrictStr], Field(description="String with json-encoded metadata for the document.")] = None,
2144
+ audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
2145
+ ocr_model: Annotated[Optional[StrictStr], Field(description="Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).")] = None,
2146
+ tesseract_lang: Annotated[Optional[StrictStr], Field(description="Which language to use when using ocr_model=\"tesseract\".")] = None,
2147
+ keep_tables_as_one_chunk: Annotated[Optional[StrictBool], Field(description="When tables are identified by the table parser the table tokens will be kept in a single chunk.")] = None,
2148
+ chunk_by_page: Annotated[Optional[StrictBool], Field(description="Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.")] = None,
2149
+ handwriting_check: Annotated[Optional[StrictBool], Field(description="Check pages for handwriting. Will use specialized models if handwriting is found.")] = None,
2150
+ ingest_mode: Annotated[Optional[StrictStr], Field(description="Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.")] = None,
2105
2151
  timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
2106
2152
  _request_timeout: Union[
2107
2153
  None,
@@ -2116,22 +2162,32 @@ class DocumentIngestionApi:
2116
2162
  _headers: Optional[Dict[StrictStr, Any]] = None,
2117
2163
  _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
2118
2164
  ) -> RESTResponseType:
2119
- """Creates a job to add plain text to a collection.
2165
+ """Creates a job to add files from the Google Cloud Storage into a collection.
2120
2166
 
2121
- Creates a job to add plain text to a collection.
2167
+ Creates a job to add files from the Google Cloud Storage into a collection.
2122
2168
 
2123
2169
  :param collection_id: String id of the collection to add the ingested documents into. (required)
2124
2170
  :type collection_id: str
2125
- :param file_name: String of the file name to use for the document. (required)
2126
- :type file_name: str
2127
- :param body: The text that will ingested into a collection. (required)
2128
- :type body: str
2171
+ :param ingest_from_gcs_body: (required)
2172
+ :type ingest_from_gcs_body: IngestFromGcsBody
2129
2173
  :param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
2130
2174
  :type gen_doc_summaries: bool
2131
2175
  :param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
2132
2176
  :type gen_doc_questions: bool
2133
- :param metadata: String with json-encoded metadata for the document.
2134
- :type metadata: str
2177
+ :param audio_input_language: Language of audio files.
2178
+ :type audio_input_language: str
2179
+ :param ocr_model: Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).
2180
+ :type ocr_model: str
2181
+ :param tesseract_lang: Which language to use when using ocr_model=\"tesseract\".
2182
+ :type tesseract_lang: str
2183
+ :param keep_tables_as_one_chunk: When tables are identified by the table parser the table tokens will be kept in a single chunk.
2184
+ :type keep_tables_as_one_chunk: bool
2185
+ :param chunk_by_page: Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.
2186
+ :type chunk_by_page: bool
2187
+ :param handwriting_check: Check pages for handwriting. Will use specialized models if handwriting is found.
2188
+ :type handwriting_check: bool
2189
+ :param ingest_mode: Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.
2190
+ :type ingest_mode: str
2135
2191
  :param timeout: Timeout in seconds
2136
2192
  :type timeout: float
2137
2193
  :param _request_timeout: timeout setting for this request. If one
@@ -2156,13 +2212,18 @@ class DocumentIngestionApi:
2156
2212
  :return: Returns the result object.
2157
2213
  """ # noqa: E501
2158
2214
 
2159
- _param = self._create_ingest_from_plain_text_job_serialize(
2215
+ _param = self._create_ingest_from_gcs_job_serialize(
2160
2216
  collection_id=collection_id,
2161
- file_name=file_name,
2162
- body=body,
2217
+ ingest_from_gcs_body=ingest_from_gcs_body,
2163
2218
  gen_doc_summaries=gen_doc_summaries,
2164
2219
  gen_doc_questions=gen_doc_questions,
2165
- metadata=metadata,
2220
+ audio_input_language=audio_input_language,
2221
+ ocr_model=ocr_model,
2222
+ tesseract_lang=tesseract_lang,
2223
+ keep_tables_as_one_chunk=keep_tables_as_one_chunk,
2224
+ chunk_by_page=chunk_by_page,
2225
+ handwriting_check=handwriting_check,
2226
+ ingest_mode=ingest_mode,
2166
2227
  timeout=timeout,
2167
2228
  _request_auth=_request_auth,
2168
2229
  _content_type=_content_type,
@@ -2181,14 +2242,418 @@ class DocumentIngestionApi:
2181
2242
  return response_data.response
2182
2243
 
2183
2244
 
2184
- def _create_ingest_from_plain_text_job_serialize(
2245
+ def _create_ingest_from_gcs_job_serialize(
2185
2246
  self,
2186
2247
  collection_id,
2187
- file_name,
2188
- body,
2248
+ ingest_from_gcs_body,
2189
2249
  gen_doc_summaries,
2190
2250
  gen_doc_questions,
2191
- metadata,
2251
+ audio_input_language,
2252
+ ocr_model,
2253
+ tesseract_lang,
2254
+ keep_tables_as_one_chunk,
2255
+ chunk_by_page,
2256
+ handwriting_check,
2257
+ ingest_mode,
2258
+ timeout,
2259
+ _request_auth,
2260
+ _content_type,
2261
+ _headers,
2262
+ _host_index,
2263
+ ) -> RequestSerialized:
2264
+
2265
+ _host = None
2266
+
2267
+ _collection_formats: Dict[str, str] = {
2268
+ }
2269
+
2270
+ _path_params: Dict[str, str] = {}
2271
+ _query_params: List[Tuple[str, str]] = []
2272
+ _header_params: Dict[str, Optional[str]] = _headers or {}
2273
+ _form_params: List[Tuple[str, str]] = []
2274
+ _files: Dict[
2275
+ str, Union[str, bytes, List[str], List[bytes], List[Tuple[str, bytes]]]
2276
+ ] = {}
2277
+ _body_params: Optional[bytes] = None
2278
+
2279
+ # process the path parameters
2280
+ # process the query parameters
2281
+ if collection_id is not None:
2282
+
2283
+ _query_params.append(('collection_id', collection_id))
2284
+
2285
+ if gen_doc_summaries is not None:
2286
+
2287
+ _query_params.append(('gen_doc_summaries', gen_doc_summaries))
2288
+
2289
+ if gen_doc_questions is not None:
2290
+
2291
+ _query_params.append(('gen_doc_questions', gen_doc_questions))
2292
+
2293
+ if audio_input_language is not None:
2294
+
2295
+ _query_params.append(('audio_input_language', audio_input_language))
2296
+
2297
+ if ocr_model is not None:
2298
+
2299
+ _query_params.append(('ocr_model', ocr_model))
2300
+
2301
+ if tesseract_lang is not None:
2302
+
2303
+ _query_params.append(('tesseract_lang', tesseract_lang))
2304
+
2305
+ if keep_tables_as_one_chunk is not None:
2306
+
2307
+ _query_params.append(('keep_tables_as_one_chunk', keep_tables_as_one_chunk))
2308
+
2309
+ if chunk_by_page is not None:
2310
+
2311
+ _query_params.append(('chunk_by_page', chunk_by_page))
2312
+
2313
+ if handwriting_check is not None:
2314
+
2315
+ _query_params.append(('handwriting_check', handwriting_check))
2316
+
2317
+ if ingest_mode is not None:
2318
+
2319
+ _query_params.append(('ingest_mode', ingest_mode))
2320
+
2321
+ if timeout is not None:
2322
+
2323
+ _query_params.append(('timeout', timeout))
2324
+
2325
+ # process the header parameters
2326
+ # process the form parameters
2327
+ # process the body parameter
2328
+ if ingest_from_gcs_body is not None:
2329
+ _body_params = ingest_from_gcs_body
2330
+
2331
+
2332
+ # set the HTTP header `Accept`
2333
+ if 'Accept' not in _header_params:
2334
+ _header_params['Accept'] = self.api_client.select_header_accept(
2335
+ [
2336
+ 'application/json'
2337
+ ]
2338
+ )
2339
+
2340
+ # set the HTTP header `Content-Type`
2341
+ if _content_type:
2342
+ _header_params['Content-Type'] = _content_type
2343
+ else:
2344
+ _default_content_type = (
2345
+ self.api_client.select_header_content_type(
2346
+ [
2347
+ 'application/json'
2348
+ ]
2349
+ )
2350
+ )
2351
+ if _default_content_type is not None:
2352
+ _header_params['Content-Type'] = _default_content_type
2353
+
2354
+ # authentication setting
2355
+ _auth_settings: List[str] = [
2356
+ 'bearerAuth'
2357
+ ]
2358
+
2359
+ return self.api_client.param_serialize(
2360
+ method='POST',
2361
+ resource_path='/ingest/gcs/job',
2362
+ path_params=_path_params,
2363
+ query_params=_query_params,
2364
+ header_params=_header_params,
2365
+ body=_body_params,
2366
+ post_params=_form_params,
2367
+ files=_files,
2368
+ auth_settings=_auth_settings,
2369
+ collection_formats=_collection_formats,
2370
+ _host=_host,
2371
+ _request_auth=_request_auth
2372
+ )
2373
+
2374
+
2375
+
2376
+
2377
+ @validate_call
2378
+ async def create_ingest_from_plain_text_job(
2379
+ self,
2380
+ collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
2381
+ file_name: Annotated[StrictStr, Field(description="String of the file name to use for the document.")],
2382
+ body: Annotated[StrictStr, Field(description="The text that will ingested into a collection.")],
2383
+ gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
2384
+ gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
2385
+ metadata: Annotated[Optional[StrictStr], Field(description="String with json-encoded metadata for the document.")] = None,
2386
+ timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
2387
+ _request_timeout: Union[
2388
+ None,
2389
+ Annotated[StrictFloat, Field(gt=0)],
2390
+ Tuple[
2391
+ Annotated[StrictFloat, Field(gt=0)],
2392
+ Annotated[StrictFloat, Field(gt=0)]
2393
+ ]
2394
+ ] = None,
2395
+ _request_auth: Optional[Dict[StrictStr, Any]] = None,
2396
+ _content_type: Optional[StrictStr] = None,
2397
+ _headers: Optional[Dict[StrictStr, Any]] = None,
2398
+ _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
2399
+ ) -> JobDetails:
2400
+ """Creates a job to add plain text to a collection.
2401
+
2402
+ Creates a job to add plain text to a collection.
2403
+
2404
+ :param collection_id: String id of the collection to add the ingested documents into. (required)
2405
+ :type collection_id: str
2406
+ :param file_name: String of the file name to use for the document. (required)
2407
+ :type file_name: str
2408
+ :param body: The text that will ingested into a collection. (required)
2409
+ :type body: str
2410
+ :param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
2411
+ :type gen_doc_summaries: bool
2412
+ :param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
2413
+ :type gen_doc_questions: bool
2414
+ :param metadata: String with json-encoded metadata for the document.
2415
+ :type metadata: str
2416
+ :param timeout: Timeout in seconds
2417
+ :type timeout: float
2418
+ :param _request_timeout: timeout setting for this request. If one
2419
+ number provided, it will be total request
2420
+ timeout. It can also be a pair (tuple) of
2421
+ (connection, read) timeouts.
2422
+ :type _request_timeout: int, tuple(int, int), optional
2423
+ :param _request_auth: set to override the auth_settings for an a single
2424
+ request; this effectively ignores the
2425
+ authentication in the spec for a single request.
2426
+ :type _request_auth: dict, optional
2427
+ :param _content_type: force content-type for the request.
2428
+ :type _content_type: str, Optional
2429
+ :param _headers: set to override the headers for a single
2430
+ request; this effectively ignores the headers
2431
+ in the spec for a single request.
2432
+ :type _headers: dict, optional
2433
+ :param _host_index: set to override the host_index for a single
2434
+ request; this effectively ignores the host_index
2435
+ in the spec for a single request.
2436
+ :type _host_index: int, optional
2437
+ :return: Returns the result object.
2438
+ """ # noqa: E501
2439
+
2440
+ _param = self._create_ingest_from_plain_text_job_serialize(
2441
+ collection_id=collection_id,
2442
+ file_name=file_name,
2443
+ body=body,
2444
+ gen_doc_summaries=gen_doc_summaries,
2445
+ gen_doc_questions=gen_doc_questions,
2446
+ metadata=metadata,
2447
+ timeout=timeout,
2448
+ _request_auth=_request_auth,
2449
+ _content_type=_content_type,
2450
+ _headers=_headers,
2451
+ _host_index=_host_index
2452
+ )
2453
+
2454
+ _response_types_map: Dict[str, Optional[str]] = {
2455
+ '201': "JobDetails",
2456
+ '401': "EndpointError",
2457
+ }
2458
+ response_data = await self.api_client.call_api(
2459
+ *_param,
2460
+ _request_timeout=_request_timeout
2461
+ )
2462
+ await response_data.read()
2463
+ return self.api_client.response_deserialize(
2464
+ response_data=response_data,
2465
+ response_types_map=_response_types_map,
2466
+ ).data
2467
+
2468
+
2469
+ @validate_call
2470
+ async def create_ingest_from_plain_text_job_with_http_info(
2471
+ self,
2472
+ collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
2473
+ file_name: Annotated[StrictStr, Field(description="String of the file name to use for the document.")],
2474
+ body: Annotated[StrictStr, Field(description="The text that will ingested into a collection.")],
2475
+ gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
2476
+ gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
2477
+ metadata: Annotated[Optional[StrictStr], Field(description="String with json-encoded metadata for the document.")] = None,
2478
+ timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
2479
+ _request_timeout: Union[
2480
+ None,
2481
+ Annotated[StrictFloat, Field(gt=0)],
2482
+ Tuple[
2483
+ Annotated[StrictFloat, Field(gt=0)],
2484
+ Annotated[StrictFloat, Field(gt=0)]
2485
+ ]
2486
+ ] = None,
2487
+ _request_auth: Optional[Dict[StrictStr, Any]] = None,
2488
+ _content_type: Optional[StrictStr] = None,
2489
+ _headers: Optional[Dict[StrictStr, Any]] = None,
2490
+ _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
2491
+ ) -> ApiResponse[JobDetails]:
2492
+ """Creates a job to add plain text to a collection.
2493
+
2494
+ Creates a job to add plain text to a collection.
2495
+
2496
+ :param collection_id: String id of the collection to add the ingested documents into. (required)
2497
+ :type collection_id: str
2498
+ :param file_name: String of the file name to use for the document. (required)
2499
+ :type file_name: str
2500
+ :param body: The text that will ingested into a collection. (required)
2501
+ :type body: str
2502
+ :param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
2503
+ :type gen_doc_summaries: bool
2504
+ :param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
2505
+ :type gen_doc_questions: bool
2506
+ :param metadata: String with json-encoded metadata for the document.
2507
+ :type metadata: str
2508
+ :param timeout: Timeout in seconds
2509
+ :type timeout: float
2510
+ :param _request_timeout: timeout setting for this request. If one
2511
+ number provided, it will be total request
2512
+ timeout. It can also be a pair (tuple) of
2513
+ (connection, read) timeouts.
2514
+ :type _request_timeout: int, tuple(int, int), optional
2515
+ :param _request_auth: set to override the auth_settings for an a single
2516
+ request; this effectively ignores the
2517
+ authentication in the spec for a single request.
2518
+ :type _request_auth: dict, optional
2519
+ :param _content_type: force content-type for the request.
2520
+ :type _content_type: str, Optional
2521
+ :param _headers: set to override the headers for a single
2522
+ request; this effectively ignores the headers
2523
+ in the spec for a single request.
2524
+ :type _headers: dict, optional
2525
+ :param _host_index: set to override the host_index for a single
2526
+ request; this effectively ignores the host_index
2527
+ in the spec for a single request.
2528
+ :type _host_index: int, optional
2529
+ :return: Returns the result object.
2530
+ """ # noqa: E501
2531
+
2532
+ _param = self._create_ingest_from_plain_text_job_serialize(
2533
+ collection_id=collection_id,
2534
+ file_name=file_name,
2535
+ body=body,
2536
+ gen_doc_summaries=gen_doc_summaries,
2537
+ gen_doc_questions=gen_doc_questions,
2538
+ metadata=metadata,
2539
+ timeout=timeout,
2540
+ _request_auth=_request_auth,
2541
+ _content_type=_content_type,
2542
+ _headers=_headers,
2543
+ _host_index=_host_index
2544
+ )
2545
+
2546
+ _response_types_map: Dict[str, Optional[str]] = {
2547
+ '201': "JobDetails",
2548
+ '401': "EndpointError",
2549
+ }
2550
+ response_data = await self.api_client.call_api(
2551
+ *_param,
2552
+ _request_timeout=_request_timeout
2553
+ )
2554
+ await response_data.read()
2555
+ return self.api_client.response_deserialize(
2556
+ response_data=response_data,
2557
+ response_types_map=_response_types_map,
2558
+ )
2559
+
2560
+
2561
+ @validate_call
2562
+ async def create_ingest_from_plain_text_job_without_preload_content(
2563
+ self,
2564
+ collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
2565
+ file_name: Annotated[StrictStr, Field(description="String of the file name to use for the document.")],
2566
+ body: Annotated[StrictStr, Field(description="The text that will ingested into a collection.")],
2567
+ gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
2568
+ gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
2569
+ metadata: Annotated[Optional[StrictStr], Field(description="String with json-encoded metadata for the document.")] = None,
2570
+ timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
2571
+ _request_timeout: Union[
2572
+ None,
2573
+ Annotated[StrictFloat, Field(gt=0)],
2574
+ Tuple[
2575
+ Annotated[StrictFloat, Field(gt=0)],
2576
+ Annotated[StrictFloat, Field(gt=0)]
2577
+ ]
2578
+ ] = None,
2579
+ _request_auth: Optional[Dict[StrictStr, Any]] = None,
2580
+ _content_type: Optional[StrictStr] = None,
2581
+ _headers: Optional[Dict[StrictStr, Any]] = None,
2582
+ _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
2583
+ ) -> RESTResponseType:
2584
+ """Creates a job to add plain text to a collection.
2585
+
2586
+ Creates a job to add plain text to a collection.
2587
+
2588
+ :param collection_id: String id of the collection to add the ingested documents into. (required)
2589
+ :type collection_id: str
2590
+ :param file_name: String of the file name to use for the document. (required)
2591
+ :type file_name: str
2592
+ :param body: The text that will ingested into a collection. (required)
2593
+ :type body: str
2594
+ :param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
2595
+ :type gen_doc_summaries: bool
2596
+ :param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
2597
+ :type gen_doc_questions: bool
2598
+ :param metadata: String with json-encoded metadata for the document.
2599
+ :type metadata: str
2600
+ :param timeout: Timeout in seconds
2601
+ :type timeout: float
2602
+ :param _request_timeout: timeout setting for this request. If one
2603
+ number provided, it will be total request
2604
+ timeout. It can also be a pair (tuple) of
2605
+ (connection, read) timeouts.
2606
+ :type _request_timeout: int, tuple(int, int), optional
2607
+ :param _request_auth: set to override the auth_settings for an a single
2608
+ request; this effectively ignores the
2609
+ authentication in the spec for a single request.
2610
+ :type _request_auth: dict, optional
2611
+ :param _content_type: force content-type for the request.
2612
+ :type _content_type: str, Optional
2613
+ :param _headers: set to override the headers for a single
2614
+ request; this effectively ignores the headers
2615
+ in the spec for a single request.
2616
+ :type _headers: dict, optional
2617
+ :param _host_index: set to override the host_index for a single
2618
+ request; this effectively ignores the host_index
2619
+ in the spec for a single request.
2620
+ :type _host_index: int, optional
2621
+ :return: Returns the result object.
2622
+ """ # noqa: E501
2623
+
2624
+ _param = self._create_ingest_from_plain_text_job_serialize(
2625
+ collection_id=collection_id,
2626
+ file_name=file_name,
2627
+ body=body,
2628
+ gen_doc_summaries=gen_doc_summaries,
2629
+ gen_doc_questions=gen_doc_questions,
2630
+ metadata=metadata,
2631
+ timeout=timeout,
2632
+ _request_auth=_request_auth,
2633
+ _content_type=_content_type,
2634
+ _headers=_headers,
2635
+ _host_index=_host_index
2636
+ )
2637
+
2638
+ _response_types_map: Dict[str, Optional[str]] = {
2639
+ '201': "JobDetails",
2640
+ '401': "EndpointError",
2641
+ }
2642
+ response_data = await self.api_client.call_api(
2643
+ *_param,
2644
+ _request_timeout=_request_timeout
2645
+ )
2646
+ return response_data.response
2647
+
2648
+
2649
+ def _create_ingest_from_plain_text_job_serialize(
2650
+ self,
2651
+ collection_id,
2652
+ file_name,
2653
+ body,
2654
+ gen_doc_summaries,
2655
+ gen_doc_questions,
2656
+ metadata,
2192
2657
  timeout,
2193
2658
  _request_auth,
2194
2659
  _content_type,
@@ -2218,7 +2683,447 @@ class DocumentIngestionApi:
2218
2683
 
2219
2684
  if file_name is not None:
2220
2685
 
2221
- _query_params.append(('file_name', file_name))
2686
+ _query_params.append(('file_name', file_name))
2687
+
2688
+ if gen_doc_summaries is not None:
2689
+
2690
+ _query_params.append(('gen_doc_summaries', gen_doc_summaries))
2691
+
2692
+ if gen_doc_questions is not None:
2693
+
2694
+ _query_params.append(('gen_doc_questions', gen_doc_questions))
2695
+
2696
+ if metadata is not None:
2697
+
2698
+ _query_params.append(('metadata', metadata))
2699
+
2700
+ if timeout is not None:
2701
+
2702
+ _query_params.append(('timeout', timeout))
2703
+
2704
+ # process the header parameters
2705
+ # process the form parameters
2706
+ # process the body parameter
2707
+ if body is not None:
2708
+ _body_params = body
2709
+
2710
+
2711
+ # set the HTTP header `Accept`
2712
+ if 'Accept' not in _header_params:
2713
+ _header_params['Accept'] = self.api_client.select_header_accept(
2714
+ [
2715
+ 'application/json'
2716
+ ]
2717
+ )
2718
+
2719
+ # set the HTTP header `Content-Type`
2720
+ if _content_type:
2721
+ _header_params['Content-Type'] = _content_type
2722
+ else:
2723
+ _default_content_type = (
2724
+ self.api_client.select_header_content_type(
2725
+ [
2726
+ 'text/plain'
2727
+ ]
2728
+ )
2729
+ )
2730
+ if _default_content_type is not None:
2731
+ _header_params['Content-Type'] = _default_content_type
2732
+
2733
+ # authentication setting
2734
+ _auth_settings: List[str] = [
2735
+ 'bearerAuth'
2736
+ ]
2737
+
2738
+ return self.api_client.param_serialize(
2739
+ method='POST',
2740
+ resource_path='/ingest/plain_text/job',
2741
+ path_params=_path_params,
2742
+ query_params=_query_params,
2743
+ header_params=_header_params,
2744
+ body=_body_params,
2745
+ post_params=_form_params,
2746
+ files=_files,
2747
+ auth_settings=_auth_settings,
2748
+ collection_formats=_collection_formats,
2749
+ _host=_host,
2750
+ _request_auth=_request_auth
2751
+ )
2752
+
2753
+
2754
+
2755
+
2756
+ @validate_call
2757
+ async def create_ingest_from_s3_job(
2758
+ self,
2759
+ collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
2760
+ ingest_from_s3_body: IngestFromS3Body,
2761
+ gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
2762
+ gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
2763
+ audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
2764
+ ocr_model: Annotated[Optional[StrictStr], Field(description="Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).")] = None,
2765
+ tesseract_lang: Annotated[Optional[StrictStr], Field(description="Which language to use when using ocr_model=\"tesseract\".")] = None,
2766
+ keep_tables_as_one_chunk: Annotated[Optional[StrictBool], Field(description="When tables are identified by the table parser the table tokens will be kept in a single chunk.")] = None,
2767
+ chunk_by_page: Annotated[Optional[StrictBool], Field(description="Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.")] = None,
2768
+ handwriting_check: Annotated[Optional[StrictBool], Field(description="Check pages for handwriting. Will use specialized models if handwriting is found.")] = None,
2769
+ ingest_mode: Annotated[Optional[StrictStr], Field(description="Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.")] = None,
2770
+ timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
2771
+ _request_timeout: Union[
2772
+ None,
2773
+ Annotated[StrictFloat, Field(gt=0)],
2774
+ Tuple[
2775
+ Annotated[StrictFloat, Field(gt=0)],
2776
+ Annotated[StrictFloat, Field(gt=0)]
2777
+ ]
2778
+ ] = None,
2779
+ _request_auth: Optional[Dict[StrictStr, Any]] = None,
2780
+ _content_type: Optional[StrictStr] = None,
2781
+ _headers: Optional[Dict[StrictStr, Any]] = None,
2782
+ _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
2783
+ ) -> JobDetails:
2784
+ """Creates a job to add files from the AWS S3 storage into a collection.
2785
+
2786
+ Creates a job to add files from the AWS S3 storage into a collection.
2787
+
2788
+ :param collection_id: String id of the collection to add the ingested documents into. (required)
2789
+ :type collection_id: str
2790
+ :param ingest_from_s3_body: (required)
2791
+ :type ingest_from_s3_body: IngestFromS3Body
2792
+ :param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
2793
+ :type gen_doc_summaries: bool
2794
+ :param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
2795
+ :type gen_doc_questions: bool
2796
+ :param audio_input_language: Language of audio files.
2797
+ :type audio_input_language: str
2798
+ :param ocr_model: Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).
2799
+ :type ocr_model: str
2800
+ :param tesseract_lang: Which language to use when using ocr_model=\"tesseract\".
2801
+ :type tesseract_lang: str
2802
+ :param keep_tables_as_one_chunk: When tables are identified by the table parser the table tokens will be kept in a single chunk.
2803
+ :type keep_tables_as_one_chunk: bool
2804
+ :param chunk_by_page: Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.
2805
+ :type chunk_by_page: bool
2806
+ :param handwriting_check: Check pages for handwriting. Will use specialized models if handwriting is found.
2807
+ :type handwriting_check: bool
2808
+ :param ingest_mode: Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.
2809
+ :type ingest_mode: str
2810
+ :param timeout: Timeout in seconds
2811
+ :type timeout: float
2812
+ :param _request_timeout: timeout setting for this request. If one
2813
+ number provided, it will be total request
2814
+ timeout. It can also be a pair (tuple) of
2815
+ (connection, read) timeouts.
2816
+ :type _request_timeout: int, tuple(int, int), optional
2817
+ :param _request_auth: set to override the auth_settings for an a single
2818
+ request; this effectively ignores the
2819
+ authentication in the spec for a single request.
2820
+ :type _request_auth: dict, optional
2821
+ :param _content_type: force content-type for the request.
2822
+ :type _content_type: str, Optional
2823
+ :param _headers: set to override the headers for a single
2824
+ request; this effectively ignores the headers
2825
+ in the spec for a single request.
2826
+ :type _headers: dict, optional
2827
+ :param _host_index: set to override the host_index for a single
2828
+ request; this effectively ignores the host_index
2829
+ in the spec for a single request.
2830
+ :type _host_index: int, optional
2831
+ :return: Returns the result object.
2832
+ """ # noqa: E501
2833
+
2834
+ _param = self._create_ingest_from_s3_job_serialize(
2835
+ collection_id=collection_id,
2836
+ ingest_from_s3_body=ingest_from_s3_body,
2837
+ gen_doc_summaries=gen_doc_summaries,
2838
+ gen_doc_questions=gen_doc_questions,
2839
+ audio_input_language=audio_input_language,
2840
+ ocr_model=ocr_model,
2841
+ tesseract_lang=tesseract_lang,
2842
+ keep_tables_as_one_chunk=keep_tables_as_one_chunk,
2843
+ chunk_by_page=chunk_by_page,
2844
+ handwriting_check=handwriting_check,
2845
+ ingest_mode=ingest_mode,
2846
+ timeout=timeout,
2847
+ _request_auth=_request_auth,
2848
+ _content_type=_content_type,
2849
+ _headers=_headers,
2850
+ _host_index=_host_index
2851
+ )
2852
+
2853
+ _response_types_map: Dict[str, Optional[str]] = {
2854
+ '201': "JobDetails",
2855
+ '401': "EndpointError",
2856
+ }
2857
+ response_data = await self.api_client.call_api(
2858
+ *_param,
2859
+ _request_timeout=_request_timeout
2860
+ )
2861
+ await response_data.read()
2862
+ return self.api_client.response_deserialize(
2863
+ response_data=response_data,
2864
+ response_types_map=_response_types_map,
2865
+ ).data
2866
+
2867
+
2868
+ @validate_call
2869
+ async def create_ingest_from_s3_job_with_http_info(
2870
+ self,
2871
+ collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
2872
+ ingest_from_s3_body: IngestFromS3Body,
2873
+ gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
2874
+ gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
2875
+ audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
2876
+ ocr_model: Annotated[Optional[StrictStr], Field(description="Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).")] = None,
2877
+ tesseract_lang: Annotated[Optional[StrictStr], Field(description="Which language to use when using ocr_model=\"tesseract\".")] = None,
2878
+ keep_tables_as_one_chunk: Annotated[Optional[StrictBool], Field(description="When tables are identified by the table parser the table tokens will be kept in a single chunk.")] = None,
2879
+ chunk_by_page: Annotated[Optional[StrictBool], Field(description="Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.")] = None,
2880
+ handwriting_check: Annotated[Optional[StrictBool], Field(description="Check pages for handwriting. Will use specialized models if handwriting is found.")] = None,
2881
+ ingest_mode: Annotated[Optional[StrictStr], Field(description="Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.")] = None,
2882
+ timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
2883
+ _request_timeout: Union[
2884
+ None,
2885
+ Annotated[StrictFloat, Field(gt=0)],
2886
+ Tuple[
2887
+ Annotated[StrictFloat, Field(gt=0)],
2888
+ Annotated[StrictFloat, Field(gt=0)]
2889
+ ]
2890
+ ] = None,
2891
+ _request_auth: Optional[Dict[StrictStr, Any]] = None,
2892
+ _content_type: Optional[StrictStr] = None,
2893
+ _headers: Optional[Dict[StrictStr, Any]] = None,
2894
+ _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
2895
+ ) -> ApiResponse[JobDetails]:
2896
+ """Creates a job to add files from the AWS S3 storage into a collection.
2897
+
2898
+ Creates a job to add files from the AWS S3 storage into a collection.
2899
+
2900
+ :param collection_id: String id of the collection to add the ingested documents into. (required)
2901
+ :type collection_id: str
2902
+ :param ingest_from_s3_body: (required)
2903
+ :type ingest_from_s3_body: IngestFromS3Body
2904
+ :param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
2905
+ :type gen_doc_summaries: bool
2906
+ :param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
2907
+ :type gen_doc_questions: bool
2908
+ :param audio_input_language: Language of audio files.
2909
+ :type audio_input_language: str
2910
+ :param ocr_model: Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).
2911
+ :type ocr_model: str
2912
+ :param tesseract_lang: Which language to use when using ocr_model=\"tesseract\".
2913
+ :type tesseract_lang: str
2914
+ :param keep_tables_as_one_chunk: When tables are identified by the table parser the table tokens will be kept in a single chunk.
2915
+ :type keep_tables_as_one_chunk: bool
2916
+ :param chunk_by_page: Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.
2917
+ :type chunk_by_page: bool
2918
+ :param handwriting_check: Check pages for handwriting. Will use specialized models if handwriting is found.
2919
+ :type handwriting_check: bool
2920
+ :param ingest_mode: Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.
2921
+ :type ingest_mode: str
2922
+ :param timeout: Timeout in seconds
2923
+ :type timeout: float
2924
+ :param _request_timeout: timeout setting for this request. If one
2925
+ number provided, it will be total request
2926
+ timeout. It can also be a pair (tuple) of
2927
+ (connection, read) timeouts.
2928
+ :type _request_timeout: int, tuple(int, int), optional
2929
+ :param _request_auth: set to override the auth_settings for an a single
2930
+ request; this effectively ignores the
2931
+ authentication in the spec for a single request.
2932
+ :type _request_auth: dict, optional
2933
+ :param _content_type: force content-type for the request.
2934
+ :type _content_type: str, Optional
2935
+ :param _headers: set to override the headers for a single
2936
+ request; this effectively ignores the headers
2937
+ in the spec for a single request.
2938
+ :type _headers: dict, optional
2939
+ :param _host_index: set to override the host_index for a single
2940
+ request; this effectively ignores the host_index
2941
+ in the spec for a single request.
2942
+ :type _host_index: int, optional
2943
+ :return: Returns the result object.
2944
+ """ # noqa: E501
2945
+
2946
+ _param = self._create_ingest_from_s3_job_serialize(
2947
+ collection_id=collection_id,
2948
+ ingest_from_s3_body=ingest_from_s3_body,
2949
+ gen_doc_summaries=gen_doc_summaries,
2950
+ gen_doc_questions=gen_doc_questions,
2951
+ audio_input_language=audio_input_language,
2952
+ ocr_model=ocr_model,
2953
+ tesseract_lang=tesseract_lang,
2954
+ keep_tables_as_one_chunk=keep_tables_as_one_chunk,
2955
+ chunk_by_page=chunk_by_page,
2956
+ handwriting_check=handwriting_check,
2957
+ ingest_mode=ingest_mode,
2958
+ timeout=timeout,
2959
+ _request_auth=_request_auth,
2960
+ _content_type=_content_type,
2961
+ _headers=_headers,
2962
+ _host_index=_host_index
2963
+ )
2964
+
2965
+ _response_types_map: Dict[str, Optional[str]] = {
2966
+ '201': "JobDetails",
2967
+ '401': "EndpointError",
2968
+ }
2969
+ response_data = await self.api_client.call_api(
2970
+ *_param,
2971
+ _request_timeout=_request_timeout
2972
+ )
2973
+ await response_data.read()
2974
+ return self.api_client.response_deserialize(
2975
+ response_data=response_data,
2976
+ response_types_map=_response_types_map,
2977
+ )
2978
+
2979
+
2980
+ @validate_call
2981
+ async def create_ingest_from_s3_job_without_preload_content(
2982
+ self,
2983
+ collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
2984
+ ingest_from_s3_body: IngestFromS3Body,
2985
+ gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
2986
+ gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
2987
+ audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
2988
+ ocr_model: Annotated[Optional[StrictStr], Field(description="Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).")] = None,
2989
+ tesseract_lang: Annotated[Optional[StrictStr], Field(description="Which language to use when using ocr_model=\"tesseract\".")] = None,
2990
+ keep_tables_as_one_chunk: Annotated[Optional[StrictBool], Field(description="When tables are identified by the table parser the table tokens will be kept in a single chunk.")] = None,
2991
+ chunk_by_page: Annotated[Optional[StrictBool], Field(description="Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.")] = None,
2992
+ handwriting_check: Annotated[Optional[StrictBool], Field(description="Check pages for handwriting. Will use specialized models if handwriting is found.")] = None,
2993
+ ingest_mode: Annotated[Optional[StrictStr], Field(description="Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.")] = None,
2994
+ timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
2995
+ _request_timeout: Union[
2996
+ None,
2997
+ Annotated[StrictFloat, Field(gt=0)],
2998
+ Tuple[
2999
+ Annotated[StrictFloat, Field(gt=0)],
3000
+ Annotated[StrictFloat, Field(gt=0)]
3001
+ ]
3002
+ ] = None,
3003
+ _request_auth: Optional[Dict[StrictStr, Any]] = None,
3004
+ _content_type: Optional[StrictStr] = None,
3005
+ _headers: Optional[Dict[StrictStr, Any]] = None,
3006
+ _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
3007
+ ) -> RESTResponseType:
3008
+ """Creates a job to add files from the AWS S3 storage into a collection.
3009
+
3010
+ Creates a job to add files from the AWS S3 storage into a collection.
3011
+
3012
+ :param collection_id: String id of the collection to add the ingested documents into. (required)
3013
+ :type collection_id: str
3014
+ :param ingest_from_s3_body: (required)
3015
+ :type ingest_from_s3_body: IngestFromS3Body
3016
+ :param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
3017
+ :type gen_doc_summaries: bool
3018
+ :param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
3019
+ :type gen_doc_questions: bool
3020
+ :param audio_input_language: Language of audio files.
3021
+ :type audio_input_language: str
3022
+ :param ocr_model: Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).
3023
+ :type ocr_model: str
3024
+ :param tesseract_lang: Which language to use when using ocr_model=\"tesseract\".
3025
+ :type tesseract_lang: str
3026
+ :param keep_tables_as_one_chunk: When tables are identified by the table parser the table tokens will be kept in a single chunk.
3027
+ :type keep_tables_as_one_chunk: bool
3028
+ :param chunk_by_page: Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.
3029
+ :type chunk_by_page: bool
3030
+ :param handwriting_check: Check pages for handwriting. Will use specialized models if handwriting is found.
3031
+ :type handwriting_check: bool
3032
+ :param ingest_mode: Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.
3033
+ :type ingest_mode: str
3034
+ :param timeout: Timeout in seconds
3035
+ :type timeout: float
3036
+ :param _request_timeout: timeout setting for this request. If one
3037
+ number provided, it will be total request
3038
+ timeout. It can also be a pair (tuple) of
3039
+ (connection, read) timeouts.
3040
+ :type _request_timeout: int, tuple(int, int), optional
3041
+ :param _request_auth: set to override the auth_settings for an a single
3042
+ request; this effectively ignores the
3043
+ authentication in the spec for a single request.
3044
+ :type _request_auth: dict, optional
3045
+ :param _content_type: force content-type for the request.
3046
+ :type _content_type: str, Optional
3047
+ :param _headers: set to override the headers for a single
3048
+ request; this effectively ignores the headers
3049
+ in the spec for a single request.
3050
+ :type _headers: dict, optional
3051
+ :param _host_index: set to override the host_index for a single
3052
+ request; this effectively ignores the host_index
3053
+ in the spec for a single request.
3054
+ :type _host_index: int, optional
3055
+ :return: Returns the result object.
3056
+ """ # noqa: E501
3057
+
3058
+ _param = self._create_ingest_from_s3_job_serialize(
3059
+ collection_id=collection_id,
3060
+ ingest_from_s3_body=ingest_from_s3_body,
3061
+ gen_doc_summaries=gen_doc_summaries,
3062
+ gen_doc_questions=gen_doc_questions,
3063
+ audio_input_language=audio_input_language,
3064
+ ocr_model=ocr_model,
3065
+ tesseract_lang=tesseract_lang,
3066
+ keep_tables_as_one_chunk=keep_tables_as_one_chunk,
3067
+ chunk_by_page=chunk_by_page,
3068
+ handwriting_check=handwriting_check,
3069
+ ingest_mode=ingest_mode,
3070
+ timeout=timeout,
3071
+ _request_auth=_request_auth,
3072
+ _content_type=_content_type,
3073
+ _headers=_headers,
3074
+ _host_index=_host_index
3075
+ )
3076
+
3077
+ _response_types_map: Dict[str, Optional[str]] = {
3078
+ '201': "JobDetails",
3079
+ '401': "EndpointError",
3080
+ }
3081
+ response_data = await self.api_client.call_api(
3082
+ *_param,
3083
+ _request_timeout=_request_timeout
3084
+ )
3085
+ return response_data.response
3086
+
3087
+
3088
+ def _create_ingest_from_s3_job_serialize(
3089
+ self,
3090
+ collection_id,
3091
+ ingest_from_s3_body,
3092
+ gen_doc_summaries,
3093
+ gen_doc_questions,
3094
+ audio_input_language,
3095
+ ocr_model,
3096
+ tesseract_lang,
3097
+ keep_tables_as_one_chunk,
3098
+ chunk_by_page,
3099
+ handwriting_check,
3100
+ ingest_mode,
3101
+ timeout,
3102
+ _request_auth,
3103
+ _content_type,
3104
+ _headers,
3105
+ _host_index,
3106
+ ) -> RequestSerialized:
3107
+
3108
+ _host = None
3109
+
3110
+ _collection_formats: Dict[str, str] = {
3111
+ }
3112
+
3113
+ _path_params: Dict[str, str] = {}
3114
+ _query_params: List[Tuple[str, str]] = []
3115
+ _header_params: Dict[str, Optional[str]] = _headers or {}
3116
+ _form_params: List[Tuple[str, str]] = []
3117
+ _files: Dict[
3118
+ str, Union[str, bytes, List[str], List[bytes], List[Tuple[str, bytes]]]
3119
+ ] = {}
3120
+ _body_params: Optional[bytes] = None
3121
+
3122
+ # process the path parameters
3123
+ # process the query parameters
3124
+ if collection_id is not None:
3125
+
3126
+ _query_params.append(('collection_id', collection_id))
2222
3127
 
2223
3128
  if gen_doc_summaries is not None:
2224
3129
 
@@ -2228,9 +3133,33 @@ class DocumentIngestionApi:
2228
3133
 
2229
3134
  _query_params.append(('gen_doc_questions', gen_doc_questions))
2230
3135
 
2231
- if metadata is not None:
3136
+ if audio_input_language is not None:
2232
3137
 
2233
- _query_params.append(('metadata', metadata))
3138
+ _query_params.append(('audio_input_language', audio_input_language))
3139
+
3140
+ if ocr_model is not None:
3141
+
3142
+ _query_params.append(('ocr_model', ocr_model))
3143
+
3144
+ if tesseract_lang is not None:
3145
+
3146
+ _query_params.append(('tesseract_lang', tesseract_lang))
3147
+
3148
+ if keep_tables_as_one_chunk is not None:
3149
+
3150
+ _query_params.append(('keep_tables_as_one_chunk', keep_tables_as_one_chunk))
3151
+
3152
+ if chunk_by_page is not None:
3153
+
3154
+ _query_params.append(('chunk_by_page', chunk_by_page))
3155
+
3156
+ if handwriting_check is not None:
3157
+
3158
+ _query_params.append(('handwriting_check', handwriting_check))
3159
+
3160
+ if ingest_mode is not None:
3161
+
3162
+ _query_params.append(('ingest_mode', ingest_mode))
2234
3163
 
2235
3164
  if timeout is not None:
2236
3165
 
@@ -2239,8 +3168,8 @@ class DocumentIngestionApi:
2239
3168
  # process the header parameters
2240
3169
  # process the form parameters
2241
3170
  # process the body parameter
2242
- if body is not None:
2243
- _body_params = body
3171
+ if ingest_from_s3_body is not None:
3172
+ _body_params = ingest_from_s3_body
2244
3173
 
2245
3174
 
2246
3175
  # set the HTTP header `Accept`
@@ -2258,7 +3187,7 @@ class DocumentIngestionApi:
2258
3187
  _default_content_type = (
2259
3188
  self.api_client.select_header_content_type(
2260
3189
  [
2261
- 'text/plain'
3190
+ 'application/json'
2262
3191
  ]
2263
3192
  )
2264
3193
  )
@@ -2272,7 +3201,7 @@ class DocumentIngestionApi:
2272
3201
 
2273
3202
  return self.api_client.param_serialize(
2274
3203
  method='POST',
2275
- resource_path='/ingest/plain_text/job',
3204
+ resource_path='/ingest/s3/job',
2276
3205
  path_params=_path_params,
2277
3206
  query_params=_query_params,
2278
3207
  header_params=_header_params,
@@ -2289,10 +3218,13 @@ class DocumentIngestionApi:
2289
3218
 
2290
3219
 
2291
3220
  @validate_call
2292
- async def create_ingest_from_s3_job(
3221
+ async def create_ingest_from_website_job(
2293
3222
  self,
2294
3223
  collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
2295
- ingest_from_s3_body: IngestFromS3Body,
3224
+ ingest_from_website_body: IngestFromWebsiteBody,
3225
+ follow_links: Annotated[Optional[StrictBool], Field(description="Whether to import all web pages linked from this URL will be imported. External links will be ignored. Links to other pages on the same domain will be followed as long as they are at the same level or below the URL you specify. Each page will be transformed into a PDF document.")] = None,
3226
+ max_depth: Annotated[Optional[StrictInt], Field(description="Max depth of recursion when following links, only when follow_links is `true`. Max_depth of 0 means don't follow any links, max_depth of 1 means follow only top-level links, etc. Use -1 for automatic (system settings).")] = None,
3227
+ max_documents: Annotated[Optional[StrictInt], Field(description="Max number of documents when following links, only when follow_links is `true`. Use None for automatic (system defaults). Use -1 for max (system limit).")] = None,
2296
3228
  gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
2297
3229
  gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
2298
3230
  audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
@@ -2316,14 +3248,20 @@ class DocumentIngestionApi:
2316
3248
  _headers: Optional[Dict[StrictStr, Any]] = None,
2317
3249
  _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
2318
3250
  ) -> JobDetails:
2319
- """Creates a job to add files from the AWS S3 storage into a collection.
3251
+ """Creates a job to crawl and ingest a URL into a collection.
2320
3252
 
2321
- Creates a job to add files from the AWS S3 storage into a collection.
3253
+ Creates a job to crawl and ingest a URL into a collection. The web page or document linked from this URL will be imported.
2322
3254
 
2323
3255
  :param collection_id: String id of the collection to add the ingested documents into. (required)
2324
3256
  :type collection_id: str
2325
- :param ingest_from_s3_body: (required)
2326
- :type ingest_from_s3_body: IngestFromS3Body
3257
+ :param ingest_from_website_body: (required)
3258
+ :type ingest_from_website_body: IngestFromWebsiteBody
3259
+ :param follow_links: Whether to import all web pages linked from this URL will be imported. External links will be ignored. Links to other pages on the same domain will be followed as long as they are at the same level or below the URL you specify. Each page will be transformed into a PDF document.
3260
+ :type follow_links: bool
3261
+ :param max_depth: Max depth of recursion when following links, only when follow_links is `true`. Max_depth of 0 means don't follow any links, max_depth of 1 means follow only top-level links, etc. Use -1 for automatic (system settings).
3262
+ :type max_depth: int
3263
+ :param max_documents: Max number of documents when following links, only when follow_links is `true`. Use None for automatic (system defaults). Use -1 for max (system limit).
3264
+ :type max_documents: int
2327
3265
  :param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
2328
3266
  :type gen_doc_summaries: bool
2329
3267
  :param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
@@ -2366,9 +3304,12 @@ class DocumentIngestionApi:
2366
3304
  :return: Returns the result object.
2367
3305
  """ # noqa: E501
2368
3306
 
2369
- _param = self._create_ingest_from_s3_job_serialize(
3307
+ _param = self._create_ingest_from_website_job_serialize(
2370
3308
  collection_id=collection_id,
2371
- ingest_from_s3_body=ingest_from_s3_body,
3309
+ ingest_from_website_body=ingest_from_website_body,
3310
+ follow_links=follow_links,
3311
+ max_depth=max_depth,
3312
+ max_documents=max_documents,
2372
3313
  gen_doc_summaries=gen_doc_summaries,
2373
3314
  gen_doc_questions=gen_doc_questions,
2374
3315
  audio_input_language=audio_input_language,
@@ -2401,10 +3342,13 @@ class DocumentIngestionApi:
2401
3342
 
2402
3343
 
2403
3344
  @validate_call
2404
- async def create_ingest_from_s3_job_with_http_info(
3345
+ async def create_ingest_from_website_job_with_http_info(
2405
3346
  self,
2406
3347
  collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
2407
- ingest_from_s3_body: IngestFromS3Body,
3348
+ ingest_from_website_body: IngestFromWebsiteBody,
3349
+ follow_links: Annotated[Optional[StrictBool], Field(description="Whether to import all web pages linked from this URL will be imported. External links will be ignored. Links to other pages on the same domain will be followed as long as they are at the same level or below the URL you specify. Each page will be transformed into a PDF document.")] = None,
3350
+ max_depth: Annotated[Optional[StrictInt], Field(description="Max depth of recursion when following links, only when follow_links is `true`. Max_depth of 0 means don't follow any links, max_depth of 1 means follow only top-level links, etc. Use -1 for automatic (system settings).")] = None,
3351
+ max_documents: Annotated[Optional[StrictInt], Field(description="Max number of documents when following links, only when follow_links is `true`. Use None for automatic (system defaults). Use -1 for max (system limit).")] = None,
2408
3352
  gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
2409
3353
  gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
2410
3354
  audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
@@ -2428,14 +3372,20 @@ class DocumentIngestionApi:
2428
3372
  _headers: Optional[Dict[StrictStr, Any]] = None,
2429
3373
  _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
2430
3374
  ) -> ApiResponse[JobDetails]:
2431
- """Creates a job to add files from the AWS S3 storage into a collection.
3375
+ """Creates a job to crawl and ingest a URL into a collection.
2432
3376
 
2433
- Creates a job to add files from the AWS S3 storage into a collection.
3377
+ Creates a job to crawl and ingest a URL into a collection. The web page or document linked from this URL will be imported.
2434
3378
 
2435
3379
  :param collection_id: String id of the collection to add the ingested documents into. (required)
2436
3380
  :type collection_id: str
2437
- :param ingest_from_s3_body: (required)
2438
- :type ingest_from_s3_body: IngestFromS3Body
3381
+ :param ingest_from_website_body: (required)
3382
+ :type ingest_from_website_body: IngestFromWebsiteBody
3383
+ :param follow_links: Whether to import all web pages linked from this URL will be imported. External links will be ignored. Links to other pages on the same domain will be followed as long as they are at the same level or below the URL you specify. Each page will be transformed into a PDF document.
3384
+ :type follow_links: bool
3385
+ :param max_depth: Max depth of recursion when following links, only when follow_links is `true`. Max_depth of 0 means don't follow any links, max_depth of 1 means follow only top-level links, etc. Use -1 for automatic (system settings).
3386
+ :type max_depth: int
3387
+ :param max_documents: Max number of documents when following links, only when follow_links is `true`. Use None for automatic (system defaults). Use -1 for max (system limit).
3388
+ :type max_documents: int
2439
3389
  :param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
2440
3390
  :type gen_doc_summaries: bool
2441
3391
  :param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
@@ -2478,9 +3428,12 @@ class DocumentIngestionApi:
2478
3428
  :return: Returns the result object.
2479
3429
  """ # noqa: E501
2480
3430
 
2481
- _param = self._create_ingest_from_s3_job_serialize(
3431
+ _param = self._create_ingest_from_website_job_serialize(
2482
3432
  collection_id=collection_id,
2483
- ingest_from_s3_body=ingest_from_s3_body,
3433
+ ingest_from_website_body=ingest_from_website_body,
3434
+ follow_links=follow_links,
3435
+ max_depth=max_depth,
3436
+ max_documents=max_documents,
2484
3437
  gen_doc_summaries=gen_doc_summaries,
2485
3438
  gen_doc_questions=gen_doc_questions,
2486
3439
  audio_input_language=audio_input_language,
@@ -2513,10 +3466,13 @@ class DocumentIngestionApi:
2513
3466
 
2514
3467
 
2515
3468
  @validate_call
2516
- async def create_ingest_from_s3_job_without_preload_content(
3469
+ async def create_ingest_from_website_job_without_preload_content(
2517
3470
  self,
2518
3471
  collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
2519
- ingest_from_s3_body: IngestFromS3Body,
3472
+ ingest_from_website_body: IngestFromWebsiteBody,
3473
+ follow_links: Annotated[Optional[StrictBool], Field(description="Whether to import all web pages linked from this URL will be imported. External links will be ignored. Links to other pages on the same domain will be followed as long as they are at the same level or below the URL you specify. Each page will be transformed into a PDF document.")] = None,
3474
+ max_depth: Annotated[Optional[StrictInt], Field(description="Max depth of recursion when following links, only when follow_links is `true`. Max_depth of 0 means don't follow any links, max_depth of 1 means follow only top-level links, etc. Use -1 for automatic (system settings).")] = None,
3475
+ max_documents: Annotated[Optional[StrictInt], Field(description="Max number of documents when following links, only when follow_links is `true`. Use None for automatic (system defaults). Use -1 for max (system limit).")] = None,
2520
3476
  gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
2521
3477
  gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
2522
3478
  audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
@@ -2540,14 +3496,20 @@ class DocumentIngestionApi:
2540
3496
  _headers: Optional[Dict[StrictStr, Any]] = None,
2541
3497
  _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
2542
3498
  ) -> RESTResponseType:
2543
- """Creates a job to add files from the AWS S3 storage into a collection.
3499
+ """Creates a job to crawl and ingest a URL into a collection.
2544
3500
 
2545
- Creates a job to add files from the AWS S3 storage into a collection.
3501
+ Creates a job to crawl and ingest a URL into a collection. The web page or document linked from this URL will be imported.
2546
3502
 
2547
3503
  :param collection_id: String id of the collection to add the ingested documents into. (required)
2548
3504
  :type collection_id: str
2549
- :param ingest_from_s3_body: (required)
2550
- :type ingest_from_s3_body: IngestFromS3Body
3505
+ :param ingest_from_website_body: (required)
3506
+ :type ingest_from_website_body: IngestFromWebsiteBody
3507
+ :param follow_links: Whether to import all web pages linked from this URL will be imported. External links will be ignored. Links to other pages on the same domain will be followed as long as they are at the same level or below the URL you specify. Each page will be transformed into a PDF document.
3508
+ :type follow_links: bool
3509
+ :param max_depth: Max depth of recursion when following links, only when follow_links is `true`. Max_depth of 0 means don't follow any links, max_depth of 1 means follow only top-level links, etc. Use -1 for automatic (system settings).
3510
+ :type max_depth: int
3511
+ :param max_documents: Max number of documents when following links, only when follow_links is `true`. Use None for automatic (system defaults). Use -1 for max (system limit).
3512
+ :type max_documents: int
2551
3513
  :param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
2552
3514
  :type gen_doc_summaries: bool
2553
3515
  :param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
@@ -2590,9 +3552,12 @@ class DocumentIngestionApi:
2590
3552
  :return: Returns the result object.
2591
3553
  """ # noqa: E501
2592
3554
 
2593
- _param = self._create_ingest_from_s3_job_serialize(
3555
+ _param = self._create_ingest_from_website_job_serialize(
2594
3556
  collection_id=collection_id,
2595
- ingest_from_s3_body=ingest_from_s3_body,
3557
+ ingest_from_website_body=ingest_from_website_body,
3558
+ follow_links=follow_links,
3559
+ max_depth=max_depth,
3560
+ max_documents=max_documents,
2596
3561
  gen_doc_summaries=gen_doc_summaries,
2597
3562
  gen_doc_questions=gen_doc_questions,
2598
3563
  audio_input_language=audio_input_language,
@@ -2620,10 +3585,13 @@ class DocumentIngestionApi:
2620
3585
  return response_data.response
2621
3586
 
2622
3587
 
2623
- def _create_ingest_from_s3_job_serialize(
3588
+ def _create_ingest_from_website_job_serialize(
2624
3589
  self,
2625
3590
  collection_id,
2626
- ingest_from_s3_body,
3591
+ ingest_from_website_body,
3592
+ follow_links,
3593
+ max_depth,
3594
+ max_documents,
2627
3595
  gen_doc_summaries,
2628
3596
  gen_doc_questions,
2629
3597
  audio_input_language,
@@ -2658,7 +3626,19 @@ class DocumentIngestionApi:
2658
3626
  # process the query parameters
2659
3627
  if collection_id is not None:
2660
3628
 
2661
- _query_params.append(('collection_id', collection_id))
3629
+ _query_params.append(('collection_id', collection_id))
3630
+
3631
+ if follow_links is not None:
3632
+
3633
+ _query_params.append(('follow_links', follow_links))
3634
+
3635
+ if max_depth is not None:
3636
+
3637
+ _query_params.append(('max_depth', max_depth))
3638
+
3639
+ if max_documents is not None:
3640
+
3641
+ _query_params.append(('max_documents', max_documents))
2662
3642
 
2663
3643
  if gen_doc_summaries is not None:
2664
3644
 
@@ -2703,8 +3683,8 @@ class DocumentIngestionApi:
2703
3683
  # process the header parameters
2704
3684
  # process the form parameters
2705
3685
  # process the body parameter
2706
- if ingest_from_s3_body is not None:
2707
- _body_params = ingest_from_s3_body
3686
+ if ingest_from_website_body is not None:
3687
+ _body_params = ingest_from_website_body
2708
3688
 
2709
3689
 
2710
3690
  # set the HTTP header `Accept`
@@ -2736,7 +3716,7 @@ class DocumentIngestionApi:
2736
3716
 
2737
3717
  return self.api_client.param_serialize(
2738
3718
  method='POST',
2739
- resource_path='/ingest/s3/job',
3719
+ resource_path='/ingest/website/job',
2740
3720
  path_params=_path_params,
2741
3721
  query_params=_query_params,
2742
3722
  header_params=_header_params,
@@ -2753,13 +3733,10 @@ class DocumentIngestionApi:
2753
3733
 
2754
3734
 
2755
3735
  @validate_call
2756
- async def create_ingest_from_website_job(
3736
+ async def create_ingest_upload_job(
2757
3737
  self,
3738
+ upload_ids: Annotated[List[StrictStr], Field(description="Id of uploaded document")],
2758
3739
  collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
2759
- ingest_from_website_body: IngestFromWebsiteBody,
2760
- follow_links: Annotated[Optional[StrictBool], Field(description="Whether to import all web pages linked from this URL will be imported. External links will be ignored. Links to other pages on the same domain will be followed as long as they are at the same level or below the URL you specify. Each page will be transformed into a PDF document.")] = None,
2761
- max_depth: Annotated[Optional[StrictInt], Field(description="Max depth of recursion when following links, only when follow_links is `true`. Max_depth of 0 means don't follow any links, max_depth of 1 means follow only top-level links, etc. Use -1 for automatic (system settings).")] = None,
2762
- max_documents: Annotated[Optional[StrictInt], Field(description="Max number of documents when following links, only when follow_links is `true`. Use None for automatic (system defaults). Use -1 for max (system limit).")] = None,
2763
3740
  gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
2764
3741
  gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
2765
3742
  audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
@@ -2769,7 +3746,10 @@ class DocumentIngestionApi:
2769
3746
  chunk_by_page: Annotated[Optional[StrictBool], Field(description="Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.")] = None,
2770
3747
  handwriting_check: Annotated[Optional[StrictBool], Field(description="Check pages for handwriting. Will use specialized models if handwriting is found.")] = None,
2771
3748
  ingest_mode: Annotated[Optional[StrictStr], Field(description="Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.")] = None,
3749
+ restricted: Annotated[Optional[StrictBool], Field(description="Whether the document should be restricted only to certain users.")] = None,
3750
+ permissions: Annotated[Optional[List[StrictStr]], Field(description="The list of usernames having permissions to the document.")] = None,
2772
3751
  timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
3752
+ ingest_upload_body: Optional[IngestUploadBody] = None,
2773
3753
  _request_timeout: Union[
2774
3754
  None,
2775
3755
  Annotated[StrictFloat, Field(gt=0)],
@@ -2783,20 +3763,14 @@ class DocumentIngestionApi:
2783
3763
  _headers: Optional[Dict[StrictStr, Any]] = None,
2784
3764
  _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
2785
3765
  ) -> JobDetails:
2786
- """Creates a job to crawl and ingest a URL into a collection.
3766
+ """Creates a job to ingest uploaded document
2787
3767
 
2788
- Creates a job to crawl and ingest a URL into a collection. The web page or document linked from this URL will be imported.
3768
+ Creates a job to ingest uploaded document identified to a given collection
2789
3769
 
3770
+ :param upload_ids: Id of uploaded document (required)
3771
+ :type upload_ids: List[str]
2790
3772
  :param collection_id: String id of the collection to add the ingested documents into. (required)
2791
3773
  :type collection_id: str
2792
- :param ingest_from_website_body: (required)
2793
- :type ingest_from_website_body: IngestFromWebsiteBody
2794
- :param follow_links: Whether to import all web pages linked from this URL will be imported. External links will be ignored. Links to other pages on the same domain will be followed as long as they are at the same level or below the URL you specify. Each page will be transformed into a PDF document.
2795
- :type follow_links: bool
2796
- :param max_depth: Max depth of recursion when following links, only when follow_links is `true`. Max_depth of 0 means don't follow any links, max_depth of 1 means follow only top-level links, etc. Use -1 for automatic (system settings).
2797
- :type max_depth: int
2798
- :param max_documents: Max number of documents when following links, only when follow_links is `true`. Use None for automatic (system defaults). Use -1 for max (system limit).
2799
- :type max_documents: int
2800
3774
  :param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
2801
3775
  :type gen_doc_summaries: bool
2802
3776
  :param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
@@ -2815,8 +3789,14 @@ class DocumentIngestionApi:
2815
3789
  :type handwriting_check: bool
2816
3790
  :param ingest_mode: Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.
2817
3791
  :type ingest_mode: str
3792
+ :param restricted: Whether the document should be restricted only to certain users.
3793
+ :type restricted: bool
3794
+ :param permissions: The list of usernames having permissions to the document.
3795
+ :type permissions: List[str]
2818
3796
  :param timeout: Timeout in seconds
2819
3797
  :type timeout: float
3798
+ :param ingest_upload_body:
3799
+ :type ingest_upload_body: IngestUploadBody
2820
3800
  :param _request_timeout: timeout setting for this request. If one
2821
3801
  number provided, it will be total request
2822
3802
  timeout. It can also be a pair (tuple) of
@@ -2839,12 +3819,9 @@ class DocumentIngestionApi:
2839
3819
  :return: Returns the result object.
2840
3820
  """ # noqa: E501
2841
3821
 
2842
- _param = self._create_ingest_from_website_job_serialize(
3822
+ _param = self._create_ingest_upload_job_serialize(
3823
+ upload_ids=upload_ids,
2843
3824
  collection_id=collection_id,
2844
- ingest_from_website_body=ingest_from_website_body,
2845
- follow_links=follow_links,
2846
- max_depth=max_depth,
2847
- max_documents=max_documents,
2848
3825
  gen_doc_summaries=gen_doc_summaries,
2849
3826
  gen_doc_questions=gen_doc_questions,
2850
3827
  audio_input_language=audio_input_language,
@@ -2854,7 +3831,10 @@ class DocumentIngestionApi:
2854
3831
  chunk_by_page=chunk_by_page,
2855
3832
  handwriting_check=handwriting_check,
2856
3833
  ingest_mode=ingest_mode,
3834
+ restricted=restricted,
3835
+ permissions=permissions,
2857
3836
  timeout=timeout,
3837
+ ingest_upload_body=ingest_upload_body,
2858
3838
  _request_auth=_request_auth,
2859
3839
  _content_type=_content_type,
2860
3840
  _headers=_headers,
@@ -2877,13 +3857,10 @@ class DocumentIngestionApi:
2877
3857
 
2878
3858
 
2879
3859
  @validate_call
2880
- async def create_ingest_from_website_job_with_http_info(
3860
+ async def create_ingest_upload_job_with_http_info(
2881
3861
  self,
3862
+ upload_ids: Annotated[List[StrictStr], Field(description="Id of uploaded document")],
2882
3863
  collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
2883
- ingest_from_website_body: IngestFromWebsiteBody,
2884
- follow_links: Annotated[Optional[StrictBool], Field(description="Whether to import all web pages linked from this URL will be imported. External links will be ignored. Links to other pages on the same domain will be followed as long as they are at the same level or below the URL you specify. Each page will be transformed into a PDF document.")] = None,
2885
- max_depth: Annotated[Optional[StrictInt], Field(description="Max depth of recursion when following links, only when follow_links is `true`. Max_depth of 0 means don't follow any links, max_depth of 1 means follow only top-level links, etc. Use -1 for automatic (system settings).")] = None,
2886
- max_documents: Annotated[Optional[StrictInt], Field(description="Max number of documents when following links, only when follow_links is `true`. Use None for automatic (system defaults). Use -1 for max (system limit).")] = None,
2887
3864
  gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
2888
3865
  gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
2889
3866
  audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
@@ -2893,7 +3870,10 @@ class DocumentIngestionApi:
2893
3870
  chunk_by_page: Annotated[Optional[StrictBool], Field(description="Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.")] = None,
2894
3871
  handwriting_check: Annotated[Optional[StrictBool], Field(description="Check pages for handwriting. Will use specialized models if handwriting is found.")] = None,
2895
3872
  ingest_mode: Annotated[Optional[StrictStr], Field(description="Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.")] = None,
3873
+ restricted: Annotated[Optional[StrictBool], Field(description="Whether the document should be restricted only to certain users.")] = None,
3874
+ permissions: Annotated[Optional[List[StrictStr]], Field(description="The list of usernames having permissions to the document.")] = None,
2896
3875
  timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
3876
+ ingest_upload_body: Optional[IngestUploadBody] = None,
2897
3877
  _request_timeout: Union[
2898
3878
  None,
2899
3879
  Annotated[StrictFloat, Field(gt=0)],
@@ -2907,20 +3887,14 @@ class DocumentIngestionApi:
2907
3887
  _headers: Optional[Dict[StrictStr, Any]] = None,
2908
3888
  _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
2909
3889
  ) -> ApiResponse[JobDetails]:
2910
- """Creates a job to crawl and ingest a URL into a collection.
3890
+ """Creates a job to ingest uploaded document
2911
3891
 
2912
- Creates a job to crawl and ingest a URL into a collection. The web page or document linked from this URL will be imported.
3892
+ Creates a job to ingest uploaded document identified to a given collection
2913
3893
 
3894
+ :param upload_ids: Id of uploaded document (required)
3895
+ :type upload_ids: List[str]
2914
3896
  :param collection_id: String id of the collection to add the ingested documents into. (required)
2915
3897
  :type collection_id: str
2916
- :param ingest_from_website_body: (required)
2917
- :type ingest_from_website_body: IngestFromWebsiteBody
2918
- :param follow_links: Whether to import all web pages linked from this URL will be imported. External links will be ignored. Links to other pages on the same domain will be followed as long as they are at the same level or below the URL you specify. Each page will be transformed into a PDF document.
2919
- :type follow_links: bool
2920
- :param max_depth: Max depth of recursion when following links, only when follow_links is `true`. Max_depth of 0 means don't follow any links, max_depth of 1 means follow only top-level links, etc. Use -1 for automatic (system settings).
2921
- :type max_depth: int
2922
- :param max_documents: Max number of documents when following links, only when follow_links is `true`. Use None for automatic (system defaults). Use -1 for max (system limit).
2923
- :type max_documents: int
2924
3898
  :param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
2925
3899
  :type gen_doc_summaries: bool
2926
3900
  :param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
@@ -2939,8 +3913,14 @@ class DocumentIngestionApi:
2939
3913
  :type handwriting_check: bool
2940
3914
  :param ingest_mode: Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.
2941
3915
  :type ingest_mode: str
3916
+ :param restricted: Whether the document should be restricted only to certain users.
3917
+ :type restricted: bool
3918
+ :param permissions: The list of usernames having permissions to the document.
3919
+ :type permissions: List[str]
2942
3920
  :param timeout: Timeout in seconds
2943
3921
  :type timeout: float
3922
+ :param ingest_upload_body:
3923
+ :type ingest_upload_body: IngestUploadBody
2944
3924
  :param _request_timeout: timeout setting for this request. If one
2945
3925
  number provided, it will be total request
2946
3926
  timeout. It can also be a pair (tuple) of
@@ -2963,12 +3943,9 @@ class DocumentIngestionApi:
2963
3943
  :return: Returns the result object.
2964
3944
  """ # noqa: E501
2965
3945
 
2966
- _param = self._create_ingest_from_website_job_serialize(
3946
+ _param = self._create_ingest_upload_job_serialize(
3947
+ upload_ids=upload_ids,
2967
3948
  collection_id=collection_id,
2968
- ingest_from_website_body=ingest_from_website_body,
2969
- follow_links=follow_links,
2970
- max_depth=max_depth,
2971
- max_documents=max_documents,
2972
3949
  gen_doc_summaries=gen_doc_summaries,
2973
3950
  gen_doc_questions=gen_doc_questions,
2974
3951
  audio_input_language=audio_input_language,
@@ -2978,7 +3955,10 @@ class DocumentIngestionApi:
2978
3955
  chunk_by_page=chunk_by_page,
2979
3956
  handwriting_check=handwriting_check,
2980
3957
  ingest_mode=ingest_mode,
3958
+ restricted=restricted,
3959
+ permissions=permissions,
2981
3960
  timeout=timeout,
3961
+ ingest_upload_body=ingest_upload_body,
2982
3962
  _request_auth=_request_auth,
2983
3963
  _content_type=_content_type,
2984
3964
  _headers=_headers,
@@ -3001,13 +3981,10 @@ class DocumentIngestionApi:
3001
3981
 
3002
3982
 
3003
3983
  @validate_call
3004
- async def create_ingest_from_website_job_without_preload_content(
3984
+ async def create_ingest_upload_job_without_preload_content(
3005
3985
  self,
3986
+ upload_ids: Annotated[List[StrictStr], Field(description="Id of uploaded document")],
3006
3987
  collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
3007
- ingest_from_website_body: IngestFromWebsiteBody,
3008
- follow_links: Annotated[Optional[StrictBool], Field(description="Whether to import all web pages linked from this URL will be imported. External links will be ignored. Links to other pages on the same domain will be followed as long as they are at the same level or below the URL you specify. Each page will be transformed into a PDF document.")] = None,
3009
- max_depth: Annotated[Optional[StrictInt], Field(description="Max depth of recursion when following links, only when follow_links is `true`. Max_depth of 0 means don't follow any links, max_depth of 1 means follow only top-level links, etc. Use -1 for automatic (system settings).")] = None,
3010
- max_documents: Annotated[Optional[StrictInt], Field(description="Max number of documents when following links, only when follow_links is `true`. Use None for automatic (system defaults). Use -1 for max (system limit).")] = None,
3011
3988
  gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
3012
3989
  gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
3013
3990
  audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
@@ -3017,7 +3994,10 @@ class DocumentIngestionApi:
3017
3994
  chunk_by_page: Annotated[Optional[StrictBool], Field(description="Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.")] = None,
3018
3995
  handwriting_check: Annotated[Optional[StrictBool], Field(description="Check pages for handwriting. Will use specialized models if handwriting is found.")] = None,
3019
3996
  ingest_mode: Annotated[Optional[StrictStr], Field(description="Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.")] = None,
3997
+ restricted: Annotated[Optional[StrictBool], Field(description="Whether the document should be restricted only to certain users.")] = None,
3998
+ permissions: Annotated[Optional[List[StrictStr]], Field(description="The list of usernames having permissions to the document.")] = None,
3020
3999
  timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
4000
+ ingest_upload_body: Optional[IngestUploadBody] = None,
3021
4001
  _request_timeout: Union[
3022
4002
  None,
3023
4003
  Annotated[StrictFloat, Field(gt=0)],
@@ -3031,20 +4011,14 @@ class DocumentIngestionApi:
3031
4011
  _headers: Optional[Dict[StrictStr, Any]] = None,
3032
4012
  _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
3033
4013
  ) -> RESTResponseType:
3034
- """Creates a job to crawl and ingest a URL into a collection.
4014
+ """Creates a job to ingest uploaded document
3035
4015
 
3036
- Creates a job to crawl and ingest a URL into a collection. The web page or document linked from this URL will be imported.
4016
+ Creates a job to ingest uploaded document identified to a given collection
3037
4017
 
4018
+ :param upload_ids: Id of uploaded document (required)
4019
+ :type upload_ids: List[str]
3038
4020
  :param collection_id: String id of the collection to add the ingested documents into. (required)
3039
4021
  :type collection_id: str
3040
- :param ingest_from_website_body: (required)
3041
- :type ingest_from_website_body: IngestFromWebsiteBody
3042
- :param follow_links: Whether to import all web pages linked from this URL will be imported. External links will be ignored. Links to other pages on the same domain will be followed as long as they are at the same level or below the URL you specify. Each page will be transformed into a PDF document.
3043
- :type follow_links: bool
3044
- :param max_depth: Max depth of recursion when following links, only when follow_links is `true`. Max_depth of 0 means don't follow any links, max_depth of 1 means follow only top-level links, etc. Use -1 for automatic (system settings).
3045
- :type max_depth: int
3046
- :param max_documents: Max number of documents when following links, only when follow_links is `true`. Use None for automatic (system defaults). Use -1 for max (system limit).
3047
- :type max_documents: int
3048
4022
  :param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
3049
4023
  :type gen_doc_summaries: bool
3050
4024
  :param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
@@ -3063,8 +4037,14 @@ class DocumentIngestionApi:
3063
4037
  :type handwriting_check: bool
3064
4038
  :param ingest_mode: Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.
3065
4039
  :type ingest_mode: str
4040
+ :param restricted: Whether the document should be restricted only to certain users.
4041
+ :type restricted: bool
4042
+ :param permissions: The list of usernames having permissions to the document.
4043
+ :type permissions: List[str]
3066
4044
  :param timeout: Timeout in seconds
3067
4045
  :type timeout: float
4046
+ :param ingest_upload_body:
4047
+ :type ingest_upload_body: IngestUploadBody
3068
4048
  :param _request_timeout: timeout setting for this request. If one
3069
4049
  number provided, it will be total request
3070
4050
  timeout. It can also be a pair (tuple) of
@@ -3087,12 +4067,9 @@ class DocumentIngestionApi:
3087
4067
  :return: Returns the result object.
3088
4068
  """ # noqa: E501
3089
4069
 
3090
- _param = self._create_ingest_from_website_job_serialize(
4070
+ _param = self._create_ingest_upload_job_serialize(
4071
+ upload_ids=upload_ids,
3091
4072
  collection_id=collection_id,
3092
- ingest_from_website_body=ingest_from_website_body,
3093
- follow_links=follow_links,
3094
- max_depth=max_depth,
3095
- max_documents=max_documents,
3096
4073
  gen_doc_summaries=gen_doc_summaries,
3097
4074
  gen_doc_questions=gen_doc_questions,
3098
4075
  audio_input_language=audio_input_language,
@@ -3102,7 +4079,10 @@ class DocumentIngestionApi:
3102
4079
  chunk_by_page=chunk_by_page,
3103
4080
  handwriting_check=handwriting_check,
3104
4081
  ingest_mode=ingest_mode,
4082
+ restricted=restricted,
4083
+ permissions=permissions,
3105
4084
  timeout=timeout,
4085
+ ingest_upload_body=ingest_upload_body,
3106
4086
  _request_auth=_request_auth,
3107
4087
  _content_type=_content_type,
3108
4088
  _headers=_headers,
@@ -3120,13 +4100,10 @@ class DocumentIngestionApi:
3120
4100
  return response_data.response
3121
4101
 
3122
4102
 
3123
- def _create_ingest_from_website_job_serialize(
4103
+ def _create_ingest_upload_job_serialize(
3124
4104
  self,
4105
+ upload_ids,
3125
4106
  collection_id,
3126
- ingest_from_website_body,
3127
- follow_links,
3128
- max_depth,
3129
- max_documents,
3130
4107
  gen_doc_summaries,
3131
4108
  gen_doc_questions,
3132
4109
  audio_input_language,
@@ -3136,7 +4113,10 @@ class DocumentIngestionApi:
3136
4113
  chunk_by_page,
3137
4114
  handwriting_check,
3138
4115
  ingest_mode,
4116
+ restricted,
4117
+ permissions,
3139
4118
  timeout,
4119
+ ingest_upload_body,
3140
4120
  _request_auth,
3141
4121
  _content_type,
3142
4122
  _headers,
@@ -3146,6 +4126,8 @@ class DocumentIngestionApi:
3146
4126
  _host = None
3147
4127
 
3148
4128
  _collection_formats: Dict[str, str] = {
4129
+ 'upload_ids': 'csv',
4130
+ 'permissions': 'multi',
3149
4131
  }
3150
4132
 
3151
4133
  _path_params: Dict[str, str] = {}
@@ -3158,23 +4140,13 @@ class DocumentIngestionApi:
3158
4140
  _body_params: Optional[bytes] = None
3159
4141
 
3160
4142
  # process the path parameters
4143
+ if upload_ids is not None:
4144
+ _path_params['upload_ids'] = upload_ids
3161
4145
  # process the query parameters
3162
4146
  if collection_id is not None:
3163
4147
 
3164
4148
  _query_params.append(('collection_id', collection_id))
3165
4149
 
3166
- if follow_links is not None:
3167
-
3168
- _query_params.append(('follow_links', follow_links))
3169
-
3170
- if max_depth is not None:
3171
-
3172
- _query_params.append(('max_depth', max_depth))
3173
-
3174
- if max_documents is not None:
3175
-
3176
- _query_params.append(('max_documents', max_documents))
3177
-
3178
4150
  if gen_doc_summaries is not None:
3179
4151
 
3180
4152
  _query_params.append(('gen_doc_summaries', gen_doc_summaries))
@@ -3211,6 +4183,14 @@ class DocumentIngestionApi:
3211
4183
 
3212
4184
  _query_params.append(('ingest_mode', ingest_mode))
3213
4185
 
4186
+ if restricted is not None:
4187
+
4188
+ _query_params.append(('restricted', restricted))
4189
+
4190
+ if permissions is not None:
4191
+
4192
+ _query_params.append(('permissions', permissions))
4193
+
3214
4194
  if timeout is not None:
3215
4195
 
3216
4196
  _query_params.append(('timeout', timeout))
@@ -3218,8 +4198,8 @@ class DocumentIngestionApi:
3218
4198
  # process the header parameters
3219
4199
  # process the form parameters
3220
4200
  # process the body parameter
3221
- if ingest_from_website_body is not None:
3222
- _body_params = ingest_from_website_body
4201
+ if ingest_upload_body is not None:
4202
+ _body_params = ingest_upload_body
3223
4203
 
3224
4204
 
3225
4205
  # set the HTTP header `Accept`
@@ -3251,7 +4231,7 @@ class DocumentIngestionApi:
3251
4231
 
3252
4232
  return self.api_client.param_serialize(
3253
4233
  method='POST',
3254
- resource_path='/ingest/website/job',
4234
+ resource_path='/uploads/{upload_ids}/ingest/job',
3255
4235
  path_params=_path_params,
3256
4236
  query_params=_query_params,
3257
4237
  header_params=_header_params,
@@ -3268,23 +4248,21 @@ class DocumentIngestionApi:
3268
4248
 
3269
4249
 
3270
4250
  @validate_call
3271
- async def create_ingest_upload_job(
4251
+ async def ingest_agent_only_to_standard(
3272
4252
  self,
3273
- upload_ids: Annotated[List[StrictStr], Field(description="Id of uploaded document")],
3274
4253
  collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
4254
+ document_id: Annotated[StrictStr, Field(description="String id of the document to be parsed.")],
3275
4255
  gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
3276
4256
  gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
3277
4257
  audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
3278
4258
  ocr_model: Annotated[Optional[StrictStr], Field(description="Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).")] = None,
4259
+ restricted: Annotated[Optional[StrictBool], Field(description="Whether the document should be restricted only to certain users.")] = None,
4260
+ permissions: Annotated[Optional[List[StrictStr]], Field(description="The list of usernames having permissions to the document.")] = None,
3279
4261
  tesseract_lang: Annotated[Optional[StrictStr], Field(description="Which language to use when using ocr_model=\"tesseract\".")] = None,
3280
4262
  keep_tables_as_one_chunk: Annotated[Optional[StrictBool], Field(description="When tables are identified by the table parser the table tokens will be kept in a single chunk.")] = None,
3281
4263
  chunk_by_page: Annotated[Optional[StrictBool], Field(description="Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.")] = None,
3282
4264
  handwriting_check: Annotated[Optional[StrictBool], Field(description="Check pages for handwriting. Will use specialized models if handwriting is found.")] = None,
3283
- ingest_mode: Annotated[Optional[StrictStr], Field(description="Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.")] = None,
3284
- restricted: Annotated[Optional[StrictBool], Field(description="Whether the document should be restricted only to certain users.")] = None,
3285
- permissions: Annotated[Optional[List[StrictStr]], Field(description="The list of usernames having permissions to the document.")] = None,
3286
4265
  timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
3287
- ingest_upload_body: Optional[IngestUploadBody] = None,
3288
4266
  _request_timeout: Union[
3289
4267
  None,
3290
4268
  Annotated[StrictFloat, Field(gt=0)],
@@ -3297,15 +4275,15 @@ class DocumentIngestionApi:
3297
4275
  _content_type: Optional[StrictStr] = None,
3298
4276
  _headers: Optional[Dict[StrictStr, Any]] = None,
3299
4277
  _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
3300
- ) -> JobDetails:
3301
- """Creates a job to ingest uploaded document
4278
+ ) -> None:
4279
+ """Converts files uploaded in \"agent_only\" ingest mode to PDF and parses them.
3302
4280
 
3303
- Creates a job to ingest uploaded document identified to a given collection
4281
+ Converts files uploaded in \"agent_only\" ingest mode to PDF and parses them.
3304
4282
 
3305
- :param upload_ids: Id of uploaded document (required)
3306
- :type upload_ids: List[str]
3307
4283
  :param collection_id: String id of the collection to add the ingested documents into. (required)
3308
4284
  :type collection_id: str
4285
+ :param document_id: String id of the document to be parsed. (required)
4286
+ :type document_id: str
3309
4287
  :param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
3310
4288
  :type gen_doc_summaries: bool
3311
4289
  :param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
@@ -3314,6 +4292,10 @@ class DocumentIngestionApi:
3314
4292
  :type audio_input_language: str
3315
4293
  :param ocr_model: Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).
3316
4294
  :type ocr_model: str
4295
+ :param restricted: Whether the document should be restricted only to certain users.
4296
+ :type restricted: bool
4297
+ :param permissions: The list of usernames having permissions to the document.
4298
+ :type permissions: List[str]
3317
4299
  :param tesseract_lang: Which language to use when using ocr_model=\"tesseract\".
3318
4300
  :type tesseract_lang: str
3319
4301
  :param keep_tables_as_one_chunk: When tables are identified by the table parser the table tokens will be kept in a single chunk.
@@ -3322,16 +4304,8 @@ class DocumentIngestionApi:
3322
4304
  :type chunk_by_page: bool
3323
4305
  :param handwriting_check: Check pages for handwriting. Will use specialized models if handwriting is found.
3324
4306
  :type handwriting_check: bool
3325
- :param ingest_mode: Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.
3326
- :type ingest_mode: str
3327
- :param restricted: Whether the document should be restricted only to certain users.
3328
- :type restricted: bool
3329
- :param permissions: The list of usernames having permissions to the document.
3330
- :type permissions: List[str]
3331
4307
  :param timeout: Timeout in seconds
3332
4308
  :type timeout: float
3333
- :param ingest_upload_body:
3334
- :type ingest_upload_body: IngestUploadBody
3335
4309
  :param _request_timeout: timeout setting for this request. If one
3336
4310
  number provided, it will be total request
3337
4311
  timeout. It can also be a pair (tuple) of
@@ -3354,22 +4328,20 @@ class DocumentIngestionApi:
3354
4328
  :return: Returns the result object.
3355
4329
  """ # noqa: E501
3356
4330
 
3357
- _param = self._create_ingest_upload_job_serialize(
3358
- upload_ids=upload_ids,
4331
+ _param = self._ingest_agent_only_to_standard_serialize(
3359
4332
  collection_id=collection_id,
4333
+ document_id=document_id,
3360
4334
  gen_doc_summaries=gen_doc_summaries,
3361
4335
  gen_doc_questions=gen_doc_questions,
3362
4336
  audio_input_language=audio_input_language,
3363
4337
  ocr_model=ocr_model,
4338
+ restricted=restricted,
4339
+ permissions=permissions,
3364
4340
  tesseract_lang=tesseract_lang,
3365
4341
  keep_tables_as_one_chunk=keep_tables_as_one_chunk,
3366
4342
  chunk_by_page=chunk_by_page,
3367
4343
  handwriting_check=handwriting_check,
3368
- ingest_mode=ingest_mode,
3369
- restricted=restricted,
3370
- permissions=permissions,
3371
4344
  timeout=timeout,
3372
- ingest_upload_body=ingest_upload_body,
3373
4345
  _request_auth=_request_auth,
3374
4346
  _content_type=_content_type,
3375
4347
  _headers=_headers,
@@ -3377,7 +4349,7 @@ class DocumentIngestionApi:
3377
4349
  )
3378
4350
 
3379
4351
  _response_types_map: Dict[str, Optional[str]] = {
3380
- '201': "JobDetails",
4352
+ '204': None,
3381
4353
  '401': "EndpointError",
3382
4354
  }
3383
4355
  response_data = await self.api_client.call_api(
@@ -3392,23 +4364,21 @@ class DocumentIngestionApi:
3392
4364
 
3393
4365
 
3394
4366
  @validate_call
3395
- async def create_ingest_upload_job_with_http_info(
4367
+ async def ingest_agent_only_to_standard_with_http_info(
3396
4368
  self,
3397
- upload_ids: Annotated[List[StrictStr], Field(description="Id of uploaded document")],
3398
4369
  collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
4370
+ document_id: Annotated[StrictStr, Field(description="String id of the document to be parsed.")],
3399
4371
  gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
3400
4372
  gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
3401
4373
  audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
3402
4374
  ocr_model: Annotated[Optional[StrictStr], Field(description="Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).")] = None,
4375
+ restricted: Annotated[Optional[StrictBool], Field(description="Whether the document should be restricted only to certain users.")] = None,
4376
+ permissions: Annotated[Optional[List[StrictStr]], Field(description="The list of usernames having permissions to the document.")] = None,
3403
4377
  tesseract_lang: Annotated[Optional[StrictStr], Field(description="Which language to use when using ocr_model=\"tesseract\".")] = None,
3404
4378
  keep_tables_as_one_chunk: Annotated[Optional[StrictBool], Field(description="When tables are identified by the table parser the table tokens will be kept in a single chunk.")] = None,
3405
4379
  chunk_by_page: Annotated[Optional[StrictBool], Field(description="Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.")] = None,
3406
4380
  handwriting_check: Annotated[Optional[StrictBool], Field(description="Check pages for handwriting. Will use specialized models if handwriting is found.")] = None,
3407
- ingest_mode: Annotated[Optional[StrictStr], Field(description="Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.")] = None,
3408
- restricted: Annotated[Optional[StrictBool], Field(description="Whether the document should be restricted only to certain users.")] = None,
3409
- permissions: Annotated[Optional[List[StrictStr]], Field(description="The list of usernames having permissions to the document.")] = None,
3410
4381
  timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
3411
- ingest_upload_body: Optional[IngestUploadBody] = None,
3412
4382
  _request_timeout: Union[
3413
4383
  None,
3414
4384
  Annotated[StrictFloat, Field(gt=0)],
@@ -3421,15 +4391,15 @@ class DocumentIngestionApi:
3421
4391
  _content_type: Optional[StrictStr] = None,
3422
4392
  _headers: Optional[Dict[StrictStr, Any]] = None,
3423
4393
  _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
3424
- ) -> ApiResponse[JobDetails]:
3425
- """Creates a job to ingest uploaded document
4394
+ ) -> ApiResponse[None]:
4395
+ """Converts files uploaded in \"agent_only\" ingest mode to PDF and parses them.
3426
4396
 
3427
- Creates a job to ingest uploaded document identified to a given collection
4397
+ Converts files uploaded in \"agent_only\" ingest mode to PDF and parses them.
3428
4398
 
3429
- :param upload_ids: Id of uploaded document (required)
3430
- :type upload_ids: List[str]
3431
4399
  :param collection_id: String id of the collection to add the ingested documents into. (required)
3432
4400
  :type collection_id: str
4401
+ :param document_id: String id of the document to be parsed. (required)
4402
+ :type document_id: str
3433
4403
  :param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
3434
4404
  :type gen_doc_summaries: bool
3435
4405
  :param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
@@ -3438,6 +4408,10 @@ class DocumentIngestionApi:
3438
4408
  :type audio_input_language: str
3439
4409
  :param ocr_model: Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).
3440
4410
  :type ocr_model: str
4411
+ :param restricted: Whether the document should be restricted only to certain users.
4412
+ :type restricted: bool
4413
+ :param permissions: The list of usernames having permissions to the document.
4414
+ :type permissions: List[str]
3441
4415
  :param tesseract_lang: Which language to use when using ocr_model=\"tesseract\".
3442
4416
  :type tesseract_lang: str
3443
4417
  :param keep_tables_as_one_chunk: When tables are identified by the table parser the table tokens will be kept in a single chunk.
@@ -3446,16 +4420,8 @@ class DocumentIngestionApi:
3446
4420
  :type chunk_by_page: bool
3447
4421
  :param handwriting_check: Check pages for handwriting. Will use specialized models if handwriting is found.
3448
4422
  :type handwriting_check: bool
3449
- :param ingest_mode: Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.
3450
- :type ingest_mode: str
3451
- :param restricted: Whether the document should be restricted only to certain users.
3452
- :type restricted: bool
3453
- :param permissions: The list of usernames having permissions to the document.
3454
- :type permissions: List[str]
3455
4423
  :param timeout: Timeout in seconds
3456
4424
  :type timeout: float
3457
- :param ingest_upload_body:
3458
- :type ingest_upload_body: IngestUploadBody
3459
4425
  :param _request_timeout: timeout setting for this request. If one
3460
4426
  number provided, it will be total request
3461
4427
  timeout. It can also be a pair (tuple) of
@@ -3478,22 +4444,20 @@ class DocumentIngestionApi:
3478
4444
  :return: Returns the result object.
3479
4445
  """ # noqa: E501
3480
4446
 
3481
- _param = self._create_ingest_upload_job_serialize(
3482
- upload_ids=upload_ids,
4447
+ _param = self._ingest_agent_only_to_standard_serialize(
3483
4448
  collection_id=collection_id,
4449
+ document_id=document_id,
3484
4450
  gen_doc_summaries=gen_doc_summaries,
3485
4451
  gen_doc_questions=gen_doc_questions,
3486
4452
  audio_input_language=audio_input_language,
3487
4453
  ocr_model=ocr_model,
4454
+ restricted=restricted,
4455
+ permissions=permissions,
3488
4456
  tesseract_lang=tesseract_lang,
3489
4457
  keep_tables_as_one_chunk=keep_tables_as_one_chunk,
3490
4458
  chunk_by_page=chunk_by_page,
3491
4459
  handwriting_check=handwriting_check,
3492
- ingest_mode=ingest_mode,
3493
- restricted=restricted,
3494
- permissions=permissions,
3495
4460
  timeout=timeout,
3496
- ingest_upload_body=ingest_upload_body,
3497
4461
  _request_auth=_request_auth,
3498
4462
  _content_type=_content_type,
3499
4463
  _headers=_headers,
@@ -3501,7 +4465,7 @@ class DocumentIngestionApi:
3501
4465
  )
3502
4466
 
3503
4467
  _response_types_map: Dict[str, Optional[str]] = {
3504
- '201': "JobDetails",
4468
+ '204': None,
3505
4469
  '401': "EndpointError",
3506
4470
  }
3507
4471
  response_data = await self.api_client.call_api(
@@ -3516,23 +4480,21 @@ class DocumentIngestionApi:
3516
4480
 
3517
4481
 
3518
4482
  @validate_call
3519
- async def create_ingest_upload_job_without_preload_content(
4483
+ async def ingest_agent_only_to_standard_without_preload_content(
3520
4484
  self,
3521
- upload_ids: Annotated[List[StrictStr], Field(description="Id of uploaded document")],
3522
4485
  collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
4486
+ document_id: Annotated[StrictStr, Field(description="String id of the document to be parsed.")],
3523
4487
  gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
3524
4488
  gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
3525
4489
  audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
3526
4490
  ocr_model: Annotated[Optional[StrictStr], Field(description="Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).")] = None,
4491
+ restricted: Annotated[Optional[StrictBool], Field(description="Whether the document should be restricted only to certain users.")] = None,
4492
+ permissions: Annotated[Optional[List[StrictStr]], Field(description="The list of usernames having permissions to the document.")] = None,
3527
4493
  tesseract_lang: Annotated[Optional[StrictStr], Field(description="Which language to use when using ocr_model=\"tesseract\".")] = None,
3528
4494
  keep_tables_as_one_chunk: Annotated[Optional[StrictBool], Field(description="When tables are identified by the table parser the table tokens will be kept in a single chunk.")] = None,
3529
4495
  chunk_by_page: Annotated[Optional[StrictBool], Field(description="Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.")] = None,
3530
4496
  handwriting_check: Annotated[Optional[StrictBool], Field(description="Check pages for handwriting. Will use specialized models if handwriting is found.")] = None,
3531
- ingest_mode: Annotated[Optional[StrictStr], Field(description="Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.")] = None,
3532
- restricted: Annotated[Optional[StrictBool], Field(description="Whether the document should be restricted only to certain users.")] = None,
3533
- permissions: Annotated[Optional[List[StrictStr]], Field(description="The list of usernames having permissions to the document.")] = None,
3534
4497
  timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
3535
- ingest_upload_body: Optional[IngestUploadBody] = None,
3536
4498
  _request_timeout: Union[
3537
4499
  None,
3538
4500
  Annotated[StrictFloat, Field(gt=0)],
@@ -3546,14 +4508,14 @@ class DocumentIngestionApi:
3546
4508
  _headers: Optional[Dict[StrictStr, Any]] = None,
3547
4509
  _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
3548
4510
  ) -> RESTResponseType:
3549
- """Creates a job to ingest uploaded document
4511
+ """Converts files uploaded in \"agent_only\" ingest mode to PDF and parses them.
3550
4512
 
3551
- Creates a job to ingest uploaded document identified to a given collection
4513
+ Converts files uploaded in \"agent_only\" ingest mode to PDF and parses them.
3552
4514
 
3553
- :param upload_ids: Id of uploaded document (required)
3554
- :type upload_ids: List[str]
3555
4515
  :param collection_id: String id of the collection to add the ingested documents into. (required)
3556
4516
  :type collection_id: str
4517
+ :param document_id: String id of the document to be parsed. (required)
4518
+ :type document_id: str
3557
4519
  :param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
3558
4520
  :type gen_doc_summaries: bool
3559
4521
  :param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
@@ -3562,6 +4524,10 @@ class DocumentIngestionApi:
3562
4524
  :type audio_input_language: str
3563
4525
  :param ocr_model: Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).
3564
4526
  :type ocr_model: str
4527
+ :param restricted: Whether the document should be restricted only to certain users.
4528
+ :type restricted: bool
4529
+ :param permissions: The list of usernames having permissions to the document.
4530
+ :type permissions: List[str]
3565
4531
  :param tesseract_lang: Which language to use when using ocr_model=\"tesseract\".
3566
4532
  :type tesseract_lang: str
3567
4533
  :param keep_tables_as_one_chunk: When tables are identified by the table parser the table tokens will be kept in a single chunk.
@@ -3570,16 +4536,8 @@ class DocumentIngestionApi:
3570
4536
  :type chunk_by_page: bool
3571
4537
  :param handwriting_check: Check pages for handwriting. Will use specialized models if handwriting is found.
3572
4538
  :type handwriting_check: bool
3573
- :param ingest_mode: Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.
3574
- :type ingest_mode: str
3575
- :param restricted: Whether the document should be restricted only to certain users.
3576
- :type restricted: bool
3577
- :param permissions: The list of usernames having permissions to the document.
3578
- :type permissions: List[str]
3579
4539
  :param timeout: Timeout in seconds
3580
4540
  :type timeout: float
3581
- :param ingest_upload_body:
3582
- :type ingest_upload_body: IngestUploadBody
3583
4541
  :param _request_timeout: timeout setting for this request. If one
3584
4542
  number provided, it will be total request
3585
4543
  timeout. It can also be a pair (tuple) of
@@ -3602,22 +4560,20 @@ class DocumentIngestionApi:
3602
4560
  :return: Returns the result object.
3603
4561
  """ # noqa: E501
3604
4562
 
3605
- _param = self._create_ingest_upload_job_serialize(
3606
- upload_ids=upload_ids,
4563
+ _param = self._ingest_agent_only_to_standard_serialize(
3607
4564
  collection_id=collection_id,
4565
+ document_id=document_id,
3608
4566
  gen_doc_summaries=gen_doc_summaries,
3609
4567
  gen_doc_questions=gen_doc_questions,
3610
4568
  audio_input_language=audio_input_language,
3611
4569
  ocr_model=ocr_model,
4570
+ restricted=restricted,
4571
+ permissions=permissions,
3612
4572
  tesseract_lang=tesseract_lang,
3613
4573
  keep_tables_as_one_chunk=keep_tables_as_one_chunk,
3614
4574
  chunk_by_page=chunk_by_page,
3615
4575
  handwriting_check=handwriting_check,
3616
- ingest_mode=ingest_mode,
3617
- restricted=restricted,
3618
- permissions=permissions,
3619
4576
  timeout=timeout,
3620
- ingest_upload_body=ingest_upload_body,
3621
4577
  _request_auth=_request_auth,
3622
4578
  _content_type=_content_type,
3623
4579
  _headers=_headers,
@@ -3625,7 +4581,7 @@ class DocumentIngestionApi:
3625
4581
  )
3626
4582
 
3627
4583
  _response_types_map: Dict[str, Optional[str]] = {
3628
- '201': "JobDetails",
4584
+ '204': None,
3629
4585
  '401': "EndpointError",
3630
4586
  }
3631
4587
  response_data = await self.api_client.call_api(
@@ -3635,23 +4591,21 @@ class DocumentIngestionApi:
3635
4591
  return response_data.response
3636
4592
 
3637
4593
 
3638
- def _create_ingest_upload_job_serialize(
4594
+ def _ingest_agent_only_to_standard_serialize(
3639
4595
  self,
3640
- upload_ids,
3641
4596
  collection_id,
4597
+ document_id,
3642
4598
  gen_doc_summaries,
3643
4599
  gen_doc_questions,
3644
4600
  audio_input_language,
3645
4601
  ocr_model,
4602
+ restricted,
4603
+ permissions,
3646
4604
  tesseract_lang,
3647
4605
  keep_tables_as_one_chunk,
3648
4606
  chunk_by_page,
3649
4607
  handwriting_check,
3650
- ingest_mode,
3651
- restricted,
3652
- permissions,
3653
4608
  timeout,
3654
- ingest_upload_body,
3655
4609
  _request_auth,
3656
4610
  _content_type,
3657
4611
  _headers,
@@ -3661,7 +4615,6 @@ class DocumentIngestionApi:
3661
4615
  _host = None
3662
4616
 
3663
4617
  _collection_formats: Dict[str, str] = {
3664
- 'upload_ids': 'csv',
3665
4618
  'permissions': 'multi',
3666
4619
  }
3667
4620
 
@@ -3675,13 +4628,15 @@ class DocumentIngestionApi:
3675
4628
  _body_params: Optional[bytes] = None
3676
4629
 
3677
4630
  # process the path parameters
3678
- if upload_ids is not None:
3679
- _path_params['upload_ids'] = upload_ids
3680
4631
  # process the query parameters
3681
4632
  if collection_id is not None:
3682
4633
 
3683
4634
  _query_params.append(('collection_id', collection_id))
3684
4635
 
4636
+ if document_id is not None:
4637
+
4638
+ _query_params.append(('document_id', document_id))
4639
+
3685
4640
  if gen_doc_summaries is not None:
3686
4641
 
3687
4642
  _query_params.append(('gen_doc_summaries', gen_doc_summaries))
@@ -3698,6 +4653,14 @@ class DocumentIngestionApi:
3698
4653
 
3699
4654
  _query_params.append(('ocr_model', ocr_model))
3700
4655
 
4656
+ if restricted is not None:
4657
+
4658
+ _query_params.append(('restricted', restricted))
4659
+
4660
+ if permissions is not None:
4661
+
4662
+ _query_params.append(('permissions', permissions))
4663
+
3701
4664
  if tesseract_lang is not None:
3702
4665
 
3703
4666
  _query_params.append(('tesseract_lang', tesseract_lang))
@@ -3714,18 +4677,6 @@ class DocumentIngestionApi:
3714
4677
 
3715
4678
  _query_params.append(('handwriting_check', handwriting_check))
3716
4679
 
3717
- if ingest_mode is not None:
3718
-
3719
- _query_params.append(('ingest_mode', ingest_mode))
3720
-
3721
- if restricted is not None:
3722
-
3723
- _query_params.append(('restricted', restricted))
3724
-
3725
- if permissions is not None:
3726
-
3727
- _query_params.append(('permissions', permissions))
3728
-
3729
4680
  if timeout is not None:
3730
4681
 
3731
4682
  _query_params.append(('timeout', timeout))
@@ -3733,8 +4684,6 @@ class DocumentIngestionApi:
3733
4684
  # process the header parameters
3734
4685
  # process the form parameters
3735
4686
  # process the body parameter
3736
- if ingest_upload_body is not None:
3737
- _body_params = ingest_upload_body
3738
4687
 
3739
4688
 
3740
4689
  # set the HTTP header `Accept`
@@ -3745,19 +4694,6 @@ class DocumentIngestionApi:
3745
4694
  ]
3746
4695
  )
3747
4696
 
3748
- # set the HTTP header `Content-Type`
3749
- if _content_type:
3750
- _header_params['Content-Type'] = _content_type
3751
- else:
3752
- _default_content_type = (
3753
- self.api_client.select_header_content_type(
3754
- [
3755
- 'application/json'
3756
- ]
3757
- )
3758
- )
3759
- if _default_content_type is not None:
3760
- _header_params['Content-Type'] = _default_content_type
3761
4697
 
3762
4698
  # authentication setting
3763
4699
  _auth_settings: List[str] = [
@@ -3766,7 +4702,7 @@ class DocumentIngestionApi:
3766
4702
 
3767
4703
  return self.api_client.param_serialize(
3768
4704
  method='POST',
3769
- resource_path='/uploads/{upload_ids}/ingest/job',
4705
+ resource_path='/ingest/agent_only_to_standard',
3770
4706
  path_params=_path_params,
3771
4707
  query_params=_query_params,
3772
4708
  header_params=_header_params,
@@ -3783,20 +4719,19 @@ class DocumentIngestionApi:
3783
4719
 
3784
4720
 
3785
4721
  @validate_call
3786
- async def ingest_agent_only_to_standard(
4722
+ async def ingest_from_azure_blob_storage(
3787
4723
  self,
3788
4724
  collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
3789
- document_id: Annotated[StrictStr, Field(description="String id of the document to be parsed.")],
4725
+ ingest_from_azure_blob_storage_body: IngestFromAzureBlobStorageBody,
3790
4726
  gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
3791
4727
  gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
3792
4728
  audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
3793
4729
  ocr_model: Annotated[Optional[StrictStr], Field(description="Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).")] = None,
3794
- restricted: Annotated[Optional[StrictBool], Field(description="Whether the document should be restricted only to certain users.")] = None,
3795
- permissions: Annotated[Optional[List[StrictStr]], Field(description="The list of usernames having permissions to the document.")] = None,
3796
4730
  tesseract_lang: Annotated[Optional[StrictStr], Field(description="Which language to use when using ocr_model=\"tesseract\".")] = None,
3797
4731
  keep_tables_as_one_chunk: Annotated[Optional[StrictBool], Field(description="When tables are identified by the table parser the table tokens will be kept in a single chunk.")] = None,
3798
4732
  chunk_by_page: Annotated[Optional[StrictBool], Field(description="Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.")] = None,
3799
4733
  handwriting_check: Annotated[Optional[StrictBool], Field(description="Check pages for handwriting. Will use specialized models if handwriting is found.")] = None,
4734
+ ingest_mode: Annotated[Optional[StrictStr], Field(description="Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.")] = None,
3800
4735
  timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
3801
4736
  _request_timeout: Union[
3802
4737
  None,
@@ -3811,14 +4746,14 @@ class DocumentIngestionApi:
3811
4746
  _headers: Optional[Dict[StrictStr, Any]] = None,
3812
4747
  _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
3813
4748
  ) -> None:
3814
- """Converts files uploaded in \"agent_only\" ingest mode to PDF and parses them.
4749
+ """Adds files from the Azure Blob Storage into a collection.
3815
4750
 
3816
- Converts files uploaded in \"agent_only\" ingest mode to PDF and parses them.
4751
+ Adds files from the Azure Blob Storage into a collection.
3817
4752
 
3818
4753
  :param collection_id: String id of the collection to add the ingested documents into. (required)
3819
4754
  :type collection_id: str
3820
- :param document_id: String id of the document to be parsed. (required)
3821
- :type document_id: str
4755
+ :param ingest_from_azure_blob_storage_body: (required)
4756
+ :type ingest_from_azure_blob_storage_body: IngestFromAzureBlobStorageBody
3822
4757
  :param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
3823
4758
  :type gen_doc_summaries: bool
3824
4759
  :param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
@@ -3827,10 +4762,6 @@ class DocumentIngestionApi:
3827
4762
  :type audio_input_language: str
3828
4763
  :param ocr_model: Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).
3829
4764
  :type ocr_model: str
3830
- :param restricted: Whether the document should be restricted only to certain users.
3831
- :type restricted: bool
3832
- :param permissions: The list of usernames having permissions to the document.
3833
- :type permissions: List[str]
3834
4765
  :param tesseract_lang: Which language to use when using ocr_model=\"tesseract\".
3835
4766
  :type tesseract_lang: str
3836
4767
  :param keep_tables_as_one_chunk: When tables are identified by the table parser the table tokens will be kept in a single chunk.
@@ -3839,6 +4770,8 @@ class DocumentIngestionApi:
3839
4770
  :type chunk_by_page: bool
3840
4771
  :param handwriting_check: Check pages for handwriting. Will use specialized models if handwriting is found.
3841
4772
  :type handwriting_check: bool
4773
+ :param ingest_mode: Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.
4774
+ :type ingest_mode: str
3842
4775
  :param timeout: Timeout in seconds
3843
4776
  :type timeout: float
3844
4777
  :param _request_timeout: timeout setting for this request. If one
@@ -3863,19 +4796,18 @@ class DocumentIngestionApi:
3863
4796
  :return: Returns the result object.
3864
4797
  """ # noqa: E501
3865
4798
 
3866
- _param = self._ingest_agent_only_to_standard_serialize(
4799
+ _param = self._ingest_from_azure_blob_storage_serialize(
3867
4800
  collection_id=collection_id,
3868
- document_id=document_id,
4801
+ ingest_from_azure_blob_storage_body=ingest_from_azure_blob_storage_body,
3869
4802
  gen_doc_summaries=gen_doc_summaries,
3870
4803
  gen_doc_questions=gen_doc_questions,
3871
4804
  audio_input_language=audio_input_language,
3872
4805
  ocr_model=ocr_model,
3873
- restricted=restricted,
3874
- permissions=permissions,
3875
4806
  tesseract_lang=tesseract_lang,
3876
4807
  keep_tables_as_one_chunk=keep_tables_as_one_chunk,
3877
4808
  chunk_by_page=chunk_by_page,
3878
4809
  handwriting_check=handwriting_check,
4810
+ ingest_mode=ingest_mode,
3879
4811
  timeout=timeout,
3880
4812
  _request_auth=_request_auth,
3881
4813
  _content_type=_content_type,
@@ -3899,20 +4831,19 @@ class DocumentIngestionApi:
3899
4831
 
3900
4832
 
3901
4833
  @validate_call
3902
- async def ingest_agent_only_to_standard_with_http_info(
4834
+ async def ingest_from_azure_blob_storage_with_http_info(
3903
4835
  self,
3904
4836
  collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
3905
- document_id: Annotated[StrictStr, Field(description="String id of the document to be parsed.")],
4837
+ ingest_from_azure_blob_storage_body: IngestFromAzureBlobStorageBody,
3906
4838
  gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
3907
4839
  gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
3908
4840
  audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
3909
4841
  ocr_model: Annotated[Optional[StrictStr], Field(description="Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).")] = None,
3910
- restricted: Annotated[Optional[StrictBool], Field(description="Whether the document should be restricted only to certain users.")] = None,
3911
- permissions: Annotated[Optional[List[StrictStr]], Field(description="The list of usernames having permissions to the document.")] = None,
3912
4842
  tesseract_lang: Annotated[Optional[StrictStr], Field(description="Which language to use when using ocr_model=\"tesseract\".")] = None,
3913
4843
  keep_tables_as_one_chunk: Annotated[Optional[StrictBool], Field(description="When tables are identified by the table parser the table tokens will be kept in a single chunk.")] = None,
3914
4844
  chunk_by_page: Annotated[Optional[StrictBool], Field(description="Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.")] = None,
3915
4845
  handwriting_check: Annotated[Optional[StrictBool], Field(description="Check pages for handwriting. Will use specialized models if handwriting is found.")] = None,
4846
+ ingest_mode: Annotated[Optional[StrictStr], Field(description="Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.")] = None,
3916
4847
  timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
3917
4848
  _request_timeout: Union[
3918
4849
  None,
@@ -3927,14 +4858,14 @@ class DocumentIngestionApi:
3927
4858
  _headers: Optional[Dict[StrictStr, Any]] = None,
3928
4859
  _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
3929
4860
  ) -> ApiResponse[None]:
3930
- """Converts files uploaded in \"agent_only\" ingest mode to PDF and parses them.
4861
+ """Adds files from the Azure Blob Storage into a collection.
3931
4862
 
3932
- Converts files uploaded in \"agent_only\" ingest mode to PDF and parses them.
4863
+ Adds files from the Azure Blob Storage into a collection.
3933
4864
 
3934
4865
  :param collection_id: String id of the collection to add the ingested documents into. (required)
3935
4866
  :type collection_id: str
3936
- :param document_id: String id of the document to be parsed. (required)
3937
- :type document_id: str
4867
+ :param ingest_from_azure_blob_storage_body: (required)
4868
+ :type ingest_from_azure_blob_storage_body: IngestFromAzureBlobStorageBody
3938
4869
  :param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
3939
4870
  :type gen_doc_summaries: bool
3940
4871
  :param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
@@ -3943,10 +4874,6 @@ class DocumentIngestionApi:
3943
4874
  :type audio_input_language: str
3944
4875
  :param ocr_model: Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).
3945
4876
  :type ocr_model: str
3946
- :param restricted: Whether the document should be restricted only to certain users.
3947
- :type restricted: bool
3948
- :param permissions: The list of usernames having permissions to the document.
3949
- :type permissions: List[str]
3950
4877
  :param tesseract_lang: Which language to use when using ocr_model=\"tesseract\".
3951
4878
  :type tesseract_lang: str
3952
4879
  :param keep_tables_as_one_chunk: When tables are identified by the table parser the table tokens will be kept in a single chunk.
@@ -3955,6 +4882,8 @@ class DocumentIngestionApi:
3955
4882
  :type chunk_by_page: bool
3956
4883
  :param handwriting_check: Check pages for handwriting. Will use specialized models if handwriting is found.
3957
4884
  :type handwriting_check: bool
4885
+ :param ingest_mode: Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.
4886
+ :type ingest_mode: str
3958
4887
  :param timeout: Timeout in seconds
3959
4888
  :type timeout: float
3960
4889
  :param _request_timeout: timeout setting for this request. If one
@@ -3979,19 +4908,18 @@ class DocumentIngestionApi:
3979
4908
  :return: Returns the result object.
3980
4909
  """ # noqa: E501
3981
4910
 
3982
- _param = self._ingest_agent_only_to_standard_serialize(
4911
+ _param = self._ingest_from_azure_blob_storage_serialize(
3983
4912
  collection_id=collection_id,
3984
- document_id=document_id,
4913
+ ingest_from_azure_blob_storage_body=ingest_from_azure_blob_storage_body,
3985
4914
  gen_doc_summaries=gen_doc_summaries,
3986
4915
  gen_doc_questions=gen_doc_questions,
3987
4916
  audio_input_language=audio_input_language,
3988
4917
  ocr_model=ocr_model,
3989
- restricted=restricted,
3990
- permissions=permissions,
3991
4918
  tesseract_lang=tesseract_lang,
3992
4919
  keep_tables_as_one_chunk=keep_tables_as_one_chunk,
3993
4920
  chunk_by_page=chunk_by_page,
3994
4921
  handwriting_check=handwriting_check,
4922
+ ingest_mode=ingest_mode,
3995
4923
  timeout=timeout,
3996
4924
  _request_auth=_request_auth,
3997
4925
  _content_type=_content_type,
@@ -4015,20 +4943,19 @@ class DocumentIngestionApi:
4015
4943
 
4016
4944
 
4017
4945
  @validate_call
4018
- async def ingest_agent_only_to_standard_without_preload_content(
4946
+ async def ingest_from_azure_blob_storage_without_preload_content(
4019
4947
  self,
4020
4948
  collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
4021
- document_id: Annotated[StrictStr, Field(description="String id of the document to be parsed.")],
4949
+ ingest_from_azure_blob_storage_body: IngestFromAzureBlobStorageBody,
4022
4950
  gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
4023
4951
  gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
4024
4952
  audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
4025
4953
  ocr_model: Annotated[Optional[StrictStr], Field(description="Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).")] = None,
4026
- restricted: Annotated[Optional[StrictBool], Field(description="Whether the document should be restricted only to certain users.")] = None,
4027
- permissions: Annotated[Optional[List[StrictStr]], Field(description="The list of usernames having permissions to the document.")] = None,
4028
4954
  tesseract_lang: Annotated[Optional[StrictStr], Field(description="Which language to use when using ocr_model=\"tesseract\".")] = None,
4029
4955
  keep_tables_as_one_chunk: Annotated[Optional[StrictBool], Field(description="When tables are identified by the table parser the table tokens will be kept in a single chunk.")] = None,
4030
4956
  chunk_by_page: Annotated[Optional[StrictBool], Field(description="Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.")] = None,
4031
4957
  handwriting_check: Annotated[Optional[StrictBool], Field(description="Check pages for handwriting. Will use specialized models if handwriting is found.")] = None,
4958
+ ingest_mode: Annotated[Optional[StrictStr], Field(description="Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.")] = None,
4032
4959
  timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
4033
4960
  _request_timeout: Union[
4034
4961
  None,
@@ -4043,14 +4970,14 @@ class DocumentIngestionApi:
4043
4970
  _headers: Optional[Dict[StrictStr, Any]] = None,
4044
4971
  _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
4045
4972
  ) -> RESTResponseType:
4046
- """Converts files uploaded in \"agent_only\" ingest mode to PDF and parses them.
4973
+ """Adds files from the Azure Blob Storage into a collection.
4047
4974
 
4048
- Converts files uploaded in \"agent_only\" ingest mode to PDF and parses them.
4975
+ Adds files from the Azure Blob Storage into a collection.
4049
4976
 
4050
4977
  :param collection_id: String id of the collection to add the ingested documents into. (required)
4051
4978
  :type collection_id: str
4052
- :param document_id: String id of the document to be parsed. (required)
4053
- :type document_id: str
4979
+ :param ingest_from_azure_blob_storage_body: (required)
4980
+ :type ingest_from_azure_blob_storage_body: IngestFromAzureBlobStorageBody
4054
4981
  :param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
4055
4982
  :type gen_doc_summaries: bool
4056
4983
  :param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
@@ -4059,10 +4986,6 @@ class DocumentIngestionApi:
4059
4986
  :type audio_input_language: str
4060
4987
  :param ocr_model: Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).
4061
4988
  :type ocr_model: str
4062
- :param restricted: Whether the document should be restricted only to certain users.
4063
- :type restricted: bool
4064
- :param permissions: The list of usernames having permissions to the document.
4065
- :type permissions: List[str]
4066
4989
  :param tesseract_lang: Which language to use when using ocr_model=\"tesseract\".
4067
4990
  :type tesseract_lang: str
4068
4991
  :param keep_tables_as_one_chunk: When tables are identified by the table parser the table tokens will be kept in a single chunk.
@@ -4071,6 +4994,8 @@ class DocumentIngestionApi:
4071
4994
  :type chunk_by_page: bool
4072
4995
  :param handwriting_check: Check pages for handwriting. Will use specialized models if handwriting is found.
4073
4996
  :type handwriting_check: bool
4997
+ :param ingest_mode: Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.
4998
+ :type ingest_mode: str
4074
4999
  :param timeout: Timeout in seconds
4075
5000
  :type timeout: float
4076
5001
  :param _request_timeout: timeout setting for this request. If one
@@ -4095,19 +5020,18 @@ class DocumentIngestionApi:
4095
5020
  :return: Returns the result object.
4096
5021
  """ # noqa: E501
4097
5022
 
4098
- _param = self._ingest_agent_only_to_standard_serialize(
5023
+ _param = self._ingest_from_azure_blob_storage_serialize(
4099
5024
  collection_id=collection_id,
4100
- document_id=document_id,
5025
+ ingest_from_azure_blob_storage_body=ingest_from_azure_blob_storage_body,
4101
5026
  gen_doc_summaries=gen_doc_summaries,
4102
5027
  gen_doc_questions=gen_doc_questions,
4103
5028
  audio_input_language=audio_input_language,
4104
5029
  ocr_model=ocr_model,
4105
- restricted=restricted,
4106
- permissions=permissions,
4107
5030
  tesseract_lang=tesseract_lang,
4108
5031
  keep_tables_as_one_chunk=keep_tables_as_one_chunk,
4109
5032
  chunk_by_page=chunk_by_page,
4110
5033
  handwriting_check=handwriting_check,
5034
+ ingest_mode=ingest_mode,
4111
5035
  timeout=timeout,
4112
5036
  _request_auth=_request_auth,
4113
5037
  _content_type=_content_type,
@@ -4126,20 +5050,19 @@ class DocumentIngestionApi:
4126
5050
  return response_data.response
4127
5051
 
4128
5052
 
4129
- def _ingest_agent_only_to_standard_serialize(
5053
+ def _ingest_from_azure_blob_storage_serialize(
4130
5054
  self,
4131
5055
  collection_id,
4132
- document_id,
5056
+ ingest_from_azure_blob_storage_body,
4133
5057
  gen_doc_summaries,
4134
5058
  gen_doc_questions,
4135
5059
  audio_input_language,
4136
5060
  ocr_model,
4137
- restricted,
4138
- permissions,
4139
5061
  tesseract_lang,
4140
5062
  keep_tables_as_one_chunk,
4141
5063
  chunk_by_page,
4142
5064
  handwriting_check,
5065
+ ingest_mode,
4143
5066
  timeout,
4144
5067
  _request_auth,
4145
5068
  _content_type,
@@ -4150,7 +5073,6 @@ class DocumentIngestionApi:
4150
5073
  _host = None
4151
5074
 
4152
5075
  _collection_formats: Dict[str, str] = {
4153
- 'permissions': 'multi',
4154
5076
  }
4155
5077
 
4156
5078
  _path_params: Dict[str, str] = {}
@@ -4168,10 +5090,6 @@ class DocumentIngestionApi:
4168
5090
 
4169
5091
  _query_params.append(('collection_id', collection_id))
4170
5092
 
4171
- if document_id is not None:
4172
-
4173
- _query_params.append(('document_id', document_id))
4174
-
4175
5093
  if gen_doc_summaries is not None:
4176
5094
 
4177
5095
  _query_params.append(('gen_doc_summaries', gen_doc_summaries))
@@ -4188,14 +5106,6 @@ class DocumentIngestionApi:
4188
5106
 
4189
5107
  _query_params.append(('ocr_model', ocr_model))
4190
5108
 
4191
- if restricted is not None:
4192
-
4193
- _query_params.append(('restricted', restricted))
4194
-
4195
- if permissions is not None:
4196
-
4197
- _query_params.append(('permissions', permissions))
4198
-
4199
5109
  if tesseract_lang is not None:
4200
5110
 
4201
5111
  _query_params.append(('tesseract_lang', tesseract_lang))
@@ -4212,6 +5122,10 @@ class DocumentIngestionApi:
4212
5122
 
4213
5123
  _query_params.append(('handwriting_check', handwriting_check))
4214
5124
 
5125
+ if ingest_mode is not None:
5126
+
5127
+ _query_params.append(('ingest_mode', ingest_mode))
5128
+
4215
5129
  if timeout is not None:
4216
5130
 
4217
5131
  _query_params.append(('timeout', timeout))
@@ -4219,6 +5133,8 @@ class DocumentIngestionApi:
4219
5133
  # process the header parameters
4220
5134
  # process the form parameters
4221
5135
  # process the body parameter
5136
+ if ingest_from_azure_blob_storage_body is not None:
5137
+ _body_params = ingest_from_azure_blob_storage_body
4222
5138
 
4223
5139
 
4224
5140
  # set the HTTP header `Accept`
@@ -4229,6 +5145,19 @@ class DocumentIngestionApi:
4229
5145
  ]
4230
5146
  )
4231
5147
 
5148
+ # set the HTTP header `Content-Type`
5149
+ if _content_type:
5150
+ _header_params['Content-Type'] = _content_type
5151
+ else:
5152
+ _default_content_type = (
5153
+ self.api_client.select_header_content_type(
5154
+ [
5155
+ 'application/json'
5156
+ ]
5157
+ )
5158
+ )
5159
+ if _default_content_type is not None:
5160
+ _header_params['Content-Type'] = _default_content_type
4232
5161
 
4233
5162
  # authentication setting
4234
5163
  _auth_settings: List[str] = [
@@ -4237,7 +5166,7 @@ class DocumentIngestionApi:
4237
5166
 
4238
5167
  return self.api_client.param_serialize(
4239
5168
  method='POST',
4240
- resource_path='/ingest/agent_only_to_standard',
5169
+ resource_path='/ingest/azure_blob_storage',
4241
5170
  path_params=_path_params,
4242
5171
  query_params=_query_params,
4243
5172
  header_params=_header_params,
@@ -4254,10 +5183,10 @@ class DocumentIngestionApi:
4254
5183
 
4255
5184
 
4256
5185
  @validate_call
4257
- async def ingest_from_azure_blob_storage(
5186
+ async def ingest_from_confluence(
4258
5187
  self,
4259
5188
  collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
4260
- ingest_from_azure_blob_storage_body: IngestFromAzureBlobStorageBody,
5189
+ ingest_from_confluence_body: IngestFromConfluenceBody,
4261
5190
  gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
4262
5191
  gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
4263
5192
  audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
@@ -4281,14 +5210,14 @@ class DocumentIngestionApi:
4281
5210
  _headers: Optional[Dict[StrictStr, Any]] = None,
4282
5211
  _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
4283
5212
  ) -> None:
4284
- """Adds files from the Azure Blob Storage into a collection.
5213
+ """Ingests confluence pages into collection.
4285
5214
 
4286
- Adds files from the Azure Blob Storage into a collection.
5215
+ Ingests confluence pages into collection. If an ingested page has sub-pages, the subpages are also ingested.
4287
5216
 
4288
5217
  :param collection_id: String id of the collection to add the ingested documents into. (required)
4289
5218
  :type collection_id: str
4290
- :param ingest_from_azure_blob_storage_body: (required)
4291
- :type ingest_from_azure_blob_storage_body: IngestFromAzureBlobStorageBody
5219
+ :param ingest_from_confluence_body: (required)
5220
+ :type ingest_from_confluence_body: IngestFromConfluenceBody
4292
5221
  :param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
4293
5222
  :type gen_doc_summaries: bool
4294
5223
  :param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
@@ -4331,9 +5260,9 @@ class DocumentIngestionApi:
4331
5260
  :return: Returns the result object.
4332
5261
  """ # noqa: E501
4333
5262
 
4334
- _param = self._ingest_from_azure_blob_storage_serialize(
5263
+ _param = self._ingest_from_confluence_serialize(
4335
5264
  collection_id=collection_id,
4336
- ingest_from_azure_blob_storage_body=ingest_from_azure_blob_storage_body,
5265
+ ingest_from_confluence_body=ingest_from_confluence_body,
4337
5266
  gen_doc_summaries=gen_doc_summaries,
4338
5267
  gen_doc_questions=gen_doc_questions,
4339
5268
  audio_input_language=audio_input_language,
@@ -4366,10 +5295,10 @@ class DocumentIngestionApi:
4366
5295
 
4367
5296
 
4368
5297
  @validate_call
4369
- async def ingest_from_azure_blob_storage_with_http_info(
5298
+ async def ingest_from_confluence_with_http_info(
4370
5299
  self,
4371
5300
  collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
4372
- ingest_from_azure_blob_storage_body: IngestFromAzureBlobStorageBody,
5301
+ ingest_from_confluence_body: IngestFromConfluenceBody,
4373
5302
  gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
4374
5303
  gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
4375
5304
  audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
@@ -4393,14 +5322,14 @@ class DocumentIngestionApi:
4393
5322
  _headers: Optional[Dict[StrictStr, Any]] = None,
4394
5323
  _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
4395
5324
  ) -> ApiResponse[None]:
4396
- """Adds files from the Azure Blob Storage into a collection.
5325
+ """Ingests confluence pages into collection.
4397
5326
 
4398
- Adds files from the Azure Blob Storage into a collection.
5327
+ Ingests confluence pages into collection. If an ingested page has sub-pages, the subpages are also ingested.
4399
5328
 
4400
5329
  :param collection_id: String id of the collection to add the ingested documents into. (required)
4401
5330
  :type collection_id: str
4402
- :param ingest_from_azure_blob_storage_body: (required)
4403
- :type ingest_from_azure_blob_storage_body: IngestFromAzureBlobStorageBody
5331
+ :param ingest_from_confluence_body: (required)
5332
+ :type ingest_from_confluence_body: IngestFromConfluenceBody
4404
5333
  :param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
4405
5334
  :type gen_doc_summaries: bool
4406
5335
  :param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
@@ -4443,9 +5372,9 @@ class DocumentIngestionApi:
4443
5372
  :return: Returns the result object.
4444
5373
  """ # noqa: E501
4445
5374
 
4446
- _param = self._ingest_from_azure_blob_storage_serialize(
5375
+ _param = self._ingest_from_confluence_serialize(
4447
5376
  collection_id=collection_id,
4448
- ingest_from_azure_blob_storage_body=ingest_from_azure_blob_storage_body,
5377
+ ingest_from_confluence_body=ingest_from_confluence_body,
4449
5378
  gen_doc_summaries=gen_doc_summaries,
4450
5379
  gen_doc_questions=gen_doc_questions,
4451
5380
  audio_input_language=audio_input_language,
@@ -4478,10 +5407,10 @@ class DocumentIngestionApi:
4478
5407
 
4479
5408
 
4480
5409
  @validate_call
4481
- async def ingest_from_azure_blob_storage_without_preload_content(
5410
+ async def ingest_from_confluence_without_preload_content(
4482
5411
  self,
4483
5412
  collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
4484
- ingest_from_azure_blob_storage_body: IngestFromAzureBlobStorageBody,
5413
+ ingest_from_confluence_body: IngestFromConfluenceBody,
4485
5414
  gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
4486
5415
  gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
4487
5416
  audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
@@ -4505,14 +5434,14 @@ class DocumentIngestionApi:
4505
5434
  _headers: Optional[Dict[StrictStr, Any]] = None,
4506
5435
  _host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
4507
5436
  ) -> RESTResponseType:
4508
- """Adds files from the Azure Blob Storage into a collection.
5437
+ """Ingests confluence pages into collection.
4509
5438
 
4510
- Adds files from the Azure Blob Storage into a collection.
5439
+ Ingests confluence pages into collection. If an ingested page has sub-pages, the subpages are also ingested.
4511
5440
 
4512
5441
  :param collection_id: String id of the collection to add the ingested documents into. (required)
4513
5442
  :type collection_id: str
4514
- :param ingest_from_azure_blob_storage_body: (required)
4515
- :type ingest_from_azure_blob_storage_body: IngestFromAzureBlobStorageBody
5443
+ :param ingest_from_confluence_body: (required)
5444
+ :type ingest_from_confluence_body: IngestFromConfluenceBody
4516
5445
  :param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
4517
5446
  :type gen_doc_summaries: bool
4518
5447
  :param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
@@ -4555,9 +5484,9 @@ class DocumentIngestionApi:
4555
5484
  :return: Returns the result object.
4556
5485
  """ # noqa: E501
4557
5486
 
4558
- _param = self._ingest_from_azure_blob_storage_serialize(
5487
+ _param = self._ingest_from_confluence_serialize(
4559
5488
  collection_id=collection_id,
4560
- ingest_from_azure_blob_storage_body=ingest_from_azure_blob_storage_body,
5489
+ ingest_from_confluence_body=ingest_from_confluence_body,
4561
5490
  gen_doc_summaries=gen_doc_summaries,
4562
5491
  gen_doc_questions=gen_doc_questions,
4563
5492
  audio_input_language=audio_input_language,
@@ -4585,10 +5514,10 @@ class DocumentIngestionApi:
4585
5514
  return response_data.response
4586
5515
 
4587
5516
 
4588
- def _ingest_from_azure_blob_storage_serialize(
5517
+ def _ingest_from_confluence_serialize(
4589
5518
  self,
4590
5519
  collection_id,
4591
- ingest_from_azure_blob_storage_body,
5520
+ ingest_from_confluence_body,
4592
5521
  gen_doc_summaries,
4593
5522
  gen_doc_questions,
4594
5523
  audio_input_language,
@@ -4668,8 +5597,8 @@ class DocumentIngestionApi:
4668
5597
  # process the header parameters
4669
5598
  # process the form parameters
4670
5599
  # process the body parameter
4671
- if ingest_from_azure_blob_storage_body is not None:
4672
- _body_params = ingest_from_azure_blob_storage_body
5600
+ if ingest_from_confluence_body is not None:
5601
+ _body_params = ingest_from_confluence_body
4673
5602
 
4674
5603
 
4675
5604
  # set the HTTP header `Accept`
@@ -4701,7 +5630,7 @@ class DocumentIngestionApi:
4701
5630
 
4702
5631
  return self.api_client.param_serialize(
4703
5632
  method='POST',
4704
- resource_path='/ingest/azure_blob_storage',
5633
+ resource_path='/ingest/confluence',
4705
5634
  path_params=_path_params,
4706
5635
  query_params=_query_params,
4707
5636
  header_params=_header_params,