h2ogpte 1.6.43rc2__py3-none-any.whl → 1.6.43rc5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- h2ogpte/__init__.py +1 -1
- h2ogpte/connectors.py +11 -0
- h2ogpte/h2ogpte.py +86 -0
- h2ogpte/h2ogpte_async.py +87 -0
- h2ogpte/rest_async/__init__.py +3 -1
- h2ogpte/rest_async/api/document_ingestion_api.py +1365 -436
- h2ogpte/rest_async/api_client.py +1 -1
- h2ogpte/rest_async/configuration.py +1 -1
- h2ogpte/rest_async/models/__init__.py +2 -0
- h2ogpte/rest_async/models/confluence_credentials.py +89 -0
- h2ogpte/rest_async/models/ingest_from_confluence_body.py +97 -0
- h2ogpte/rest_sync/__init__.py +3 -1
- h2ogpte/rest_sync/api/document_ingestion_api.py +1365 -436
- h2ogpte/rest_sync/api_client.py +1 -1
- h2ogpte/rest_sync/configuration.py +1 -1
- h2ogpte/rest_sync/models/__init__.py +2 -0
- h2ogpte/rest_sync/models/confluence_credentials.py +89 -0
- h2ogpte/rest_sync/models/ingest_from_confluence_body.py +97 -0
- h2ogpte/session.py +8 -0
- h2ogpte/session_async.py +8 -0
- {h2ogpte-1.6.43rc2.dist-info → h2ogpte-1.6.43rc5.dist-info}/METADATA +1 -1
- {h2ogpte-1.6.43rc2.dist-info → h2ogpte-1.6.43rc5.dist-info}/RECORD +25 -21
- {h2ogpte-1.6.43rc2.dist-info → h2ogpte-1.6.43rc5.dist-info}/WHEEL +0 -0
- {h2ogpte-1.6.43rc2.dist-info → h2ogpte-1.6.43rc5.dist-info}/entry_points.txt +0 -0
- {h2ogpte-1.6.43rc2.dist-info → h2ogpte-1.6.43rc5.dist-info}/top_level.txt +0 -0
|
@@ -20,6 +20,7 @@ from pydantic import Field, StrictBool, StrictBytes, StrictFloat, StrictInt, Str
|
|
|
20
20
|
from typing import List, Optional, Tuple, Union
|
|
21
21
|
from typing_extensions import Annotated
|
|
22
22
|
from h2ogpte.rest_sync.models.ingest_from_azure_blob_storage_body import IngestFromAzureBlobStorageBody
|
|
23
|
+
from h2ogpte.rest_sync.models.ingest_from_confluence_body import IngestFromConfluenceBody
|
|
23
24
|
from h2ogpte.rest_sync.models.ingest_from_file_system_body import IngestFromFileSystemBody
|
|
24
25
|
from h2ogpte.rest_sync.models.ingest_from_gcs_body import IngestFromGcsBody
|
|
25
26
|
from h2ogpte.rest_sync.models.ingest_from_s3_body import IngestFromS3Body
|
|
@@ -982,10 +983,10 @@ class DocumentIngestionApi:
|
|
|
982
983
|
|
|
983
984
|
|
|
984
985
|
@validate_call
|
|
985
|
-
def
|
|
986
|
+
def create_ingest_from_confluence_job(
|
|
986
987
|
self,
|
|
987
988
|
collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
|
|
988
|
-
|
|
989
|
+
ingest_from_confluence_body: IngestFromConfluenceBody,
|
|
989
990
|
gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
|
|
990
991
|
gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
|
|
991
992
|
audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
|
|
@@ -1009,14 +1010,14 @@ class DocumentIngestionApi:
|
|
|
1009
1010
|
_headers: Optional[Dict[StrictStr, Any]] = None,
|
|
1010
1011
|
_host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
|
|
1011
1012
|
) -> JobDetails:
|
|
1012
|
-
"""Creates a job to
|
|
1013
|
+
"""Creates a job to ingest confluence pages into collection.
|
|
1013
1014
|
|
|
1014
|
-
Creates a job to
|
|
1015
|
+
Creates a job to confluence pages into collection. If an ingested page has sub-pages, the subpages are also ingested.
|
|
1015
1016
|
|
|
1016
1017
|
:param collection_id: String id of the collection to add the ingested documents into. (required)
|
|
1017
1018
|
:type collection_id: str
|
|
1018
|
-
:param
|
|
1019
|
-
:type
|
|
1019
|
+
:param ingest_from_confluence_body: (required)
|
|
1020
|
+
:type ingest_from_confluence_body: IngestFromConfluenceBody
|
|
1020
1021
|
:param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
|
|
1021
1022
|
:type gen_doc_summaries: bool
|
|
1022
1023
|
:param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
|
|
@@ -1059,9 +1060,9 @@ class DocumentIngestionApi:
|
|
|
1059
1060
|
:return: Returns the result object.
|
|
1060
1061
|
""" # noqa: E501
|
|
1061
1062
|
|
|
1062
|
-
_param = self.
|
|
1063
|
+
_param = self._create_ingest_from_confluence_job_serialize(
|
|
1063
1064
|
collection_id=collection_id,
|
|
1064
|
-
|
|
1065
|
+
ingest_from_confluence_body=ingest_from_confluence_body,
|
|
1065
1066
|
gen_doc_summaries=gen_doc_summaries,
|
|
1066
1067
|
gen_doc_questions=gen_doc_questions,
|
|
1067
1068
|
audio_input_language=audio_input_language,
|
|
@@ -1094,10 +1095,10 @@ class DocumentIngestionApi:
|
|
|
1094
1095
|
|
|
1095
1096
|
|
|
1096
1097
|
@validate_call
|
|
1097
|
-
def
|
|
1098
|
+
def create_ingest_from_confluence_job_with_http_info(
|
|
1098
1099
|
self,
|
|
1099
1100
|
collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
|
|
1100
|
-
|
|
1101
|
+
ingest_from_confluence_body: IngestFromConfluenceBody,
|
|
1101
1102
|
gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
|
|
1102
1103
|
gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
|
|
1103
1104
|
audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
|
|
@@ -1121,14 +1122,14 @@ class DocumentIngestionApi:
|
|
|
1121
1122
|
_headers: Optional[Dict[StrictStr, Any]] = None,
|
|
1122
1123
|
_host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
|
|
1123
1124
|
) -> ApiResponse[JobDetails]:
|
|
1124
|
-
"""Creates a job to
|
|
1125
|
+
"""Creates a job to ingest confluence pages into collection.
|
|
1125
1126
|
|
|
1126
|
-
Creates a job to
|
|
1127
|
+
Creates a job to confluence pages into collection. If an ingested page has sub-pages, the subpages are also ingested.
|
|
1127
1128
|
|
|
1128
1129
|
:param collection_id: String id of the collection to add the ingested documents into. (required)
|
|
1129
1130
|
:type collection_id: str
|
|
1130
|
-
:param
|
|
1131
|
-
:type
|
|
1131
|
+
:param ingest_from_confluence_body: (required)
|
|
1132
|
+
:type ingest_from_confluence_body: IngestFromConfluenceBody
|
|
1132
1133
|
:param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
|
|
1133
1134
|
:type gen_doc_summaries: bool
|
|
1134
1135
|
:param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
|
|
@@ -1171,9 +1172,9 @@ class DocumentIngestionApi:
|
|
|
1171
1172
|
:return: Returns the result object.
|
|
1172
1173
|
""" # noqa: E501
|
|
1173
1174
|
|
|
1174
|
-
_param = self.
|
|
1175
|
+
_param = self._create_ingest_from_confluence_job_serialize(
|
|
1175
1176
|
collection_id=collection_id,
|
|
1176
|
-
|
|
1177
|
+
ingest_from_confluence_body=ingest_from_confluence_body,
|
|
1177
1178
|
gen_doc_summaries=gen_doc_summaries,
|
|
1178
1179
|
gen_doc_questions=gen_doc_questions,
|
|
1179
1180
|
audio_input_language=audio_input_language,
|
|
@@ -1206,10 +1207,10 @@ class DocumentIngestionApi:
|
|
|
1206
1207
|
|
|
1207
1208
|
|
|
1208
1209
|
@validate_call
|
|
1209
|
-
def
|
|
1210
|
+
def create_ingest_from_confluence_job_without_preload_content(
|
|
1210
1211
|
self,
|
|
1211
1212
|
collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
|
|
1212
|
-
|
|
1213
|
+
ingest_from_confluence_body: IngestFromConfluenceBody,
|
|
1213
1214
|
gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
|
|
1214
1215
|
gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
|
|
1215
1216
|
audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
|
|
@@ -1233,14 +1234,14 @@ class DocumentIngestionApi:
|
|
|
1233
1234
|
_headers: Optional[Dict[StrictStr, Any]] = None,
|
|
1234
1235
|
_host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
|
|
1235
1236
|
) -> RESTResponseType:
|
|
1236
|
-
"""Creates a job to
|
|
1237
|
+
"""Creates a job to ingest confluence pages into collection.
|
|
1237
1238
|
|
|
1238
|
-
Creates a job to
|
|
1239
|
+
Creates a job to confluence pages into collection. If an ingested page has sub-pages, the subpages are also ingested.
|
|
1239
1240
|
|
|
1240
1241
|
:param collection_id: String id of the collection to add the ingested documents into. (required)
|
|
1241
1242
|
:type collection_id: str
|
|
1242
|
-
:param
|
|
1243
|
-
:type
|
|
1243
|
+
:param ingest_from_confluence_body: (required)
|
|
1244
|
+
:type ingest_from_confluence_body: IngestFromConfluenceBody
|
|
1244
1245
|
:param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
|
|
1245
1246
|
:type gen_doc_summaries: bool
|
|
1246
1247
|
:param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
|
|
@@ -1283,9 +1284,9 @@ class DocumentIngestionApi:
|
|
|
1283
1284
|
:return: Returns the result object.
|
|
1284
1285
|
""" # noqa: E501
|
|
1285
1286
|
|
|
1286
|
-
_param = self.
|
|
1287
|
+
_param = self._create_ingest_from_confluence_job_serialize(
|
|
1287
1288
|
collection_id=collection_id,
|
|
1288
|
-
|
|
1289
|
+
ingest_from_confluence_body=ingest_from_confluence_body,
|
|
1289
1290
|
gen_doc_summaries=gen_doc_summaries,
|
|
1290
1291
|
gen_doc_questions=gen_doc_questions,
|
|
1291
1292
|
audio_input_language=audio_input_language,
|
|
@@ -1313,10 +1314,10 @@ class DocumentIngestionApi:
|
|
|
1313
1314
|
return response_data.response
|
|
1314
1315
|
|
|
1315
1316
|
|
|
1316
|
-
def
|
|
1317
|
+
def _create_ingest_from_confluence_job_serialize(
|
|
1317
1318
|
self,
|
|
1318
1319
|
collection_id,
|
|
1319
|
-
|
|
1320
|
+
ingest_from_confluence_body,
|
|
1320
1321
|
gen_doc_summaries,
|
|
1321
1322
|
gen_doc_questions,
|
|
1322
1323
|
audio_input_language,
|
|
@@ -1396,8 +1397,8 @@ class DocumentIngestionApi:
|
|
|
1396
1397
|
# process the header parameters
|
|
1397
1398
|
# process the form parameters
|
|
1398
1399
|
# process the body parameter
|
|
1399
|
-
if
|
|
1400
|
-
_body_params =
|
|
1400
|
+
if ingest_from_confluence_body is not None:
|
|
1401
|
+
_body_params = ingest_from_confluence_body
|
|
1401
1402
|
|
|
1402
1403
|
|
|
1403
1404
|
# set the HTTP header `Accept`
|
|
@@ -1429,7 +1430,7 @@ class DocumentIngestionApi:
|
|
|
1429
1430
|
|
|
1430
1431
|
return self.api_client.param_serialize(
|
|
1431
1432
|
method='POST',
|
|
1432
|
-
resource_path='/ingest/
|
|
1433
|
+
resource_path='/ingest/confluence/job',
|
|
1433
1434
|
path_params=_path_params,
|
|
1434
1435
|
query_params=_query_params,
|
|
1435
1436
|
header_params=_header_params,
|
|
@@ -1446,10 +1447,10 @@ class DocumentIngestionApi:
|
|
|
1446
1447
|
|
|
1447
1448
|
|
|
1448
1449
|
@validate_call
|
|
1449
|
-
def
|
|
1450
|
+
def create_ingest_from_file_system_job(
|
|
1450
1451
|
self,
|
|
1451
1452
|
collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
|
|
1452
|
-
|
|
1453
|
+
ingest_from_file_system_body: IngestFromFileSystemBody,
|
|
1453
1454
|
gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
|
|
1454
1455
|
gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
|
|
1455
1456
|
audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
|
|
@@ -1473,14 +1474,14 @@ class DocumentIngestionApi:
|
|
|
1473
1474
|
_headers: Optional[Dict[StrictStr, Any]] = None,
|
|
1474
1475
|
_host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
|
|
1475
1476
|
) -> JobDetails:
|
|
1476
|
-
"""Creates a job to add files from the
|
|
1477
|
+
"""Creates a job to add files from the local system into a collection.
|
|
1477
1478
|
|
|
1478
|
-
Creates a job to add files from the
|
|
1479
|
+
Creates a job to add files from the local system into a collection.
|
|
1479
1480
|
|
|
1480
1481
|
:param collection_id: String id of the collection to add the ingested documents into. (required)
|
|
1481
1482
|
:type collection_id: str
|
|
1482
|
-
:param
|
|
1483
|
-
:type
|
|
1483
|
+
:param ingest_from_file_system_body: (required)
|
|
1484
|
+
:type ingest_from_file_system_body: IngestFromFileSystemBody
|
|
1484
1485
|
:param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
|
|
1485
1486
|
:type gen_doc_summaries: bool
|
|
1486
1487
|
:param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
|
|
@@ -1523,9 +1524,9 @@ class DocumentIngestionApi:
|
|
|
1523
1524
|
:return: Returns the result object.
|
|
1524
1525
|
""" # noqa: E501
|
|
1525
1526
|
|
|
1526
|
-
_param = self.
|
|
1527
|
+
_param = self._create_ingest_from_file_system_job_serialize(
|
|
1527
1528
|
collection_id=collection_id,
|
|
1528
|
-
|
|
1529
|
+
ingest_from_file_system_body=ingest_from_file_system_body,
|
|
1529
1530
|
gen_doc_summaries=gen_doc_summaries,
|
|
1530
1531
|
gen_doc_questions=gen_doc_questions,
|
|
1531
1532
|
audio_input_language=audio_input_language,
|
|
@@ -1558,10 +1559,10 @@ class DocumentIngestionApi:
|
|
|
1558
1559
|
|
|
1559
1560
|
|
|
1560
1561
|
@validate_call
|
|
1561
|
-
def
|
|
1562
|
+
def create_ingest_from_file_system_job_with_http_info(
|
|
1562
1563
|
self,
|
|
1563
1564
|
collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
|
|
1564
|
-
|
|
1565
|
+
ingest_from_file_system_body: IngestFromFileSystemBody,
|
|
1565
1566
|
gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
|
|
1566
1567
|
gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
|
|
1567
1568
|
audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
|
|
@@ -1585,14 +1586,14 @@ class DocumentIngestionApi:
|
|
|
1585
1586
|
_headers: Optional[Dict[StrictStr, Any]] = None,
|
|
1586
1587
|
_host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
|
|
1587
1588
|
) -> ApiResponse[JobDetails]:
|
|
1588
|
-
"""Creates a job to add files from the
|
|
1589
|
+
"""Creates a job to add files from the local system into a collection.
|
|
1589
1590
|
|
|
1590
|
-
Creates a job to add files from the
|
|
1591
|
+
Creates a job to add files from the local system into a collection.
|
|
1591
1592
|
|
|
1592
1593
|
:param collection_id: String id of the collection to add the ingested documents into. (required)
|
|
1593
1594
|
:type collection_id: str
|
|
1594
|
-
:param
|
|
1595
|
-
:type
|
|
1595
|
+
:param ingest_from_file_system_body: (required)
|
|
1596
|
+
:type ingest_from_file_system_body: IngestFromFileSystemBody
|
|
1596
1597
|
:param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
|
|
1597
1598
|
:type gen_doc_summaries: bool
|
|
1598
1599
|
:param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
|
|
@@ -1635,9 +1636,9 @@ class DocumentIngestionApi:
|
|
|
1635
1636
|
:return: Returns the result object.
|
|
1636
1637
|
""" # noqa: E501
|
|
1637
1638
|
|
|
1638
|
-
_param = self.
|
|
1639
|
+
_param = self._create_ingest_from_file_system_job_serialize(
|
|
1639
1640
|
collection_id=collection_id,
|
|
1640
|
-
|
|
1641
|
+
ingest_from_file_system_body=ingest_from_file_system_body,
|
|
1641
1642
|
gen_doc_summaries=gen_doc_summaries,
|
|
1642
1643
|
gen_doc_questions=gen_doc_questions,
|
|
1643
1644
|
audio_input_language=audio_input_language,
|
|
@@ -1670,10 +1671,10 @@ class DocumentIngestionApi:
|
|
|
1670
1671
|
|
|
1671
1672
|
|
|
1672
1673
|
@validate_call
|
|
1673
|
-
def
|
|
1674
|
+
def create_ingest_from_file_system_job_without_preload_content(
|
|
1674
1675
|
self,
|
|
1675
1676
|
collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
|
|
1676
|
-
|
|
1677
|
+
ingest_from_file_system_body: IngestFromFileSystemBody,
|
|
1677
1678
|
gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
|
|
1678
1679
|
gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
|
|
1679
1680
|
audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
|
|
@@ -1697,14 +1698,14 @@ class DocumentIngestionApi:
|
|
|
1697
1698
|
_headers: Optional[Dict[StrictStr, Any]] = None,
|
|
1698
1699
|
_host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
|
|
1699
1700
|
) -> RESTResponseType:
|
|
1700
|
-
"""Creates a job to add files from the
|
|
1701
|
+
"""Creates a job to add files from the local system into a collection.
|
|
1701
1702
|
|
|
1702
|
-
Creates a job to add files from the
|
|
1703
|
+
Creates a job to add files from the local system into a collection.
|
|
1703
1704
|
|
|
1704
1705
|
:param collection_id: String id of the collection to add the ingested documents into. (required)
|
|
1705
1706
|
:type collection_id: str
|
|
1706
|
-
:param
|
|
1707
|
-
:type
|
|
1707
|
+
:param ingest_from_file_system_body: (required)
|
|
1708
|
+
:type ingest_from_file_system_body: IngestFromFileSystemBody
|
|
1708
1709
|
:param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
|
|
1709
1710
|
:type gen_doc_summaries: bool
|
|
1710
1711
|
:param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
|
|
@@ -1747,9 +1748,9 @@ class DocumentIngestionApi:
|
|
|
1747
1748
|
:return: Returns the result object.
|
|
1748
1749
|
""" # noqa: E501
|
|
1749
1750
|
|
|
1750
|
-
_param = self.
|
|
1751
|
+
_param = self._create_ingest_from_file_system_job_serialize(
|
|
1751
1752
|
collection_id=collection_id,
|
|
1752
|
-
|
|
1753
|
+
ingest_from_file_system_body=ingest_from_file_system_body,
|
|
1753
1754
|
gen_doc_summaries=gen_doc_summaries,
|
|
1754
1755
|
gen_doc_questions=gen_doc_questions,
|
|
1755
1756
|
audio_input_language=audio_input_language,
|
|
@@ -1777,10 +1778,10 @@ class DocumentIngestionApi:
|
|
|
1777
1778
|
return response_data.response
|
|
1778
1779
|
|
|
1779
1780
|
|
|
1780
|
-
def
|
|
1781
|
+
def _create_ingest_from_file_system_job_serialize(
|
|
1781
1782
|
self,
|
|
1782
1783
|
collection_id,
|
|
1783
|
-
|
|
1784
|
+
ingest_from_file_system_body,
|
|
1784
1785
|
gen_doc_summaries,
|
|
1785
1786
|
gen_doc_questions,
|
|
1786
1787
|
audio_input_language,
|
|
@@ -1860,8 +1861,8 @@ class DocumentIngestionApi:
|
|
|
1860
1861
|
# process the header parameters
|
|
1861
1862
|
# process the form parameters
|
|
1862
1863
|
# process the body parameter
|
|
1863
|
-
if
|
|
1864
|
-
_body_params =
|
|
1864
|
+
if ingest_from_file_system_body is not None:
|
|
1865
|
+
_body_params = ingest_from_file_system_body
|
|
1865
1866
|
|
|
1866
1867
|
|
|
1867
1868
|
# set the HTTP header `Accept`
|
|
@@ -1893,7 +1894,7 @@ class DocumentIngestionApi:
|
|
|
1893
1894
|
|
|
1894
1895
|
return self.api_client.param_serialize(
|
|
1895
1896
|
method='POST',
|
|
1896
|
-
resource_path='/ingest/
|
|
1897
|
+
resource_path='/ingest/file_system/job',
|
|
1897
1898
|
path_params=_path_params,
|
|
1898
1899
|
query_params=_query_params,
|
|
1899
1900
|
header_params=_header_params,
|
|
@@ -1910,14 +1911,19 @@ class DocumentIngestionApi:
|
|
|
1910
1911
|
|
|
1911
1912
|
|
|
1912
1913
|
@validate_call
|
|
1913
|
-
def
|
|
1914
|
+
def create_ingest_from_gcs_job(
|
|
1914
1915
|
self,
|
|
1915
1916
|
collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
|
|
1916
|
-
|
|
1917
|
-
body: Annotated[StrictStr, Field(description="The text that will ingested into a collection.")],
|
|
1917
|
+
ingest_from_gcs_body: IngestFromGcsBody,
|
|
1918
1918
|
gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
|
|
1919
1919
|
gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
|
|
1920
|
-
|
|
1920
|
+
audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
|
|
1921
|
+
ocr_model: Annotated[Optional[StrictStr], Field(description="Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).")] = None,
|
|
1922
|
+
tesseract_lang: Annotated[Optional[StrictStr], Field(description="Which language to use when using ocr_model=\"tesseract\".")] = None,
|
|
1923
|
+
keep_tables_as_one_chunk: Annotated[Optional[StrictBool], Field(description="When tables are identified by the table parser the table tokens will be kept in a single chunk.")] = None,
|
|
1924
|
+
chunk_by_page: Annotated[Optional[StrictBool], Field(description="Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.")] = None,
|
|
1925
|
+
handwriting_check: Annotated[Optional[StrictBool], Field(description="Check pages for handwriting. Will use specialized models if handwriting is found.")] = None,
|
|
1926
|
+
ingest_mode: Annotated[Optional[StrictStr], Field(description="Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.")] = None,
|
|
1921
1927
|
timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
|
|
1922
1928
|
_request_timeout: Union[
|
|
1923
1929
|
None,
|
|
@@ -1932,22 +1938,32 @@ class DocumentIngestionApi:
|
|
|
1932
1938
|
_headers: Optional[Dict[StrictStr, Any]] = None,
|
|
1933
1939
|
_host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
|
|
1934
1940
|
) -> JobDetails:
|
|
1935
|
-
"""Creates a job to add
|
|
1941
|
+
"""Creates a job to add files from the Google Cloud Storage into a collection.
|
|
1936
1942
|
|
|
1937
|
-
Creates a job to add
|
|
1943
|
+
Creates a job to add files from the Google Cloud Storage into a collection.
|
|
1938
1944
|
|
|
1939
1945
|
:param collection_id: String id of the collection to add the ingested documents into. (required)
|
|
1940
1946
|
:type collection_id: str
|
|
1941
|
-
:param
|
|
1942
|
-
:type
|
|
1943
|
-
:param body: The text that will ingested into a collection. (required)
|
|
1944
|
-
:type body: str
|
|
1947
|
+
:param ingest_from_gcs_body: (required)
|
|
1948
|
+
:type ingest_from_gcs_body: IngestFromGcsBody
|
|
1945
1949
|
:param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
|
|
1946
1950
|
:type gen_doc_summaries: bool
|
|
1947
1951
|
:param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
|
|
1948
1952
|
:type gen_doc_questions: bool
|
|
1949
|
-
:param
|
|
1950
|
-
:type
|
|
1953
|
+
:param audio_input_language: Language of audio files.
|
|
1954
|
+
:type audio_input_language: str
|
|
1955
|
+
:param ocr_model: Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).
|
|
1956
|
+
:type ocr_model: str
|
|
1957
|
+
:param tesseract_lang: Which language to use when using ocr_model=\"tesseract\".
|
|
1958
|
+
:type tesseract_lang: str
|
|
1959
|
+
:param keep_tables_as_one_chunk: When tables are identified by the table parser the table tokens will be kept in a single chunk.
|
|
1960
|
+
:type keep_tables_as_one_chunk: bool
|
|
1961
|
+
:param chunk_by_page: Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.
|
|
1962
|
+
:type chunk_by_page: bool
|
|
1963
|
+
:param handwriting_check: Check pages for handwriting. Will use specialized models if handwriting is found.
|
|
1964
|
+
:type handwriting_check: bool
|
|
1965
|
+
:param ingest_mode: Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.
|
|
1966
|
+
:type ingest_mode: str
|
|
1951
1967
|
:param timeout: Timeout in seconds
|
|
1952
1968
|
:type timeout: float
|
|
1953
1969
|
:param _request_timeout: timeout setting for this request. If one
|
|
@@ -1972,13 +1988,18 @@ class DocumentIngestionApi:
|
|
|
1972
1988
|
:return: Returns the result object.
|
|
1973
1989
|
""" # noqa: E501
|
|
1974
1990
|
|
|
1975
|
-
_param = self.
|
|
1991
|
+
_param = self._create_ingest_from_gcs_job_serialize(
|
|
1976
1992
|
collection_id=collection_id,
|
|
1977
|
-
|
|
1978
|
-
body=body,
|
|
1993
|
+
ingest_from_gcs_body=ingest_from_gcs_body,
|
|
1979
1994
|
gen_doc_summaries=gen_doc_summaries,
|
|
1980
1995
|
gen_doc_questions=gen_doc_questions,
|
|
1981
|
-
|
|
1996
|
+
audio_input_language=audio_input_language,
|
|
1997
|
+
ocr_model=ocr_model,
|
|
1998
|
+
tesseract_lang=tesseract_lang,
|
|
1999
|
+
keep_tables_as_one_chunk=keep_tables_as_one_chunk,
|
|
2000
|
+
chunk_by_page=chunk_by_page,
|
|
2001
|
+
handwriting_check=handwriting_check,
|
|
2002
|
+
ingest_mode=ingest_mode,
|
|
1982
2003
|
timeout=timeout,
|
|
1983
2004
|
_request_auth=_request_auth,
|
|
1984
2005
|
_content_type=_content_type,
|
|
@@ -2002,14 +2023,19 @@ class DocumentIngestionApi:
|
|
|
2002
2023
|
|
|
2003
2024
|
|
|
2004
2025
|
@validate_call
|
|
2005
|
-
def
|
|
2026
|
+
def create_ingest_from_gcs_job_with_http_info(
|
|
2006
2027
|
self,
|
|
2007
2028
|
collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
|
|
2008
|
-
|
|
2009
|
-
body: Annotated[StrictStr, Field(description="The text that will ingested into a collection.")],
|
|
2029
|
+
ingest_from_gcs_body: IngestFromGcsBody,
|
|
2010
2030
|
gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
|
|
2011
2031
|
gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
|
|
2012
|
-
|
|
2032
|
+
audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
|
|
2033
|
+
ocr_model: Annotated[Optional[StrictStr], Field(description="Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).")] = None,
|
|
2034
|
+
tesseract_lang: Annotated[Optional[StrictStr], Field(description="Which language to use when using ocr_model=\"tesseract\".")] = None,
|
|
2035
|
+
keep_tables_as_one_chunk: Annotated[Optional[StrictBool], Field(description="When tables are identified by the table parser the table tokens will be kept in a single chunk.")] = None,
|
|
2036
|
+
chunk_by_page: Annotated[Optional[StrictBool], Field(description="Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.")] = None,
|
|
2037
|
+
handwriting_check: Annotated[Optional[StrictBool], Field(description="Check pages for handwriting. Will use specialized models if handwriting is found.")] = None,
|
|
2038
|
+
ingest_mode: Annotated[Optional[StrictStr], Field(description="Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.")] = None,
|
|
2013
2039
|
timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
|
|
2014
2040
|
_request_timeout: Union[
|
|
2015
2041
|
None,
|
|
@@ -2024,22 +2050,32 @@ class DocumentIngestionApi:
|
|
|
2024
2050
|
_headers: Optional[Dict[StrictStr, Any]] = None,
|
|
2025
2051
|
_host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
|
|
2026
2052
|
) -> ApiResponse[JobDetails]:
|
|
2027
|
-
"""Creates a job to add
|
|
2053
|
+
"""Creates a job to add files from the Google Cloud Storage into a collection.
|
|
2028
2054
|
|
|
2029
|
-
Creates a job to add
|
|
2055
|
+
Creates a job to add files from the Google Cloud Storage into a collection.
|
|
2030
2056
|
|
|
2031
2057
|
:param collection_id: String id of the collection to add the ingested documents into. (required)
|
|
2032
2058
|
:type collection_id: str
|
|
2033
|
-
:param
|
|
2034
|
-
:type
|
|
2035
|
-
:param body: The text that will ingested into a collection. (required)
|
|
2036
|
-
:type body: str
|
|
2059
|
+
:param ingest_from_gcs_body: (required)
|
|
2060
|
+
:type ingest_from_gcs_body: IngestFromGcsBody
|
|
2037
2061
|
:param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
|
|
2038
2062
|
:type gen_doc_summaries: bool
|
|
2039
2063
|
:param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
|
|
2040
2064
|
:type gen_doc_questions: bool
|
|
2041
|
-
:param
|
|
2042
|
-
:type
|
|
2065
|
+
:param audio_input_language: Language of audio files.
|
|
2066
|
+
:type audio_input_language: str
|
|
2067
|
+
:param ocr_model: Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).
|
|
2068
|
+
:type ocr_model: str
|
|
2069
|
+
:param tesseract_lang: Which language to use when using ocr_model=\"tesseract\".
|
|
2070
|
+
:type tesseract_lang: str
|
|
2071
|
+
:param keep_tables_as_one_chunk: When tables are identified by the table parser the table tokens will be kept in a single chunk.
|
|
2072
|
+
:type keep_tables_as_one_chunk: bool
|
|
2073
|
+
:param chunk_by_page: Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.
|
|
2074
|
+
:type chunk_by_page: bool
|
|
2075
|
+
:param handwriting_check: Check pages for handwriting. Will use specialized models if handwriting is found.
|
|
2076
|
+
:type handwriting_check: bool
|
|
2077
|
+
:param ingest_mode: Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.
|
|
2078
|
+
:type ingest_mode: str
|
|
2043
2079
|
:param timeout: Timeout in seconds
|
|
2044
2080
|
:type timeout: float
|
|
2045
2081
|
:param _request_timeout: timeout setting for this request. If one
|
|
@@ -2064,13 +2100,18 @@ class DocumentIngestionApi:
|
|
|
2064
2100
|
:return: Returns the result object.
|
|
2065
2101
|
""" # noqa: E501
|
|
2066
2102
|
|
|
2067
|
-
_param = self.
|
|
2103
|
+
_param = self._create_ingest_from_gcs_job_serialize(
|
|
2068
2104
|
collection_id=collection_id,
|
|
2069
|
-
|
|
2070
|
-
body=body,
|
|
2105
|
+
ingest_from_gcs_body=ingest_from_gcs_body,
|
|
2071
2106
|
gen_doc_summaries=gen_doc_summaries,
|
|
2072
2107
|
gen_doc_questions=gen_doc_questions,
|
|
2073
|
-
|
|
2108
|
+
audio_input_language=audio_input_language,
|
|
2109
|
+
ocr_model=ocr_model,
|
|
2110
|
+
tesseract_lang=tesseract_lang,
|
|
2111
|
+
keep_tables_as_one_chunk=keep_tables_as_one_chunk,
|
|
2112
|
+
chunk_by_page=chunk_by_page,
|
|
2113
|
+
handwriting_check=handwriting_check,
|
|
2114
|
+
ingest_mode=ingest_mode,
|
|
2074
2115
|
timeout=timeout,
|
|
2075
2116
|
_request_auth=_request_auth,
|
|
2076
2117
|
_content_type=_content_type,
|
|
@@ -2094,14 +2135,19 @@ class DocumentIngestionApi:
|
|
|
2094
2135
|
|
|
2095
2136
|
|
|
2096
2137
|
@validate_call
|
|
2097
|
-
def
|
|
2138
|
+
def create_ingest_from_gcs_job_without_preload_content(
|
|
2098
2139
|
self,
|
|
2099
2140
|
collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
|
|
2100
|
-
|
|
2101
|
-
body: Annotated[StrictStr, Field(description="The text that will ingested into a collection.")],
|
|
2141
|
+
ingest_from_gcs_body: IngestFromGcsBody,
|
|
2102
2142
|
gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
|
|
2103
2143
|
gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
|
|
2104
|
-
|
|
2144
|
+
audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
|
|
2145
|
+
ocr_model: Annotated[Optional[StrictStr], Field(description="Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).")] = None,
|
|
2146
|
+
tesseract_lang: Annotated[Optional[StrictStr], Field(description="Which language to use when using ocr_model=\"tesseract\".")] = None,
|
|
2147
|
+
keep_tables_as_one_chunk: Annotated[Optional[StrictBool], Field(description="When tables are identified by the table parser the table tokens will be kept in a single chunk.")] = None,
|
|
2148
|
+
chunk_by_page: Annotated[Optional[StrictBool], Field(description="Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.")] = None,
|
|
2149
|
+
handwriting_check: Annotated[Optional[StrictBool], Field(description="Check pages for handwriting. Will use specialized models if handwriting is found.")] = None,
|
|
2150
|
+
ingest_mode: Annotated[Optional[StrictStr], Field(description="Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.")] = None,
|
|
2105
2151
|
timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
|
|
2106
2152
|
_request_timeout: Union[
|
|
2107
2153
|
None,
|
|
@@ -2116,22 +2162,32 @@ class DocumentIngestionApi:
|
|
|
2116
2162
|
_headers: Optional[Dict[StrictStr, Any]] = None,
|
|
2117
2163
|
_host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
|
|
2118
2164
|
) -> RESTResponseType:
|
|
2119
|
-
"""Creates a job to add
|
|
2165
|
+
"""Creates a job to add files from the Google Cloud Storage into a collection.
|
|
2120
2166
|
|
|
2121
|
-
Creates a job to add
|
|
2167
|
+
Creates a job to add files from the Google Cloud Storage into a collection.
|
|
2122
2168
|
|
|
2123
2169
|
:param collection_id: String id of the collection to add the ingested documents into. (required)
|
|
2124
2170
|
:type collection_id: str
|
|
2125
|
-
:param
|
|
2126
|
-
:type
|
|
2127
|
-
:param body: The text that will ingested into a collection. (required)
|
|
2128
|
-
:type body: str
|
|
2171
|
+
:param ingest_from_gcs_body: (required)
|
|
2172
|
+
:type ingest_from_gcs_body: IngestFromGcsBody
|
|
2129
2173
|
:param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
|
|
2130
2174
|
:type gen_doc_summaries: bool
|
|
2131
2175
|
:param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
|
|
2132
2176
|
:type gen_doc_questions: bool
|
|
2133
|
-
:param
|
|
2134
|
-
:type
|
|
2177
|
+
:param audio_input_language: Language of audio files.
|
|
2178
|
+
:type audio_input_language: str
|
|
2179
|
+
:param ocr_model: Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).
|
|
2180
|
+
:type ocr_model: str
|
|
2181
|
+
:param tesseract_lang: Which language to use when using ocr_model=\"tesseract\".
|
|
2182
|
+
:type tesseract_lang: str
|
|
2183
|
+
:param keep_tables_as_one_chunk: When tables are identified by the table parser the table tokens will be kept in a single chunk.
|
|
2184
|
+
:type keep_tables_as_one_chunk: bool
|
|
2185
|
+
:param chunk_by_page: Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.
|
|
2186
|
+
:type chunk_by_page: bool
|
|
2187
|
+
:param handwriting_check: Check pages for handwriting. Will use specialized models if handwriting is found.
|
|
2188
|
+
:type handwriting_check: bool
|
|
2189
|
+
:param ingest_mode: Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.
|
|
2190
|
+
:type ingest_mode: str
|
|
2135
2191
|
:param timeout: Timeout in seconds
|
|
2136
2192
|
:type timeout: float
|
|
2137
2193
|
:param _request_timeout: timeout setting for this request. If one
|
|
@@ -2156,13 +2212,18 @@ class DocumentIngestionApi:
|
|
|
2156
2212
|
:return: Returns the result object.
|
|
2157
2213
|
""" # noqa: E501
|
|
2158
2214
|
|
|
2159
|
-
_param = self.
|
|
2215
|
+
_param = self._create_ingest_from_gcs_job_serialize(
|
|
2160
2216
|
collection_id=collection_id,
|
|
2161
|
-
|
|
2162
|
-
body=body,
|
|
2217
|
+
ingest_from_gcs_body=ingest_from_gcs_body,
|
|
2163
2218
|
gen_doc_summaries=gen_doc_summaries,
|
|
2164
2219
|
gen_doc_questions=gen_doc_questions,
|
|
2165
|
-
|
|
2220
|
+
audio_input_language=audio_input_language,
|
|
2221
|
+
ocr_model=ocr_model,
|
|
2222
|
+
tesseract_lang=tesseract_lang,
|
|
2223
|
+
keep_tables_as_one_chunk=keep_tables_as_one_chunk,
|
|
2224
|
+
chunk_by_page=chunk_by_page,
|
|
2225
|
+
handwriting_check=handwriting_check,
|
|
2226
|
+
ingest_mode=ingest_mode,
|
|
2166
2227
|
timeout=timeout,
|
|
2167
2228
|
_request_auth=_request_auth,
|
|
2168
2229
|
_content_type=_content_type,
|
|
@@ -2181,14 +2242,418 @@ class DocumentIngestionApi:
|
|
|
2181
2242
|
return response_data.response
|
|
2182
2243
|
|
|
2183
2244
|
|
|
2184
|
-
def
|
|
2245
|
+
def _create_ingest_from_gcs_job_serialize(
|
|
2185
2246
|
self,
|
|
2186
2247
|
collection_id,
|
|
2187
|
-
|
|
2188
|
-
body,
|
|
2248
|
+
ingest_from_gcs_body,
|
|
2189
2249
|
gen_doc_summaries,
|
|
2190
2250
|
gen_doc_questions,
|
|
2191
|
-
|
|
2251
|
+
audio_input_language,
|
|
2252
|
+
ocr_model,
|
|
2253
|
+
tesseract_lang,
|
|
2254
|
+
keep_tables_as_one_chunk,
|
|
2255
|
+
chunk_by_page,
|
|
2256
|
+
handwriting_check,
|
|
2257
|
+
ingest_mode,
|
|
2258
|
+
timeout,
|
|
2259
|
+
_request_auth,
|
|
2260
|
+
_content_type,
|
|
2261
|
+
_headers,
|
|
2262
|
+
_host_index,
|
|
2263
|
+
) -> RequestSerialized:
|
|
2264
|
+
|
|
2265
|
+
_host = None
|
|
2266
|
+
|
|
2267
|
+
_collection_formats: Dict[str, str] = {
|
|
2268
|
+
}
|
|
2269
|
+
|
|
2270
|
+
_path_params: Dict[str, str] = {}
|
|
2271
|
+
_query_params: List[Tuple[str, str]] = []
|
|
2272
|
+
_header_params: Dict[str, Optional[str]] = _headers or {}
|
|
2273
|
+
_form_params: List[Tuple[str, str]] = []
|
|
2274
|
+
_files: Dict[
|
|
2275
|
+
str, Union[str, bytes, List[str], List[bytes], List[Tuple[str, bytes]]]
|
|
2276
|
+
] = {}
|
|
2277
|
+
_body_params: Optional[bytes] = None
|
|
2278
|
+
|
|
2279
|
+
# process the path parameters
|
|
2280
|
+
# process the query parameters
|
|
2281
|
+
if collection_id is not None:
|
|
2282
|
+
|
|
2283
|
+
_query_params.append(('collection_id', collection_id))
|
|
2284
|
+
|
|
2285
|
+
if gen_doc_summaries is not None:
|
|
2286
|
+
|
|
2287
|
+
_query_params.append(('gen_doc_summaries', gen_doc_summaries))
|
|
2288
|
+
|
|
2289
|
+
if gen_doc_questions is not None:
|
|
2290
|
+
|
|
2291
|
+
_query_params.append(('gen_doc_questions', gen_doc_questions))
|
|
2292
|
+
|
|
2293
|
+
if audio_input_language is not None:
|
|
2294
|
+
|
|
2295
|
+
_query_params.append(('audio_input_language', audio_input_language))
|
|
2296
|
+
|
|
2297
|
+
if ocr_model is not None:
|
|
2298
|
+
|
|
2299
|
+
_query_params.append(('ocr_model', ocr_model))
|
|
2300
|
+
|
|
2301
|
+
if tesseract_lang is not None:
|
|
2302
|
+
|
|
2303
|
+
_query_params.append(('tesseract_lang', tesseract_lang))
|
|
2304
|
+
|
|
2305
|
+
if keep_tables_as_one_chunk is not None:
|
|
2306
|
+
|
|
2307
|
+
_query_params.append(('keep_tables_as_one_chunk', keep_tables_as_one_chunk))
|
|
2308
|
+
|
|
2309
|
+
if chunk_by_page is not None:
|
|
2310
|
+
|
|
2311
|
+
_query_params.append(('chunk_by_page', chunk_by_page))
|
|
2312
|
+
|
|
2313
|
+
if handwriting_check is not None:
|
|
2314
|
+
|
|
2315
|
+
_query_params.append(('handwriting_check', handwriting_check))
|
|
2316
|
+
|
|
2317
|
+
if ingest_mode is not None:
|
|
2318
|
+
|
|
2319
|
+
_query_params.append(('ingest_mode', ingest_mode))
|
|
2320
|
+
|
|
2321
|
+
if timeout is not None:
|
|
2322
|
+
|
|
2323
|
+
_query_params.append(('timeout', timeout))
|
|
2324
|
+
|
|
2325
|
+
# process the header parameters
|
|
2326
|
+
# process the form parameters
|
|
2327
|
+
# process the body parameter
|
|
2328
|
+
if ingest_from_gcs_body is not None:
|
|
2329
|
+
_body_params = ingest_from_gcs_body
|
|
2330
|
+
|
|
2331
|
+
|
|
2332
|
+
# set the HTTP header `Accept`
|
|
2333
|
+
if 'Accept' not in _header_params:
|
|
2334
|
+
_header_params['Accept'] = self.api_client.select_header_accept(
|
|
2335
|
+
[
|
|
2336
|
+
'application/json'
|
|
2337
|
+
]
|
|
2338
|
+
)
|
|
2339
|
+
|
|
2340
|
+
# set the HTTP header `Content-Type`
|
|
2341
|
+
if _content_type:
|
|
2342
|
+
_header_params['Content-Type'] = _content_type
|
|
2343
|
+
else:
|
|
2344
|
+
_default_content_type = (
|
|
2345
|
+
self.api_client.select_header_content_type(
|
|
2346
|
+
[
|
|
2347
|
+
'application/json'
|
|
2348
|
+
]
|
|
2349
|
+
)
|
|
2350
|
+
)
|
|
2351
|
+
if _default_content_type is not None:
|
|
2352
|
+
_header_params['Content-Type'] = _default_content_type
|
|
2353
|
+
|
|
2354
|
+
# authentication setting
|
|
2355
|
+
_auth_settings: List[str] = [
|
|
2356
|
+
'bearerAuth'
|
|
2357
|
+
]
|
|
2358
|
+
|
|
2359
|
+
return self.api_client.param_serialize(
|
|
2360
|
+
method='POST',
|
|
2361
|
+
resource_path='/ingest/gcs/job',
|
|
2362
|
+
path_params=_path_params,
|
|
2363
|
+
query_params=_query_params,
|
|
2364
|
+
header_params=_header_params,
|
|
2365
|
+
body=_body_params,
|
|
2366
|
+
post_params=_form_params,
|
|
2367
|
+
files=_files,
|
|
2368
|
+
auth_settings=_auth_settings,
|
|
2369
|
+
collection_formats=_collection_formats,
|
|
2370
|
+
_host=_host,
|
|
2371
|
+
_request_auth=_request_auth
|
|
2372
|
+
)
|
|
2373
|
+
|
|
2374
|
+
|
|
2375
|
+
|
|
2376
|
+
|
|
2377
|
+
@validate_call
|
|
2378
|
+
def create_ingest_from_plain_text_job(
|
|
2379
|
+
self,
|
|
2380
|
+
collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
|
|
2381
|
+
file_name: Annotated[StrictStr, Field(description="String of the file name to use for the document.")],
|
|
2382
|
+
body: Annotated[StrictStr, Field(description="The text that will ingested into a collection.")],
|
|
2383
|
+
gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
|
|
2384
|
+
gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
|
|
2385
|
+
metadata: Annotated[Optional[StrictStr], Field(description="String with json-encoded metadata for the document.")] = None,
|
|
2386
|
+
timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
|
|
2387
|
+
_request_timeout: Union[
|
|
2388
|
+
None,
|
|
2389
|
+
Annotated[StrictFloat, Field(gt=0)],
|
|
2390
|
+
Tuple[
|
|
2391
|
+
Annotated[StrictFloat, Field(gt=0)],
|
|
2392
|
+
Annotated[StrictFloat, Field(gt=0)]
|
|
2393
|
+
]
|
|
2394
|
+
] = None,
|
|
2395
|
+
_request_auth: Optional[Dict[StrictStr, Any]] = None,
|
|
2396
|
+
_content_type: Optional[StrictStr] = None,
|
|
2397
|
+
_headers: Optional[Dict[StrictStr, Any]] = None,
|
|
2398
|
+
_host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
|
|
2399
|
+
) -> JobDetails:
|
|
2400
|
+
"""Creates a job to add plain text to a collection.
|
|
2401
|
+
|
|
2402
|
+
Creates a job to add plain text to a collection.
|
|
2403
|
+
|
|
2404
|
+
:param collection_id: String id of the collection to add the ingested documents into. (required)
|
|
2405
|
+
:type collection_id: str
|
|
2406
|
+
:param file_name: String of the file name to use for the document. (required)
|
|
2407
|
+
:type file_name: str
|
|
2408
|
+
:param body: The text that will ingested into a collection. (required)
|
|
2409
|
+
:type body: str
|
|
2410
|
+
:param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
|
|
2411
|
+
:type gen_doc_summaries: bool
|
|
2412
|
+
:param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
|
|
2413
|
+
:type gen_doc_questions: bool
|
|
2414
|
+
:param metadata: String with json-encoded metadata for the document.
|
|
2415
|
+
:type metadata: str
|
|
2416
|
+
:param timeout: Timeout in seconds
|
|
2417
|
+
:type timeout: float
|
|
2418
|
+
:param _request_timeout: timeout setting for this request. If one
|
|
2419
|
+
number provided, it will be total request
|
|
2420
|
+
timeout. It can also be a pair (tuple) of
|
|
2421
|
+
(connection, read) timeouts.
|
|
2422
|
+
:type _request_timeout: int, tuple(int, int), optional
|
|
2423
|
+
:param _request_auth: set to override the auth_settings for an a single
|
|
2424
|
+
request; this effectively ignores the
|
|
2425
|
+
authentication in the spec for a single request.
|
|
2426
|
+
:type _request_auth: dict, optional
|
|
2427
|
+
:param _content_type: force content-type for the request.
|
|
2428
|
+
:type _content_type: str, Optional
|
|
2429
|
+
:param _headers: set to override the headers for a single
|
|
2430
|
+
request; this effectively ignores the headers
|
|
2431
|
+
in the spec for a single request.
|
|
2432
|
+
:type _headers: dict, optional
|
|
2433
|
+
:param _host_index: set to override the host_index for a single
|
|
2434
|
+
request; this effectively ignores the host_index
|
|
2435
|
+
in the spec for a single request.
|
|
2436
|
+
:type _host_index: int, optional
|
|
2437
|
+
:return: Returns the result object.
|
|
2438
|
+
""" # noqa: E501
|
|
2439
|
+
|
|
2440
|
+
_param = self._create_ingest_from_plain_text_job_serialize(
|
|
2441
|
+
collection_id=collection_id,
|
|
2442
|
+
file_name=file_name,
|
|
2443
|
+
body=body,
|
|
2444
|
+
gen_doc_summaries=gen_doc_summaries,
|
|
2445
|
+
gen_doc_questions=gen_doc_questions,
|
|
2446
|
+
metadata=metadata,
|
|
2447
|
+
timeout=timeout,
|
|
2448
|
+
_request_auth=_request_auth,
|
|
2449
|
+
_content_type=_content_type,
|
|
2450
|
+
_headers=_headers,
|
|
2451
|
+
_host_index=_host_index
|
|
2452
|
+
)
|
|
2453
|
+
|
|
2454
|
+
_response_types_map: Dict[str, Optional[str]] = {
|
|
2455
|
+
'201': "JobDetails",
|
|
2456
|
+
'401': "EndpointError",
|
|
2457
|
+
}
|
|
2458
|
+
response_data = self.api_client.call_api(
|
|
2459
|
+
*_param,
|
|
2460
|
+
_request_timeout=_request_timeout
|
|
2461
|
+
)
|
|
2462
|
+
response_data.read()
|
|
2463
|
+
return self.api_client.response_deserialize(
|
|
2464
|
+
response_data=response_data,
|
|
2465
|
+
response_types_map=_response_types_map,
|
|
2466
|
+
).data
|
|
2467
|
+
|
|
2468
|
+
|
|
2469
|
+
@validate_call
|
|
2470
|
+
def create_ingest_from_plain_text_job_with_http_info(
|
|
2471
|
+
self,
|
|
2472
|
+
collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
|
|
2473
|
+
file_name: Annotated[StrictStr, Field(description="String of the file name to use for the document.")],
|
|
2474
|
+
body: Annotated[StrictStr, Field(description="The text that will ingested into a collection.")],
|
|
2475
|
+
gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
|
|
2476
|
+
gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
|
|
2477
|
+
metadata: Annotated[Optional[StrictStr], Field(description="String with json-encoded metadata for the document.")] = None,
|
|
2478
|
+
timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
|
|
2479
|
+
_request_timeout: Union[
|
|
2480
|
+
None,
|
|
2481
|
+
Annotated[StrictFloat, Field(gt=0)],
|
|
2482
|
+
Tuple[
|
|
2483
|
+
Annotated[StrictFloat, Field(gt=0)],
|
|
2484
|
+
Annotated[StrictFloat, Field(gt=0)]
|
|
2485
|
+
]
|
|
2486
|
+
] = None,
|
|
2487
|
+
_request_auth: Optional[Dict[StrictStr, Any]] = None,
|
|
2488
|
+
_content_type: Optional[StrictStr] = None,
|
|
2489
|
+
_headers: Optional[Dict[StrictStr, Any]] = None,
|
|
2490
|
+
_host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
|
|
2491
|
+
) -> ApiResponse[JobDetails]:
|
|
2492
|
+
"""Creates a job to add plain text to a collection.
|
|
2493
|
+
|
|
2494
|
+
Creates a job to add plain text to a collection.
|
|
2495
|
+
|
|
2496
|
+
:param collection_id: String id of the collection to add the ingested documents into. (required)
|
|
2497
|
+
:type collection_id: str
|
|
2498
|
+
:param file_name: String of the file name to use for the document. (required)
|
|
2499
|
+
:type file_name: str
|
|
2500
|
+
:param body: The text that will ingested into a collection. (required)
|
|
2501
|
+
:type body: str
|
|
2502
|
+
:param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
|
|
2503
|
+
:type gen_doc_summaries: bool
|
|
2504
|
+
:param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
|
|
2505
|
+
:type gen_doc_questions: bool
|
|
2506
|
+
:param metadata: String with json-encoded metadata for the document.
|
|
2507
|
+
:type metadata: str
|
|
2508
|
+
:param timeout: Timeout in seconds
|
|
2509
|
+
:type timeout: float
|
|
2510
|
+
:param _request_timeout: timeout setting for this request. If one
|
|
2511
|
+
number provided, it will be total request
|
|
2512
|
+
timeout. It can also be a pair (tuple) of
|
|
2513
|
+
(connection, read) timeouts.
|
|
2514
|
+
:type _request_timeout: int, tuple(int, int), optional
|
|
2515
|
+
:param _request_auth: set to override the auth_settings for an a single
|
|
2516
|
+
request; this effectively ignores the
|
|
2517
|
+
authentication in the spec for a single request.
|
|
2518
|
+
:type _request_auth: dict, optional
|
|
2519
|
+
:param _content_type: force content-type for the request.
|
|
2520
|
+
:type _content_type: str, Optional
|
|
2521
|
+
:param _headers: set to override the headers for a single
|
|
2522
|
+
request; this effectively ignores the headers
|
|
2523
|
+
in the spec for a single request.
|
|
2524
|
+
:type _headers: dict, optional
|
|
2525
|
+
:param _host_index: set to override the host_index for a single
|
|
2526
|
+
request; this effectively ignores the host_index
|
|
2527
|
+
in the spec for a single request.
|
|
2528
|
+
:type _host_index: int, optional
|
|
2529
|
+
:return: Returns the result object.
|
|
2530
|
+
""" # noqa: E501
|
|
2531
|
+
|
|
2532
|
+
_param = self._create_ingest_from_plain_text_job_serialize(
|
|
2533
|
+
collection_id=collection_id,
|
|
2534
|
+
file_name=file_name,
|
|
2535
|
+
body=body,
|
|
2536
|
+
gen_doc_summaries=gen_doc_summaries,
|
|
2537
|
+
gen_doc_questions=gen_doc_questions,
|
|
2538
|
+
metadata=metadata,
|
|
2539
|
+
timeout=timeout,
|
|
2540
|
+
_request_auth=_request_auth,
|
|
2541
|
+
_content_type=_content_type,
|
|
2542
|
+
_headers=_headers,
|
|
2543
|
+
_host_index=_host_index
|
|
2544
|
+
)
|
|
2545
|
+
|
|
2546
|
+
_response_types_map: Dict[str, Optional[str]] = {
|
|
2547
|
+
'201': "JobDetails",
|
|
2548
|
+
'401': "EndpointError",
|
|
2549
|
+
}
|
|
2550
|
+
response_data = self.api_client.call_api(
|
|
2551
|
+
*_param,
|
|
2552
|
+
_request_timeout=_request_timeout
|
|
2553
|
+
)
|
|
2554
|
+
response_data.read()
|
|
2555
|
+
return self.api_client.response_deserialize(
|
|
2556
|
+
response_data=response_data,
|
|
2557
|
+
response_types_map=_response_types_map,
|
|
2558
|
+
)
|
|
2559
|
+
|
|
2560
|
+
|
|
2561
|
+
@validate_call
|
|
2562
|
+
def create_ingest_from_plain_text_job_without_preload_content(
|
|
2563
|
+
self,
|
|
2564
|
+
collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
|
|
2565
|
+
file_name: Annotated[StrictStr, Field(description="String of the file name to use for the document.")],
|
|
2566
|
+
body: Annotated[StrictStr, Field(description="The text that will ingested into a collection.")],
|
|
2567
|
+
gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
|
|
2568
|
+
gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
|
|
2569
|
+
metadata: Annotated[Optional[StrictStr], Field(description="String with json-encoded metadata for the document.")] = None,
|
|
2570
|
+
timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
|
|
2571
|
+
_request_timeout: Union[
|
|
2572
|
+
None,
|
|
2573
|
+
Annotated[StrictFloat, Field(gt=0)],
|
|
2574
|
+
Tuple[
|
|
2575
|
+
Annotated[StrictFloat, Field(gt=0)],
|
|
2576
|
+
Annotated[StrictFloat, Field(gt=0)]
|
|
2577
|
+
]
|
|
2578
|
+
] = None,
|
|
2579
|
+
_request_auth: Optional[Dict[StrictStr, Any]] = None,
|
|
2580
|
+
_content_type: Optional[StrictStr] = None,
|
|
2581
|
+
_headers: Optional[Dict[StrictStr, Any]] = None,
|
|
2582
|
+
_host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
|
|
2583
|
+
) -> RESTResponseType:
|
|
2584
|
+
"""Creates a job to add plain text to a collection.
|
|
2585
|
+
|
|
2586
|
+
Creates a job to add plain text to a collection.
|
|
2587
|
+
|
|
2588
|
+
:param collection_id: String id of the collection to add the ingested documents into. (required)
|
|
2589
|
+
:type collection_id: str
|
|
2590
|
+
:param file_name: String of the file name to use for the document. (required)
|
|
2591
|
+
:type file_name: str
|
|
2592
|
+
:param body: The text that will ingested into a collection. (required)
|
|
2593
|
+
:type body: str
|
|
2594
|
+
:param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
|
|
2595
|
+
:type gen_doc_summaries: bool
|
|
2596
|
+
:param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
|
|
2597
|
+
:type gen_doc_questions: bool
|
|
2598
|
+
:param metadata: String with json-encoded metadata for the document.
|
|
2599
|
+
:type metadata: str
|
|
2600
|
+
:param timeout: Timeout in seconds
|
|
2601
|
+
:type timeout: float
|
|
2602
|
+
:param _request_timeout: timeout setting for this request. If one
|
|
2603
|
+
number provided, it will be total request
|
|
2604
|
+
timeout. It can also be a pair (tuple) of
|
|
2605
|
+
(connection, read) timeouts.
|
|
2606
|
+
:type _request_timeout: int, tuple(int, int), optional
|
|
2607
|
+
:param _request_auth: set to override the auth_settings for an a single
|
|
2608
|
+
request; this effectively ignores the
|
|
2609
|
+
authentication in the spec for a single request.
|
|
2610
|
+
:type _request_auth: dict, optional
|
|
2611
|
+
:param _content_type: force content-type for the request.
|
|
2612
|
+
:type _content_type: str, Optional
|
|
2613
|
+
:param _headers: set to override the headers for a single
|
|
2614
|
+
request; this effectively ignores the headers
|
|
2615
|
+
in the spec for a single request.
|
|
2616
|
+
:type _headers: dict, optional
|
|
2617
|
+
:param _host_index: set to override the host_index for a single
|
|
2618
|
+
request; this effectively ignores the host_index
|
|
2619
|
+
in the spec for a single request.
|
|
2620
|
+
:type _host_index: int, optional
|
|
2621
|
+
:return: Returns the result object.
|
|
2622
|
+
""" # noqa: E501
|
|
2623
|
+
|
|
2624
|
+
_param = self._create_ingest_from_plain_text_job_serialize(
|
|
2625
|
+
collection_id=collection_id,
|
|
2626
|
+
file_name=file_name,
|
|
2627
|
+
body=body,
|
|
2628
|
+
gen_doc_summaries=gen_doc_summaries,
|
|
2629
|
+
gen_doc_questions=gen_doc_questions,
|
|
2630
|
+
metadata=metadata,
|
|
2631
|
+
timeout=timeout,
|
|
2632
|
+
_request_auth=_request_auth,
|
|
2633
|
+
_content_type=_content_type,
|
|
2634
|
+
_headers=_headers,
|
|
2635
|
+
_host_index=_host_index
|
|
2636
|
+
)
|
|
2637
|
+
|
|
2638
|
+
_response_types_map: Dict[str, Optional[str]] = {
|
|
2639
|
+
'201': "JobDetails",
|
|
2640
|
+
'401': "EndpointError",
|
|
2641
|
+
}
|
|
2642
|
+
response_data = self.api_client.call_api(
|
|
2643
|
+
*_param,
|
|
2644
|
+
_request_timeout=_request_timeout
|
|
2645
|
+
)
|
|
2646
|
+
return response_data.response
|
|
2647
|
+
|
|
2648
|
+
|
|
2649
|
+
def _create_ingest_from_plain_text_job_serialize(
|
|
2650
|
+
self,
|
|
2651
|
+
collection_id,
|
|
2652
|
+
file_name,
|
|
2653
|
+
body,
|
|
2654
|
+
gen_doc_summaries,
|
|
2655
|
+
gen_doc_questions,
|
|
2656
|
+
metadata,
|
|
2192
2657
|
timeout,
|
|
2193
2658
|
_request_auth,
|
|
2194
2659
|
_content_type,
|
|
@@ -2218,7 +2683,447 @@ class DocumentIngestionApi:
|
|
|
2218
2683
|
|
|
2219
2684
|
if file_name is not None:
|
|
2220
2685
|
|
|
2221
|
-
_query_params.append(('file_name', file_name))
|
|
2686
|
+
_query_params.append(('file_name', file_name))
|
|
2687
|
+
|
|
2688
|
+
if gen_doc_summaries is not None:
|
|
2689
|
+
|
|
2690
|
+
_query_params.append(('gen_doc_summaries', gen_doc_summaries))
|
|
2691
|
+
|
|
2692
|
+
if gen_doc_questions is not None:
|
|
2693
|
+
|
|
2694
|
+
_query_params.append(('gen_doc_questions', gen_doc_questions))
|
|
2695
|
+
|
|
2696
|
+
if metadata is not None:
|
|
2697
|
+
|
|
2698
|
+
_query_params.append(('metadata', metadata))
|
|
2699
|
+
|
|
2700
|
+
if timeout is not None:
|
|
2701
|
+
|
|
2702
|
+
_query_params.append(('timeout', timeout))
|
|
2703
|
+
|
|
2704
|
+
# process the header parameters
|
|
2705
|
+
# process the form parameters
|
|
2706
|
+
# process the body parameter
|
|
2707
|
+
if body is not None:
|
|
2708
|
+
_body_params = body
|
|
2709
|
+
|
|
2710
|
+
|
|
2711
|
+
# set the HTTP header `Accept`
|
|
2712
|
+
if 'Accept' not in _header_params:
|
|
2713
|
+
_header_params['Accept'] = self.api_client.select_header_accept(
|
|
2714
|
+
[
|
|
2715
|
+
'application/json'
|
|
2716
|
+
]
|
|
2717
|
+
)
|
|
2718
|
+
|
|
2719
|
+
# set the HTTP header `Content-Type`
|
|
2720
|
+
if _content_type:
|
|
2721
|
+
_header_params['Content-Type'] = _content_type
|
|
2722
|
+
else:
|
|
2723
|
+
_default_content_type = (
|
|
2724
|
+
self.api_client.select_header_content_type(
|
|
2725
|
+
[
|
|
2726
|
+
'text/plain'
|
|
2727
|
+
]
|
|
2728
|
+
)
|
|
2729
|
+
)
|
|
2730
|
+
if _default_content_type is not None:
|
|
2731
|
+
_header_params['Content-Type'] = _default_content_type
|
|
2732
|
+
|
|
2733
|
+
# authentication setting
|
|
2734
|
+
_auth_settings: List[str] = [
|
|
2735
|
+
'bearerAuth'
|
|
2736
|
+
]
|
|
2737
|
+
|
|
2738
|
+
return self.api_client.param_serialize(
|
|
2739
|
+
method='POST',
|
|
2740
|
+
resource_path='/ingest/plain_text/job',
|
|
2741
|
+
path_params=_path_params,
|
|
2742
|
+
query_params=_query_params,
|
|
2743
|
+
header_params=_header_params,
|
|
2744
|
+
body=_body_params,
|
|
2745
|
+
post_params=_form_params,
|
|
2746
|
+
files=_files,
|
|
2747
|
+
auth_settings=_auth_settings,
|
|
2748
|
+
collection_formats=_collection_formats,
|
|
2749
|
+
_host=_host,
|
|
2750
|
+
_request_auth=_request_auth
|
|
2751
|
+
)
|
|
2752
|
+
|
|
2753
|
+
|
|
2754
|
+
|
|
2755
|
+
|
|
2756
|
+
@validate_call
|
|
2757
|
+
def create_ingest_from_s3_job(
|
|
2758
|
+
self,
|
|
2759
|
+
collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
|
|
2760
|
+
ingest_from_s3_body: IngestFromS3Body,
|
|
2761
|
+
gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
|
|
2762
|
+
gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
|
|
2763
|
+
audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
|
|
2764
|
+
ocr_model: Annotated[Optional[StrictStr], Field(description="Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).")] = None,
|
|
2765
|
+
tesseract_lang: Annotated[Optional[StrictStr], Field(description="Which language to use when using ocr_model=\"tesseract\".")] = None,
|
|
2766
|
+
keep_tables_as_one_chunk: Annotated[Optional[StrictBool], Field(description="When tables are identified by the table parser the table tokens will be kept in a single chunk.")] = None,
|
|
2767
|
+
chunk_by_page: Annotated[Optional[StrictBool], Field(description="Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.")] = None,
|
|
2768
|
+
handwriting_check: Annotated[Optional[StrictBool], Field(description="Check pages for handwriting. Will use specialized models if handwriting is found.")] = None,
|
|
2769
|
+
ingest_mode: Annotated[Optional[StrictStr], Field(description="Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.")] = None,
|
|
2770
|
+
timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
|
|
2771
|
+
_request_timeout: Union[
|
|
2772
|
+
None,
|
|
2773
|
+
Annotated[StrictFloat, Field(gt=0)],
|
|
2774
|
+
Tuple[
|
|
2775
|
+
Annotated[StrictFloat, Field(gt=0)],
|
|
2776
|
+
Annotated[StrictFloat, Field(gt=0)]
|
|
2777
|
+
]
|
|
2778
|
+
] = None,
|
|
2779
|
+
_request_auth: Optional[Dict[StrictStr, Any]] = None,
|
|
2780
|
+
_content_type: Optional[StrictStr] = None,
|
|
2781
|
+
_headers: Optional[Dict[StrictStr, Any]] = None,
|
|
2782
|
+
_host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
|
|
2783
|
+
) -> JobDetails:
|
|
2784
|
+
"""Creates a job to add files from the AWS S3 storage into a collection.
|
|
2785
|
+
|
|
2786
|
+
Creates a job to add files from the AWS S3 storage into a collection.
|
|
2787
|
+
|
|
2788
|
+
:param collection_id: String id of the collection to add the ingested documents into. (required)
|
|
2789
|
+
:type collection_id: str
|
|
2790
|
+
:param ingest_from_s3_body: (required)
|
|
2791
|
+
:type ingest_from_s3_body: IngestFromS3Body
|
|
2792
|
+
:param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
|
|
2793
|
+
:type gen_doc_summaries: bool
|
|
2794
|
+
:param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
|
|
2795
|
+
:type gen_doc_questions: bool
|
|
2796
|
+
:param audio_input_language: Language of audio files.
|
|
2797
|
+
:type audio_input_language: str
|
|
2798
|
+
:param ocr_model: Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).
|
|
2799
|
+
:type ocr_model: str
|
|
2800
|
+
:param tesseract_lang: Which language to use when using ocr_model=\"tesseract\".
|
|
2801
|
+
:type tesseract_lang: str
|
|
2802
|
+
:param keep_tables_as_one_chunk: When tables are identified by the table parser the table tokens will be kept in a single chunk.
|
|
2803
|
+
:type keep_tables_as_one_chunk: bool
|
|
2804
|
+
:param chunk_by_page: Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.
|
|
2805
|
+
:type chunk_by_page: bool
|
|
2806
|
+
:param handwriting_check: Check pages for handwriting. Will use specialized models if handwriting is found.
|
|
2807
|
+
:type handwriting_check: bool
|
|
2808
|
+
:param ingest_mode: Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.
|
|
2809
|
+
:type ingest_mode: str
|
|
2810
|
+
:param timeout: Timeout in seconds
|
|
2811
|
+
:type timeout: float
|
|
2812
|
+
:param _request_timeout: timeout setting for this request. If one
|
|
2813
|
+
number provided, it will be total request
|
|
2814
|
+
timeout. It can also be a pair (tuple) of
|
|
2815
|
+
(connection, read) timeouts.
|
|
2816
|
+
:type _request_timeout: int, tuple(int, int), optional
|
|
2817
|
+
:param _request_auth: set to override the auth_settings for an a single
|
|
2818
|
+
request; this effectively ignores the
|
|
2819
|
+
authentication in the spec for a single request.
|
|
2820
|
+
:type _request_auth: dict, optional
|
|
2821
|
+
:param _content_type: force content-type for the request.
|
|
2822
|
+
:type _content_type: str, Optional
|
|
2823
|
+
:param _headers: set to override the headers for a single
|
|
2824
|
+
request; this effectively ignores the headers
|
|
2825
|
+
in the spec for a single request.
|
|
2826
|
+
:type _headers: dict, optional
|
|
2827
|
+
:param _host_index: set to override the host_index for a single
|
|
2828
|
+
request; this effectively ignores the host_index
|
|
2829
|
+
in the spec for a single request.
|
|
2830
|
+
:type _host_index: int, optional
|
|
2831
|
+
:return: Returns the result object.
|
|
2832
|
+
""" # noqa: E501
|
|
2833
|
+
|
|
2834
|
+
_param = self._create_ingest_from_s3_job_serialize(
|
|
2835
|
+
collection_id=collection_id,
|
|
2836
|
+
ingest_from_s3_body=ingest_from_s3_body,
|
|
2837
|
+
gen_doc_summaries=gen_doc_summaries,
|
|
2838
|
+
gen_doc_questions=gen_doc_questions,
|
|
2839
|
+
audio_input_language=audio_input_language,
|
|
2840
|
+
ocr_model=ocr_model,
|
|
2841
|
+
tesseract_lang=tesseract_lang,
|
|
2842
|
+
keep_tables_as_one_chunk=keep_tables_as_one_chunk,
|
|
2843
|
+
chunk_by_page=chunk_by_page,
|
|
2844
|
+
handwriting_check=handwriting_check,
|
|
2845
|
+
ingest_mode=ingest_mode,
|
|
2846
|
+
timeout=timeout,
|
|
2847
|
+
_request_auth=_request_auth,
|
|
2848
|
+
_content_type=_content_type,
|
|
2849
|
+
_headers=_headers,
|
|
2850
|
+
_host_index=_host_index
|
|
2851
|
+
)
|
|
2852
|
+
|
|
2853
|
+
_response_types_map: Dict[str, Optional[str]] = {
|
|
2854
|
+
'201': "JobDetails",
|
|
2855
|
+
'401': "EndpointError",
|
|
2856
|
+
}
|
|
2857
|
+
response_data = self.api_client.call_api(
|
|
2858
|
+
*_param,
|
|
2859
|
+
_request_timeout=_request_timeout
|
|
2860
|
+
)
|
|
2861
|
+
response_data.read()
|
|
2862
|
+
return self.api_client.response_deserialize(
|
|
2863
|
+
response_data=response_data,
|
|
2864
|
+
response_types_map=_response_types_map,
|
|
2865
|
+
).data
|
|
2866
|
+
|
|
2867
|
+
|
|
2868
|
+
@validate_call
|
|
2869
|
+
def create_ingest_from_s3_job_with_http_info(
|
|
2870
|
+
self,
|
|
2871
|
+
collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
|
|
2872
|
+
ingest_from_s3_body: IngestFromS3Body,
|
|
2873
|
+
gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
|
|
2874
|
+
gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
|
|
2875
|
+
audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
|
|
2876
|
+
ocr_model: Annotated[Optional[StrictStr], Field(description="Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).")] = None,
|
|
2877
|
+
tesseract_lang: Annotated[Optional[StrictStr], Field(description="Which language to use when using ocr_model=\"tesseract\".")] = None,
|
|
2878
|
+
keep_tables_as_one_chunk: Annotated[Optional[StrictBool], Field(description="When tables are identified by the table parser the table tokens will be kept in a single chunk.")] = None,
|
|
2879
|
+
chunk_by_page: Annotated[Optional[StrictBool], Field(description="Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.")] = None,
|
|
2880
|
+
handwriting_check: Annotated[Optional[StrictBool], Field(description="Check pages for handwriting. Will use specialized models if handwriting is found.")] = None,
|
|
2881
|
+
ingest_mode: Annotated[Optional[StrictStr], Field(description="Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.")] = None,
|
|
2882
|
+
timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
|
|
2883
|
+
_request_timeout: Union[
|
|
2884
|
+
None,
|
|
2885
|
+
Annotated[StrictFloat, Field(gt=0)],
|
|
2886
|
+
Tuple[
|
|
2887
|
+
Annotated[StrictFloat, Field(gt=0)],
|
|
2888
|
+
Annotated[StrictFloat, Field(gt=0)]
|
|
2889
|
+
]
|
|
2890
|
+
] = None,
|
|
2891
|
+
_request_auth: Optional[Dict[StrictStr, Any]] = None,
|
|
2892
|
+
_content_type: Optional[StrictStr] = None,
|
|
2893
|
+
_headers: Optional[Dict[StrictStr, Any]] = None,
|
|
2894
|
+
_host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
|
|
2895
|
+
) -> ApiResponse[JobDetails]:
|
|
2896
|
+
"""Creates a job to add files from the AWS S3 storage into a collection.
|
|
2897
|
+
|
|
2898
|
+
Creates a job to add files from the AWS S3 storage into a collection.
|
|
2899
|
+
|
|
2900
|
+
:param collection_id: String id of the collection to add the ingested documents into. (required)
|
|
2901
|
+
:type collection_id: str
|
|
2902
|
+
:param ingest_from_s3_body: (required)
|
|
2903
|
+
:type ingest_from_s3_body: IngestFromS3Body
|
|
2904
|
+
:param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
|
|
2905
|
+
:type gen_doc_summaries: bool
|
|
2906
|
+
:param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
|
|
2907
|
+
:type gen_doc_questions: bool
|
|
2908
|
+
:param audio_input_language: Language of audio files.
|
|
2909
|
+
:type audio_input_language: str
|
|
2910
|
+
:param ocr_model: Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).
|
|
2911
|
+
:type ocr_model: str
|
|
2912
|
+
:param tesseract_lang: Which language to use when using ocr_model=\"tesseract\".
|
|
2913
|
+
:type tesseract_lang: str
|
|
2914
|
+
:param keep_tables_as_one_chunk: When tables are identified by the table parser the table tokens will be kept in a single chunk.
|
|
2915
|
+
:type keep_tables_as_one_chunk: bool
|
|
2916
|
+
:param chunk_by_page: Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.
|
|
2917
|
+
:type chunk_by_page: bool
|
|
2918
|
+
:param handwriting_check: Check pages for handwriting. Will use specialized models if handwriting is found.
|
|
2919
|
+
:type handwriting_check: bool
|
|
2920
|
+
:param ingest_mode: Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.
|
|
2921
|
+
:type ingest_mode: str
|
|
2922
|
+
:param timeout: Timeout in seconds
|
|
2923
|
+
:type timeout: float
|
|
2924
|
+
:param _request_timeout: timeout setting for this request. If one
|
|
2925
|
+
number provided, it will be total request
|
|
2926
|
+
timeout. It can also be a pair (tuple) of
|
|
2927
|
+
(connection, read) timeouts.
|
|
2928
|
+
:type _request_timeout: int, tuple(int, int), optional
|
|
2929
|
+
:param _request_auth: set to override the auth_settings for an a single
|
|
2930
|
+
request; this effectively ignores the
|
|
2931
|
+
authentication in the spec for a single request.
|
|
2932
|
+
:type _request_auth: dict, optional
|
|
2933
|
+
:param _content_type: force content-type for the request.
|
|
2934
|
+
:type _content_type: str, Optional
|
|
2935
|
+
:param _headers: set to override the headers for a single
|
|
2936
|
+
request; this effectively ignores the headers
|
|
2937
|
+
in the spec for a single request.
|
|
2938
|
+
:type _headers: dict, optional
|
|
2939
|
+
:param _host_index: set to override the host_index for a single
|
|
2940
|
+
request; this effectively ignores the host_index
|
|
2941
|
+
in the spec for a single request.
|
|
2942
|
+
:type _host_index: int, optional
|
|
2943
|
+
:return: Returns the result object.
|
|
2944
|
+
""" # noqa: E501
|
|
2945
|
+
|
|
2946
|
+
_param = self._create_ingest_from_s3_job_serialize(
|
|
2947
|
+
collection_id=collection_id,
|
|
2948
|
+
ingest_from_s3_body=ingest_from_s3_body,
|
|
2949
|
+
gen_doc_summaries=gen_doc_summaries,
|
|
2950
|
+
gen_doc_questions=gen_doc_questions,
|
|
2951
|
+
audio_input_language=audio_input_language,
|
|
2952
|
+
ocr_model=ocr_model,
|
|
2953
|
+
tesseract_lang=tesseract_lang,
|
|
2954
|
+
keep_tables_as_one_chunk=keep_tables_as_one_chunk,
|
|
2955
|
+
chunk_by_page=chunk_by_page,
|
|
2956
|
+
handwriting_check=handwriting_check,
|
|
2957
|
+
ingest_mode=ingest_mode,
|
|
2958
|
+
timeout=timeout,
|
|
2959
|
+
_request_auth=_request_auth,
|
|
2960
|
+
_content_type=_content_type,
|
|
2961
|
+
_headers=_headers,
|
|
2962
|
+
_host_index=_host_index
|
|
2963
|
+
)
|
|
2964
|
+
|
|
2965
|
+
_response_types_map: Dict[str, Optional[str]] = {
|
|
2966
|
+
'201': "JobDetails",
|
|
2967
|
+
'401': "EndpointError",
|
|
2968
|
+
}
|
|
2969
|
+
response_data = self.api_client.call_api(
|
|
2970
|
+
*_param,
|
|
2971
|
+
_request_timeout=_request_timeout
|
|
2972
|
+
)
|
|
2973
|
+
response_data.read()
|
|
2974
|
+
return self.api_client.response_deserialize(
|
|
2975
|
+
response_data=response_data,
|
|
2976
|
+
response_types_map=_response_types_map,
|
|
2977
|
+
)
|
|
2978
|
+
|
|
2979
|
+
|
|
2980
|
+
@validate_call
|
|
2981
|
+
def create_ingest_from_s3_job_without_preload_content(
|
|
2982
|
+
self,
|
|
2983
|
+
collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
|
|
2984
|
+
ingest_from_s3_body: IngestFromS3Body,
|
|
2985
|
+
gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
|
|
2986
|
+
gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
|
|
2987
|
+
audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
|
|
2988
|
+
ocr_model: Annotated[Optional[StrictStr], Field(description="Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).")] = None,
|
|
2989
|
+
tesseract_lang: Annotated[Optional[StrictStr], Field(description="Which language to use when using ocr_model=\"tesseract\".")] = None,
|
|
2990
|
+
keep_tables_as_one_chunk: Annotated[Optional[StrictBool], Field(description="When tables are identified by the table parser the table tokens will be kept in a single chunk.")] = None,
|
|
2991
|
+
chunk_by_page: Annotated[Optional[StrictBool], Field(description="Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.")] = None,
|
|
2992
|
+
handwriting_check: Annotated[Optional[StrictBool], Field(description="Check pages for handwriting. Will use specialized models if handwriting is found.")] = None,
|
|
2993
|
+
ingest_mode: Annotated[Optional[StrictStr], Field(description="Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.")] = None,
|
|
2994
|
+
timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
|
|
2995
|
+
_request_timeout: Union[
|
|
2996
|
+
None,
|
|
2997
|
+
Annotated[StrictFloat, Field(gt=0)],
|
|
2998
|
+
Tuple[
|
|
2999
|
+
Annotated[StrictFloat, Field(gt=0)],
|
|
3000
|
+
Annotated[StrictFloat, Field(gt=0)]
|
|
3001
|
+
]
|
|
3002
|
+
] = None,
|
|
3003
|
+
_request_auth: Optional[Dict[StrictStr, Any]] = None,
|
|
3004
|
+
_content_type: Optional[StrictStr] = None,
|
|
3005
|
+
_headers: Optional[Dict[StrictStr, Any]] = None,
|
|
3006
|
+
_host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
|
|
3007
|
+
) -> RESTResponseType:
|
|
3008
|
+
"""Creates a job to add files from the AWS S3 storage into a collection.
|
|
3009
|
+
|
|
3010
|
+
Creates a job to add files from the AWS S3 storage into a collection.
|
|
3011
|
+
|
|
3012
|
+
:param collection_id: String id of the collection to add the ingested documents into. (required)
|
|
3013
|
+
:type collection_id: str
|
|
3014
|
+
:param ingest_from_s3_body: (required)
|
|
3015
|
+
:type ingest_from_s3_body: IngestFromS3Body
|
|
3016
|
+
:param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
|
|
3017
|
+
:type gen_doc_summaries: bool
|
|
3018
|
+
:param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
|
|
3019
|
+
:type gen_doc_questions: bool
|
|
3020
|
+
:param audio_input_language: Language of audio files.
|
|
3021
|
+
:type audio_input_language: str
|
|
3022
|
+
:param ocr_model: Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).
|
|
3023
|
+
:type ocr_model: str
|
|
3024
|
+
:param tesseract_lang: Which language to use when using ocr_model=\"tesseract\".
|
|
3025
|
+
:type tesseract_lang: str
|
|
3026
|
+
:param keep_tables_as_one_chunk: When tables are identified by the table parser the table tokens will be kept in a single chunk.
|
|
3027
|
+
:type keep_tables_as_one_chunk: bool
|
|
3028
|
+
:param chunk_by_page: Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.
|
|
3029
|
+
:type chunk_by_page: bool
|
|
3030
|
+
:param handwriting_check: Check pages for handwriting. Will use specialized models if handwriting is found.
|
|
3031
|
+
:type handwriting_check: bool
|
|
3032
|
+
:param ingest_mode: Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.
|
|
3033
|
+
:type ingest_mode: str
|
|
3034
|
+
:param timeout: Timeout in seconds
|
|
3035
|
+
:type timeout: float
|
|
3036
|
+
:param _request_timeout: timeout setting for this request. If one
|
|
3037
|
+
number provided, it will be total request
|
|
3038
|
+
timeout. It can also be a pair (tuple) of
|
|
3039
|
+
(connection, read) timeouts.
|
|
3040
|
+
:type _request_timeout: int, tuple(int, int), optional
|
|
3041
|
+
:param _request_auth: set to override the auth_settings for an a single
|
|
3042
|
+
request; this effectively ignores the
|
|
3043
|
+
authentication in the spec for a single request.
|
|
3044
|
+
:type _request_auth: dict, optional
|
|
3045
|
+
:param _content_type: force content-type for the request.
|
|
3046
|
+
:type _content_type: str, Optional
|
|
3047
|
+
:param _headers: set to override the headers for a single
|
|
3048
|
+
request; this effectively ignores the headers
|
|
3049
|
+
in the spec for a single request.
|
|
3050
|
+
:type _headers: dict, optional
|
|
3051
|
+
:param _host_index: set to override the host_index for a single
|
|
3052
|
+
request; this effectively ignores the host_index
|
|
3053
|
+
in the spec for a single request.
|
|
3054
|
+
:type _host_index: int, optional
|
|
3055
|
+
:return: Returns the result object.
|
|
3056
|
+
""" # noqa: E501
|
|
3057
|
+
|
|
3058
|
+
_param = self._create_ingest_from_s3_job_serialize(
|
|
3059
|
+
collection_id=collection_id,
|
|
3060
|
+
ingest_from_s3_body=ingest_from_s3_body,
|
|
3061
|
+
gen_doc_summaries=gen_doc_summaries,
|
|
3062
|
+
gen_doc_questions=gen_doc_questions,
|
|
3063
|
+
audio_input_language=audio_input_language,
|
|
3064
|
+
ocr_model=ocr_model,
|
|
3065
|
+
tesseract_lang=tesseract_lang,
|
|
3066
|
+
keep_tables_as_one_chunk=keep_tables_as_one_chunk,
|
|
3067
|
+
chunk_by_page=chunk_by_page,
|
|
3068
|
+
handwriting_check=handwriting_check,
|
|
3069
|
+
ingest_mode=ingest_mode,
|
|
3070
|
+
timeout=timeout,
|
|
3071
|
+
_request_auth=_request_auth,
|
|
3072
|
+
_content_type=_content_type,
|
|
3073
|
+
_headers=_headers,
|
|
3074
|
+
_host_index=_host_index
|
|
3075
|
+
)
|
|
3076
|
+
|
|
3077
|
+
_response_types_map: Dict[str, Optional[str]] = {
|
|
3078
|
+
'201': "JobDetails",
|
|
3079
|
+
'401': "EndpointError",
|
|
3080
|
+
}
|
|
3081
|
+
response_data = self.api_client.call_api(
|
|
3082
|
+
*_param,
|
|
3083
|
+
_request_timeout=_request_timeout
|
|
3084
|
+
)
|
|
3085
|
+
return response_data.response
|
|
3086
|
+
|
|
3087
|
+
|
|
3088
|
+
def _create_ingest_from_s3_job_serialize(
|
|
3089
|
+
self,
|
|
3090
|
+
collection_id,
|
|
3091
|
+
ingest_from_s3_body,
|
|
3092
|
+
gen_doc_summaries,
|
|
3093
|
+
gen_doc_questions,
|
|
3094
|
+
audio_input_language,
|
|
3095
|
+
ocr_model,
|
|
3096
|
+
tesseract_lang,
|
|
3097
|
+
keep_tables_as_one_chunk,
|
|
3098
|
+
chunk_by_page,
|
|
3099
|
+
handwriting_check,
|
|
3100
|
+
ingest_mode,
|
|
3101
|
+
timeout,
|
|
3102
|
+
_request_auth,
|
|
3103
|
+
_content_type,
|
|
3104
|
+
_headers,
|
|
3105
|
+
_host_index,
|
|
3106
|
+
) -> RequestSerialized:
|
|
3107
|
+
|
|
3108
|
+
_host = None
|
|
3109
|
+
|
|
3110
|
+
_collection_formats: Dict[str, str] = {
|
|
3111
|
+
}
|
|
3112
|
+
|
|
3113
|
+
_path_params: Dict[str, str] = {}
|
|
3114
|
+
_query_params: List[Tuple[str, str]] = []
|
|
3115
|
+
_header_params: Dict[str, Optional[str]] = _headers or {}
|
|
3116
|
+
_form_params: List[Tuple[str, str]] = []
|
|
3117
|
+
_files: Dict[
|
|
3118
|
+
str, Union[str, bytes, List[str], List[bytes], List[Tuple[str, bytes]]]
|
|
3119
|
+
] = {}
|
|
3120
|
+
_body_params: Optional[bytes] = None
|
|
3121
|
+
|
|
3122
|
+
# process the path parameters
|
|
3123
|
+
# process the query parameters
|
|
3124
|
+
if collection_id is not None:
|
|
3125
|
+
|
|
3126
|
+
_query_params.append(('collection_id', collection_id))
|
|
2222
3127
|
|
|
2223
3128
|
if gen_doc_summaries is not None:
|
|
2224
3129
|
|
|
@@ -2228,9 +3133,33 @@ class DocumentIngestionApi:
|
|
|
2228
3133
|
|
|
2229
3134
|
_query_params.append(('gen_doc_questions', gen_doc_questions))
|
|
2230
3135
|
|
|
2231
|
-
if
|
|
3136
|
+
if audio_input_language is not None:
|
|
2232
3137
|
|
|
2233
|
-
_query_params.append(('
|
|
3138
|
+
_query_params.append(('audio_input_language', audio_input_language))
|
|
3139
|
+
|
|
3140
|
+
if ocr_model is not None:
|
|
3141
|
+
|
|
3142
|
+
_query_params.append(('ocr_model', ocr_model))
|
|
3143
|
+
|
|
3144
|
+
if tesseract_lang is not None:
|
|
3145
|
+
|
|
3146
|
+
_query_params.append(('tesseract_lang', tesseract_lang))
|
|
3147
|
+
|
|
3148
|
+
if keep_tables_as_one_chunk is not None:
|
|
3149
|
+
|
|
3150
|
+
_query_params.append(('keep_tables_as_one_chunk', keep_tables_as_one_chunk))
|
|
3151
|
+
|
|
3152
|
+
if chunk_by_page is not None:
|
|
3153
|
+
|
|
3154
|
+
_query_params.append(('chunk_by_page', chunk_by_page))
|
|
3155
|
+
|
|
3156
|
+
if handwriting_check is not None:
|
|
3157
|
+
|
|
3158
|
+
_query_params.append(('handwriting_check', handwriting_check))
|
|
3159
|
+
|
|
3160
|
+
if ingest_mode is not None:
|
|
3161
|
+
|
|
3162
|
+
_query_params.append(('ingest_mode', ingest_mode))
|
|
2234
3163
|
|
|
2235
3164
|
if timeout is not None:
|
|
2236
3165
|
|
|
@@ -2239,8 +3168,8 @@ class DocumentIngestionApi:
|
|
|
2239
3168
|
# process the header parameters
|
|
2240
3169
|
# process the form parameters
|
|
2241
3170
|
# process the body parameter
|
|
2242
|
-
if
|
|
2243
|
-
_body_params =
|
|
3171
|
+
if ingest_from_s3_body is not None:
|
|
3172
|
+
_body_params = ingest_from_s3_body
|
|
2244
3173
|
|
|
2245
3174
|
|
|
2246
3175
|
# set the HTTP header `Accept`
|
|
@@ -2258,7 +3187,7 @@ class DocumentIngestionApi:
|
|
|
2258
3187
|
_default_content_type = (
|
|
2259
3188
|
self.api_client.select_header_content_type(
|
|
2260
3189
|
[
|
|
2261
|
-
'
|
|
3190
|
+
'application/json'
|
|
2262
3191
|
]
|
|
2263
3192
|
)
|
|
2264
3193
|
)
|
|
@@ -2272,7 +3201,7 @@ class DocumentIngestionApi:
|
|
|
2272
3201
|
|
|
2273
3202
|
return self.api_client.param_serialize(
|
|
2274
3203
|
method='POST',
|
|
2275
|
-
resource_path='/ingest/
|
|
3204
|
+
resource_path='/ingest/s3/job',
|
|
2276
3205
|
path_params=_path_params,
|
|
2277
3206
|
query_params=_query_params,
|
|
2278
3207
|
header_params=_header_params,
|
|
@@ -2289,10 +3218,13 @@ class DocumentIngestionApi:
|
|
|
2289
3218
|
|
|
2290
3219
|
|
|
2291
3220
|
@validate_call
|
|
2292
|
-
def
|
|
3221
|
+
def create_ingest_from_website_job(
|
|
2293
3222
|
self,
|
|
2294
3223
|
collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
|
|
2295
|
-
|
|
3224
|
+
ingest_from_website_body: IngestFromWebsiteBody,
|
|
3225
|
+
follow_links: Annotated[Optional[StrictBool], Field(description="Whether to import all web pages linked from this URL will be imported. External links will be ignored. Links to other pages on the same domain will be followed as long as they are at the same level or below the URL you specify. Each page will be transformed into a PDF document.")] = None,
|
|
3226
|
+
max_depth: Annotated[Optional[StrictInt], Field(description="Max depth of recursion when following links, only when follow_links is `true`. Max_depth of 0 means don't follow any links, max_depth of 1 means follow only top-level links, etc. Use -1 for automatic (system settings).")] = None,
|
|
3227
|
+
max_documents: Annotated[Optional[StrictInt], Field(description="Max number of documents when following links, only when follow_links is `true`. Use None for automatic (system defaults). Use -1 for max (system limit).")] = None,
|
|
2296
3228
|
gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
|
|
2297
3229
|
gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
|
|
2298
3230
|
audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
|
|
@@ -2316,14 +3248,20 @@ class DocumentIngestionApi:
|
|
|
2316
3248
|
_headers: Optional[Dict[StrictStr, Any]] = None,
|
|
2317
3249
|
_host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
|
|
2318
3250
|
) -> JobDetails:
|
|
2319
|
-
"""Creates a job to
|
|
3251
|
+
"""Creates a job to crawl and ingest a URL into a collection.
|
|
2320
3252
|
|
|
2321
|
-
Creates a job to
|
|
3253
|
+
Creates a job to crawl and ingest a URL into a collection. The web page or document linked from this URL will be imported.
|
|
2322
3254
|
|
|
2323
3255
|
:param collection_id: String id of the collection to add the ingested documents into. (required)
|
|
2324
3256
|
:type collection_id: str
|
|
2325
|
-
:param
|
|
2326
|
-
:type
|
|
3257
|
+
:param ingest_from_website_body: (required)
|
|
3258
|
+
:type ingest_from_website_body: IngestFromWebsiteBody
|
|
3259
|
+
:param follow_links: Whether to import all web pages linked from this URL will be imported. External links will be ignored. Links to other pages on the same domain will be followed as long as they are at the same level or below the URL you specify. Each page will be transformed into a PDF document.
|
|
3260
|
+
:type follow_links: bool
|
|
3261
|
+
:param max_depth: Max depth of recursion when following links, only when follow_links is `true`. Max_depth of 0 means don't follow any links, max_depth of 1 means follow only top-level links, etc. Use -1 for automatic (system settings).
|
|
3262
|
+
:type max_depth: int
|
|
3263
|
+
:param max_documents: Max number of documents when following links, only when follow_links is `true`. Use None for automatic (system defaults). Use -1 for max (system limit).
|
|
3264
|
+
:type max_documents: int
|
|
2327
3265
|
:param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
|
|
2328
3266
|
:type gen_doc_summaries: bool
|
|
2329
3267
|
:param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
|
|
@@ -2366,9 +3304,12 @@ class DocumentIngestionApi:
|
|
|
2366
3304
|
:return: Returns the result object.
|
|
2367
3305
|
""" # noqa: E501
|
|
2368
3306
|
|
|
2369
|
-
_param = self.
|
|
3307
|
+
_param = self._create_ingest_from_website_job_serialize(
|
|
2370
3308
|
collection_id=collection_id,
|
|
2371
|
-
|
|
3309
|
+
ingest_from_website_body=ingest_from_website_body,
|
|
3310
|
+
follow_links=follow_links,
|
|
3311
|
+
max_depth=max_depth,
|
|
3312
|
+
max_documents=max_documents,
|
|
2372
3313
|
gen_doc_summaries=gen_doc_summaries,
|
|
2373
3314
|
gen_doc_questions=gen_doc_questions,
|
|
2374
3315
|
audio_input_language=audio_input_language,
|
|
@@ -2401,10 +3342,13 @@ class DocumentIngestionApi:
|
|
|
2401
3342
|
|
|
2402
3343
|
|
|
2403
3344
|
@validate_call
|
|
2404
|
-
def
|
|
3345
|
+
def create_ingest_from_website_job_with_http_info(
|
|
2405
3346
|
self,
|
|
2406
3347
|
collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
|
|
2407
|
-
|
|
3348
|
+
ingest_from_website_body: IngestFromWebsiteBody,
|
|
3349
|
+
follow_links: Annotated[Optional[StrictBool], Field(description="Whether to import all web pages linked from this URL will be imported. External links will be ignored. Links to other pages on the same domain will be followed as long as they are at the same level or below the URL you specify. Each page will be transformed into a PDF document.")] = None,
|
|
3350
|
+
max_depth: Annotated[Optional[StrictInt], Field(description="Max depth of recursion when following links, only when follow_links is `true`. Max_depth of 0 means don't follow any links, max_depth of 1 means follow only top-level links, etc. Use -1 for automatic (system settings).")] = None,
|
|
3351
|
+
max_documents: Annotated[Optional[StrictInt], Field(description="Max number of documents when following links, only when follow_links is `true`. Use None for automatic (system defaults). Use -1 for max (system limit).")] = None,
|
|
2408
3352
|
gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
|
|
2409
3353
|
gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
|
|
2410
3354
|
audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
|
|
@@ -2428,14 +3372,20 @@ class DocumentIngestionApi:
|
|
|
2428
3372
|
_headers: Optional[Dict[StrictStr, Any]] = None,
|
|
2429
3373
|
_host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
|
|
2430
3374
|
) -> ApiResponse[JobDetails]:
|
|
2431
|
-
"""Creates a job to
|
|
3375
|
+
"""Creates a job to crawl and ingest a URL into a collection.
|
|
2432
3376
|
|
|
2433
|
-
Creates a job to
|
|
3377
|
+
Creates a job to crawl and ingest a URL into a collection. The web page or document linked from this URL will be imported.
|
|
2434
3378
|
|
|
2435
3379
|
:param collection_id: String id of the collection to add the ingested documents into. (required)
|
|
2436
3380
|
:type collection_id: str
|
|
2437
|
-
:param
|
|
2438
|
-
:type
|
|
3381
|
+
:param ingest_from_website_body: (required)
|
|
3382
|
+
:type ingest_from_website_body: IngestFromWebsiteBody
|
|
3383
|
+
:param follow_links: Whether to import all web pages linked from this URL will be imported. External links will be ignored. Links to other pages on the same domain will be followed as long as they are at the same level or below the URL you specify. Each page will be transformed into a PDF document.
|
|
3384
|
+
:type follow_links: bool
|
|
3385
|
+
:param max_depth: Max depth of recursion when following links, only when follow_links is `true`. Max_depth of 0 means don't follow any links, max_depth of 1 means follow only top-level links, etc. Use -1 for automatic (system settings).
|
|
3386
|
+
:type max_depth: int
|
|
3387
|
+
:param max_documents: Max number of documents when following links, only when follow_links is `true`. Use None for automatic (system defaults). Use -1 for max (system limit).
|
|
3388
|
+
:type max_documents: int
|
|
2439
3389
|
:param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
|
|
2440
3390
|
:type gen_doc_summaries: bool
|
|
2441
3391
|
:param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
|
|
@@ -2478,9 +3428,12 @@ class DocumentIngestionApi:
|
|
|
2478
3428
|
:return: Returns the result object.
|
|
2479
3429
|
""" # noqa: E501
|
|
2480
3430
|
|
|
2481
|
-
_param = self.
|
|
3431
|
+
_param = self._create_ingest_from_website_job_serialize(
|
|
2482
3432
|
collection_id=collection_id,
|
|
2483
|
-
|
|
3433
|
+
ingest_from_website_body=ingest_from_website_body,
|
|
3434
|
+
follow_links=follow_links,
|
|
3435
|
+
max_depth=max_depth,
|
|
3436
|
+
max_documents=max_documents,
|
|
2484
3437
|
gen_doc_summaries=gen_doc_summaries,
|
|
2485
3438
|
gen_doc_questions=gen_doc_questions,
|
|
2486
3439
|
audio_input_language=audio_input_language,
|
|
@@ -2513,10 +3466,13 @@ class DocumentIngestionApi:
|
|
|
2513
3466
|
|
|
2514
3467
|
|
|
2515
3468
|
@validate_call
|
|
2516
|
-
def
|
|
3469
|
+
def create_ingest_from_website_job_without_preload_content(
|
|
2517
3470
|
self,
|
|
2518
3471
|
collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
|
|
2519
|
-
|
|
3472
|
+
ingest_from_website_body: IngestFromWebsiteBody,
|
|
3473
|
+
follow_links: Annotated[Optional[StrictBool], Field(description="Whether to import all web pages linked from this URL will be imported. External links will be ignored. Links to other pages on the same domain will be followed as long as they are at the same level or below the URL you specify. Each page will be transformed into a PDF document.")] = None,
|
|
3474
|
+
max_depth: Annotated[Optional[StrictInt], Field(description="Max depth of recursion when following links, only when follow_links is `true`. Max_depth of 0 means don't follow any links, max_depth of 1 means follow only top-level links, etc. Use -1 for automatic (system settings).")] = None,
|
|
3475
|
+
max_documents: Annotated[Optional[StrictInt], Field(description="Max number of documents when following links, only when follow_links is `true`. Use None for automatic (system defaults). Use -1 for max (system limit).")] = None,
|
|
2520
3476
|
gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
|
|
2521
3477
|
gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
|
|
2522
3478
|
audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
|
|
@@ -2540,14 +3496,20 @@ class DocumentIngestionApi:
|
|
|
2540
3496
|
_headers: Optional[Dict[StrictStr, Any]] = None,
|
|
2541
3497
|
_host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
|
|
2542
3498
|
) -> RESTResponseType:
|
|
2543
|
-
"""Creates a job to
|
|
3499
|
+
"""Creates a job to crawl and ingest a URL into a collection.
|
|
2544
3500
|
|
|
2545
|
-
Creates a job to
|
|
3501
|
+
Creates a job to crawl and ingest a URL into a collection. The web page or document linked from this URL will be imported.
|
|
2546
3502
|
|
|
2547
3503
|
:param collection_id: String id of the collection to add the ingested documents into. (required)
|
|
2548
3504
|
:type collection_id: str
|
|
2549
|
-
:param
|
|
2550
|
-
:type
|
|
3505
|
+
:param ingest_from_website_body: (required)
|
|
3506
|
+
:type ingest_from_website_body: IngestFromWebsiteBody
|
|
3507
|
+
:param follow_links: Whether to import all web pages linked from this URL will be imported. External links will be ignored. Links to other pages on the same domain will be followed as long as they are at the same level or below the URL you specify. Each page will be transformed into a PDF document.
|
|
3508
|
+
:type follow_links: bool
|
|
3509
|
+
:param max_depth: Max depth of recursion when following links, only when follow_links is `true`. Max_depth of 0 means don't follow any links, max_depth of 1 means follow only top-level links, etc. Use -1 for automatic (system settings).
|
|
3510
|
+
:type max_depth: int
|
|
3511
|
+
:param max_documents: Max number of documents when following links, only when follow_links is `true`. Use None for automatic (system defaults). Use -1 for max (system limit).
|
|
3512
|
+
:type max_documents: int
|
|
2551
3513
|
:param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
|
|
2552
3514
|
:type gen_doc_summaries: bool
|
|
2553
3515
|
:param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
|
|
@@ -2590,9 +3552,12 @@ class DocumentIngestionApi:
|
|
|
2590
3552
|
:return: Returns the result object.
|
|
2591
3553
|
""" # noqa: E501
|
|
2592
3554
|
|
|
2593
|
-
_param = self.
|
|
3555
|
+
_param = self._create_ingest_from_website_job_serialize(
|
|
2594
3556
|
collection_id=collection_id,
|
|
2595
|
-
|
|
3557
|
+
ingest_from_website_body=ingest_from_website_body,
|
|
3558
|
+
follow_links=follow_links,
|
|
3559
|
+
max_depth=max_depth,
|
|
3560
|
+
max_documents=max_documents,
|
|
2596
3561
|
gen_doc_summaries=gen_doc_summaries,
|
|
2597
3562
|
gen_doc_questions=gen_doc_questions,
|
|
2598
3563
|
audio_input_language=audio_input_language,
|
|
@@ -2620,10 +3585,13 @@ class DocumentIngestionApi:
|
|
|
2620
3585
|
return response_data.response
|
|
2621
3586
|
|
|
2622
3587
|
|
|
2623
|
-
def
|
|
3588
|
+
def _create_ingest_from_website_job_serialize(
|
|
2624
3589
|
self,
|
|
2625
3590
|
collection_id,
|
|
2626
|
-
|
|
3591
|
+
ingest_from_website_body,
|
|
3592
|
+
follow_links,
|
|
3593
|
+
max_depth,
|
|
3594
|
+
max_documents,
|
|
2627
3595
|
gen_doc_summaries,
|
|
2628
3596
|
gen_doc_questions,
|
|
2629
3597
|
audio_input_language,
|
|
@@ -2658,7 +3626,19 @@ class DocumentIngestionApi:
|
|
|
2658
3626
|
# process the query parameters
|
|
2659
3627
|
if collection_id is not None:
|
|
2660
3628
|
|
|
2661
|
-
_query_params.append(('collection_id', collection_id))
|
|
3629
|
+
_query_params.append(('collection_id', collection_id))
|
|
3630
|
+
|
|
3631
|
+
if follow_links is not None:
|
|
3632
|
+
|
|
3633
|
+
_query_params.append(('follow_links', follow_links))
|
|
3634
|
+
|
|
3635
|
+
if max_depth is not None:
|
|
3636
|
+
|
|
3637
|
+
_query_params.append(('max_depth', max_depth))
|
|
3638
|
+
|
|
3639
|
+
if max_documents is not None:
|
|
3640
|
+
|
|
3641
|
+
_query_params.append(('max_documents', max_documents))
|
|
2662
3642
|
|
|
2663
3643
|
if gen_doc_summaries is not None:
|
|
2664
3644
|
|
|
@@ -2703,8 +3683,8 @@ class DocumentIngestionApi:
|
|
|
2703
3683
|
# process the header parameters
|
|
2704
3684
|
# process the form parameters
|
|
2705
3685
|
# process the body parameter
|
|
2706
|
-
if
|
|
2707
|
-
_body_params =
|
|
3686
|
+
if ingest_from_website_body is not None:
|
|
3687
|
+
_body_params = ingest_from_website_body
|
|
2708
3688
|
|
|
2709
3689
|
|
|
2710
3690
|
# set the HTTP header `Accept`
|
|
@@ -2736,7 +3716,7 @@ class DocumentIngestionApi:
|
|
|
2736
3716
|
|
|
2737
3717
|
return self.api_client.param_serialize(
|
|
2738
3718
|
method='POST',
|
|
2739
|
-
resource_path='/ingest/
|
|
3719
|
+
resource_path='/ingest/website/job',
|
|
2740
3720
|
path_params=_path_params,
|
|
2741
3721
|
query_params=_query_params,
|
|
2742
3722
|
header_params=_header_params,
|
|
@@ -2753,13 +3733,10 @@ class DocumentIngestionApi:
|
|
|
2753
3733
|
|
|
2754
3734
|
|
|
2755
3735
|
@validate_call
|
|
2756
|
-
def
|
|
3736
|
+
def create_ingest_upload_job(
|
|
2757
3737
|
self,
|
|
3738
|
+
upload_ids: Annotated[List[StrictStr], Field(description="Id of uploaded document")],
|
|
2758
3739
|
collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
|
|
2759
|
-
ingest_from_website_body: IngestFromWebsiteBody,
|
|
2760
|
-
follow_links: Annotated[Optional[StrictBool], Field(description="Whether to import all web pages linked from this URL will be imported. External links will be ignored. Links to other pages on the same domain will be followed as long as they are at the same level or below the URL you specify. Each page will be transformed into a PDF document.")] = None,
|
|
2761
|
-
max_depth: Annotated[Optional[StrictInt], Field(description="Max depth of recursion when following links, only when follow_links is `true`. Max_depth of 0 means don't follow any links, max_depth of 1 means follow only top-level links, etc. Use -1 for automatic (system settings).")] = None,
|
|
2762
|
-
max_documents: Annotated[Optional[StrictInt], Field(description="Max number of documents when following links, only when follow_links is `true`. Use None for automatic (system defaults). Use -1 for max (system limit).")] = None,
|
|
2763
3740
|
gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
|
|
2764
3741
|
gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
|
|
2765
3742
|
audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
|
|
@@ -2769,7 +3746,10 @@ class DocumentIngestionApi:
|
|
|
2769
3746
|
chunk_by_page: Annotated[Optional[StrictBool], Field(description="Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.")] = None,
|
|
2770
3747
|
handwriting_check: Annotated[Optional[StrictBool], Field(description="Check pages for handwriting. Will use specialized models if handwriting is found.")] = None,
|
|
2771
3748
|
ingest_mode: Annotated[Optional[StrictStr], Field(description="Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.")] = None,
|
|
3749
|
+
restricted: Annotated[Optional[StrictBool], Field(description="Whether the document should be restricted only to certain users.")] = None,
|
|
3750
|
+
permissions: Annotated[Optional[List[StrictStr]], Field(description="The list of usernames having permissions to the document.")] = None,
|
|
2772
3751
|
timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
|
|
3752
|
+
ingest_upload_body: Optional[IngestUploadBody] = None,
|
|
2773
3753
|
_request_timeout: Union[
|
|
2774
3754
|
None,
|
|
2775
3755
|
Annotated[StrictFloat, Field(gt=0)],
|
|
@@ -2783,20 +3763,14 @@ class DocumentIngestionApi:
|
|
|
2783
3763
|
_headers: Optional[Dict[StrictStr, Any]] = None,
|
|
2784
3764
|
_host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
|
|
2785
3765
|
) -> JobDetails:
|
|
2786
|
-
"""Creates a job to
|
|
3766
|
+
"""Creates a job to ingest uploaded document
|
|
2787
3767
|
|
|
2788
|
-
Creates a job to
|
|
3768
|
+
Creates a job to ingest uploaded document identified to a given collection
|
|
2789
3769
|
|
|
3770
|
+
:param upload_ids: Id of uploaded document (required)
|
|
3771
|
+
:type upload_ids: List[str]
|
|
2790
3772
|
:param collection_id: String id of the collection to add the ingested documents into. (required)
|
|
2791
3773
|
:type collection_id: str
|
|
2792
|
-
:param ingest_from_website_body: (required)
|
|
2793
|
-
:type ingest_from_website_body: IngestFromWebsiteBody
|
|
2794
|
-
:param follow_links: Whether to import all web pages linked from this URL will be imported. External links will be ignored. Links to other pages on the same domain will be followed as long as they are at the same level or below the URL you specify. Each page will be transformed into a PDF document.
|
|
2795
|
-
:type follow_links: bool
|
|
2796
|
-
:param max_depth: Max depth of recursion when following links, only when follow_links is `true`. Max_depth of 0 means don't follow any links, max_depth of 1 means follow only top-level links, etc. Use -1 for automatic (system settings).
|
|
2797
|
-
:type max_depth: int
|
|
2798
|
-
:param max_documents: Max number of documents when following links, only when follow_links is `true`. Use None for automatic (system defaults). Use -1 for max (system limit).
|
|
2799
|
-
:type max_documents: int
|
|
2800
3774
|
:param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
|
|
2801
3775
|
:type gen_doc_summaries: bool
|
|
2802
3776
|
:param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
|
|
@@ -2815,8 +3789,14 @@ class DocumentIngestionApi:
|
|
|
2815
3789
|
:type handwriting_check: bool
|
|
2816
3790
|
:param ingest_mode: Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.
|
|
2817
3791
|
:type ingest_mode: str
|
|
3792
|
+
:param restricted: Whether the document should be restricted only to certain users.
|
|
3793
|
+
:type restricted: bool
|
|
3794
|
+
:param permissions: The list of usernames having permissions to the document.
|
|
3795
|
+
:type permissions: List[str]
|
|
2818
3796
|
:param timeout: Timeout in seconds
|
|
2819
3797
|
:type timeout: float
|
|
3798
|
+
:param ingest_upload_body:
|
|
3799
|
+
:type ingest_upload_body: IngestUploadBody
|
|
2820
3800
|
:param _request_timeout: timeout setting for this request. If one
|
|
2821
3801
|
number provided, it will be total request
|
|
2822
3802
|
timeout. It can also be a pair (tuple) of
|
|
@@ -2839,12 +3819,9 @@ class DocumentIngestionApi:
|
|
|
2839
3819
|
:return: Returns the result object.
|
|
2840
3820
|
""" # noqa: E501
|
|
2841
3821
|
|
|
2842
|
-
_param = self.
|
|
3822
|
+
_param = self._create_ingest_upload_job_serialize(
|
|
3823
|
+
upload_ids=upload_ids,
|
|
2843
3824
|
collection_id=collection_id,
|
|
2844
|
-
ingest_from_website_body=ingest_from_website_body,
|
|
2845
|
-
follow_links=follow_links,
|
|
2846
|
-
max_depth=max_depth,
|
|
2847
|
-
max_documents=max_documents,
|
|
2848
3825
|
gen_doc_summaries=gen_doc_summaries,
|
|
2849
3826
|
gen_doc_questions=gen_doc_questions,
|
|
2850
3827
|
audio_input_language=audio_input_language,
|
|
@@ -2854,7 +3831,10 @@ class DocumentIngestionApi:
|
|
|
2854
3831
|
chunk_by_page=chunk_by_page,
|
|
2855
3832
|
handwriting_check=handwriting_check,
|
|
2856
3833
|
ingest_mode=ingest_mode,
|
|
3834
|
+
restricted=restricted,
|
|
3835
|
+
permissions=permissions,
|
|
2857
3836
|
timeout=timeout,
|
|
3837
|
+
ingest_upload_body=ingest_upload_body,
|
|
2858
3838
|
_request_auth=_request_auth,
|
|
2859
3839
|
_content_type=_content_type,
|
|
2860
3840
|
_headers=_headers,
|
|
@@ -2877,13 +3857,10 @@ class DocumentIngestionApi:
|
|
|
2877
3857
|
|
|
2878
3858
|
|
|
2879
3859
|
@validate_call
|
|
2880
|
-
def
|
|
3860
|
+
def create_ingest_upload_job_with_http_info(
|
|
2881
3861
|
self,
|
|
3862
|
+
upload_ids: Annotated[List[StrictStr], Field(description="Id of uploaded document")],
|
|
2882
3863
|
collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
|
|
2883
|
-
ingest_from_website_body: IngestFromWebsiteBody,
|
|
2884
|
-
follow_links: Annotated[Optional[StrictBool], Field(description="Whether to import all web pages linked from this URL will be imported. External links will be ignored. Links to other pages on the same domain will be followed as long as they are at the same level or below the URL you specify. Each page will be transformed into a PDF document.")] = None,
|
|
2885
|
-
max_depth: Annotated[Optional[StrictInt], Field(description="Max depth of recursion when following links, only when follow_links is `true`. Max_depth of 0 means don't follow any links, max_depth of 1 means follow only top-level links, etc. Use -1 for automatic (system settings).")] = None,
|
|
2886
|
-
max_documents: Annotated[Optional[StrictInt], Field(description="Max number of documents when following links, only when follow_links is `true`. Use None for automatic (system defaults). Use -1 for max (system limit).")] = None,
|
|
2887
3864
|
gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
|
|
2888
3865
|
gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
|
|
2889
3866
|
audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
|
|
@@ -2893,7 +3870,10 @@ class DocumentIngestionApi:
|
|
|
2893
3870
|
chunk_by_page: Annotated[Optional[StrictBool], Field(description="Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.")] = None,
|
|
2894
3871
|
handwriting_check: Annotated[Optional[StrictBool], Field(description="Check pages for handwriting. Will use specialized models if handwriting is found.")] = None,
|
|
2895
3872
|
ingest_mode: Annotated[Optional[StrictStr], Field(description="Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.")] = None,
|
|
3873
|
+
restricted: Annotated[Optional[StrictBool], Field(description="Whether the document should be restricted only to certain users.")] = None,
|
|
3874
|
+
permissions: Annotated[Optional[List[StrictStr]], Field(description="The list of usernames having permissions to the document.")] = None,
|
|
2896
3875
|
timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
|
|
3876
|
+
ingest_upload_body: Optional[IngestUploadBody] = None,
|
|
2897
3877
|
_request_timeout: Union[
|
|
2898
3878
|
None,
|
|
2899
3879
|
Annotated[StrictFloat, Field(gt=0)],
|
|
@@ -2907,20 +3887,14 @@ class DocumentIngestionApi:
|
|
|
2907
3887
|
_headers: Optional[Dict[StrictStr, Any]] = None,
|
|
2908
3888
|
_host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
|
|
2909
3889
|
) -> ApiResponse[JobDetails]:
|
|
2910
|
-
"""Creates a job to
|
|
3890
|
+
"""Creates a job to ingest uploaded document
|
|
2911
3891
|
|
|
2912
|
-
Creates a job to
|
|
3892
|
+
Creates a job to ingest uploaded document identified to a given collection
|
|
2913
3893
|
|
|
3894
|
+
:param upload_ids: Id of uploaded document (required)
|
|
3895
|
+
:type upload_ids: List[str]
|
|
2914
3896
|
:param collection_id: String id of the collection to add the ingested documents into. (required)
|
|
2915
3897
|
:type collection_id: str
|
|
2916
|
-
:param ingest_from_website_body: (required)
|
|
2917
|
-
:type ingest_from_website_body: IngestFromWebsiteBody
|
|
2918
|
-
:param follow_links: Whether to import all web pages linked from this URL will be imported. External links will be ignored. Links to other pages on the same domain will be followed as long as they are at the same level or below the URL you specify. Each page will be transformed into a PDF document.
|
|
2919
|
-
:type follow_links: bool
|
|
2920
|
-
:param max_depth: Max depth of recursion when following links, only when follow_links is `true`. Max_depth of 0 means don't follow any links, max_depth of 1 means follow only top-level links, etc. Use -1 for automatic (system settings).
|
|
2921
|
-
:type max_depth: int
|
|
2922
|
-
:param max_documents: Max number of documents when following links, only when follow_links is `true`. Use None for automatic (system defaults). Use -1 for max (system limit).
|
|
2923
|
-
:type max_documents: int
|
|
2924
3898
|
:param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
|
|
2925
3899
|
:type gen_doc_summaries: bool
|
|
2926
3900
|
:param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
|
|
@@ -2939,8 +3913,14 @@ class DocumentIngestionApi:
|
|
|
2939
3913
|
:type handwriting_check: bool
|
|
2940
3914
|
:param ingest_mode: Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.
|
|
2941
3915
|
:type ingest_mode: str
|
|
3916
|
+
:param restricted: Whether the document should be restricted only to certain users.
|
|
3917
|
+
:type restricted: bool
|
|
3918
|
+
:param permissions: The list of usernames having permissions to the document.
|
|
3919
|
+
:type permissions: List[str]
|
|
2942
3920
|
:param timeout: Timeout in seconds
|
|
2943
3921
|
:type timeout: float
|
|
3922
|
+
:param ingest_upload_body:
|
|
3923
|
+
:type ingest_upload_body: IngestUploadBody
|
|
2944
3924
|
:param _request_timeout: timeout setting for this request. If one
|
|
2945
3925
|
number provided, it will be total request
|
|
2946
3926
|
timeout. It can also be a pair (tuple) of
|
|
@@ -2963,12 +3943,9 @@ class DocumentIngestionApi:
|
|
|
2963
3943
|
:return: Returns the result object.
|
|
2964
3944
|
""" # noqa: E501
|
|
2965
3945
|
|
|
2966
|
-
_param = self.
|
|
3946
|
+
_param = self._create_ingest_upload_job_serialize(
|
|
3947
|
+
upload_ids=upload_ids,
|
|
2967
3948
|
collection_id=collection_id,
|
|
2968
|
-
ingest_from_website_body=ingest_from_website_body,
|
|
2969
|
-
follow_links=follow_links,
|
|
2970
|
-
max_depth=max_depth,
|
|
2971
|
-
max_documents=max_documents,
|
|
2972
3949
|
gen_doc_summaries=gen_doc_summaries,
|
|
2973
3950
|
gen_doc_questions=gen_doc_questions,
|
|
2974
3951
|
audio_input_language=audio_input_language,
|
|
@@ -2978,7 +3955,10 @@ class DocumentIngestionApi:
|
|
|
2978
3955
|
chunk_by_page=chunk_by_page,
|
|
2979
3956
|
handwriting_check=handwriting_check,
|
|
2980
3957
|
ingest_mode=ingest_mode,
|
|
3958
|
+
restricted=restricted,
|
|
3959
|
+
permissions=permissions,
|
|
2981
3960
|
timeout=timeout,
|
|
3961
|
+
ingest_upload_body=ingest_upload_body,
|
|
2982
3962
|
_request_auth=_request_auth,
|
|
2983
3963
|
_content_type=_content_type,
|
|
2984
3964
|
_headers=_headers,
|
|
@@ -3001,13 +3981,10 @@ class DocumentIngestionApi:
|
|
|
3001
3981
|
|
|
3002
3982
|
|
|
3003
3983
|
@validate_call
|
|
3004
|
-
def
|
|
3984
|
+
def create_ingest_upload_job_without_preload_content(
|
|
3005
3985
|
self,
|
|
3986
|
+
upload_ids: Annotated[List[StrictStr], Field(description="Id of uploaded document")],
|
|
3006
3987
|
collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
|
|
3007
|
-
ingest_from_website_body: IngestFromWebsiteBody,
|
|
3008
|
-
follow_links: Annotated[Optional[StrictBool], Field(description="Whether to import all web pages linked from this URL will be imported. External links will be ignored. Links to other pages on the same domain will be followed as long as they are at the same level or below the URL you specify. Each page will be transformed into a PDF document.")] = None,
|
|
3009
|
-
max_depth: Annotated[Optional[StrictInt], Field(description="Max depth of recursion when following links, only when follow_links is `true`. Max_depth of 0 means don't follow any links, max_depth of 1 means follow only top-level links, etc. Use -1 for automatic (system settings).")] = None,
|
|
3010
|
-
max_documents: Annotated[Optional[StrictInt], Field(description="Max number of documents when following links, only when follow_links is `true`. Use None for automatic (system defaults). Use -1 for max (system limit).")] = None,
|
|
3011
3988
|
gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
|
|
3012
3989
|
gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
|
|
3013
3990
|
audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
|
|
@@ -3017,7 +3994,10 @@ class DocumentIngestionApi:
|
|
|
3017
3994
|
chunk_by_page: Annotated[Optional[StrictBool], Field(description="Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.")] = None,
|
|
3018
3995
|
handwriting_check: Annotated[Optional[StrictBool], Field(description="Check pages for handwriting. Will use specialized models if handwriting is found.")] = None,
|
|
3019
3996
|
ingest_mode: Annotated[Optional[StrictStr], Field(description="Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.")] = None,
|
|
3997
|
+
restricted: Annotated[Optional[StrictBool], Field(description="Whether the document should be restricted only to certain users.")] = None,
|
|
3998
|
+
permissions: Annotated[Optional[List[StrictStr]], Field(description="The list of usernames having permissions to the document.")] = None,
|
|
3020
3999
|
timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
|
|
4000
|
+
ingest_upload_body: Optional[IngestUploadBody] = None,
|
|
3021
4001
|
_request_timeout: Union[
|
|
3022
4002
|
None,
|
|
3023
4003
|
Annotated[StrictFloat, Field(gt=0)],
|
|
@@ -3031,20 +4011,14 @@ class DocumentIngestionApi:
|
|
|
3031
4011
|
_headers: Optional[Dict[StrictStr, Any]] = None,
|
|
3032
4012
|
_host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
|
|
3033
4013
|
) -> RESTResponseType:
|
|
3034
|
-
"""Creates a job to
|
|
4014
|
+
"""Creates a job to ingest uploaded document
|
|
3035
4015
|
|
|
3036
|
-
Creates a job to
|
|
4016
|
+
Creates a job to ingest uploaded document identified to a given collection
|
|
3037
4017
|
|
|
4018
|
+
:param upload_ids: Id of uploaded document (required)
|
|
4019
|
+
:type upload_ids: List[str]
|
|
3038
4020
|
:param collection_id: String id of the collection to add the ingested documents into. (required)
|
|
3039
4021
|
:type collection_id: str
|
|
3040
|
-
:param ingest_from_website_body: (required)
|
|
3041
|
-
:type ingest_from_website_body: IngestFromWebsiteBody
|
|
3042
|
-
:param follow_links: Whether to import all web pages linked from this URL will be imported. External links will be ignored. Links to other pages on the same domain will be followed as long as they are at the same level or below the URL you specify. Each page will be transformed into a PDF document.
|
|
3043
|
-
:type follow_links: bool
|
|
3044
|
-
:param max_depth: Max depth of recursion when following links, only when follow_links is `true`. Max_depth of 0 means don't follow any links, max_depth of 1 means follow only top-level links, etc. Use -1 for automatic (system settings).
|
|
3045
|
-
:type max_depth: int
|
|
3046
|
-
:param max_documents: Max number of documents when following links, only when follow_links is `true`. Use None for automatic (system defaults). Use -1 for max (system limit).
|
|
3047
|
-
:type max_documents: int
|
|
3048
4022
|
:param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
|
|
3049
4023
|
:type gen_doc_summaries: bool
|
|
3050
4024
|
:param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
|
|
@@ -3063,8 +4037,14 @@ class DocumentIngestionApi:
|
|
|
3063
4037
|
:type handwriting_check: bool
|
|
3064
4038
|
:param ingest_mode: Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.
|
|
3065
4039
|
:type ingest_mode: str
|
|
4040
|
+
:param restricted: Whether the document should be restricted only to certain users.
|
|
4041
|
+
:type restricted: bool
|
|
4042
|
+
:param permissions: The list of usernames having permissions to the document.
|
|
4043
|
+
:type permissions: List[str]
|
|
3066
4044
|
:param timeout: Timeout in seconds
|
|
3067
4045
|
:type timeout: float
|
|
4046
|
+
:param ingest_upload_body:
|
|
4047
|
+
:type ingest_upload_body: IngestUploadBody
|
|
3068
4048
|
:param _request_timeout: timeout setting for this request. If one
|
|
3069
4049
|
number provided, it will be total request
|
|
3070
4050
|
timeout. It can also be a pair (tuple) of
|
|
@@ -3087,12 +4067,9 @@ class DocumentIngestionApi:
|
|
|
3087
4067
|
:return: Returns the result object.
|
|
3088
4068
|
""" # noqa: E501
|
|
3089
4069
|
|
|
3090
|
-
_param = self.
|
|
4070
|
+
_param = self._create_ingest_upload_job_serialize(
|
|
4071
|
+
upload_ids=upload_ids,
|
|
3091
4072
|
collection_id=collection_id,
|
|
3092
|
-
ingest_from_website_body=ingest_from_website_body,
|
|
3093
|
-
follow_links=follow_links,
|
|
3094
|
-
max_depth=max_depth,
|
|
3095
|
-
max_documents=max_documents,
|
|
3096
4073
|
gen_doc_summaries=gen_doc_summaries,
|
|
3097
4074
|
gen_doc_questions=gen_doc_questions,
|
|
3098
4075
|
audio_input_language=audio_input_language,
|
|
@@ -3102,7 +4079,10 @@ class DocumentIngestionApi:
|
|
|
3102
4079
|
chunk_by_page=chunk_by_page,
|
|
3103
4080
|
handwriting_check=handwriting_check,
|
|
3104
4081
|
ingest_mode=ingest_mode,
|
|
4082
|
+
restricted=restricted,
|
|
4083
|
+
permissions=permissions,
|
|
3105
4084
|
timeout=timeout,
|
|
4085
|
+
ingest_upload_body=ingest_upload_body,
|
|
3106
4086
|
_request_auth=_request_auth,
|
|
3107
4087
|
_content_type=_content_type,
|
|
3108
4088
|
_headers=_headers,
|
|
@@ -3120,13 +4100,10 @@ class DocumentIngestionApi:
|
|
|
3120
4100
|
return response_data.response
|
|
3121
4101
|
|
|
3122
4102
|
|
|
3123
|
-
def
|
|
4103
|
+
def _create_ingest_upload_job_serialize(
|
|
3124
4104
|
self,
|
|
4105
|
+
upload_ids,
|
|
3125
4106
|
collection_id,
|
|
3126
|
-
ingest_from_website_body,
|
|
3127
|
-
follow_links,
|
|
3128
|
-
max_depth,
|
|
3129
|
-
max_documents,
|
|
3130
4107
|
gen_doc_summaries,
|
|
3131
4108
|
gen_doc_questions,
|
|
3132
4109
|
audio_input_language,
|
|
@@ -3136,7 +4113,10 @@ class DocumentIngestionApi:
|
|
|
3136
4113
|
chunk_by_page,
|
|
3137
4114
|
handwriting_check,
|
|
3138
4115
|
ingest_mode,
|
|
4116
|
+
restricted,
|
|
4117
|
+
permissions,
|
|
3139
4118
|
timeout,
|
|
4119
|
+
ingest_upload_body,
|
|
3140
4120
|
_request_auth,
|
|
3141
4121
|
_content_type,
|
|
3142
4122
|
_headers,
|
|
@@ -3146,6 +4126,8 @@ class DocumentIngestionApi:
|
|
|
3146
4126
|
_host = None
|
|
3147
4127
|
|
|
3148
4128
|
_collection_formats: Dict[str, str] = {
|
|
4129
|
+
'upload_ids': 'csv',
|
|
4130
|
+
'permissions': 'multi',
|
|
3149
4131
|
}
|
|
3150
4132
|
|
|
3151
4133
|
_path_params: Dict[str, str] = {}
|
|
@@ -3158,23 +4140,13 @@ class DocumentIngestionApi:
|
|
|
3158
4140
|
_body_params: Optional[bytes] = None
|
|
3159
4141
|
|
|
3160
4142
|
# process the path parameters
|
|
4143
|
+
if upload_ids is not None:
|
|
4144
|
+
_path_params['upload_ids'] = upload_ids
|
|
3161
4145
|
# process the query parameters
|
|
3162
4146
|
if collection_id is not None:
|
|
3163
4147
|
|
|
3164
4148
|
_query_params.append(('collection_id', collection_id))
|
|
3165
4149
|
|
|
3166
|
-
if follow_links is not None:
|
|
3167
|
-
|
|
3168
|
-
_query_params.append(('follow_links', follow_links))
|
|
3169
|
-
|
|
3170
|
-
if max_depth is not None:
|
|
3171
|
-
|
|
3172
|
-
_query_params.append(('max_depth', max_depth))
|
|
3173
|
-
|
|
3174
|
-
if max_documents is not None:
|
|
3175
|
-
|
|
3176
|
-
_query_params.append(('max_documents', max_documents))
|
|
3177
|
-
|
|
3178
4150
|
if gen_doc_summaries is not None:
|
|
3179
4151
|
|
|
3180
4152
|
_query_params.append(('gen_doc_summaries', gen_doc_summaries))
|
|
@@ -3211,6 +4183,14 @@ class DocumentIngestionApi:
|
|
|
3211
4183
|
|
|
3212
4184
|
_query_params.append(('ingest_mode', ingest_mode))
|
|
3213
4185
|
|
|
4186
|
+
if restricted is not None:
|
|
4187
|
+
|
|
4188
|
+
_query_params.append(('restricted', restricted))
|
|
4189
|
+
|
|
4190
|
+
if permissions is not None:
|
|
4191
|
+
|
|
4192
|
+
_query_params.append(('permissions', permissions))
|
|
4193
|
+
|
|
3214
4194
|
if timeout is not None:
|
|
3215
4195
|
|
|
3216
4196
|
_query_params.append(('timeout', timeout))
|
|
@@ -3218,8 +4198,8 @@ class DocumentIngestionApi:
|
|
|
3218
4198
|
# process the header parameters
|
|
3219
4199
|
# process the form parameters
|
|
3220
4200
|
# process the body parameter
|
|
3221
|
-
if
|
|
3222
|
-
_body_params =
|
|
4201
|
+
if ingest_upload_body is not None:
|
|
4202
|
+
_body_params = ingest_upload_body
|
|
3223
4203
|
|
|
3224
4204
|
|
|
3225
4205
|
# set the HTTP header `Accept`
|
|
@@ -3251,7 +4231,7 @@ class DocumentIngestionApi:
|
|
|
3251
4231
|
|
|
3252
4232
|
return self.api_client.param_serialize(
|
|
3253
4233
|
method='POST',
|
|
3254
|
-
resource_path='/ingest/
|
|
4234
|
+
resource_path='/uploads/{upload_ids}/ingest/job',
|
|
3255
4235
|
path_params=_path_params,
|
|
3256
4236
|
query_params=_query_params,
|
|
3257
4237
|
header_params=_header_params,
|
|
@@ -3268,23 +4248,21 @@ class DocumentIngestionApi:
|
|
|
3268
4248
|
|
|
3269
4249
|
|
|
3270
4250
|
@validate_call
|
|
3271
|
-
def
|
|
4251
|
+
def ingest_agent_only_to_standard(
|
|
3272
4252
|
self,
|
|
3273
|
-
upload_ids: Annotated[List[StrictStr], Field(description="Id of uploaded document")],
|
|
3274
4253
|
collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
|
|
4254
|
+
document_id: Annotated[StrictStr, Field(description="String id of the document to be parsed.")],
|
|
3275
4255
|
gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
|
|
3276
4256
|
gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
|
|
3277
4257
|
audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
|
|
3278
4258
|
ocr_model: Annotated[Optional[StrictStr], Field(description="Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).")] = None,
|
|
4259
|
+
restricted: Annotated[Optional[StrictBool], Field(description="Whether the document should be restricted only to certain users.")] = None,
|
|
4260
|
+
permissions: Annotated[Optional[List[StrictStr]], Field(description="The list of usernames having permissions to the document.")] = None,
|
|
3279
4261
|
tesseract_lang: Annotated[Optional[StrictStr], Field(description="Which language to use when using ocr_model=\"tesseract\".")] = None,
|
|
3280
4262
|
keep_tables_as_one_chunk: Annotated[Optional[StrictBool], Field(description="When tables are identified by the table parser the table tokens will be kept in a single chunk.")] = None,
|
|
3281
4263
|
chunk_by_page: Annotated[Optional[StrictBool], Field(description="Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.")] = None,
|
|
3282
4264
|
handwriting_check: Annotated[Optional[StrictBool], Field(description="Check pages for handwriting. Will use specialized models if handwriting is found.")] = None,
|
|
3283
|
-
ingest_mode: Annotated[Optional[StrictStr], Field(description="Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.")] = None,
|
|
3284
|
-
restricted: Annotated[Optional[StrictBool], Field(description="Whether the document should be restricted only to certain users.")] = None,
|
|
3285
|
-
permissions: Annotated[Optional[List[StrictStr]], Field(description="The list of usernames having permissions to the document.")] = None,
|
|
3286
4265
|
timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
|
|
3287
|
-
ingest_upload_body: Optional[IngestUploadBody] = None,
|
|
3288
4266
|
_request_timeout: Union[
|
|
3289
4267
|
None,
|
|
3290
4268
|
Annotated[StrictFloat, Field(gt=0)],
|
|
@@ -3297,15 +4275,15 @@ class DocumentIngestionApi:
|
|
|
3297
4275
|
_content_type: Optional[StrictStr] = None,
|
|
3298
4276
|
_headers: Optional[Dict[StrictStr, Any]] = None,
|
|
3299
4277
|
_host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
|
|
3300
|
-
) ->
|
|
3301
|
-
"""
|
|
4278
|
+
) -> None:
|
|
4279
|
+
"""Converts files uploaded in \"agent_only\" ingest mode to PDF and parses them.
|
|
3302
4280
|
|
|
3303
|
-
|
|
4281
|
+
Converts files uploaded in \"agent_only\" ingest mode to PDF and parses them.
|
|
3304
4282
|
|
|
3305
|
-
:param upload_ids: Id of uploaded document (required)
|
|
3306
|
-
:type upload_ids: List[str]
|
|
3307
4283
|
:param collection_id: String id of the collection to add the ingested documents into. (required)
|
|
3308
4284
|
:type collection_id: str
|
|
4285
|
+
:param document_id: String id of the document to be parsed. (required)
|
|
4286
|
+
:type document_id: str
|
|
3309
4287
|
:param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
|
|
3310
4288
|
:type gen_doc_summaries: bool
|
|
3311
4289
|
:param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
|
|
@@ -3314,6 +4292,10 @@ class DocumentIngestionApi:
|
|
|
3314
4292
|
:type audio_input_language: str
|
|
3315
4293
|
:param ocr_model: Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).
|
|
3316
4294
|
:type ocr_model: str
|
|
4295
|
+
:param restricted: Whether the document should be restricted only to certain users.
|
|
4296
|
+
:type restricted: bool
|
|
4297
|
+
:param permissions: The list of usernames having permissions to the document.
|
|
4298
|
+
:type permissions: List[str]
|
|
3317
4299
|
:param tesseract_lang: Which language to use when using ocr_model=\"tesseract\".
|
|
3318
4300
|
:type tesseract_lang: str
|
|
3319
4301
|
:param keep_tables_as_one_chunk: When tables are identified by the table parser the table tokens will be kept in a single chunk.
|
|
@@ -3322,16 +4304,8 @@ class DocumentIngestionApi:
|
|
|
3322
4304
|
:type chunk_by_page: bool
|
|
3323
4305
|
:param handwriting_check: Check pages for handwriting. Will use specialized models if handwriting is found.
|
|
3324
4306
|
:type handwriting_check: bool
|
|
3325
|
-
:param ingest_mode: Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.
|
|
3326
|
-
:type ingest_mode: str
|
|
3327
|
-
:param restricted: Whether the document should be restricted only to certain users.
|
|
3328
|
-
:type restricted: bool
|
|
3329
|
-
:param permissions: The list of usernames having permissions to the document.
|
|
3330
|
-
:type permissions: List[str]
|
|
3331
4307
|
:param timeout: Timeout in seconds
|
|
3332
4308
|
:type timeout: float
|
|
3333
|
-
:param ingest_upload_body:
|
|
3334
|
-
:type ingest_upload_body: IngestUploadBody
|
|
3335
4309
|
:param _request_timeout: timeout setting for this request. If one
|
|
3336
4310
|
number provided, it will be total request
|
|
3337
4311
|
timeout. It can also be a pair (tuple) of
|
|
@@ -3354,22 +4328,20 @@ class DocumentIngestionApi:
|
|
|
3354
4328
|
:return: Returns the result object.
|
|
3355
4329
|
""" # noqa: E501
|
|
3356
4330
|
|
|
3357
|
-
_param = self.
|
|
3358
|
-
upload_ids=upload_ids,
|
|
4331
|
+
_param = self._ingest_agent_only_to_standard_serialize(
|
|
3359
4332
|
collection_id=collection_id,
|
|
4333
|
+
document_id=document_id,
|
|
3360
4334
|
gen_doc_summaries=gen_doc_summaries,
|
|
3361
4335
|
gen_doc_questions=gen_doc_questions,
|
|
3362
4336
|
audio_input_language=audio_input_language,
|
|
3363
4337
|
ocr_model=ocr_model,
|
|
4338
|
+
restricted=restricted,
|
|
4339
|
+
permissions=permissions,
|
|
3364
4340
|
tesseract_lang=tesseract_lang,
|
|
3365
4341
|
keep_tables_as_one_chunk=keep_tables_as_one_chunk,
|
|
3366
4342
|
chunk_by_page=chunk_by_page,
|
|
3367
4343
|
handwriting_check=handwriting_check,
|
|
3368
|
-
ingest_mode=ingest_mode,
|
|
3369
|
-
restricted=restricted,
|
|
3370
|
-
permissions=permissions,
|
|
3371
4344
|
timeout=timeout,
|
|
3372
|
-
ingest_upload_body=ingest_upload_body,
|
|
3373
4345
|
_request_auth=_request_auth,
|
|
3374
4346
|
_content_type=_content_type,
|
|
3375
4347
|
_headers=_headers,
|
|
@@ -3377,7 +4349,7 @@ class DocumentIngestionApi:
|
|
|
3377
4349
|
)
|
|
3378
4350
|
|
|
3379
4351
|
_response_types_map: Dict[str, Optional[str]] = {
|
|
3380
|
-
'
|
|
4352
|
+
'204': None,
|
|
3381
4353
|
'401': "EndpointError",
|
|
3382
4354
|
}
|
|
3383
4355
|
response_data = self.api_client.call_api(
|
|
@@ -3392,23 +4364,21 @@ class DocumentIngestionApi:
|
|
|
3392
4364
|
|
|
3393
4365
|
|
|
3394
4366
|
@validate_call
|
|
3395
|
-
def
|
|
4367
|
+
def ingest_agent_only_to_standard_with_http_info(
|
|
3396
4368
|
self,
|
|
3397
|
-
upload_ids: Annotated[List[StrictStr], Field(description="Id of uploaded document")],
|
|
3398
4369
|
collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
|
|
4370
|
+
document_id: Annotated[StrictStr, Field(description="String id of the document to be parsed.")],
|
|
3399
4371
|
gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
|
|
3400
4372
|
gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
|
|
3401
4373
|
audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
|
|
3402
4374
|
ocr_model: Annotated[Optional[StrictStr], Field(description="Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).")] = None,
|
|
4375
|
+
restricted: Annotated[Optional[StrictBool], Field(description="Whether the document should be restricted only to certain users.")] = None,
|
|
4376
|
+
permissions: Annotated[Optional[List[StrictStr]], Field(description="The list of usernames having permissions to the document.")] = None,
|
|
3403
4377
|
tesseract_lang: Annotated[Optional[StrictStr], Field(description="Which language to use when using ocr_model=\"tesseract\".")] = None,
|
|
3404
4378
|
keep_tables_as_one_chunk: Annotated[Optional[StrictBool], Field(description="When tables are identified by the table parser the table tokens will be kept in a single chunk.")] = None,
|
|
3405
4379
|
chunk_by_page: Annotated[Optional[StrictBool], Field(description="Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.")] = None,
|
|
3406
4380
|
handwriting_check: Annotated[Optional[StrictBool], Field(description="Check pages for handwriting. Will use specialized models if handwriting is found.")] = None,
|
|
3407
|
-
ingest_mode: Annotated[Optional[StrictStr], Field(description="Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.")] = None,
|
|
3408
|
-
restricted: Annotated[Optional[StrictBool], Field(description="Whether the document should be restricted only to certain users.")] = None,
|
|
3409
|
-
permissions: Annotated[Optional[List[StrictStr]], Field(description="The list of usernames having permissions to the document.")] = None,
|
|
3410
4381
|
timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
|
|
3411
|
-
ingest_upload_body: Optional[IngestUploadBody] = None,
|
|
3412
4382
|
_request_timeout: Union[
|
|
3413
4383
|
None,
|
|
3414
4384
|
Annotated[StrictFloat, Field(gt=0)],
|
|
@@ -3421,15 +4391,15 @@ class DocumentIngestionApi:
|
|
|
3421
4391
|
_content_type: Optional[StrictStr] = None,
|
|
3422
4392
|
_headers: Optional[Dict[StrictStr, Any]] = None,
|
|
3423
4393
|
_host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
|
|
3424
|
-
) -> ApiResponse[
|
|
3425
|
-
"""
|
|
4394
|
+
) -> ApiResponse[None]:
|
|
4395
|
+
"""Converts files uploaded in \"agent_only\" ingest mode to PDF and parses them.
|
|
3426
4396
|
|
|
3427
|
-
|
|
4397
|
+
Converts files uploaded in \"agent_only\" ingest mode to PDF and parses them.
|
|
3428
4398
|
|
|
3429
|
-
:param upload_ids: Id of uploaded document (required)
|
|
3430
|
-
:type upload_ids: List[str]
|
|
3431
4399
|
:param collection_id: String id of the collection to add the ingested documents into. (required)
|
|
3432
4400
|
:type collection_id: str
|
|
4401
|
+
:param document_id: String id of the document to be parsed. (required)
|
|
4402
|
+
:type document_id: str
|
|
3433
4403
|
:param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
|
|
3434
4404
|
:type gen_doc_summaries: bool
|
|
3435
4405
|
:param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
|
|
@@ -3438,6 +4408,10 @@ class DocumentIngestionApi:
|
|
|
3438
4408
|
:type audio_input_language: str
|
|
3439
4409
|
:param ocr_model: Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).
|
|
3440
4410
|
:type ocr_model: str
|
|
4411
|
+
:param restricted: Whether the document should be restricted only to certain users.
|
|
4412
|
+
:type restricted: bool
|
|
4413
|
+
:param permissions: The list of usernames having permissions to the document.
|
|
4414
|
+
:type permissions: List[str]
|
|
3441
4415
|
:param tesseract_lang: Which language to use when using ocr_model=\"tesseract\".
|
|
3442
4416
|
:type tesseract_lang: str
|
|
3443
4417
|
:param keep_tables_as_one_chunk: When tables are identified by the table parser the table tokens will be kept in a single chunk.
|
|
@@ -3446,16 +4420,8 @@ class DocumentIngestionApi:
|
|
|
3446
4420
|
:type chunk_by_page: bool
|
|
3447
4421
|
:param handwriting_check: Check pages for handwriting. Will use specialized models if handwriting is found.
|
|
3448
4422
|
:type handwriting_check: bool
|
|
3449
|
-
:param ingest_mode: Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.
|
|
3450
|
-
:type ingest_mode: str
|
|
3451
|
-
:param restricted: Whether the document should be restricted only to certain users.
|
|
3452
|
-
:type restricted: bool
|
|
3453
|
-
:param permissions: The list of usernames having permissions to the document.
|
|
3454
|
-
:type permissions: List[str]
|
|
3455
4423
|
:param timeout: Timeout in seconds
|
|
3456
4424
|
:type timeout: float
|
|
3457
|
-
:param ingest_upload_body:
|
|
3458
|
-
:type ingest_upload_body: IngestUploadBody
|
|
3459
4425
|
:param _request_timeout: timeout setting for this request. If one
|
|
3460
4426
|
number provided, it will be total request
|
|
3461
4427
|
timeout. It can also be a pair (tuple) of
|
|
@@ -3478,22 +4444,20 @@ class DocumentIngestionApi:
|
|
|
3478
4444
|
:return: Returns the result object.
|
|
3479
4445
|
""" # noqa: E501
|
|
3480
4446
|
|
|
3481
|
-
_param = self.
|
|
3482
|
-
upload_ids=upload_ids,
|
|
4447
|
+
_param = self._ingest_agent_only_to_standard_serialize(
|
|
3483
4448
|
collection_id=collection_id,
|
|
4449
|
+
document_id=document_id,
|
|
3484
4450
|
gen_doc_summaries=gen_doc_summaries,
|
|
3485
4451
|
gen_doc_questions=gen_doc_questions,
|
|
3486
4452
|
audio_input_language=audio_input_language,
|
|
3487
4453
|
ocr_model=ocr_model,
|
|
4454
|
+
restricted=restricted,
|
|
4455
|
+
permissions=permissions,
|
|
3488
4456
|
tesseract_lang=tesseract_lang,
|
|
3489
4457
|
keep_tables_as_one_chunk=keep_tables_as_one_chunk,
|
|
3490
4458
|
chunk_by_page=chunk_by_page,
|
|
3491
4459
|
handwriting_check=handwriting_check,
|
|
3492
|
-
ingest_mode=ingest_mode,
|
|
3493
|
-
restricted=restricted,
|
|
3494
|
-
permissions=permissions,
|
|
3495
4460
|
timeout=timeout,
|
|
3496
|
-
ingest_upload_body=ingest_upload_body,
|
|
3497
4461
|
_request_auth=_request_auth,
|
|
3498
4462
|
_content_type=_content_type,
|
|
3499
4463
|
_headers=_headers,
|
|
@@ -3501,7 +4465,7 @@ class DocumentIngestionApi:
|
|
|
3501
4465
|
)
|
|
3502
4466
|
|
|
3503
4467
|
_response_types_map: Dict[str, Optional[str]] = {
|
|
3504
|
-
'
|
|
4468
|
+
'204': None,
|
|
3505
4469
|
'401': "EndpointError",
|
|
3506
4470
|
}
|
|
3507
4471
|
response_data = self.api_client.call_api(
|
|
@@ -3516,23 +4480,21 @@ class DocumentIngestionApi:
|
|
|
3516
4480
|
|
|
3517
4481
|
|
|
3518
4482
|
@validate_call
|
|
3519
|
-
def
|
|
4483
|
+
def ingest_agent_only_to_standard_without_preload_content(
|
|
3520
4484
|
self,
|
|
3521
|
-
upload_ids: Annotated[List[StrictStr], Field(description="Id of uploaded document")],
|
|
3522
4485
|
collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
|
|
4486
|
+
document_id: Annotated[StrictStr, Field(description="String id of the document to be parsed.")],
|
|
3523
4487
|
gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
|
|
3524
4488
|
gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
|
|
3525
4489
|
audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
|
|
3526
4490
|
ocr_model: Annotated[Optional[StrictStr], Field(description="Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).")] = None,
|
|
4491
|
+
restricted: Annotated[Optional[StrictBool], Field(description="Whether the document should be restricted only to certain users.")] = None,
|
|
4492
|
+
permissions: Annotated[Optional[List[StrictStr]], Field(description="The list of usernames having permissions to the document.")] = None,
|
|
3527
4493
|
tesseract_lang: Annotated[Optional[StrictStr], Field(description="Which language to use when using ocr_model=\"tesseract\".")] = None,
|
|
3528
4494
|
keep_tables_as_one_chunk: Annotated[Optional[StrictBool], Field(description="When tables are identified by the table parser the table tokens will be kept in a single chunk.")] = None,
|
|
3529
4495
|
chunk_by_page: Annotated[Optional[StrictBool], Field(description="Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.")] = None,
|
|
3530
4496
|
handwriting_check: Annotated[Optional[StrictBool], Field(description="Check pages for handwriting. Will use specialized models if handwriting is found.")] = None,
|
|
3531
|
-
ingest_mode: Annotated[Optional[StrictStr], Field(description="Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.")] = None,
|
|
3532
|
-
restricted: Annotated[Optional[StrictBool], Field(description="Whether the document should be restricted only to certain users.")] = None,
|
|
3533
|
-
permissions: Annotated[Optional[List[StrictStr]], Field(description="The list of usernames having permissions to the document.")] = None,
|
|
3534
4497
|
timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
|
|
3535
|
-
ingest_upload_body: Optional[IngestUploadBody] = None,
|
|
3536
4498
|
_request_timeout: Union[
|
|
3537
4499
|
None,
|
|
3538
4500
|
Annotated[StrictFloat, Field(gt=0)],
|
|
@@ -3546,14 +4508,14 @@ class DocumentIngestionApi:
|
|
|
3546
4508
|
_headers: Optional[Dict[StrictStr, Any]] = None,
|
|
3547
4509
|
_host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
|
|
3548
4510
|
) -> RESTResponseType:
|
|
3549
|
-
"""
|
|
4511
|
+
"""Converts files uploaded in \"agent_only\" ingest mode to PDF and parses them.
|
|
3550
4512
|
|
|
3551
|
-
|
|
4513
|
+
Converts files uploaded in \"agent_only\" ingest mode to PDF and parses them.
|
|
3552
4514
|
|
|
3553
|
-
:param upload_ids: Id of uploaded document (required)
|
|
3554
|
-
:type upload_ids: List[str]
|
|
3555
4515
|
:param collection_id: String id of the collection to add the ingested documents into. (required)
|
|
3556
4516
|
:type collection_id: str
|
|
4517
|
+
:param document_id: String id of the document to be parsed. (required)
|
|
4518
|
+
:type document_id: str
|
|
3557
4519
|
:param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
|
|
3558
4520
|
:type gen_doc_summaries: bool
|
|
3559
4521
|
:param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
|
|
@@ -3562,6 +4524,10 @@ class DocumentIngestionApi:
|
|
|
3562
4524
|
:type audio_input_language: str
|
|
3563
4525
|
:param ocr_model: Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).
|
|
3564
4526
|
:type ocr_model: str
|
|
4527
|
+
:param restricted: Whether the document should be restricted only to certain users.
|
|
4528
|
+
:type restricted: bool
|
|
4529
|
+
:param permissions: The list of usernames having permissions to the document.
|
|
4530
|
+
:type permissions: List[str]
|
|
3565
4531
|
:param tesseract_lang: Which language to use when using ocr_model=\"tesseract\".
|
|
3566
4532
|
:type tesseract_lang: str
|
|
3567
4533
|
:param keep_tables_as_one_chunk: When tables are identified by the table parser the table tokens will be kept in a single chunk.
|
|
@@ -3570,16 +4536,8 @@ class DocumentIngestionApi:
|
|
|
3570
4536
|
:type chunk_by_page: bool
|
|
3571
4537
|
:param handwriting_check: Check pages for handwriting. Will use specialized models if handwriting is found.
|
|
3572
4538
|
:type handwriting_check: bool
|
|
3573
|
-
:param ingest_mode: Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.
|
|
3574
|
-
:type ingest_mode: str
|
|
3575
|
-
:param restricted: Whether the document should be restricted only to certain users.
|
|
3576
|
-
:type restricted: bool
|
|
3577
|
-
:param permissions: The list of usernames having permissions to the document.
|
|
3578
|
-
:type permissions: List[str]
|
|
3579
4539
|
:param timeout: Timeout in seconds
|
|
3580
4540
|
:type timeout: float
|
|
3581
|
-
:param ingest_upload_body:
|
|
3582
|
-
:type ingest_upload_body: IngestUploadBody
|
|
3583
4541
|
:param _request_timeout: timeout setting for this request. If one
|
|
3584
4542
|
number provided, it will be total request
|
|
3585
4543
|
timeout. It can also be a pair (tuple) of
|
|
@@ -3602,22 +4560,20 @@ class DocumentIngestionApi:
|
|
|
3602
4560
|
:return: Returns the result object.
|
|
3603
4561
|
""" # noqa: E501
|
|
3604
4562
|
|
|
3605
|
-
_param = self.
|
|
3606
|
-
upload_ids=upload_ids,
|
|
4563
|
+
_param = self._ingest_agent_only_to_standard_serialize(
|
|
3607
4564
|
collection_id=collection_id,
|
|
4565
|
+
document_id=document_id,
|
|
3608
4566
|
gen_doc_summaries=gen_doc_summaries,
|
|
3609
4567
|
gen_doc_questions=gen_doc_questions,
|
|
3610
4568
|
audio_input_language=audio_input_language,
|
|
3611
4569
|
ocr_model=ocr_model,
|
|
4570
|
+
restricted=restricted,
|
|
4571
|
+
permissions=permissions,
|
|
3612
4572
|
tesseract_lang=tesseract_lang,
|
|
3613
4573
|
keep_tables_as_one_chunk=keep_tables_as_one_chunk,
|
|
3614
4574
|
chunk_by_page=chunk_by_page,
|
|
3615
4575
|
handwriting_check=handwriting_check,
|
|
3616
|
-
ingest_mode=ingest_mode,
|
|
3617
|
-
restricted=restricted,
|
|
3618
|
-
permissions=permissions,
|
|
3619
4576
|
timeout=timeout,
|
|
3620
|
-
ingest_upload_body=ingest_upload_body,
|
|
3621
4577
|
_request_auth=_request_auth,
|
|
3622
4578
|
_content_type=_content_type,
|
|
3623
4579
|
_headers=_headers,
|
|
@@ -3625,7 +4581,7 @@ class DocumentIngestionApi:
|
|
|
3625
4581
|
)
|
|
3626
4582
|
|
|
3627
4583
|
_response_types_map: Dict[str, Optional[str]] = {
|
|
3628
|
-
'
|
|
4584
|
+
'204': None,
|
|
3629
4585
|
'401': "EndpointError",
|
|
3630
4586
|
}
|
|
3631
4587
|
response_data = self.api_client.call_api(
|
|
@@ -3635,23 +4591,21 @@ class DocumentIngestionApi:
|
|
|
3635
4591
|
return response_data.response
|
|
3636
4592
|
|
|
3637
4593
|
|
|
3638
|
-
def
|
|
4594
|
+
def _ingest_agent_only_to_standard_serialize(
|
|
3639
4595
|
self,
|
|
3640
|
-
upload_ids,
|
|
3641
4596
|
collection_id,
|
|
4597
|
+
document_id,
|
|
3642
4598
|
gen_doc_summaries,
|
|
3643
4599
|
gen_doc_questions,
|
|
3644
4600
|
audio_input_language,
|
|
3645
4601
|
ocr_model,
|
|
4602
|
+
restricted,
|
|
4603
|
+
permissions,
|
|
3646
4604
|
tesseract_lang,
|
|
3647
4605
|
keep_tables_as_one_chunk,
|
|
3648
4606
|
chunk_by_page,
|
|
3649
4607
|
handwriting_check,
|
|
3650
|
-
ingest_mode,
|
|
3651
|
-
restricted,
|
|
3652
|
-
permissions,
|
|
3653
4608
|
timeout,
|
|
3654
|
-
ingest_upload_body,
|
|
3655
4609
|
_request_auth,
|
|
3656
4610
|
_content_type,
|
|
3657
4611
|
_headers,
|
|
@@ -3661,7 +4615,6 @@ class DocumentIngestionApi:
|
|
|
3661
4615
|
_host = None
|
|
3662
4616
|
|
|
3663
4617
|
_collection_formats: Dict[str, str] = {
|
|
3664
|
-
'upload_ids': 'csv',
|
|
3665
4618
|
'permissions': 'multi',
|
|
3666
4619
|
}
|
|
3667
4620
|
|
|
@@ -3675,13 +4628,15 @@ class DocumentIngestionApi:
|
|
|
3675
4628
|
_body_params: Optional[bytes] = None
|
|
3676
4629
|
|
|
3677
4630
|
# process the path parameters
|
|
3678
|
-
if upload_ids is not None:
|
|
3679
|
-
_path_params['upload_ids'] = upload_ids
|
|
3680
4631
|
# process the query parameters
|
|
3681
4632
|
if collection_id is not None:
|
|
3682
4633
|
|
|
3683
4634
|
_query_params.append(('collection_id', collection_id))
|
|
3684
4635
|
|
|
4636
|
+
if document_id is not None:
|
|
4637
|
+
|
|
4638
|
+
_query_params.append(('document_id', document_id))
|
|
4639
|
+
|
|
3685
4640
|
if gen_doc_summaries is not None:
|
|
3686
4641
|
|
|
3687
4642
|
_query_params.append(('gen_doc_summaries', gen_doc_summaries))
|
|
@@ -3698,6 +4653,14 @@ class DocumentIngestionApi:
|
|
|
3698
4653
|
|
|
3699
4654
|
_query_params.append(('ocr_model', ocr_model))
|
|
3700
4655
|
|
|
4656
|
+
if restricted is not None:
|
|
4657
|
+
|
|
4658
|
+
_query_params.append(('restricted', restricted))
|
|
4659
|
+
|
|
4660
|
+
if permissions is not None:
|
|
4661
|
+
|
|
4662
|
+
_query_params.append(('permissions', permissions))
|
|
4663
|
+
|
|
3701
4664
|
if tesseract_lang is not None:
|
|
3702
4665
|
|
|
3703
4666
|
_query_params.append(('tesseract_lang', tesseract_lang))
|
|
@@ -3714,18 +4677,6 @@ class DocumentIngestionApi:
|
|
|
3714
4677
|
|
|
3715
4678
|
_query_params.append(('handwriting_check', handwriting_check))
|
|
3716
4679
|
|
|
3717
|
-
if ingest_mode is not None:
|
|
3718
|
-
|
|
3719
|
-
_query_params.append(('ingest_mode', ingest_mode))
|
|
3720
|
-
|
|
3721
|
-
if restricted is not None:
|
|
3722
|
-
|
|
3723
|
-
_query_params.append(('restricted', restricted))
|
|
3724
|
-
|
|
3725
|
-
if permissions is not None:
|
|
3726
|
-
|
|
3727
|
-
_query_params.append(('permissions', permissions))
|
|
3728
|
-
|
|
3729
4680
|
if timeout is not None:
|
|
3730
4681
|
|
|
3731
4682
|
_query_params.append(('timeout', timeout))
|
|
@@ -3733,8 +4684,6 @@ class DocumentIngestionApi:
|
|
|
3733
4684
|
# process the header parameters
|
|
3734
4685
|
# process the form parameters
|
|
3735
4686
|
# process the body parameter
|
|
3736
|
-
if ingest_upload_body is not None:
|
|
3737
|
-
_body_params = ingest_upload_body
|
|
3738
4687
|
|
|
3739
4688
|
|
|
3740
4689
|
# set the HTTP header `Accept`
|
|
@@ -3745,19 +4694,6 @@ class DocumentIngestionApi:
|
|
|
3745
4694
|
]
|
|
3746
4695
|
)
|
|
3747
4696
|
|
|
3748
|
-
# set the HTTP header `Content-Type`
|
|
3749
|
-
if _content_type:
|
|
3750
|
-
_header_params['Content-Type'] = _content_type
|
|
3751
|
-
else:
|
|
3752
|
-
_default_content_type = (
|
|
3753
|
-
self.api_client.select_header_content_type(
|
|
3754
|
-
[
|
|
3755
|
-
'application/json'
|
|
3756
|
-
]
|
|
3757
|
-
)
|
|
3758
|
-
)
|
|
3759
|
-
if _default_content_type is not None:
|
|
3760
|
-
_header_params['Content-Type'] = _default_content_type
|
|
3761
4697
|
|
|
3762
4698
|
# authentication setting
|
|
3763
4699
|
_auth_settings: List[str] = [
|
|
@@ -3766,7 +4702,7 @@ class DocumentIngestionApi:
|
|
|
3766
4702
|
|
|
3767
4703
|
return self.api_client.param_serialize(
|
|
3768
4704
|
method='POST',
|
|
3769
|
-
resource_path='/
|
|
4705
|
+
resource_path='/ingest/agent_only_to_standard',
|
|
3770
4706
|
path_params=_path_params,
|
|
3771
4707
|
query_params=_query_params,
|
|
3772
4708
|
header_params=_header_params,
|
|
@@ -3783,20 +4719,19 @@ class DocumentIngestionApi:
|
|
|
3783
4719
|
|
|
3784
4720
|
|
|
3785
4721
|
@validate_call
|
|
3786
|
-
def
|
|
4722
|
+
def ingest_from_azure_blob_storage(
|
|
3787
4723
|
self,
|
|
3788
4724
|
collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
|
|
3789
|
-
|
|
4725
|
+
ingest_from_azure_blob_storage_body: IngestFromAzureBlobStorageBody,
|
|
3790
4726
|
gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
|
|
3791
4727
|
gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
|
|
3792
4728
|
audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
|
|
3793
4729
|
ocr_model: Annotated[Optional[StrictStr], Field(description="Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).")] = None,
|
|
3794
|
-
restricted: Annotated[Optional[StrictBool], Field(description="Whether the document should be restricted only to certain users.")] = None,
|
|
3795
|
-
permissions: Annotated[Optional[List[StrictStr]], Field(description="The list of usernames having permissions to the document.")] = None,
|
|
3796
4730
|
tesseract_lang: Annotated[Optional[StrictStr], Field(description="Which language to use when using ocr_model=\"tesseract\".")] = None,
|
|
3797
4731
|
keep_tables_as_one_chunk: Annotated[Optional[StrictBool], Field(description="When tables are identified by the table parser the table tokens will be kept in a single chunk.")] = None,
|
|
3798
4732
|
chunk_by_page: Annotated[Optional[StrictBool], Field(description="Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.")] = None,
|
|
3799
4733
|
handwriting_check: Annotated[Optional[StrictBool], Field(description="Check pages for handwriting. Will use specialized models if handwriting is found.")] = None,
|
|
4734
|
+
ingest_mode: Annotated[Optional[StrictStr], Field(description="Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.")] = None,
|
|
3800
4735
|
timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
|
|
3801
4736
|
_request_timeout: Union[
|
|
3802
4737
|
None,
|
|
@@ -3811,14 +4746,14 @@ class DocumentIngestionApi:
|
|
|
3811
4746
|
_headers: Optional[Dict[StrictStr, Any]] = None,
|
|
3812
4747
|
_host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
|
|
3813
4748
|
) -> None:
|
|
3814
|
-
"""
|
|
4749
|
+
"""Adds files from the Azure Blob Storage into a collection.
|
|
3815
4750
|
|
|
3816
|
-
|
|
4751
|
+
Adds files from the Azure Blob Storage into a collection.
|
|
3817
4752
|
|
|
3818
4753
|
:param collection_id: String id of the collection to add the ingested documents into. (required)
|
|
3819
4754
|
:type collection_id: str
|
|
3820
|
-
:param
|
|
3821
|
-
:type
|
|
4755
|
+
:param ingest_from_azure_blob_storage_body: (required)
|
|
4756
|
+
:type ingest_from_azure_blob_storage_body: IngestFromAzureBlobStorageBody
|
|
3822
4757
|
:param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
|
|
3823
4758
|
:type gen_doc_summaries: bool
|
|
3824
4759
|
:param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
|
|
@@ -3827,10 +4762,6 @@ class DocumentIngestionApi:
|
|
|
3827
4762
|
:type audio_input_language: str
|
|
3828
4763
|
:param ocr_model: Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).
|
|
3829
4764
|
:type ocr_model: str
|
|
3830
|
-
:param restricted: Whether the document should be restricted only to certain users.
|
|
3831
|
-
:type restricted: bool
|
|
3832
|
-
:param permissions: The list of usernames having permissions to the document.
|
|
3833
|
-
:type permissions: List[str]
|
|
3834
4765
|
:param tesseract_lang: Which language to use when using ocr_model=\"tesseract\".
|
|
3835
4766
|
:type tesseract_lang: str
|
|
3836
4767
|
:param keep_tables_as_one_chunk: When tables are identified by the table parser the table tokens will be kept in a single chunk.
|
|
@@ -3839,6 +4770,8 @@ class DocumentIngestionApi:
|
|
|
3839
4770
|
:type chunk_by_page: bool
|
|
3840
4771
|
:param handwriting_check: Check pages for handwriting. Will use specialized models if handwriting is found.
|
|
3841
4772
|
:type handwriting_check: bool
|
|
4773
|
+
:param ingest_mode: Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.
|
|
4774
|
+
:type ingest_mode: str
|
|
3842
4775
|
:param timeout: Timeout in seconds
|
|
3843
4776
|
:type timeout: float
|
|
3844
4777
|
:param _request_timeout: timeout setting for this request. If one
|
|
@@ -3863,19 +4796,18 @@ class DocumentIngestionApi:
|
|
|
3863
4796
|
:return: Returns the result object.
|
|
3864
4797
|
""" # noqa: E501
|
|
3865
4798
|
|
|
3866
|
-
_param = self.
|
|
4799
|
+
_param = self._ingest_from_azure_blob_storage_serialize(
|
|
3867
4800
|
collection_id=collection_id,
|
|
3868
|
-
|
|
4801
|
+
ingest_from_azure_blob_storage_body=ingest_from_azure_blob_storage_body,
|
|
3869
4802
|
gen_doc_summaries=gen_doc_summaries,
|
|
3870
4803
|
gen_doc_questions=gen_doc_questions,
|
|
3871
4804
|
audio_input_language=audio_input_language,
|
|
3872
4805
|
ocr_model=ocr_model,
|
|
3873
|
-
restricted=restricted,
|
|
3874
|
-
permissions=permissions,
|
|
3875
4806
|
tesseract_lang=tesseract_lang,
|
|
3876
4807
|
keep_tables_as_one_chunk=keep_tables_as_one_chunk,
|
|
3877
4808
|
chunk_by_page=chunk_by_page,
|
|
3878
4809
|
handwriting_check=handwriting_check,
|
|
4810
|
+
ingest_mode=ingest_mode,
|
|
3879
4811
|
timeout=timeout,
|
|
3880
4812
|
_request_auth=_request_auth,
|
|
3881
4813
|
_content_type=_content_type,
|
|
@@ -3899,20 +4831,19 @@ class DocumentIngestionApi:
|
|
|
3899
4831
|
|
|
3900
4832
|
|
|
3901
4833
|
@validate_call
|
|
3902
|
-
def
|
|
4834
|
+
def ingest_from_azure_blob_storage_with_http_info(
|
|
3903
4835
|
self,
|
|
3904
4836
|
collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
|
|
3905
|
-
|
|
4837
|
+
ingest_from_azure_blob_storage_body: IngestFromAzureBlobStorageBody,
|
|
3906
4838
|
gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
|
|
3907
4839
|
gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
|
|
3908
4840
|
audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
|
|
3909
4841
|
ocr_model: Annotated[Optional[StrictStr], Field(description="Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).")] = None,
|
|
3910
|
-
restricted: Annotated[Optional[StrictBool], Field(description="Whether the document should be restricted only to certain users.")] = None,
|
|
3911
|
-
permissions: Annotated[Optional[List[StrictStr]], Field(description="The list of usernames having permissions to the document.")] = None,
|
|
3912
4842
|
tesseract_lang: Annotated[Optional[StrictStr], Field(description="Which language to use when using ocr_model=\"tesseract\".")] = None,
|
|
3913
4843
|
keep_tables_as_one_chunk: Annotated[Optional[StrictBool], Field(description="When tables are identified by the table parser the table tokens will be kept in a single chunk.")] = None,
|
|
3914
4844
|
chunk_by_page: Annotated[Optional[StrictBool], Field(description="Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.")] = None,
|
|
3915
4845
|
handwriting_check: Annotated[Optional[StrictBool], Field(description="Check pages for handwriting. Will use specialized models if handwriting is found.")] = None,
|
|
4846
|
+
ingest_mode: Annotated[Optional[StrictStr], Field(description="Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.")] = None,
|
|
3916
4847
|
timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
|
|
3917
4848
|
_request_timeout: Union[
|
|
3918
4849
|
None,
|
|
@@ -3927,14 +4858,14 @@ class DocumentIngestionApi:
|
|
|
3927
4858
|
_headers: Optional[Dict[StrictStr, Any]] = None,
|
|
3928
4859
|
_host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
|
|
3929
4860
|
) -> ApiResponse[None]:
|
|
3930
|
-
"""
|
|
4861
|
+
"""Adds files from the Azure Blob Storage into a collection.
|
|
3931
4862
|
|
|
3932
|
-
|
|
4863
|
+
Adds files from the Azure Blob Storage into a collection.
|
|
3933
4864
|
|
|
3934
4865
|
:param collection_id: String id of the collection to add the ingested documents into. (required)
|
|
3935
4866
|
:type collection_id: str
|
|
3936
|
-
:param
|
|
3937
|
-
:type
|
|
4867
|
+
:param ingest_from_azure_blob_storage_body: (required)
|
|
4868
|
+
:type ingest_from_azure_blob_storage_body: IngestFromAzureBlobStorageBody
|
|
3938
4869
|
:param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
|
|
3939
4870
|
:type gen_doc_summaries: bool
|
|
3940
4871
|
:param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
|
|
@@ -3943,10 +4874,6 @@ class DocumentIngestionApi:
|
|
|
3943
4874
|
:type audio_input_language: str
|
|
3944
4875
|
:param ocr_model: Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).
|
|
3945
4876
|
:type ocr_model: str
|
|
3946
|
-
:param restricted: Whether the document should be restricted only to certain users.
|
|
3947
|
-
:type restricted: bool
|
|
3948
|
-
:param permissions: The list of usernames having permissions to the document.
|
|
3949
|
-
:type permissions: List[str]
|
|
3950
4877
|
:param tesseract_lang: Which language to use when using ocr_model=\"tesseract\".
|
|
3951
4878
|
:type tesseract_lang: str
|
|
3952
4879
|
:param keep_tables_as_one_chunk: When tables are identified by the table parser the table tokens will be kept in a single chunk.
|
|
@@ -3955,6 +4882,8 @@ class DocumentIngestionApi:
|
|
|
3955
4882
|
:type chunk_by_page: bool
|
|
3956
4883
|
:param handwriting_check: Check pages for handwriting. Will use specialized models if handwriting is found.
|
|
3957
4884
|
:type handwriting_check: bool
|
|
4885
|
+
:param ingest_mode: Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.
|
|
4886
|
+
:type ingest_mode: str
|
|
3958
4887
|
:param timeout: Timeout in seconds
|
|
3959
4888
|
:type timeout: float
|
|
3960
4889
|
:param _request_timeout: timeout setting for this request. If one
|
|
@@ -3979,19 +4908,18 @@ class DocumentIngestionApi:
|
|
|
3979
4908
|
:return: Returns the result object.
|
|
3980
4909
|
""" # noqa: E501
|
|
3981
4910
|
|
|
3982
|
-
_param = self.
|
|
4911
|
+
_param = self._ingest_from_azure_blob_storage_serialize(
|
|
3983
4912
|
collection_id=collection_id,
|
|
3984
|
-
|
|
4913
|
+
ingest_from_azure_blob_storage_body=ingest_from_azure_blob_storage_body,
|
|
3985
4914
|
gen_doc_summaries=gen_doc_summaries,
|
|
3986
4915
|
gen_doc_questions=gen_doc_questions,
|
|
3987
4916
|
audio_input_language=audio_input_language,
|
|
3988
4917
|
ocr_model=ocr_model,
|
|
3989
|
-
restricted=restricted,
|
|
3990
|
-
permissions=permissions,
|
|
3991
4918
|
tesseract_lang=tesseract_lang,
|
|
3992
4919
|
keep_tables_as_one_chunk=keep_tables_as_one_chunk,
|
|
3993
4920
|
chunk_by_page=chunk_by_page,
|
|
3994
4921
|
handwriting_check=handwriting_check,
|
|
4922
|
+
ingest_mode=ingest_mode,
|
|
3995
4923
|
timeout=timeout,
|
|
3996
4924
|
_request_auth=_request_auth,
|
|
3997
4925
|
_content_type=_content_type,
|
|
@@ -4015,20 +4943,19 @@ class DocumentIngestionApi:
|
|
|
4015
4943
|
|
|
4016
4944
|
|
|
4017
4945
|
@validate_call
|
|
4018
|
-
def
|
|
4946
|
+
def ingest_from_azure_blob_storage_without_preload_content(
|
|
4019
4947
|
self,
|
|
4020
4948
|
collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
|
|
4021
|
-
|
|
4949
|
+
ingest_from_azure_blob_storage_body: IngestFromAzureBlobStorageBody,
|
|
4022
4950
|
gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
|
|
4023
4951
|
gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
|
|
4024
4952
|
audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
|
|
4025
4953
|
ocr_model: Annotated[Optional[StrictStr], Field(description="Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).")] = None,
|
|
4026
|
-
restricted: Annotated[Optional[StrictBool], Field(description="Whether the document should be restricted only to certain users.")] = None,
|
|
4027
|
-
permissions: Annotated[Optional[List[StrictStr]], Field(description="The list of usernames having permissions to the document.")] = None,
|
|
4028
4954
|
tesseract_lang: Annotated[Optional[StrictStr], Field(description="Which language to use when using ocr_model=\"tesseract\".")] = None,
|
|
4029
4955
|
keep_tables_as_one_chunk: Annotated[Optional[StrictBool], Field(description="When tables are identified by the table parser the table tokens will be kept in a single chunk.")] = None,
|
|
4030
4956
|
chunk_by_page: Annotated[Optional[StrictBool], Field(description="Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is `true`.")] = None,
|
|
4031
4957
|
handwriting_check: Annotated[Optional[StrictBool], Field(description="Check pages for handwriting. Will use specialized models if handwriting is found.")] = None,
|
|
4958
|
+
ingest_mode: Annotated[Optional[StrictStr], Field(description="Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.")] = None,
|
|
4032
4959
|
timeout: Annotated[Optional[Union[StrictFloat, StrictInt]], Field(description="Timeout in seconds")] = None,
|
|
4033
4960
|
_request_timeout: Union[
|
|
4034
4961
|
None,
|
|
@@ -4043,14 +4970,14 @@ class DocumentIngestionApi:
|
|
|
4043
4970
|
_headers: Optional[Dict[StrictStr, Any]] = None,
|
|
4044
4971
|
_host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
|
|
4045
4972
|
) -> RESTResponseType:
|
|
4046
|
-
"""
|
|
4973
|
+
"""Adds files from the Azure Blob Storage into a collection.
|
|
4047
4974
|
|
|
4048
|
-
|
|
4975
|
+
Adds files from the Azure Blob Storage into a collection.
|
|
4049
4976
|
|
|
4050
4977
|
:param collection_id: String id of the collection to add the ingested documents into. (required)
|
|
4051
4978
|
:type collection_id: str
|
|
4052
|
-
:param
|
|
4053
|
-
:type
|
|
4979
|
+
:param ingest_from_azure_blob_storage_body: (required)
|
|
4980
|
+
:type ingest_from_azure_blob_storage_body: IngestFromAzureBlobStorageBody
|
|
4054
4981
|
:param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
|
|
4055
4982
|
:type gen_doc_summaries: bool
|
|
4056
4983
|
:param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
|
|
@@ -4059,10 +4986,6 @@ class DocumentIngestionApi:
|
|
|
4059
4986
|
:type audio_input_language: str
|
|
4060
4987
|
:param ocr_model: Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models. docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages. Mississippi works well on handwriting. - `auto` - Automatic will auto-select the best OCR model for every page. - `off` - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).
|
|
4061
4988
|
:type ocr_model: str
|
|
4062
|
-
:param restricted: Whether the document should be restricted only to certain users.
|
|
4063
|
-
:type restricted: bool
|
|
4064
|
-
:param permissions: The list of usernames having permissions to the document.
|
|
4065
|
-
:type permissions: List[str]
|
|
4066
4989
|
:param tesseract_lang: Which language to use when using ocr_model=\"tesseract\".
|
|
4067
4990
|
:type tesseract_lang: str
|
|
4068
4991
|
:param keep_tables_as_one_chunk: When tables are identified by the table parser the table tokens will be kept in a single chunk.
|
|
@@ -4071,6 +4994,8 @@ class DocumentIngestionApi:
|
|
|
4071
4994
|
:type chunk_by_page: bool
|
|
4072
4995
|
:param handwriting_check: Check pages for handwriting. Will use specialized models if handwriting is found.
|
|
4073
4996
|
:type handwriting_check: bool
|
|
4997
|
+
:param ingest_mode: Ingest mode to use. - `standard` - Files will be ingested for use with RAG - `agent_only` - Bypasses standard ingestion. Files can only be used with agents.
|
|
4998
|
+
:type ingest_mode: str
|
|
4074
4999
|
:param timeout: Timeout in seconds
|
|
4075
5000
|
:type timeout: float
|
|
4076
5001
|
:param _request_timeout: timeout setting for this request. If one
|
|
@@ -4095,19 +5020,18 @@ class DocumentIngestionApi:
|
|
|
4095
5020
|
:return: Returns the result object.
|
|
4096
5021
|
""" # noqa: E501
|
|
4097
5022
|
|
|
4098
|
-
_param = self.
|
|
5023
|
+
_param = self._ingest_from_azure_blob_storage_serialize(
|
|
4099
5024
|
collection_id=collection_id,
|
|
4100
|
-
|
|
5025
|
+
ingest_from_azure_blob_storage_body=ingest_from_azure_blob_storage_body,
|
|
4101
5026
|
gen_doc_summaries=gen_doc_summaries,
|
|
4102
5027
|
gen_doc_questions=gen_doc_questions,
|
|
4103
5028
|
audio_input_language=audio_input_language,
|
|
4104
5029
|
ocr_model=ocr_model,
|
|
4105
|
-
restricted=restricted,
|
|
4106
|
-
permissions=permissions,
|
|
4107
5030
|
tesseract_lang=tesseract_lang,
|
|
4108
5031
|
keep_tables_as_one_chunk=keep_tables_as_one_chunk,
|
|
4109
5032
|
chunk_by_page=chunk_by_page,
|
|
4110
5033
|
handwriting_check=handwriting_check,
|
|
5034
|
+
ingest_mode=ingest_mode,
|
|
4111
5035
|
timeout=timeout,
|
|
4112
5036
|
_request_auth=_request_auth,
|
|
4113
5037
|
_content_type=_content_type,
|
|
@@ -4126,20 +5050,19 @@ class DocumentIngestionApi:
|
|
|
4126
5050
|
return response_data.response
|
|
4127
5051
|
|
|
4128
5052
|
|
|
4129
|
-
def
|
|
5053
|
+
def _ingest_from_azure_blob_storage_serialize(
|
|
4130
5054
|
self,
|
|
4131
5055
|
collection_id,
|
|
4132
|
-
|
|
5056
|
+
ingest_from_azure_blob_storage_body,
|
|
4133
5057
|
gen_doc_summaries,
|
|
4134
5058
|
gen_doc_questions,
|
|
4135
5059
|
audio_input_language,
|
|
4136
5060
|
ocr_model,
|
|
4137
|
-
restricted,
|
|
4138
|
-
permissions,
|
|
4139
5061
|
tesseract_lang,
|
|
4140
5062
|
keep_tables_as_one_chunk,
|
|
4141
5063
|
chunk_by_page,
|
|
4142
5064
|
handwriting_check,
|
|
5065
|
+
ingest_mode,
|
|
4143
5066
|
timeout,
|
|
4144
5067
|
_request_auth,
|
|
4145
5068
|
_content_type,
|
|
@@ -4150,7 +5073,6 @@ class DocumentIngestionApi:
|
|
|
4150
5073
|
_host = None
|
|
4151
5074
|
|
|
4152
5075
|
_collection_formats: Dict[str, str] = {
|
|
4153
|
-
'permissions': 'multi',
|
|
4154
5076
|
}
|
|
4155
5077
|
|
|
4156
5078
|
_path_params: Dict[str, str] = {}
|
|
@@ -4168,10 +5090,6 @@ class DocumentIngestionApi:
|
|
|
4168
5090
|
|
|
4169
5091
|
_query_params.append(('collection_id', collection_id))
|
|
4170
5092
|
|
|
4171
|
-
if document_id is not None:
|
|
4172
|
-
|
|
4173
|
-
_query_params.append(('document_id', document_id))
|
|
4174
|
-
|
|
4175
5093
|
if gen_doc_summaries is not None:
|
|
4176
5094
|
|
|
4177
5095
|
_query_params.append(('gen_doc_summaries', gen_doc_summaries))
|
|
@@ -4188,14 +5106,6 @@ class DocumentIngestionApi:
|
|
|
4188
5106
|
|
|
4189
5107
|
_query_params.append(('ocr_model', ocr_model))
|
|
4190
5108
|
|
|
4191
|
-
if restricted is not None:
|
|
4192
|
-
|
|
4193
|
-
_query_params.append(('restricted', restricted))
|
|
4194
|
-
|
|
4195
|
-
if permissions is not None:
|
|
4196
|
-
|
|
4197
|
-
_query_params.append(('permissions', permissions))
|
|
4198
|
-
|
|
4199
5109
|
if tesseract_lang is not None:
|
|
4200
5110
|
|
|
4201
5111
|
_query_params.append(('tesseract_lang', tesseract_lang))
|
|
@@ -4212,6 +5122,10 @@ class DocumentIngestionApi:
|
|
|
4212
5122
|
|
|
4213
5123
|
_query_params.append(('handwriting_check', handwriting_check))
|
|
4214
5124
|
|
|
5125
|
+
if ingest_mode is not None:
|
|
5126
|
+
|
|
5127
|
+
_query_params.append(('ingest_mode', ingest_mode))
|
|
5128
|
+
|
|
4215
5129
|
if timeout is not None:
|
|
4216
5130
|
|
|
4217
5131
|
_query_params.append(('timeout', timeout))
|
|
@@ -4219,6 +5133,8 @@ class DocumentIngestionApi:
|
|
|
4219
5133
|
# process the header parameters
|
|
4220
5134
|
# process the form parameters
|
|
4221
5135
|
# process the body parameter
|
|
5136
|
+
if ingest_from_azure_blob_storage_body is not None:
|
|
5137
|
+
_body_params = ingest_from_azure_blob_storage_body
|
|
4222
5138
|
|
|
4223
5139
|
|
|
4224
5140
|
# set the HTTP header `Accept`
|
|
@@ -4229,6 +5145,19 @@ class DocumentIngestionApi:
|
|
|
4229
5145
|
]
|
|
4230
5146
|
)
|
|
4231
5147
|
|
|
5148
|
+
# set the HTTP header `Content-Type`
|
|
5149
|
+
if _content_type:
|
|
5150
|
+
_header_params['Content-Type'] = _content_type
|
|
5151
|
+
else:
|
|
5152
|
+
_default_content_type = (
|
|
5153
|
+
self.api_client.select_header_content_type(
|
|
5154
|
+
[
|
|
5155
|
+
'application/json'
|
|
5156
|
+
]
|
|
5157
|
+
)
|
|
5158
|
+
)
|
|
5159
|
+
if _default_content_type is not None:
|
|
5160
|
+
_header_params['Content-Type'] = _default_content_type
|
|
4232
5161
|
|
|
4233
5162
|
# authentication setting
|
|
4234
5163
|
_auth_settings: List[str] = [
|
|
@@ -4237,7 +5166,7 @@ class DocumentIngestionApi:
|
|
|
4237
5166
|
|
|
4238
5167
|
return self.api_client.param_serialize(
|
|
4239
5168
|
method='POST',
|
|
4240
|
-
resource_path='/ingest/
|
|
5169
|
+
resource_path='/ingest/azure_blob_storage',
|
|
4241
5170
|
path_params=_path_params,
|
|
4242
5171
|
query_params=_query_params,
|
|
4243
5172
|
header_params=_header_params,
|
|
@@ -4254,10 +5183,10 @@ class DocumentIngestionApi:
|
|
|
4254
5183
|
|
|
4255
5184
|
|
|
4256
5185
|
@validate_call
|
|
4257
|
-
def
|
|
5186
|
+
def ingest_from_confluence(
|
|
4258
5187
|
self,
|
|
4259
5188
|
collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
|
|
4260
|
-
|
|
5189
|
+
ingest_from_confluence_body: IngestFromConfluenceBody,
|
|
4261
5190
|
gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
|
|
4262
5191
|
gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
|
|
4263
5192
|
audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
|
|
@@ -4281,14 +5210,14 @@ class DocumentIngestionApi:
|
|
|
4281
5210
|
_headers: Optional[Dict[StrictStr, Any]] = None,
|
|
4282
5211
|
_host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
|
|
4283
5212
|
) -> None:
|
|
4284
|
-
"""
|
|
5213
|
+
"""Ingests confluence pages into collection.
|
|
4285
5214
|
|
|
4286
|
-
|
|
5215
|
+
Ingests confluence pages into collection. If an ingested page has sub-pages, the subpages are also ingested.
|
|
4287
5216
|
|
|
4288
5217
|
:param collection_id: String id of the collection to add the ingested documents into. (required)
|
|
4289
5218
|
:type collection_id: str
|
|
4290
|
-
:param
|
|
4291
|
-
:type
|
|
5219
|
+
:param ingest_from_confluence_body: (required)
|
|
5220
|
+
:type ingest_from_confluence_body: IngestFromConfluenceBody
|
|
4292
5221
|
:param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
|
|
4293
5222
|
:type gen_doc_summaries: bool
|
|
4294
5223
|
:param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
|
|
@@ -4331,9 +5260,9 @@ class DocumentIngestionApi:
|
|
|
4331
5260
|
:return: Returns the result object.
|
|
4332
5261
|
""" # noqa: E501
|
|
4333
5262
|
|
|
4334
|
-
_param = self.
|
|
5263
|
+
_param = self._ingest_from_confluence_serialize(
|
|
4335
5264
|
collection_id=collection_id,
|
|
4336
|
-
|
|
5265
|
+
ingest_from_confluence_body=ingest_from_confluence_body,
|
|
4337
5266
|
gen_doc_summaries=gen_doc_summaries,
|
|
4338
5267
|
gen_doc_questions=gen_doc_questions,
|
|
4339
5268
|
audio_input_language=audio_input_language,
|
|
@@ -4366,10 +5295,10 @@ class DocumentIngestionApi:
|
|
|
4366
5295
|
|
|
4367
5296
|
|
|
4368
5297
|
@validate_call
|
|
4369
|
-
def
|
|
5298
|
+
def ingest_from_confluence_with_http_info(
|
|
4370
5299
|
self,
|
|
4371
5300
|
collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
|
|
4372
|
-
|
|
5301
|
+
ingest_from_confluence_body: IngestFromConfluenceBody,
|
|
4373
5302
|
gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
|
|
4374
5303
|
gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
|
|
4375
5304
|
audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
|
|
@@ -4393,14 +5322,14 @@ class DocumentIngestionApi:
|
|
|
4393
5322
|
_headers: Optional[Dict[StrictStr, Any]] = None,
|
|
4394
5323
|
_host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
|
|
4395
5324
|
) -> ApiResponse[None]:
|
|
4396
|
-
"""
|
|
5325
|
+
"""Ingests confluence pages into collection.
|
|
4397
5326
|
|
|
4398
|
-
|
|
5327
|
+
Ingests confluence pages into collection. If an ingested page has sub-pages, the subpages are also ingested.
|
|
4399
5328
|
|
|
4400
5329
|
:param collection_id: String id of the collection to add the ingested documents into. (required)
|
|
4401
5330
|
:type collection_id: str
|
|
4402
|
-
:param
|
|
4403
|
-
:type
|
|
5331
|
+
:param ingest_from_confluence_body: (required)
|
|
5332
|
+
:type ingest_from_confluence_body: IngestFromConfluenceBody
|
|
4404
5333
|
:param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
|
|
4405
5334
|
:type gen_doc_summaries: bool
|
|
4406
5335
|
:param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
|
|
@@ -4443,9 +5372,9 @@ class DocumentIngestionApi:
|
|
|
4443
5372
|
:return: Returns the result object.
|
|
4444
5373
|
""" # noqa: E501
|
|
4445
5374
|
|
|
4446
|
-
_param = self.
|
|
5375
|
+
_param = self._ingest_from_confluence_serialize(
|
|
4447
5376
|
collection_id=collection_id,
|
|
4448
|
-
|
|
5377
|
+
ingest_from_confluence_body=ingest_from_confluence_body,
|
|
4449
5378
|
gen_doc_summaries=gen_doc_summaries,
|
|
4450
5379
|
gen_doc_questions=gen_doc_questions,
|
|
4451
5380
|
audio_input_language=audio_input_language,
|
|
@@ -4478,10 +5407,10 @@ class DocumentIngestionApi:
|
|
|
4478
5407
|
|
|
4479
5408
|
|
|
4480
5409
|
@validate_call
|
|
4481
|
-
def
|
|
5410
|
+
def ingest_from_confluence_without_preload_content(
|
|
4482
5411
|
self,
|
|
4483
5412
|
collection_id: Annotated[StrictStr, Field(description="String id of the collection to add the ingested documents into.")],
|
|
4484
|
-
|
|
5413
|
+
ingest_from_confluence_body: IngestFromConfluenceBody,
|
|
4485
5414
|
gen_doc_summaries: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate document summaries (uses LLM).")] = None,
|
|
4486
5415
|
gen_doc_questions: Annotated[Optional[StrictBool], Field(description="Whether to auto-generate sample questions for each document (uses LLM).")] = None,
|
|
4487
5416
|
audio_input_language: Annotated[Optional[StrictStr], Field(description="Language of audio files.")] = None,
|
|
@@ -4505,14 +5434,14 @@ class DocumentIngestionApi:
|
|
|
4505
5434
|
_headers: Optional[Dict[StrictStr, Any]] = None,
|
|
4506
5435
|
_host_index: Annotated[StrictInt, Field(ge=0, le=0)] = 0,
|
|
4507
5436
|
) -> RESTResponseType:
|
|
4508
|
-
"""
|
|
5437
|
+
"""Ingests confluence pages into collection.
|
|
4509
5438
|
|
|
4510
|
-
|
|
5439
|
+
Ingests confluence pages into collection. If an ingested page has sub-pages, the subpages are also ingested.
|
|
4511
5440
|
|
|
4512
5441
|
:param collection_id: String id of the collection to add the ingested documents into. (required)
|
|
4513
5442
|
:type collection_id: str
|
|
4514
|
-
:param
|
|
4515
|
-
:type
|
|
5443
|
+
:param ingest_from_confluence_body: (required)
|
|
5444
|
+
:type ingest_from_confluence_body: IngestFromConfluenceBody
|
|
4516
5445
|
:param gen_doc_summaries: Whether to auto-generate document summaries (uses LLM).
|
|
4517
5446
|
:type gen_doc_summaries: bool
|
|
4518
5447
|
:param gen_doc_questions: Whether to auto-generate sample questions for each document (uses LLM).
|
|
@@ -4555,9 +5484,9 @@ class DocumentIngestionApi:
|
|
|
4555
5484
|
:return: Returns the result object.
|
|
4556
5485
|
""" # noqa: E501
|
|
4557
5486
|
|
|
4558
|
-
_param = self.
|
|
5487
|
+
_param = self._ingest_from_confluence_serialize(
|
|
4559
5488
|
collection_id=collection_id,
|
|
4560
|
-
|
|
5489
|
+
ingest_from_confluence_body=ingest_from_confluence_body,
|
|
4561
5490
|
gen_doc_summaries=gen_doc_summaries,
|
|
4562
5491
|
gen_doc_questions=gen_doc_questions,
|
|
4563
5492
|
audio_input_language=audio_input_language,
|
|
@@ -4585,10 +5514,10 @@ class DocumentIngestionApi:
|
|
|
4585
5514
|
return response_data.response
|
|
4586
5515
|
|
|
4587
5516
|
|
|
4588
|
-
def
|
|
5517
|
+
def _ingest_from_confluence_serialize(
|
|
4589
5518
|
self,
|
|
4590
5519
|
collection_id,
|
|
4591
|
-
|
|
5520
|
+
ingest_from_confluence_body,
|
|
4592
5521
|
gen_doc_summaries,
|
|
4593
5522
|
gen_doc_questions,
|
|
4594
5523
|
audio_input_language,
|
|
@@ -4668,8 +5597,8 @@ class DocumentIngestionApi:
|
|
|
4668
5597
|
# process the header parameters
|
|
4669
5598
|
# process the form parameters
|
|
4670
5599
|
# process the body parameter
|
|
4671
|
-
if
|
|
4672
|
-
_body_params =
|
|
5600
|
+
if ingest_from_confluence_body is not None:
|
|
5601
|
+
_body_params = ingest_from_confluence_body
|
|
4673
5602
|
|
|
4674
5603
|
|
|
4675
5604
|
# set the HTTP header `Accept`
|
|
@@ -4701,7 +5630,7 @@ class DocumentIngestionApi:
|
|
|
4701
5630
|
|
|
4702
5631
|
return self.api_client.param_serialize(
|
|
4703
5632
|
method='POST',
|
|
4704
|
-
resource_path='/ingest/
|
|
5633
|
+
resource_path='/ingest/confluence',
|
|
4705
5634
|
path_params=_path_params,
|
|
4706
5635
|
query_params=_query_params,
|
|
4707
5636
|
header_params=_header_params,
|