h2ogpte 1.6.43rc2__py3-none-any.whl → 1.6.43rc5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- h2ogpte/__init__.py +1 -1
- h2ogpte/connectors.py +11 -0
- h2ogpte/h2ogpte.py +86 -0
- h2ogpte/h2ogpte_async.py +87 -0
- h2ogpte/rest_async/__init__.py +3 -1
- h2ogpte/rest_async/api/document_ingestion_api.py +1365 -436
- h2ogpte/rest_async/api_client.py +1 -1
- h2ogpte/rest_async/configuration.py +1 -1
- h2ogpte/rest_async/models/__init__.py +2 -0
- h2ogpte/rest_async/models/confluence_credentials.py +89 -0
- h2ogpte/rest_async/models/ingest_from_confluence_body.py +97 -0
- h2ogpte/rest_sync/__init__.py +3 -1
- h2ogpte/rest_sync/api/document_ingestion_api.py +1365 -436
- h2ogpte/rest_sync/api_client.py +1 -1
- h2ogpte/rest_sync/configuration.py +1 -1
- h2ogpte/rest_sync/models/__init__.py +2 -0
- h2ogpte/rest_sync/models/confluence_credentials.py +89 -0
- h2ogpte/rest_sync/models/ingest_from_confluence_body.py +97 -0
- h2ogpte/session.py +8 -0
- h2ogpte/session_async.py +8 -0
- {h2ogpte-1.6.43rc2.dist-info → h2ogpte-1.6.43rc5.dist-info}/METADATA +1 -1
- {h2ogpte-1.6.43rc2.dist-info → h2ogpte-1.6.43rc5.dist-info}/RECORD +25 -21
- {h2ogpte-1.6.43rc2.dist-info → h2ogpte-1.6.43rc5.dist-info}/WHEEL +0 -0
- {h2ogpte-1.6.43rc2.dist-info → h2ogpte-1.6.43rc5.dist-info}/entry_points.txt +0 -0
- {h2ogpte-1.6.43rc2.dist-info → h2ogpte-1.6.43rc5.dist-info}/top_level.txt +0 -0
h2ogpte/__init__.py
CHANGED
h2ogpte/connectors.py
CHANGED
|
@@ -245,3 +245,14 @@ def create_ingest_job_from_azure_blob_storage(
|
|
|
245
245
|
metadata=metadata,
|
|
246
246
|
)
|
|
247
247
|
return job
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
class ConfluenceCredential:
|
|
251
|
+
def __init__(self, username: str, password: str):
|
|
252
|
+
"""
|
|
253
|
+
Creates an object with Confluence credentials.
|
|
254
|
+
:param username: Name or email of the user.
|
|
255
|
+
:param password: Password or API token.
|
|
256
|
+
"""
|
|
257
|
+
self.username = username
|
|
258
|
+
self.password = password
|
h2ogpte/h2ogpte.py
CHANGED
|
@@ -2472,6 +2472,92 @@ class H2OGPTE(H2OGPTESyncBase):
|
|
|
2472
2472
|
)
|
|
2473
2473
|
return self._wait_for_completion(response.id, timeout=timeout)
|
|
2474
2474
|
|
|
2475
|
+
def ingest_from_confluence(
|
|
2476
|
+
self,
|
|
2477
|
+
collection_id: str,
|
|
2478
|
+
base_url: str,
|
|
2479
|
+
page_id: Union[str, List[str]],
|
|
2480
|
+
credentials: ConfluenceCredential,
|
|
2481
|
+
gen_doc_summaries: Union[bool, None] = None,
|
|
2482
|
+
gen_doc_questions: Union[bool, None] = None,
|
|
2483
|
+
audio_input_language: Union[str, None] = None,
|
|
2484
|
+
ocr_model: Union[str, None] = None,
|
|
2485
|
+
tesseract_lang: Union[str, None] = None,
|
|
2486
|
+
keep_tables_as_one_chunk: Union[bool, None] = None,
|
|
2487
|
+
chunk_by_page: Union[bool, None] = None,
|
|
2488
|
+
handwriting_check: Union[bool, None] = None,
|
|
2489
|
+
metadata: Union[Dict[str, Any], None] = None,
|
|
2490
|
+
timeout: Union[float, None] = None,
|
|
2491
|
+
ingest_mode: Union[str, None] = None,
|
|
2492
|
+
):
|
|
2493
|
+
"""Ingests confluence pages into collection.
|
|
2494
|
+
|
|
2495
|
+
Args:
|
|
2496
|
+
collection_id:
|
|
2497
|
+
String id of the collection to add the ingested documents into.
|
|
2498
|
+
base_url:
|
|
2499
|
+
Url of confluence instance. Example: https://h2oai.atlassian.net/wiki
|
|
2500
|
+
page_id:
|
|
2501
|
+
The page id or ids of pages to be ingested.
|
|
2502
|
+
credentials:
|
|
2503
|
+
The object with Confluence credentials.
|
|
2504
|
+
gen_doc_summaries:
|
|
2505
|
+
Whether to auto-generate document summaries (uses LLM)
|
|
2506
|
+
gen_doc_questions:
|
|
2507
|
+
Whether to auto-generate sample questions for each document (uses LLM)
|
|
2508
|
+
audio_input_language:
|
|
2509
|
+
Language of audio files. Defaults to "auto" language detection. Pass empty string to see choices.
|
|
2510
|
+
ocr_model:
|
|
2511
|
+
Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models.
|
|
2512
|
+
Pass empty string to see choices.
|
|
2513
|
+
docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages.
|
|
2514
|
+
Mississippi works well on handwriting.
|
|
2515
|
+
"auto" - Automatic will auto-select the best OCR model for every page.
|
|
2516
|
+
"off" - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).
|
|
2517
|
+
tesseract_lang:
|
|
2518
|
+
Which language to use when using ocr_model="tesseract". Pass empty string to see choices.
|
|
2519
|
+
keep_tables_as_one_chunk:
|
|
2520
|
+
When tables are identified by the table parser the table tokens will be kept in a single chunk.
|
|
2521
|
+
chunk_by_page:
|
|
2522
|
+
Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is True.
|
|
2523
|
+
handwriting_check:
|
|
2524
|
+
Check pages for handwriting. Will use specialized models if handwriting is found.
|
|
2525
|
+
metadata:
|
|
2526
|
+
Dictionary of metadata to add to the document.
|
|
2527
|
+
timeout:
|
|
2528
|
+
Timeout in seconds.
|
|
2529
|
+
ingest_mode:
|
|
2530
|
+
Ingest mode to use.
|
|
2531
|
+
"standard" - Files will be ingested for use with RAG
|
|
2532
|
+
"lite" - Files will be ingested for use with RAG, but minimal processing will be done, favoring ingest speed over accuracy
|
|
2533
|
+
"agent_only" - Bypasses standard ingestion. Files can only be used with agents.
|
|
2534
|
+
"""
|
|
2535
|
+
header = self._get_auth_header()
|
|
2536
|
+
with self._RESTClient(self) as rest_client:
|
|
2537
|
+
response = _rest_to_client_exceptions(
|
|
2538
|
+
lambda: rest_client.ingestion_api.create_ingest_from_confluence_job(
|
|
2539
|
+
collection_id=collection_id,
|
|
2540
|
+
ingest_from_confluence_body=rest.IngestFromConfluenceBody(
|
|
2541
|
+
base_url=base_url,
|
|
2542
|
+
page_ids=[page_id] if isinstance(page_id, str) else page_id,
|
|
2543
|
+
credentials=rest.ConfluenceCredentials(**credentials.__dict__),
|
|
2544
|
+
metadata=metadata,
|
|
2545
|
+
),
|
|
2546
|
+
gen_doc_summaries=gen_doc_summaries,
|
|
2547
|
+
gen_doc_questions=gen_doc_questions,
|
|
2548
|
+
audio_input_language=audio_input_language,
|
|
2549
|
+
ocr_model=ocr_model,
|
|
2550
|
+
tesseract_lang=tesseract_lang,
|
|
2551
|
+
keep_tables_as_one_chunk=keep_tables_as_one_chunk,
|
|
2552
|
+
chunk_by_page=chunk_by_page,
|
|
2553
|
+
handwriting_check=handwriting_check,
|
|
2554
|
+
ingest_mode=ingest_mode,
|
|
2555
|
+
timeout=timeout,
|
|
2556
|
+
_headers=header,
|
|
2557
|
+
)
|
|
2558
|
+
)
|
|
2559
|
+
return self._wait_for_completion(response.id, timeout=timeout)
|
|
2560
|
+
|
|
2475
2561
|
def list_secret_ids(self, connector_type: Optional[str] = None) -> List[str]:
|
|
2476
2562
|
"""
|
|
2477
2563
|
List available secret IDs from the SecureStore for cloud storage connectors.
|
h2ogpte/h2ogpte_async.py
CHANGED
|
@@ -89,6 +89,7 @@ from h2ogpte.connectors import (
|
|
|
89
89
|
GCSServiceAccountCredential,
|
|
90
90
|
AzureKeyCredential,
|
|
91
91
|
AzureSASCredential,
|
|
92
|
+
ConfluenceCredential,
|
|
92
93
|
)
|
|
93
94
|
|
|
94
95
|
|
|
@@ -2673,6 +2674,92 @@ class H2OGPTEAsync:
|
|
|
2673
2674
|
)
|
|
2674
2675
|
return await self._wait_for_completion(response.id, timeout=timeout)
|
|
2675
2676
|
|
|
2677
|
+
async def ingest_from_confluence(
|
|
2678
|
+
self,
|
|
2679
|
+
collection_id: str,
|
|
2680
|
+
base_url: str,
|
|
2681
|
+
page_id: Union[str, List[str]],
|
|
2682
|
+
credentials: ConfluenceCredential,
|
|
2683
|
+
gen_doc_summaries: Union[bool, None] = None,
|
|
2684
|
+
gen_doc_questions: Union[bool, None] = None,
|
|
2685
|
+
audio_input_language: Union[str, None] = None,
|
|
2686
|
+
ocr_model: Union[str, None] = None,
|
|
2687
|
+
tesseract_lang: Union[str, None] = None,
|
|
2688
|
+
keep_tables_as_one_chunk: Union[bool, None] = None,
|
|
2689
|
+
chunk_by_page: Union[bool, None] = None,
|
|
2690
|
+
handwriting_check: Union[bool, None] = None,
|
|
2691
|
+
metadata: Union[Dict[str, Any], None] = None,
|
|
2692
|
+
timeout: Union[float, None] = None,
|
|
2693
|
+
ingest_mode: Union[str, None] = None,
|
|
2694
|
+
):
|
|
2695
|
+
"""Ingests confluence pages into collection.
|
|
2696
|
+
|
|
2697
|
+
Args:
|
|
2698
|
+
collection_id:
|
|
2699
|
+
String id of the collection to add the ingested documents into.
|
|
2700
|
+
base_url:
|
|
2701
|
+
Url of confluence instance. Example: https://h2oai.atlassian.net/wiki
|
|
2702
|
+
page_id:
|
|
2703
|
+
The page id or ids of pages to be ingested.
|
|
2704
|
+
credentials:
|
|
2705
|
+
The object with Confluence credentials.
|
|
2706
|
+
gen_doc_summaries:
|
|
2707
|
+
Whether to auto-generate document summaries (uses LLM)
|
|
2708
|
+
gen_doc_questions:
|
|
2709
|
+
Whether to auto-generate sample questions for each document (uses LLM)
|
|
2710
|
+
audio_input_language:
|
|
2711
|
+
Language of audio files. Defaults to "auto" language detection. Pass empty string to see choices.
|
|
2712
|
+
ocr_model:
|
|
2713
|
+
Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models.
|
|
2714
|
+
Pass empty string to see choices.
|
|
2715
|
+
docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages.
|
|
2716
|
+
Mississippi works well on handwriting.
|
|
2717
|
+
"auto" - Automatic will auto-select the best OCR model for every page.
|
|
2718
|
+
"off" - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).
|
|
2719
|
+
tesseract_lang:
|
|
2720
|
+
Which language to use when using ocr_model="tesseract". Pass empty string to see choices.
|
|
2721
|
+
keep_tables_as_one_chunk:
|
|
2722
|
+
When tables are identified by the table parser the table tokens will be kept in a single chunk.
|
|
2723
|
+
chunk_by_page:
|
|
2724
|
+
Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is True.
|
|
2725
|
+
handwriting_check:
|
|
2726
|
+
Check pages for handwriting. Will use specialized models if handwriting is found.
|
|
2727
|
+
metadata:
|
|
2728
|
+
Dictionary of metadata to add to the document.
|
|
2729
|
+
timeout:
|
|
2730
|
+
Timeout in seconds.
|
|
2731
|
+
ingest_mode:
|
|
2732
|
+
Ingest mode to use.
|
|
2733
|
+
"standard" - Files will be ingested for use with RAG
|
|
2734
|
+
"lite" - Files will be ingested for use with RAG, but minimal processing will be done, favoring ingest speed over accuracy
|
|
2735
|
+
"agent_only" - Bypasses standard ingestion. Files can only be used with agents.
|
|
2736
|
+
"""
|
|
2737
|
+
header = await self._get_auth_header()
|
|
2738
|
+
async with self._RESTClient(self) as rest_client:
|
|
2739
|
+
response = await _rest_to_client_exceptions(
|
|
2740
|
+
rest_client.ingestion_api.create_ingest_from_confluence_job(
|
|
2741
|
+
collection_id=collection_id,
|
|
2742
|
+
ingest_from_confluence_body=rest.IngestFromConfluenceBody(
|
|
2743
|
+
base_url=base_url,
|
|
2744
|
+
page_ids=[page_id] if isinstance(page_id, str) else page_id,
|
|
2745
|
+
credentials=rest.ConfluenceCredentials(**credentials.__dict__),
|
|
2746
|
+
metadata=metadata,
|
|
2747
|
+
),
|
|
2748
|
+
gen_doc_summaries=gen_doc_summaries,
|
|
2749
|
+
gen_doc_questions=gen_doc_questions,
|
|
2750
|
+
audio_input_language=audio_input_language,
|
|
2751
|
+
ocr_model=ocr_model,
|
|
2752
|
+
tesseract_lang=tesseract_lang,
|
|
2753
|
+
keep_tables_as_one_chunk=keep_tables_as_one_chunk,
|
|
2754
|
+
chunk_by_page=chunk_by_page,
|
|
2755
|
+
handwriting_check=handwriting_check,
|
|
2756
|
+
ingest_mode=ingest_mode,
|
|
2757
|
+
timeout=timeout,
|
|
2758
|
+
_headers=header,
|
|
2759
|
+
)
|
|
2760
|
+
)
|
|
2761
|
+
return await self._wait_for_completion(response.id, timeout=timeout)
|
|
2762
|
+
|
|
2676
2763
|
async def list_secret_ids(self, connector_type: Optional[str] = None) -> List[str]:
|
|
2677
2764
|
"""
|
|
2678
2765
|
List available secret IDs from the SecureStore for cloud storage connectors.
|
h2ogpte/rest_async/__init__.py
CHANGED
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
""" # noqa: E501
|
|
15
15
|
|
|
16
16
|
|
|
17
|
-
__version__ = "1.6.43-
|
|
17
|
+
__version__ = "1.6.43-dev5"
|
|
18
18
|
|
|
19
19
|
# import apis into sdk package
|
|
20
20
|
from h2ogpte.rest_async.api.api_keys_api import APIKeysApi
|
|
@@ -77,6 +77,7 @@ from h2ogpte.rest_async.models.collection_create_request import CollectionCreate
|
|
|
77
77
|
from h2ogpte.rest_async.models.collection_settings import CollectionSettings
|
|
78
78
|
from h2ogpte.rest_async.models.collection_update_request import CollectionUpdateRequest
|
|
79
79
|
from h2ogpte.rest_async.models.confirm_user_deletion_request import ConfirmUserDeletionRequest
|
|
80
|
+
from h2ogpte.rest_async.models.confluence_credentials import ConfluenceCredentials
|
|
80
81
|
from h2ogpte.rest_async.models.count import Count
|
|
81
82
|
from h2ogpte.rest_async.models.count_with_queue_details import CountWithQueueDetails
|
|
82
83
|
from h2ogpte.rest_async.models.create_agent_key_request import CreateAgentKeyRequest
|
|
@@ -111,6 +112,7 @@ from h2ogpte.rest_async.models.guardrails_settings_create_request import Guardra
|
|
|
111
112
|
from h2ogpte.rest_async.models.h2_ogptgpu_info import H2OGPTGPUInfo
|
|
112
113
|
from h2ogpte.rest_async.models.h2_ogpt_system_info import H2OGPTSystemInfo
|
|
113
114
|
from h2ogpte.rest_async.models.ingest_from_azure_blob_storage_body import IngestFromAzureBlobStorageBody
|
|
115
|
+
from h2ogpte.rest_async.models.ingest_from_confluence_body import IngestFromConfluenceBody
|
|
114
116
|
from h2ogpte.rest_async.models.ingest_from_file_system_body import IngestFromFileSystemBody
|
|
115
117
|
from h2ogpte.rest_async.models.ingest_from_gcs_body import IngestFromGcsBody
|
|
116
118
|
from h2ogpte.rest_async.models.ingest_from_s3_body import IngestFromS3Body
|