h2ogpte 1.6.43rc3__py3-none-any.whl → 1.6.43rc6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- h2ogpte/__init__.py +1 -1
- h2ogpte/connectors.py +11 -0
- h2ogpte/h2ogpte.py +94 -2
- h2ogpte/h2ogpte_async.py +95 -2
- h2ogpte/rest_async/__init__.py +3 -1
- h2ogpte/rest_async/api/document_ingestion_api.py +1365 -436
- h2ogpte/rest_async/api_client.py +1 -1
- h2ogpte/rest_async/configuration.py +1 -1
- h2ogpte/rest_async/models/__init__.py +2 -0
- h2ogpte/rest_async/models/chat_completion_request.py +1 -1
- h2ogpte/rest_async/models/chat_settings.py +1 -1
- h2ogpte/rest_async/models/confluence_credentials.py +89 -0
- h2ogpte/rest_async/models/extraction_request.py +1 -1
- h2ogpte/rest_async/models/ingest_from_confluence_body.py +97 -0
- h2ogpte/rest_async/models/process_document_job_request.py +1 -1
- h2ogpte/rest_async/models/question_request.py +1 -1
- h2ogpte/rest_async/models/summarize_request.py +1 -1
- h2ogpte/rest_async/models/update_collection_privacy_request.py +6 -4
- h2ogpte/rest_sync/__init__.py +3 -1
- h2ogpte/rest_sync/api/document_ingestion_api.py +1365 -436
- h2ogpte/rest_sync/api_client.py +1 -1
- h2ogpte/rest_sync/configuration.py +1 -1
- h2ogpte/rest_sync/models/__init__.py +2 -0
- h2ogpte/rest_sync/models/chat_completion_request.py +1 -1
- h2ogpte/rest_sync/models/chat_settings.py +1 -1
- h2ogpte/rest_sync/models/confluence_credentials.py +89 -0
- h2ogpte/rest_sync/models/extraction_request.py +1 -1
- h2ogpte/rest_sync/models/ingest_from_confluence_body.py +97 -0
- h2ogpte/rest_sync/models/process_document_job_request.py +1 -1
- h2ogpte/rest_sync/models/question_request.py +1 -1
- h2ogpte/rest_sync/models/summarize_request.py +1 -1
- h2ogpte/rest_sync/models/update_collection_privacy_request.py +6 -4
- h2ogpte/session.py +10 -0
- h2ogpte/session_async.py +10 -0
- h2ogpte/types.py +3 -1
- {h2ogpte-1.6.43rc3.dist-info → h2ogpte-1.6.43rc6.dist-info}/METADATA +1 -1
- {h2ogpte-1.6.43rc3.dist-info → h2ogpte-1.6.43rc6.dist-info}/RECORD +40 -36
- {h2ogpte-1.6.43rc3.dist-info → h2ogpte-1.6.43rc6.dist-info}/WHEEL +0 -0
- {h2ogpte-1.6.43rc3.dist-info → h2ogpte-1.6.43rc6.dist-info}/entry_points.txt +0 -0
- {h2ogpte-1.6.43rc3.dist-info → h2ogpte-1.6.43rc6.dist-info}/top_level.txt +0 -0
h2ogpte/__init__.py
CHANGED
h2ogpte/connectors.py
CHANGED
|
@@ -245,3 +245,14 @@ def create_ingest_job_from_azure_blob_storage(
|
|
|
245
245
|
metadata=metadata,
|
|
246
246
|
)
|
|
247
247
|
return job
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
class ConfluenceCredential:
|
|
251
|
+
def __init__(self, username: str, password: str):
|
|
252
|
+
"""
|
|
253
|
+
Creates an object with Confluence credentials.
|
|
254
|
+
:param username: Name or email of the user.
|
|
255
|
+
:param password: Password or API token.
|
|
256
|
+
"""
|
|
257
|
+
self.username = username
|
|
258
|
+
self.password = password
|
h2ogpte/h2ogpte.py
CHANGED
|
@@ -146,6 +146,8 @@ class H2OGPTE(H2OGPTESyncBase):
|
|
|
146
146
|
agent_planning_forced_mode (bool) — Whether to force planning mode for agent (True to always plan first)
|
|
147
147
|
agent_too_soon_forced_mode (bool) — Whether to force handling of premature agent decisions
|
|
148
148
|
agent_critique_forced_mode (int) — Whether to force critique mode for agent self-evaluation
|
|
149
|
+
agent_query_understanding_parallel_calls (int) — Number of parallel calls for query understanding
|
|
150
|
+
tool_building_mode (str) — Mode for tool building configuration
|
|
149
151
|
agent_stream_files (bool, default: True) — Whether to stream files from agent operations for real-time updates
|
|
150
152
|
|
|
151
153
|
# Other parameters
|
|
@@ -2472,6 +2474,92 @@ class H2OGPTE(H2OGPTESyncBase):
|
|
|
2472
2474
|
)
|
|
2473
2475
|
return self._wait_for_completion(response.id, timeout=timeout)
|
|
2474
2476
|
|
|
2477
|
+
def ingest_from_confluence(
|
|
2478
|
+
self,
|
|
2479
|
+
collection_id: str,
|
|
2480
|
+
base_url: str,
|
|
2481
|
+
page_id: Union[str, List[str]],
|
|
2482
|
+
credentials: ConfluenceCredential,
|
|
2483
|
+
gen_doc_summaries: Union[bool, None] = None,
|
|
2484
|
+
gen_doc_questions: Union[bool, None] = None,
|
|
2485
|
+
audio_input_language: Union[str, None] = None,
|
|
2486
|
+
ocr_model: Union[str, None] = None,
|
|
2487
|
+
tesseract_lang: Union[str, None] = None,
|
|
2488
|
+
keep_tables_as_one_chunk: Union[bool, None] = None,
|
|
2489
|
+
chunk_by_page: Union[bool, None] = None,
|
|
2490
|
+
handwriting_check: Union[bool, None] = None,
|
|
2491
|
+
metadata: Union[Dict[str, Any], None] = None,
|
|
2492
|
+
timeout: Union[float, None] = None,
|
|
2493
|
+
ingest_mode: Union[str, None] = None,
|
|
2494
|
+
):
|
|
2495
|
+
"""Ingests confluence pages into collection.
|
|
2496
|
+
|
|
2497
|
+
Args:
|
|
2498
|
+
collection_id:
|
|
2499
|
+
String id of the collection to add the ingested documents into.
|
|
2500
|
+
base_url:
|
|
2501
|
+
Url of confluence instance. Example: https://h2oai.atlassian.net/wiki
|
|
2502
|
+
page_id:
|
|
2503
|
+
The page id or ids of pages to be ingested.
|
|
2504
|
+
credentials:
|
|
2505
|
+
The object with Confluence credentials.
|
|
2506
|
+
gen_doc_summaries:
|
|
2507
|
+
Whether to auto-generate document summaries (uses LLM)
|
|
2508
|
+
gen_doc_questions:
|
|
2509
|
+
Whether to auto-generate sample questions for each document (uses LLM)
|
|
2510
|
+
audio_input_language:
|
|
2511
|
+
Language of audio files. Defaults to "auto" language detection. Pass empty string to see choices.
|
|
2512
|
+
ocr_model:
|
|
2513
|
+
Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models.
|
|
2514
|
+
Pass empty string to see choices.
|
|
2515
|
+
docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages.
|
|
2516
|
+
Mississippi works well on handwriting.
|
|
2517
|
+
"auto" - Automatic will auto-select the best OCR model for every page.
|
|
2518
|
+
"off" - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).
|
|
2519
|
+
tesseract_lang:
|
|
2520
|
+
Which language to use when using ocr_model="tesseract". Pass empty string to see choices.
|
|
2521
|
+
keep_tables_as_one_chunk:
|
|
2522
|
+
When tables are identified by the table parser the table tokens will be kept in a single chunk.
|
|
2523
|
+
chunk_by_page:
|
|
2524
|
+
Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is True.
|
|
2525
|
+
handwriting_check:
|
|
2526
|
+
Check pages for handwriting. Will use specialized models if handwriting is found.
|
|
2527
|
+
metadata:
|
|
2528
|
+
Dictionary of metadata to add to the document.
|
|
2529
|
+
timeout:
|
|
2530
|
+
Timeout in seconds.
|
|
2531
|
+
ingest_mode:
|
|
2532
|
+
Ingest mode to use.
|
|
2533
|
+
"standard" - Files will be ingested for use with RAG
|
|
2534
|
+
"lite" - Files will be ingested for use with RAG, but minimal processing will be done, favoring ingest speed over accuracy
|
|
2535
|
+
"agent_only" - Bypasses standard ingestion. Files can only be used with agents.
|
|
2536
|
+
"""
|
|
2537
|
+
header = self._get_auth_header()
|
|
2538
|
+
with self._RESTClient(self) as rest_client:
|
|
2539
|
+
response = _rest_to_client_exceptions(
|
|
2540
|
+
lambda: rest_client.ingestion_api.create_ingest_from_confluence_job(
|
|
2541
|
+
collection_id=collection_id,
|
|
2542
|
+
ingest_from_confluence_body=rest.IngestFromConfluenceBody(
|
|
2543
|
+
base_url=base_url,
|
|
2544
|
+
page_ids=[page_id] if isinstance(page_id, str) else page_id,
|
|
2545
|
+
credentials=rest.ConfluenceCredentials(**credentials.__dict__),
|
|
2546
|
+
metadata=metadata,
|
|
2547
|
+
),
|
|
2548
|
+
gen_doc_summaries=gen_doc_summaries,
|
|
2549
|
+
gen_doc_questions=gen_doc_questions,
|
|
2550
|
+
audio_input_language=audio_input_language,
|
|
2551
|
+
ocr_model=ocr_model,
|
|
2552
|
+
tesseract_lang=tesseract_lang,
|
|
2553
|
+
keep_tables_as_one_chunk=keep_tables_as_one_chunk,
|
|
2554
|
+
chunk_by_page=chunk_by_page,
|
|
2555
|
+
handwriting_check=handwriting_check,
|
|
2556
|
+
ingest_mode=ingest_mode,
|
|
2557
|
+
timeout=timeout,
|
|
2558
|
+
_headers=header,
|
|
2559
|
+
)
|
|
2560
|
+
)
|
|
2561
|
+
return self._wait_for_completion(response.id, timeout=timeout)
|
|
2562
|
+
|
|
2475
2563
|
def list_secret_ids(self, connector_type: Optional[str] = None) -> List[str]:
|
|
2476
2564
|
"""
|
|
2477
2565
|
List available secret IDs from the SecureStore for cloud storage connectors.
|
|
@@ -3681,7 +3769,9 @@ class H2OGPTE(H2OGPTESyncBase):
|
|
|
3681
3769
|
)
|
|
3682
3770
|
return result
|
|
3683
3771
|
|
|
3684
|
-
def make_collection_public(
|
|
3772
|
+
def make_collection_public(
|
|
3773
|
+
self, collection_id: str, permissions: Optional[List[str]] = None
|
|
3774
|
+
):
|
|
3685
3775
|
"""Make a collection public
|
|
3686
3776
|
|
|
3687
3777
|
Once a collection is public, it will be accessible to all
|
|
@@ -3690,6 +3780,8 @@ class H2OGPTE(H2OGPTESyncBase):
|
|
|
3690
3780
|
Args:
|
|
3691
3781
|
collection_id:
|
|
3692
3782
|
ID of the collection to make public.
|
|
3783
|
+
permissions:
|
|
3784
|
+
Optional: Collection specific permissions. If not provided, all permissions will default to true.
|
|
3693
3785
|
"""
|
|
3694
3786
|
header = self._get_auth_header()
|
|
3695
3787
|
with self._RESTClient(self) as rest_client:
|
|
@@ -3697,7 +3789,7 @@ class H2OGPTE(H2OGPTESyncBase):
|
|
|
3697
3789
|
lambda: rest_client.collection_api.update_collection_privacy(
|
|
3698
3790
|
collection_id=collection_id,
|
|
3699
3791
|
update_collection_privacy_request=rest.UpdateCollectionPrivacyRequest(
|
|
3700
|
-
is_public=True
|
|
3792
|
+
is_public=True, permissions=permissions
|
|
3701
3793
|
),
|
|
3702
3794
|
_headers=header,
|
|
3703
3795
|
)
|
h2ogpte/h2ogpte_async.py
CHANGED
|
@@ -89,6 +89,7 @@ from h2ogpte.connectors import (
|
|
|
89
89
|
GCSServiceAccountCredential,
|
|
90
90
|
AzureKeyCredential,
|
|
91
91
|
AzureSASCredential,
|
|
92
|
+
ConfluenceCredential,
|
|
92
93
|
)
|
|
93
94
|
|
|
94
95
|
|
|
@@ -351,6 +352,8 @@ class H2OGPTEAsync:
|
|
|
351
352
|
agent_planning_forced_mode (bool) — Whether to force planning mode for agent (True to always plan first)
|
|
352
353
|
agent_too_soon_forced_mode (bool) — Whether to force handling of premature agent decisions
|
|
353
354
|
agent_critique_forced_mode (int) — Whether to force critique mode for agent self-evaluation
|
|
355
|
+
agent_query_understanding_parallel_calls (int) — Number of parallel calls for query understanding
|
|
356
|
+
tool_building_mode (str) — Mode for tool building configuration
|
|
354
357
|
agent_stream_files (bool, default: True) — Whether to stream files from agent operations for real-time updates
|
|
355
358
|
|
|
356
359
|
# Other parameters
|
|
@@ -2673,6 +2676,92 @@ class H2OGPTEAsync:
|
|
|
2673
2676
|
)
|
|
2674
2677
|
return await self._wait_for_completion(response.id, timeout=timeout)
|
|
2675
2678
|
|
|
2679
|
+
async def ingest_from_confluence(
|
|
2680
|
+
self,
|
|
2681
|
+
collection_id: str,
|
|
2682
|
+
base_url: str,
|
|
2683
|
+
page_id: Union[str, List[str]],
|
|
2684
|
+
credentials: ConfluenceCredential,
|
|
2685
|
+
gen_doc_summaries: Union[bool, None] = None,
|
|
2686
|
+
gen_doc_questions: Union[bool, None] = None,
|
|
2687
|
+
audio_input_language: Union[str, None] = None,
|
|
2688
|
+
ocr_model: Union[str, None] = None,
|
|
2689
|
+
tesseract_lang: Union[str, None] = None,
|
|
2690
|
+
keep_tables_as_one_chunk: Union[bool, None] = None,
|
|
2691
|
+
chunk_by_page: Union[bool, None] = None,
|
|
2692
|
+
handwriting_check: Union[bool, None] = None,
|
|
2693
|
+
metadata: Union[Dict[str, Any], None] = None,
|
|
2694
|
+
timeout: Union[float, None] = None,
|
|
2695
|
+
ingest_mode: Union[str, None] = None,
|
|
2696
|
+
):
|
|
2697
|
+
"""Ingests confluence pages into collection.
|
|
2698
|
+
|
|
2699
|
+
Args:
|
|
2700
|
+
collection_id:
|
|
2701
|
+
String id of the collection to add the ingested documents into.
|
|
2702
|
+
base_url:
|
|
2703
|
+
Url of confluence instance. Example: https://h2oai.atlassian.net/wiki
|
|
2704
|
+
page_id:
|
|
2705
|
+
The page id or ids of pages to be ingested.
|
|
2706
|
+
credentials:
|
|
2707
|
+
The object with Confluence credentials.
|
|
2708
|
+
gen_doc_summaries:
|
|
2709
|
+
Whether to auto-generate document summaries (uses LLM)
|
|
2710
|
+
gen_doc_questions:
|
|
2711
|
+
Whether to auto-generate sample questions for each document (uses LLM)
|
|
2712
|
+
audio_input_language:
|
|
2713
|
+
Language of audio files. Defaults to "auto" language detection. Pass empty string to see choices.
|
|
2714
|
+
ocr_model:
|
|
2715
|
+
Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models.
|
|
2716
|
+
Pass empty string to see choices.
|
|
2717
|
+
docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages.
|
|
2718
|
+
Mississippi works well on handwriting.
|
|
2719
|
+
"auto" - Automatic will auto-select the best OCR model for every page.
|
|
2720
|
+
"off" - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).
|
|
2721
|
+
tesseract_lang:
|
|
2722
|
+
Which language to use when using ocr_model="tesseract". Pass empty string to see choices.
|
|
2723
|
+
keep_tables_as_one_chunk:
|
|
2724
|
+
When tables are identified by the table parser the table tokens will be kept in a single chunk.
|
|
2725
|
+
chunk_by_page:
|
|
2726
|
+
Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is True.
|
|
2727
|
+
handwriting_check:
|
|
2728
|
+
Check pages for handwriting. Will use specialized models if handwriting is found.
|
|
2729
|
+
metadata:
|
|
2730
|
+
Dictionary of metadata to add to the document.
|
|
2731
|
+
timeout:
|
|
2732
|
+
Timeout in seconds.
|
|
2733
|
+
ingest_mode:
|
|
2734
|
+
Ingest mode to use.
|
|
2735
|
+
"standard" - Files will be ingested for use with RAG
|
|
2736
|
+
"lite" - Files will be ingested for use with RAG, but minimal processing will be done, favoring ingest speed over accuracy
|
|
2737
|
+
"agent_only" - Bypasses standard ingestion. Files can only be used with agents.
|
|
2738
|
+
"""
|
|
2739
|
+
header = await self._get_auth_header()
|
|
2740
|
+
async with self._RESTClient(self) as rest_client:
|
|
2741
|
+
response = await _rest_to_client_exceptions(
|
|
2742
|
+
rest_client.ingestion_api.create_ingest_from_confluence_job(
|
|
2743
|
+
collection_id=collection_id,
|
|
2744
|
+
ingest_from_confluence_body=rest.IngestFromConfluenceBody(
|
|
2745
|
+
base_url=base_url,
|
|
2746
|
+
page_ids=[page_id] if isinstance(page_id, str) else page_id,
|
|
2747
|
+
credentials=rest.ConfluenceCredentials(**credentials.__dict__),
|
|
2748
|
+
metadata=metadata,
|
|
2749
|
+
),
|
|
2750
|
+
gen_doc_summaries=gen_doc_summaries,
|
|
2751
|
+
gen_doc_questions=gen_doc_questions,
|
|
2752
|
+
audio_input_language=audio_input_language,
|
|
2753
|
+
ocr_model=ocr_model,
|
|
2754
|
+
tesseract_lang=tesseract_lang,
|
|
2755
|
+
keep_tables_as_one_chunk=keep_tables_as_one_chunk,
|
|
2756
|
+
chunk_by_page=chunk_by_page,
|
|
2757
|
+
handwriting_check=handwriting_check,
|
|
2758
|
+
ingest_mode=ingest_mode,
|
|
2759
|
+
timeout=timeout,
|
|
2760
|
+
_headers=header,
|
|
2761
|
+
)
|
|
2762
|
+
)
|
|
2763
|
+
return await self._wait_for_completion(response.id, timeout=timeout)
|
|
2764
|
+
|
|
2676
2765
|
async def list_secret_ids(self, connector_type: Optional[str] = None) -> List[str]:
|
|
2677
2766
|
"""
|
|
2678
2767
|
List available secret IDs from the SecureStore for cloud storage connectors.
|
|
@@ -3884,7 +3973,9 @@ class H2OGPTEAsync:
|
|
|
3884
3973
|
)
|
|
3885
3974
|
return result
|
|
3886
3975
|
|
|
3887
|
-
async def make_collection_public(
|
|
3976
|
+
async def make_collection_public(
|
|
3977
|
+
self, collection_id: str, permissions: Optional[List[str]] = None
|
|
3978
|
+
):
|
|
3888
3979
|
"""Make a collection public
|
|
3889
3980
|
|
|
3890
3981
|
Once a collection is public, it will be accessible to all
|
|
@@ -3893,6 +3984,8 @@ class H2OGPTEAsync:
|
|
|
3893
3984
|
Args:
|
|
3894
3985
|
collection_id:
|
|
3895
3986
|
ID of the collection to make public.
|
|
3987
|
+
permissions:
|
|
3988
|
+
Optional: Collection specific permissions. If not provided, all permissions will default to true.
|
|
3896
3989
|
"""
|
|
3897
3990
|
header = await self._get_auth_header()
|
|
3898
3991
|
async with self._RESTClient(self) as rest_client:
|
|
@@ -3900,7 +3993,7 @@ class H2OGPTEAsync:
|
|
|
3900
3993
|
rest_client.collection_api.update_collection_privacy(
|
|
3901
3994
|
collection_id=collection_id,
|
|
3902
3995
|
update_collection_privacy_request=rest.UpdateCollectionPrivacyRequest(
|
|
3903
|
-
is_public=True
|
|
3996
|
+
is_public=True, permissions=permissions
|
|
3904
3997
|
),
|
|
3905
3998
|
_headers=header,
|
|
3906
3999
|
)
|
h2ogpte/rest_async/__init__.py
CHANGED
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
""" # noqa: E501
|
|
15
15
|
|
|
16
16
|
|
|
17
|
-
__version__ = "1.6.43-
|
|
17
|
+
__version__ = "1.6.43-dev6"
|
|
18
18
|
|
|
19
19
|
# import apis into sdk package
|
|
20
20
|
from h2ogpte.rest_async.api.api_keys_api import APIKeysApi
|
|
@@ -77,6 +77,7 @@ from h2ogpte.rest_async.models.collection_create_request import CollectionCreate
|
|
|
77
77
|
from h2ogpte.rest_async.models.collection_settings import CollectionSettings
|
|
78
78
|
from h2ogpte.rest_async.models.collection_update_request import CollectionUpdateRequest
|
|
79
79
|
from h2ogpte.rest_async.models.confirm_user_deletion_request import ConfirmUserDeletionRequest
|
|
80
|
+
from h2ogpte.rest_async.models.confluence_credentials import ConfluenceCredentials
|
|
80
81
|
from h2ogpte.rest_async.models.count import Count
|
|
81
82
|
from h2ogpte.rest_async.models.count_with_queue_details import CountWithQueueDetails
|
|
82
83
|
from h2ogpte.rest_async.models.create_agent_key_request import CreateAgentKeyRequest
|
|
@@ -111,6 +112,7 @@ from h2ogpte.rest_async.models.guardrails_settings_create_request import Guardra
|
|
|
111
112
|
from h2ogpte.rest_async.models.h2_ogptgpu_info import H2OGPTGPUInfo
|
|
112
113
|
from h2ogpte.rest_async.models.h2_ogpt_system_info import H2OGPTSystemInfo
|
|
113
114
|
from h2ogpte.rest_async.models.ingest_from_azure_blob_storage_body import IngestFromAzureBlobStorageBody
|
|
115
|
+
from h2ogpte.rest_async.models.ingest_from_confluence_body import IngestFromConfluenceBody
|
|
114
116
|
from h2ogpte.rest_async.models.ingest_from_file_system_body import IngestFromFileSystemBody
|
|
115
117
|
from h2ogpte.rest_async.models.ingest_from_gcs_body import IngestFromGcsBody
|
|
116
118
|
from h2ogpte.rest_async.models.ingest_from_s3_body import IngestFromS3Body
|