PyPI - h2ogpte - Versions diffs - 1.6.43rc2__py3-none-any.whl → 1.6.43rc5__py3-none-any.whl - Mend

h2ogpte 1.6.43rc2py3-none-any.whl → 1.6.43rc5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

h2ogpte/__init__.py +1 -1
h2ogpte/connectors.py +11 -0
h2ogpte/h2ogpte.py +86 -0
h2ogpte/h2ogpte_async.py +87 -0
h2ogpte/rest_async/__init__.py +3 -1
h2ogpte/rest_async/api/document_ingestion_api.py +1365 -436
h2ogpte/rest_async/api_client.py +1 -1
h2ogpte/rest_async/configuration.py +1 -1
h2ogpte/rest_async/models/__init__.py +2 -0
h2ogpte/rest_async/models/confluence_credentials.py +89 -0
h2ogpte/rest_async/models/ingest_from_confluence_body.py +97 -0
h2ogpte/rest_sync/__init__.py +3 -1
h2ogpte/rest_sync/api/document_ingestion_api.py +1365 -436
h2ogpte/rest_sync/api_client.py +1 -1
h2ogpte/rest_sync/configuration.py +1 -1
h2ogpte/rest_sync/models/__init__.py +2 -0
h2ogpte/rest_sync/models/confluence_credentials.py +89 -0
h2ogpte/rest_sync/models/ingest_from_confluence_body.py +97 -0
h2ogpte/session.py +8 -0
h2ogpte/session_async.py +8 -0
{h2ogpte-1.6.43rc2.dist-info → h2ogpte-1.6.43rc5.dist-info}/METADATA +1 -1
{h2ogpte-1.6.43rc2.dist-info → h2ogpte-1.6.43rc5.dist-info}/RECORD +25 -21
{h2ogpte-1.6.43rc2.dist-info → h2ogpte-1.6.43rc5.dist-info}/WHEEL +0 -0
{h2ogpte-1.6.43rc2.dist-info → h2ogpte-1.6.43rc5.dist-info}/entry_points.txt +0 -0
{h2ogpte-1.6.43rc2.dist-info → h2ogpte-1.6.43rc5.dist-info}/top_level.txt +0 -0

h2ogpte/__init__.py CHANGED Viewed

@@ -3,7 +3,7 @@ from h2ogpte.h2ogpte import H2OGPTE
 from h2ogpte.h2ogpte_async import H2OGPTEAsync
 from h2ogpte.session_async import SessionAsync
-__version__ = "1.6.43rc2"
+__version__ = "1.6.43rc5"
 __all__ = [
     "H2OGPTE",

h2ogpte/connectors.py CHANGED Viewed

@@ -245,3 +245,14 @@ def create_ingest_job_from_azure_blob_storage(
         metadata=metadata,
     )
     return job
+class ConfluenceCredential:
+    def __init__(self, username: str, password: str):
+        """
+        Creates an object with Confluence credentials.
+        :param username: Name or email of the user.
+        :param password: Password or API token.
+        """
+        self.username = username
+        self.password = password

h2ogpte/h2ogpte.py CHANGED Viewed

@@ -2472,6 +2472,92 @@ class H2OGPTE(H2OGPTESyncBase):
             )
         return self._wait_for_completion(response.id, timeout=timeout)
+    def ingest_from_confluence(
+        self,
+        collection_id: str,
+        base_url: str,
+        page_id: Union[str, List[str]],
+        credentials: ConfluenceCredential,
+        gen_doc_summaries: Union[bool, None] = None,
+        gen_doc_questions: Union[bool, None] = None,
+        audio_input_language: Union[str, None] = None,
+        ocr_model: Union[str, None] = None,
+        tesseract_lang: Union[str, None] = None,
+        keep_tables_as_one_chunk: Union[bool, None] = None,
+        chunk_by_page: Union[bool, None] = None,
+        handwriting_check: Union[bool, None] = None,
+        metadata: Union[Dict[str, Any], None] = None,
+        timeout: Union[float, None] = None,
+        ingest_mode: Union[str, None] = None,
+    ):
+        """Ingests confluence pages into collection.
+        Args:
+            collection_id:
+                String id of the collection to add the ingested documents into.
+            base_url:
+                Url of confluence instance. Example: https://h2oai.atlassian.net/wiki
+            page_id:
+                The page id or ids of pages to be ingested.
+            credentials:
+                The object with Confluence credentials.
+            gen_doc_summaries:
+                Whether to auto-generate document summaries (uses LLM)
+            gen_doc_questions:
+                Whether to auto-generate sample questions for each document (uses LLM)
+            audio_input_language:
+                Language of audio files. Defaults to "auto" language detection. Pass empty string to see choices.
+            ocr_model:
+                Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models.
+                Pass empty string to see choices.
+                docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages.
+                Mississippi works well on handwriting.
+                "auto" - Automatic will auto-select the best OCR model for every page.
+                "off" - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).
+            tesseract_lang:
+                Which language to use when using ocr_model="tesseract". Pass empty string to see choices.
+            keep_tables_as_one_chunk:
+                When tables are identified by the table parser the table tokens will be kept in a single chunk.
+            chunk_by_page:
+                Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is True.
+            handwriting_check:
+                Check pages for handwriting. Will use specialized models if handwriting is found.
+            metadata:
+                Dictionary of metadata to add to the document.
+            timeout:
+                Timeout in seconds.
+            ingest_mode:
+                Ingest mode to use.
+                "standard" - Files will be ingested for use with RAG
+                "lite" - Files will be ingested for use with RAG, but minimal processing will be done, favoring ingest speed over accuracy
+                "agent_only" - Bypasses standard ingestion. Files can only be used with agents.
+        """
+        header = self._get_auth_header()
+        with self._RESTClient(self) as rest_client:
+            response = _rest_to_client_exceptions(
+                lambda: rest_client.ingestion_api.create_ingest_from_confluence_job(
+                    collection_id=collection_id,
+                    ingest_from_confluence_body=rest.IngestFromConfluenceBody(
+                        base_url=base_url,
+                        page_ids=[page_id] if isinstance(page_id, str) else page_id,
+                        credentials=rest.ConfluenceCredentials(**credentials.__dict__),
+                        metadata=metadata,
+                    ),
+                    gen_doc_summaries=gen_doc_summaries,
+                    gen_doc_questions=gen_doc_questions,
+                    audio_input_language=audio_input_language,
+                    ocr_model=ocr_model,
+                    tesseract_lang=tesseract_lang,
+                    keep_tables_as_one_chunk=keep_tables_as_one_chunk,
+                    chunk_by_page=chunk_by_page,
+                    handwriting_check=handwriting_check,
+                    ingest_mode=ingest_mode,
+                    timeout=timeout,
+                    _headers=header,
+                )
+            )
+        return self._wait_for_completion(response.id, timeout=timeout)
     def list_secret_ids(self, connector_type: Optional[str] = None) -> List[str]:
         """
         List available secret IDs from the SecureStore for cloud storage connectors.

h2ogpte/h2ogpte_async.py CHANGED Viewed

@@ -89,6 +89,7 @@ from h2ogpte.connectors import (
     GCSServiceAccountCredential,
     AzureKeyCredential,
     AzureSASCredential,
+    ConfluenceCredential,
 )
@@ -2673,6 +2674,92 @@ class H2OGPTEAsync:
             )
         return await self._wait_for_completion(response.id, timeout=timeout)
+    async def ingest_from_confluence(
+        self,
+        collection_id: str,
+        base_url: str,
+        page_id: Union[str, List[str]],
+        credentials: ConfluenceCredential,
+        gen_doc_summaries: Union[bool, None] = None,
+        gen_doc_questions: Union[bool, None] = None,
+        audio_input_language: Union[str, None] = None,
+        ocr_model: Union[str, None] = None,
+        tesseract_lang: Union[str, None] = None,
+        keep_tables_as_one_chunk: Union[bool, None] = None,
+        chunk_by_page: Union[bool, None] = None,
+        handwriting_check: Union[bool, None] = None,
+        metadata: Union[Dict[str, Any], None] = None,
+        timeout: Union[float, None] = None,
+        ingest_mode: Union[str, None] = None,
+    ):
+        """Ingests confluence pages into collection.
+        Args:
+            collection_id:
+                String id of the collection to add the ingested documents into.
+            base_url:
+                Url of confluence instance. Example: https://h2oai.atlassian.net/wiki
+            page_id:
+                The page id or ids of pages to be ingested.
+            credentials:
+                The object with Confluence credentials.
+            gen_doc_summaries:
+                Whether to auto-generate document summaries (uses LLM)
+            gen_doc_questions:
+                Whether to auto-generate sample questions for each document (uses LLM)
+            audio_input_language:
+                Language of audio files. Defaults to "auto" language detection. Pass empty string to see choices.
+            ocr_model:
+                Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models.
+                Pass empty string to see choices.
+                docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages.
+                Mississippi works well on handwriting.
+                "auto" - Automatic will auto-select the best OCR model for every page.
+                "off" - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).
+            tesseract_lang:
+                Which language to use when using ocr_model="tesseract". Pass empty string to see choices.
+            keep_tables_as_one_chunk:
+                When tables are identified by the table parser the table tokens will be kept in a single chunk.
+            chunk_by_page:
+                Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is True.
+            handwriting_check:
+                Check pages for handwriting. Will use specialized models if handwriting is found.
+            metadata:
+                Dictionary of metadata to add to the document.
+            timeout:
+                Timeout in seconds.
+            ingest_mode:
+                Ingest mode to use.
+                "standard" - Files will be ingested for use with RAG
+                "lite" - Files will be ingested for use with RAG, but minimal processing will be done, favoring ingest speed over accuracy
+                "agent_only" - Bypasses standard ingestion. Files can only be used with agents.
+        """
+        header = await self._get_auth_header()
+        async with self._RESTClient(self) as rest_client:
+            response = await _rest_to_client_exceptions(
+                rest_client.ingestion_api.create_ingest_from_confluence_job(
+                    collection_id=collection_id,
+                    ingest_from_confluence_body=rest.IngestFromConfluenceBody(
+                        base_url=base_url,
+                        page_ids=[page_id] if isinstance(page_id, str) else page_id,
+                        credentials=rest.ConfluenceCredentials(**credentials.__dict__),
+                        metadata=metadata,
+                    ),
+                    gen_doc_summaries=gen_doc_summaries,
+                    gen_doc_questions=gen_doc_questions,
+                    audio_input_language=audio_input_language,
+                    ocr_model=ocr_model,
+                    tesseract_lang=tesseract_lang,
+                    keep_tables_as_one_chunk=keep_tables_as_one_chunk,
+                    chunk_by_page=chunk_by_page,
+                    handwriting_check=handwriting_check,
+                    ingest_mode=ingest_mode,
+                    timeout=timeout,
+                    _headers=header,
+                )
+            )
+        return await self._wait_for_completion(response.id, timeout=timeout)
     async def list_secret_ids(self, connector_type: Optional[str] = None) -> List[str]:
         """
         List available secret IDs from the SecureStore for cloud storage connectors.

h2ogpte/rest_async/__init__.py CHANGED Viewed

@@ -14,7 +14,7 @@
 """  # noqa: E501
-__version__ = "1.6.43-dev2"
+__version__ = "1.6.43-dev5"
 # import apis into sdk package
 from h2ogpte.rest_async.api.api_keys_api import APIKeysApi
@@ -77,6 +77,7 @@ from h2ogpte.rest_async.models.collection_create_request import CollectionCreate
 from h2ogpte.rest_async.models.collection_settings import CollectionSettings
 from h2ogpte.rest_async.models.collection_update_request import CollectionUpdateRequest
 from h2ogpte.rest_async.models.confirm_user_deletion_request import ConfirmUserDeletionRequest
+from h2ogpte.rest_async.models.confluence_credentials import ConfluenceCredentials
 from h2ogpte.rest_async.models.count import Count
 from h2ogpte.rest_async.models.count_with_queue_details import CountWithQueueDetails
 from h2ogpte.rest_async.models.create_agent_key_request import CreateAgentKeyRequest
@@ -111,6 +112,7 @@ from h2ogpte.rest_async.models.guardrails_settings_create_request import Guardra
 from h2ogpte.rest_async.models.h2_ogptgpu_info import H2OGPTGPUInfo
 from h2ogpte.rest_async.models.h2_ogpt_system_info import H2OGPTSystemInfo
 from h2ogpte.rest_async.models.ingest_from_azure_blob_storage_body import IngestFromAzureBlobStorageBody
+from h2ogpte.rest_async.models.ingest_from_confluence_body import IngestFromConfluenceBody
 from h2ogpte.rest_async.models.ingest_from_file_system_body import IngestFromFileSystemBody
 from h2ogpte.rest_async.models.ingest_from_gcs_body import IngestFromGcsBody
 from h2ogpte.rest_async.models.ingest_from_s3_body import IngestFromS3Body

h2ogpte 1.6.43rc2__py3-none-any.whl → 1.6.43rc5__py3-none-any.whl

h2ogpte 1.6.43rc2py3-none-any.whl → 1.6.43rc5py3-none-any.whl