h2ogpte 1.6.43rc2__py3-none-any.whl → 1.6.43rc5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
h2ogpte/__init__.py CHANGED
@@ -3,7 +3,7 @@ from h2ogpte.h2ogpte import H2OGPTE
3
3
  from h2ogpte.h2ogpte_async import H2OGPTEAsync
4
4
  from h2ogpte.session_async import SessionAsync
5
5
 
6
- __version__ = "1.6.43rc2"
6
+ __version__ = "1.6.43rc5"
7
7
 
8
8
  __all__ = [
9
9
  "H2OGPTE",
h2ogpte/connectors.py CHANGED
@@ -245,3 +245,14 @@ def create_ingest_job_from_azure_blob_storage(
245
245
  metadata=metadata,
246
246
  )
247
247
  return job
248
+
249
+
250
+ class ConfluenceCredential:
251
+ def __init__(self, username: str, password: str):
252
+ """
253
+ Creates an object with Confluence credentials.
254
+ :param username: Name or email of the user.
255
+ :param password: Password or API token.
256
+ """
257
+ self.username = username
258
+ self.password = password
h2ogpte/h2ogpte.py CHANGED
@@ -2472,6 +2472,92 @@ class H2OGPTE(H2OGPTESyncBase):
2472
2472
  )
2473
2473
  return self._wait_for_completion(response.id, timeout=timeout)
2474
2474
 
2475
+ def ingest_from_confluence(
2476
+ self,
2477
+ collection_id: str,
2478
+ base_url: str,
2479
+ page_id: Union[str, List[str]],
2480
+ credentials: ConfluenceCredential,
2481
+ gen_doc_summaries: Union[bool, None] = None,
2482
+ gen_doc_questions: Union[bool, None] = None,
2483
+ audio_input_language: Union[str, None] = None,
2484
+ ocr_model: Union[str, None] = None,
2485
+ tesseract_lang: Union[str, None] = None,
2486
+ keep_tables_as_one_chunk: Union[bool, None] = None,
2487
+ chunk_by_page: Union[bool, None] = None,
2488
+ handwriting_check: Union[bool, None] = None,
2489
+ metadata: Union[Dict[str, Any], None] = None,
2490
+ timeout: Union[float, None] = None,
2491
+ ingest_mode: Union[str, None] = None,
2492
+ ):
2493
+ """Ingests confluence pages into collection.
2494
+
2495
+ Args:
2496
+ collection_id:
2497
+ String id of the collection to add the ingested documents into.
2498
+ base_url:
2499
+ Url of confluence instance. Example: https://h2oai.atlassian.net/wiki
2500
+ page_id:
2501
+ The page id or ids of pages to be ingested.
2502
+ credentials:
2503
+ The object with Confluence credentials.
2504
+ gen_doc_summaries:
2505
+ Whether to auto-generate document summaries (uses LLM)
2506
+ gen_doc_questions:
2507
+ Whether to auto-generate sample questions for each document (uses LLM)
2508
+ audio_input_language:
2509
+ Language of audio files. Defaults to "auto" language detection. Pass empty string to see choices.
2510
+ ocr_model:
2511
+ Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models.
2512
+ Pass empty string to see choices.
2513
+ docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages.
2514
+ Mississippi works well on handwriting.
2515
+ "auto" - Automatic will auto-select the best OCR model for every page.
2516
+ "off" - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).
2517
+ tesseract_lang:
2518
+ Which language to use when using ocr_model="tesseract". Pass empty string to see choices.
2519
+ keep_tables_as_one_chunk:
2520
+ When tables are identified by the table parser the table tokens will be kept in a single chunk.
2521
+ chunk_by_page:
2522
+ Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is True.
2523
+ handwriting_check:
2524
+ Check pages for handwriting. Will use specialized models if handwriting is found.
2525
+ metadata:
2526
+ Dictionary of metadata to add to the document.
2527
+ timeout:
2528
+ Timeout in seconds.
2529
+ ingest_mode:
2530
+ Ingest mode to use.
2531
+ "standard" - Files will be ingested for use with RAG
2532
+ "lite" - Files will be ingested for use with RAG, but minimal processing will be done, favoring ingest speed over accuracy
2533
+ "agent_only" - Bypasses standard ingestion. Files can only be used with agents.
2534
+ """
2535
+ header = self._get_auth_header()
2536
+ with self._RESTClient(self) as rest_client:
2537
+ response = _rest_to_client_exceptions(
2538
+ lambda: rest_client.ingestion_api.create_ingest_from_confluence_job(
2539
+ collection_id=collection_id,
2540
+ ingest_from_confluence_body=rest.IngestFromConfluenceBody(
2541
+ base_url=base_url,
2542
+ page_ids=[page_id] if isinstance(page_id, str) else page_id,
2543
+ credentials=rest.ConfluenceCredentials(**credentials.__dict__),
2544
+ metadata=metadata,
2545
+ ),
2546
+ gen_doc_summaries=gen_doc_summaries,
2547
+ gen_doc_questions=gen_doc_questions,
2548
+ audio_input_language=audio_input_language,
2549
+ ocr_model=ocr_model,
2550
+ tesseract_lang=tesseract_lang,
2551
+ keep_tables_as_one_chunk=keep_tables_as_one_chunk,
2552
+ chunk_by_page=chunk_by_page,
2553
+ handwriting_check=handwriting_check,
2554
+ ingest_mode=ingest_mode,
2555
+ timeout=timeout,
2556
+ _headers=header,
2557
+ )
2558
+ )
2559
+ return self._wait_for_completion(response.id, timeout=timeout)
2560
+
2475
2561
  def list_secret_ids(self, connector_type: Optional[str] = None) -> List[str]:
2476
2562
  """
2477
2563
  List available secret IDs from the SecureStore for cloud storage connectors.
h2ogpte/h2ogpte_async.py CHANGED
@@ -89,6 +89,7 @@ from h2ogpte.connectors import (
89
89
  GCSServiceAccountCredential,
90
90
  AzureKeyCredential,
91
91
  AzureSASCredential,
92
+ ConfluenceCredential,
92
93
  )
93
94
 
94
95
 
@@ -2673,6 +2674,92 @@ class H2OGPTEAsync:
2673
2674
  )
2674
2675
  return await self._wait_for_completion(response.id, timeout=timeout)
2675
2676
 
2677
+ async def ingest_from_confluence(
2678
+ self,
2679
+ collection_id: str,
2680
+ base_url: str,
2681
+ page_id: Union[str, List[str]],
2682
+ credentials: ConfluenceCredential,
2683
+ gen_doc_summaries: Union[bool, None] = None,
2684
+ gen_doc_questions: Union[bool, None] = None,
2685
+ audio_input_language: Union[str, None] = None,
2686
+ ocr_model: Union[str, None] = None,
2687
+ tesseract_lang: Union[str, None] = None,
2688
+ keep_tables_as_one_chunk: Union[bool, None] = None,
2689
+ chunk_by_page: Union[bool, None] = None,
2690
+ handwriting_check: Union[bool, None] = None,
2691
+ metadata: Union[Dict[str, Any], None] = None,
2692
+ timeout: Union[float, None] = None,
2693
+ ingest_mode: Union[str, None] = None,
2694
+ ):
2695
+ """Ingests confluence pages into collection.
2696
+
2697
+ Args:
2698
+ collection_id:
2699
+ String id of the collection to add the ingested documents into.
2700
+ base_url:
2701
+ Url of confluence instance. Example: https://h2oai.atlassian.net/wiki
2702
+ page_id:
2703
+ The page id or ids of pages to be ingested.
2704
+ credentials:
2705
+ The object with Confluence credentials.
2706
+ gen_doc_summaries:
2707
+ Whether to auto-generate document summaries (uses LLM)
2708
+ gen_doc_questions:
2709
+ Whether to auto-generate sample questions for each document (uses LLM)
2710
+ audio_input_language:
2711
+ Language of audio files. Defaults to "auto" language detection. Pass empty string to see choices.
2712
+ ocr_model:
2713
+ Which method to use to extract text from images using AI-enabled optical character recognition (OCR) models.
2714
+ Pass empty string to see choices.
2715
+ docTR is best for Latin text, PaddleOCR is best for certain non-Latin languages, Tesseract covers a wide range of languages.
2716
+ Mississippi works well on handwriting.
2717
+ "auto" - Automatic will auto-select the best OCR model for every page.
2718
+ "off" - Disable OCR for speed, but all images will then be skipped (also no image captions will be made).
2719
+ tesseract_lang:
2720
+ Which language to use when using ocr_model="tesseract". Pass empty string to see choices.
2721
+ keep_tables_as_one_chunk:
2722
+ When tables are identified by the table parser the table tokens will be kept in a single chunk.
2723
+ chunk_by_page:
2724
+ Each page will be a chunk. `keep_tables_as_one_chunk` will be ignored if this is True.
2725
+ handwriting_check:
2726
+ Check pages for handwriting. Will use specialized models if handwriting is found.
2727
+ metadata:
2728
+ Dictionary of metadata to add to the document.
2729
+ timeout:
2730
+ Timeout in seconds.
2731
+ ingest_mode:
2732
+ Ingest mode to use.
2733
+ "standard" - Files will be ingested for use with RAG
2734
+ "lite" - Files will be ingested for use with RAG, but minimal processing will be done, favoring ingest speed over accuracy
2735
+ "agent_only" - Bypasses standard ingestion. Files can only be used with agents.
2736
+ """
2737
+ header = await self._get_auth_header()
2738
+ async with self._RESTClient(self) as rest_client:
2739
+ response = await _rest_to_client_exceptions(
2740
+ rest_client.ingestion_api.create_ingest_from_confluence_job(
2741
+ collection_id=collection_id,
2742
+ ingest_from_confluence_body=rest.IngestFromConfluenceBody(
2743
+ base_url=base_url,
2744
+ page_ids=[page_id] if isinstance(page_id, str) else page_id,
2745
+ credentials=rest.ConfluenceCredentials(**credentials.__dict__),
2746
+ metadata=metadata,
2747
+ ),
2748
+ gen_doc_summaries=gen_doc_summaries,
2749
+ gen_doc_questions=gen_doc_questions,
2750
+ audio_input_language=audio_input_language,
2751
+ ocr_model=ocr_model,
2752
+ tesseract_lang=tesseract_lang,
2753
+ keep_tables_as_one_chunk=keep_tables_as_one_chunk,
2754
+ chunk_by_page=chunk_by_page,
2755
+ handwriting_check=handwriting_check,
2756
+ ingest_mode=ingest_mode,
2757
+ timeout=timeout,
2758
+ _headers=header,
2759
+ )
2760
+ )
2761
+ return await self._wait_for_completion(response.id, timeout=timeout)
2762
+
2676
2763
  async def list_secret_ids(self, connector_type: Optional[str] = None) -> List[str]:
2677
2764
  """
2678
2765
  List available secret IDs from the SecureStore for cloud storage connectors.
@@ -14,7 +14,7 @@
14
14
  """ # noqa: E501
15
15
 
16
16
 
17
- __version__ = "1.6.43-dev2"
17
+ __version__ = "1.6.43-dev5"
18
18
 
19
19
  # import apis into sdk package
20
20
  from h2ogpte.rest_async.api.api_keys_api import APIKeysApi
@@ -77,6 +77,7 @@ from h2ogpte.rest_async.models.collection_create_request import CollectionCreate
77
77
  from h2ogpte.rest_async.models.collection_settings import CollectionSettings
78
78
  from h2ogpte.rest_async.models.collection_update_request import CollectionUpdateRequest
79
79
  from h2ogpte.rest_async.models.confirm_user_deletion_request import ConfirmUserDeletionRequest
80
+ from h2ogpte.rest_async.models.confluence_credentials import ConfluenceCredentials
80
81
  from h2ogpte.rest_async.models.count import Count
81
82
  from h2ogpte.rest_async.models.count_with_queue_details import CountWithQueueDetails
82
83
  from h2ogpte.rest_async.models.create_agent_key_request import CreateAgentKeyRequest
@@ -111,6 +112,7 @@ from h2ogpte.rest_async.models.guardrails_settings_create_request import Guardra
111
112
  from h2ogpte.rest_async.models.h2_ogptgpu_info import H2OGPTGPUInfo
112
113
  from h2ogpte.rest_async.models.h2_ogpt_system_info import H2OGPTSystemInfo
113
114
  from h2ogpte.rest_async.models.ingest_from_azure_blob_storage_body import IngestFromAzureBlobStorageBody
115
+ from h2ogpte.rest_async.models.ingest_from_confluence_body import IngestFromConfluenceBody
114
116
  from h2ogpte.rest_async.models.ingest_from_file_system_body import IngestFromFileSystemBody
115
117
  from h2ogpte.rest_async.models.ingest_from_gcs_body import IngestFromGcsBody
116
118
  from h2ogpte.rest_async.models.ingest_from_s3_body import IngestFromS3Body