PyPI - ws-bom-robot-app - Versions diffs - 0.0.81__py3-none-any.whl → 0.0.82__py3-none-any.whl - Mend

ws-bom-robot-app 0.0.81py3-none-any.whl → 0.0.82py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

ws_bom_robot_app/config.py +10 -0
ws_bom_robot_app/cron_manager.py +6 -6
ws_bom_robot_app/llm/agent_description.py +123 -123
ws_bom_robot_app/llm/agent_handler.py +166 -166
ws_bom_robot_app/llm/agent_lcel.py +50 -50
ws_bom_robot_app/llm/api.py +2 -2
ws_bom_robot_app/llm/defaut_prompt.py +15 -15
ws_bom_robot_app/llm/feedbacks/feedback_manager.py +66 -66
ws_bom_robot_app/llm/main.py +158 -158
ws_bom_robot_app/llm/models/feedback.py +30 -30
ws_bom_robot_app/llm/nebuly_handler.py +185 -185
ws_bom_robot_app/llm/providers/llm_manager.py +5 -6
ws_bom_robot_app/llm/tools/tool_builder.py +65 -65
ws_bom_robot_app/llm/tools/tool_manager.py +330 -330
ws_bom_robot_app/llm/tools/utils.py +41 -41
ws_bom_robot_app/llm/utils/agent.py +34 -34
ws_bom_robot_app/llm/utils/cleanup.py +7 -0
ws_bom_robot_app/llm/utils/cms.py +114 -114
ws_bom_robot_app/llm/utils/download.py +183 -185
ws_bom_robot_app/llm/utils/print.py +29 -29
ws_bom_robot_app/llm/vector_store/generator.py +137 -137
ws_bom_robot_app/llm/vector_store/integration/azure.py +1 -1
ws_bom_robot_app/llm/vector_store/integration/base.py +57 -15
ws_bom_robot_app/llm/vector_store/integration/confluence.py +1 -1
ws_bom_robot_app/llm/vector_store/integration/dropbox.py +1 -1
ws_bom_robot_app/llm/vector_store/integration/gcs.py +1 -1
ws_bom_robot_app/llm/vector_store/integration/github.py +22 -22
ws_bom_robot_app/llm/vector_store/integration/googledrive.py +1 -1
ws_bom_robot_app/llm/vector_store/integration/jira.py +93 -60
ws_bom_robot_app/llm/vector_store/integration/s3.py +1 -1
ws_bom_robot_app/llm/vector_store/integration/sftp.py +1 -1
ws_bom_robot_app/llm/vector_store/integration/sharepoint.py +7 -14
ws_bom_robot_app/llm/vector_store/integration/shopify.py +143 -144
ws_bom_robot_app/llm/vector_store/integration/sitemap.py +3 -0
ws_bom_robot_app/llm/vector_store/integration/slack.py +3 -2
ws_bom_robot_app/llm/vector_store/integration/thron.py +102 -103
ws_bom_robot_app/llm/vector_store/loader/base.py +8 -6
ws_bom_robot_app/llm/vector_store/loader/docling.py +1 -1
ws_bom_robot_app/llm/vector_store/loader/json_loader.py +25 -25
ws_bom_robot_app/subprocess_runner.py +103 -0
ws_bom_robot_app/task_manager.py +169 -41
{ws_bom_robot_app-0.0.81.dist-info → ws_bom_robot_app-0.0.82.dist-info}/METADATA +18 -8
ws_bom_robot_app-0.0.82.dist-info/RECORD +74 -0
ws_bom_robot_app-0.0.81.dist-info/RECORD +0 -73
{ws_bom_robot_app-0.0.81.dist-info → ws_bom_robot_app-0.0.82.dist-info}/WHEEL +0 -0
{ws_bom_robot_app-0.0.81.dist-info → ws_bom_robot_app-0.0.82.dist-info}/top_level.txt +0 -0

ws_bom_robot_app/llm/vector_store/generator.py CHANGED Viewed

@@ -1,137 +1,137 @@
-import os, gc, shutil, logging, traceback
-import asyncio, aiofiles, aiofiles.os
-from fastapi import HTTPException
-from fastapi.responses import StreamingResponse
-from langchain_core.documents import Document
-from ws_bom_robot_app.llm.vector_store.loader.base import Loader
-from ws_bom_robot_app.llm.models.api import RulesRequest, KbRequest, VectorDbResponse
-from ws_bom_robot_app.llm.vector_store.integration.manager import IntegrationManager
-from ws_bom_robot_app.llm.vector_store.db.manager import VectorDbManager
-from ws_bom_robot_app.config import config
-from ws_bom_robot_app.llm.models.kb import load_endpoints
-from ws_bom_robot_app.llm.utils.download import download_files
-async def _cleanup_directory(directory_path: str):
-  if os.path.exists(directory_path):
-    await asyncio.to_thread(shutil.rmtree, directory_path)
-#@timer
-async def rules(rq: RulesRequest) -> VectorDbResponse:
-  _config = rq.config()
-  db_name = rq.out_name()
-  store_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_store, db_name)
-  try:
-    await VectorDbManager.get_strategy(rq.vector_type).create(rq.embeddings(),[Document(page_content=rule, metadata={"source": "rules"}) for rule in rq.rules], store_path) #type: ignore
-    db_file_path = shutil.make_archive(os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_out, db_name), "zip", store_path)
-    return VectorDbResponse(file = os.path.basename(db_file_path), vector_type=rq.vector_type)
-  except Exception as e:
-    try:
-      await _cleanup_directory(store_path)
-    finally:
-      return VectorDbResponse(success = False, error = str(e))
-  finally:
-    gc.collect()
-#@atimer
-async def kb(rq: KbRequest) -> VectorDbResponse:
-  os.environ['MPLCONFIGDIR'] = './tmp/.matplotlib'
-  _config = rq.config()
-  db_name = rq.out_name()
-  src_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_src)
-  working_path = os.path.join(src_path, db_name)
-  if all([not rq.files,not rq.endpoints,not rq.integrations]):
-    return VectorDbResponse(success = False, error = "No files, endpoints or integrations provided")
-  else:
-    await aiofiles.os.makedirs(src_path, exist_ok=True)
-    await aiofiles.os.makedirs(working_path, exist_ok=True)
-  documents: list[Document] = []
-  # Download/copy all files
-  if rq.files:
-    try:
-      loaders = Loader(working_path)
-      filter_file_extensions = loaders.managed_file_extensions()
-      files_to_download = [file for file in rq.files if not os.path.exists(os.path.join(src_path, os.path.basename(file)))]
-      if files_to_download:
-        await download_files(
-          [f"{_config.robot_cms_host}/{_config.robot_cms_kb_folder}/{os.path.basename(file)}" for file in files_to_download if any([file.endswith(ext) for ext in filter_file_extensions])],
-          src_path, authorization=_config.robot_cms_auth)
-      # copy files to working tmp folder
-      for file in rq.files:
-        async with aiofiles.open(os.path.join(src_path, os.path.basename(file)), 'rb') as src_file:
-          async with aiofiles.open(os.path.join(working_path, os.path.basename(file)), 'wb') as dest_file:
-            await dest_file.write(await src_file.read())
-      #load files
-      try:
-        documents.extend(await loaders.load())
-      except Exception as e:
-        tb = traceback.format_exc()
-        _error = f"File loader failure: {e} | {tb}"
-        logging.warning(_error)
-        return VectorDbResponse(success = False, error = _error)
-    except Exception as e:
-      await _cleanup_directory(working_path)
-      return VectorDbResponse(success = False, error = f"Failed to download file {e}")
-  if rq.endpoints:
-    try:
-      documents.extend(await load_endpoints(rq.endpoints, working_path))
-    except Exception as e:
-      await _cleanup_directory(working_path)
-      tb = traceback.format_exc()
-      _error = f"Endpoint failure: {e} | {tb}"
-      logging.warning(_error)
-      return VectorDbResponse(success = False, error = _error)
-  if rq.integrations:
-    tasks = []
-    for integration in rq.integrations:
-      tasks.append(
-        IntegrationManager
-        .get_strategy(integration.type.lower(), working_path, integration.__pydantic_extra__) #type: ignore
-        .load()
-      )
-    try:
-      integration_documents = await asyncio.gather(*tasks)
-      for docs in integration_documents:
-        documents.extend(docs)
-    except Exception as e:
-      await _cleanup_directory(working_path)
-      tb = traceback.format_exc()
-      _error = f"Integration failure: {e} | {tb}"
-      logging.warning(_error)
-      return VectorDbResponse(success=False, error=_error)
-  #cleanup
-  await _cleanup_directory(working_path)
-  if documents and len(documents) > 0:
-    try:
-      store_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_store, db_name)
-      db_file_path = await aiofiles.os.wrap(shutil.make_archive)(
-          os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_out, db_name),
-          "zip",
-          await VectorDbManager.get_strategy(rq.vector_type).create(rq.embeddings(), documents, store_path, return_folder_path=True)
-      )
-      return VectorDbResponse(file = os.path.basename(db_file_path), vector_type=rq.vector_type)
-    except Exception as e:
-      await _cleanup_directory(store_path)
-      return VectorDbResponse(success = False, error = str(e))
-    finally:
-      del documents
-      gc.collect()
-  else:
-    _error = "No documents found in the knowledgebase folder"
-    logging.warning(_error)
-    return VectorDbResponse(success = False, error = _error)
-async def kb_stream_file(filename: str):
-    file_path = os.path.join(config.robot_data_folder, config.robot_data_db_folder, config.robot_data_db_folder_out, filename)
-    if not os.path.isfile(file_path):
-        raise HTTPException(status_code=404, detail="File not found")
-    def iter_file():
-        with open(file_path, mode="rb") as file:
-            while chunk := file.read(1024*8):
-                yield chunk
-    return StreamingResponse(iter_file(), media_type="application/octet-stream", headers={"Content-Disposition": f"attachment; filename={filename}"})
+import os, gc, shutil, logging, traceback
+import asyncio, aiofiles, aiofiles.os
+from fastapi import HTTPException
+from fastapi.responses import StreamingResponse
+from langchain_core.documents import Document
+from ws_bom_robot_app.llm.vector_store.loader.base import Loader
+from ws_bom_robot_app.llm.models.api import RulesRequest, KbRequest, VectorDbResponse
+from ws_bom_robot_app.llm.vector_store.integration.manager import IntegrationManager
+from ws_bom_robot_app.llm.vector_store.db.manager import VectorDbManager
+from ws_bom_robot_app.config import config
+from ws_bom_robot_app.llm.models.kb import load_endpoints
+from ws_bom_robot_app.llm.utils.download import download_files
+async def _cleanup_directory(directory_path: str):
+  if os.path.exists(directory_path):
+    await asyncio.to_thread(shutil.rmtree, directory_path)
+#@timer
+async def rules(rq: RulesRequest) -> VectorDbResponse:
+  _config = rq.config()
+  db_name = rq.out_name()
+  store_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_store, db_name)
+  try:
+    await VectorDbManager.get_strategy(rq.vector_type).create(rq.embeddings(),[Document(page_content=rule, metadata={"source": "rules"}) for rule in rq.rules], store_path) #type: ignore
+    db_file_path = shutil.make_archive(os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_out, db_name), "zip", store_path)
+    return VectorDbResponse(file = os.path.basename(db_file_path), vector_type=rq.vector_type)
+  except Exception as e:
+    try:
+      await _cleanup_directory(store_path)
+    finally:
+      return VectorDbResponse(success = False, error = str(e))
+  finally:
+    gc.collect()
+#@atimer
+async def kb(rq: KbRequest) -> VectorDbResponse:
+  os.environ['MPLCONFIGDIR'] = './tmp/.matplotlib'
+  _config = rq.config()
+  db_name = rq.out_name()
+  src_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_src)
+  working_path = os.path.join(src_path, db_name)
+  if all([not rq.files,not rq.endpoints,not rq.integrations]):
+    return VectorDbResponse(success = False, error = "No files, endpoints or integrations provided")
+  else:
+    await aiofiles.os.makedirs(src_path, exist_ok=True)
+    await aiofiles.os.makedirs(working_path, exist_ok=True)
+  documents: list[Document] = []
+  # Download/copy all files
+  if rq.files:
+    try:
+      loaders = Loader(working_path)
+      filter_file_extensions = loaders.managed_file_extensions()
+      files_to_download = [file for file in rq.files if not os.path.exists(os.path.join(src_path, os.path.basename(file)))]
+      if files_to_download:
+        await download_files(
+          [f"{_config.robot_cms_host}/{_config.robot_cms_kb_folder}/{os.path.basename(file)}" for file in files_to_download if any([file.endswith(ext) for ext in filter_file_extensions])],
+          src_path, authorization=_config.robot_cms_auth)
+      # copy files to working tmp folder
+      for file in rq.files:
+        async with aiofiles.open(os.path.join(src_path, os.path.basename(file)), 'rb') as src_file:
+          async with aiofiles.open(os.path.join(working_path, os.path.basename(file)), 'wb') as dest_file:
+            await dest_file.write(await src_file.read())
+      #load files
+      try:
+        documents.extend(await loaders.load())
+      except Exception as e:
+        tb = traceback.format_exc()
+        _error = f"File loader failure: {e} | {tb}"
+        logging.warning(_error)
+        return VectorDbResponse(success = False, error = _error)
+    except Exception as e:
+      await _cleanup_directory(working_path)
+      return VectorDbResponse(success = False, error = f"Failed to download file {e}")
+  if rq.endpoints:
+    try:
+      documents.extend(await load_endpoints(rq.endpoints, working_path))
+    except Exception as e:
+      await _cleanup_directory(working_path)
+      tb = traceback.format_exc()
+      _error = f"Endpoint failure: {e} | {tb}"
+      logging.warning(_error)
+      return VectorDbResponse(success = False, error = _error)
+  if rq.integrations:
+    tasks = []
+    for integration in rq.integrations:
+      tasks.append(
+        IntegrationManager
+        .get_strategy(integration.type.lower(), working_path, integration.__pydantic_extra__) #type: ignore
+        .load()
+      )
+    try:
+      integration_documents = await asyncio.gather(*tasks)
+      for docs in integration_documents:
+        documents.extend(docs)
+    except Exception as e:
+      await _cleanup_directory(working_path)
+      tb = traceback.format_exc()
+      _error = f"Integration failure: {e} | {tb}"
+      logging.warning(_error)
+      return VectorDbResponse(success=False, error=_error)
+  #cleanup
+  await _cleanup_directory(working_path)
+  if documents and len(documents) > 0:
+    try:
+      store_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_store, db_name)
+      db_file_path = await aiofiles.os.wrap(shutil.make_archive)(
+          os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_out, db_name),
+          "zip",
+          await VectorDbManager.get_strategy(rq.vector_type).create(rq.embeddings(), documents, store_path, return_folder_path=True)
+      )
+      return VectorDbResponse(file = os.path.basename(db_file_path), vector_type=rq.vector_type)
+    except Exception as e:
+      await _cleanup_directory(store_path)
+      return VectorDbResponse(success = False, error = str(e))
+    finally:
+      del documents
+      gc.collect()
+  else:
+    _error = "No documents found in the knowledgebase folder"
+    logging.warning(_error)
+    return VectorDbResponse(success = False, error = _error)
+async def kb_stream_file(filename: str):
+    file_path = os.path.join(config.robot_data_folder, config.robot_data_db_folder, config.robot_data_db_folder_out, filename)
+    if not os.path.isfile(file_path):
+        raise HTTPException(status_code=404, detail="File not found")
+    def iter_file():
+        with open(file_path, mode="rb") as file:
+            while chunk := file.read(1024*8):
+                yield chunk
+    return StreamingResponse(iter_file(), media_type="application/octet-stream", headers={"Content-Disposition": f"attachment; filename={filename}"})

ws_bom_robot_app/llm/vector_store/integration/azure.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import asyncio
 from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
-from unstructured_ingest.v2.processes.connectors.fsspec.azure import AzureConnectionConfig, AzureAccessConfig, AzureDownloaderConfig, AzureIndexerConfig
+from unstructured_ingest.processes.connectors.fsspec.azure import AzureConnectionConfig, AzureAccessConfig, AzureDownloaderConfig, AzureIndexerConfig
 from langchain_core.documents import Document
 from ws_bom_robot_app.llm.vector_store.loader.base import Loader
 from typing import Union, Optional

ws_bom_robot_app/llm/vector_store/integration/base.py CHANGED Viewed

@@ -1,10 +1,17 @@
-import os
+import os, copy
+from random import random
 from langchain_core.documents import Document
 from abc import ABC, abstractmethod
-from unstructured_ingest.v2.interfaces import ProcessorConfig
-from unstructured_ingest.v2.pipeline.pipeline import Pipeline, PartitionerConfig, FiltererConfig
+from unstructured_ingest.interfaces import ProcessorConfig
+from unstructured_ingest.pipeline.pipeline import (
+  Pipeline,
+  PartitionerConfig,
+  FiltererConfig
+)
+from unstructured_ingest.processes.connector_registry import source_registry
 from typing import Union
 from ws_bom_robot_app.llm.utils.secrets import Secrets
+from ws_bom_robot_app.config import config
 class IntegrationStrategy(ABC):
   @classmethod
@@ -32,23 +39,58 @@ class IntegrationStrategy(ABC):
     pass
 class UnstructuredIngest():
+  _PIPELINE: Pipeline = None
   def __init__(self, working_directory: str):
     self.working_directory = working_directory
-  def pipeline(self,indexer,downloader,connection,extension: list[str] = None) -> Pipeline:
-    return Pipeline.from_configs(
-      context=ProcessorConfig(
+  def pipeline(self,indexer_config,downloader_config,connection_config,extension: list[str] = None) -> Pipeline:
+    def _default_processor_config() -> ProcessorConfig:
+      return ProcessorConfig(
         reprocess=False,
         verbose=False,
         tqdm=False,
-        num_processes=2,
+        num_processes=config.robot_ingest_max_threads, #safe choice to 1, avoid potential process-related issues with Docker
+        disable_parallelism=False,
         preserve_downloads=True,
         download_only=True,
-        raise_on_error=False
-      ),
-      indexer_config=indexer,
-      downloader_config=downloader,
-      source_connection_config=connection,
-      partitioner_config=PartitionerConfig(),
-      filterer_config=FiltererConfig(file_glob=[f"**/*{ext}" for ext in extension] if extension else None)
-    )
+        raise_on_error=False,
+        iter_delete=True,
+        delete_cache=False #already managed by the generator task
+      )
+    def _init_pipeline() -> Pipeline:
+      return Pipeline.from_configs(
+        context=_default_processor_config(),
+        indexer_config=indexer_config,
+        downloader_config=downloader_config,
+        source_connection_config=connection_config,
+        partitioner_config=PartitionerConfig(),
+        filterer_config=FiltererConfig(file_glob=[f"**/*{ext}" for ext in extension] if extension else None)
+      )
+    def _instance_pipeline() -> Pipeline:
+        from unstructured_ingest.pipeline.steps.index import  IndexStep
+        from unstructured_ingest.pipeline.steps.download import  DownloadStep
+        from unstructured_ingest.pipeline.steps.filter import Filterer, FilterStep
+        _context = _default_processor_config()
+        source_entry = {
+                    k: v
+                    for k, v in source_registry.items()
+                    if type(indexer_config) is v.indexer_config
+                    and type(downloader_config) is v.downloader_config
+                    and type(connection_config) is v.connection_config
+                }
+        source = list(source_entry.values())[0]
+        _pipeline = copy.deepcopy(UnstructuredIngest._PIPELINE)
+        _pipeline.context = _context
+        _pipeline.context.work_dir = f"{self.working_directory}_unstructured" # use sibling directory, cleaned up by the generator task
+        _pipeline.indexer_step = IndexStep(process=source.indexer(index_config=indexer_config, connection_config=connection_config), context=_context)
+        _pipeline.downloader_step = DownloadStep(process=source.downloader(download_config=downloader_config, connection_config=connection_config), context=_context)
+        _pipeline.filter_step = FilterStep(process=Filterer(config=FiltererConfig(file_glob=[f"**/*{ext}" for ext in extension] if extension else None)), context=_context) if extension else None
+        return _pipeline
+    if not UnstructuredIngest._PIPELINE:
+      import random
+      import time
+      time.sleep(random.uniform(0.2, 1))
+      if not UnstructuredIngest._PIPELINE:
+        UnstructuredIngest._PIPELINE = _init_pipeline()
+    return _instance_pipeline()

ws_bom_robot_app/llm/vector_store/integration/confluence.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import asyncio
 from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
-from unstructured_ingest.v2.processes.connectors.confluence import ConfluenceIndexerConfig, ConfluenceDownloaderConfig, ConfluenceConnectionConfig, ConfluenceAccessConfig
+from unstructured_ingest.processes.connectors.confluence import ConfluenceIndexerConfig, ConfluenceDownloaderConfig, ConfluenceConnectionConfig, ConfluenceAccessConfig
 from langchain_core.documents import Document
 from ws_bom_robot_app.llm.vector_store.loader.base import Loader
 from typing import Optional, Union

ws_bom_robot_app/llm/vector_store/integration/dropbox.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import asyncio
 from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
-from unstructured_ingest.v2.processes.connectors.fsspec.dropbox import DropboxConnectionConfig, DropboxAccessConfig, DropboxDownloaderConfig, DropboxIndexerConfig
+from unstructured_ingest.processes.connectors.fsspec.dropbox import DropboxConnectionConfig, DropboxAccessConfig, DropboxDownloaderConfig, DropboxIndexerConfig
 from langchain_core.documents import Document
 from ws_bom_robot_app.llm.vector_store.loader.base import Loader
 from typing import Union

ws_bom_robot_app/llm/vector_store/integration/gcs.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import asyncio
 from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
-from unstructured_ingest.v2.processes.connectors.fsspec.gcs import GcsIndexerConfig, GcsConnectionConfig, GcsAccessConfig, GcsDownloaderConfig
+from unstructured_ingest.processes.connectors.fsspec.gcs import GcsIndexerConfig, GcsConnectionConfig, GcsAccessConfig, GcsDownloaderConfig
 from langchain_core.documents import Document
 from ws_bom_robot_app.llm.vector_store.loader.base import Loader
 from typing import Union, Optional

ws_bom_robot_app/llm/vector_store/integration/github.py CHANGED Viewed

@@ -1,10 +1,12 @@
 import asyncio
 from typing import Optional, Union
-from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy
-from unstructured_ingest.interfaces import  ProcessorConfig, ReadConfig
-from unstructured_ingest.connector.git import GitAccessConfig
-from unstructured_ingest.connector.github import SimpleGitHubConfig
-from unstructured_ingest.runner import GithubRunner
+from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
+from unstructured_ingest.processes.connectors.github import (
+    GithubIndexerConfig,
+    GithubDownloaderConfig,
+    GithubConnectionConfig,
+    GithubAccessConfig
+)
 from langchain_core.documents import Document
 from ws_bom_robot_app.llm.vector_store.loader.base import Loader
 from pydantic import BaseModel, Field, AliasChoices
@@ -27,28 +29,26 @@ class Github(IntegrationStrategy):
   def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
     super().__init__(knowledgebase_path, data)
     self.__data = GithubParams.model_validate(self.data)
+    self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
   def working_subdirectory(self) -> str:
     return 'github'
   def run(self) -> None:
-    access_config = GitAccessConfig(
-      access_token=self.__data.access_token
-    )
-    file_ext = self.__data.file_ext or None
-    file_glob = [f"**/*{ext}" for ext in file_ext] if file_ext else None
-    config = SimpleGitHubConfig(
-      url = self.__data.repo,
-      access_config=access_config,
+    indexer_config = GithubIndexerConfig(
       branch=self.__data.branch,
-      file_glob=file_glob
+      recursive=True
+    )
+    downloader_config = GithubDownloaderConfig(
+      download_dir=self.working_directory
+    )
+    connection_config = GithubConnectionConfig(
+      access_config=GithubAccessConfig(access_token=self.__data.access_token),
+      url=self.__data.repo
     )
-    runner = GithubRunner(
-      connector_config=config,
-      processor_config=ProcessorConfig(reprocess=False,verbose=False,num_processes=2,raise_on_error=False),
-      read_config=ReadConfig(download_dir=self.working_directory,re_download=True,preserve_downloads=True,download_only=True),
-      partition_config=None,
-      retry_strategy_config=None
-      )
-    runner.run()
+    self.__unstructured_ingest.pipeline(
+      indexer_config,
+      downloader_config,
+      connection_config,
+      extension=self.__data.file_ext).run()
   async def load(self) -> list[Document]:
       await asyncio.to_thread(self.run)
       await asyncio.sleep(1)

ws_bom_robot_app/llm/vector_store/integration/googledrive.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import asyncio
 from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
-from unstructured_ingest.v2.processes.connectors.google_drive import GoogleDriveConnectionConfig, GoogleDriveDownloaderConfig, GoogleDriveIndexerConfig, GoogleDriveAccessConfig
+from unstructured_ingest.processes.connectors.google_drive import GoogleDriveConnectionConfig, GoogleDriveDownloaderConfig, GoogleDriveIndexerConfig, GoogleDriveAccessConfig
 from langchain_core.documents import Document
 from ws_bom_robot_app.llm.vector_store.loader.base import Loader
 from typing import Union

ws-bom-robot-app 0.0.81__py3-none-any.whl → 0.0.82__py3-none-any.whl

ws-bom-robot-app 0.0.81py3-none-any.whl → 0.0.82py3-none-any.whl