ws-bom-robot-app 0.0.81__py3-none-any.whl → 0.0.82__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. ws_bom_robot_app/config.py +10 -0
  2. ws_bom_robot_app/cron_manager.py +6 -6
  3. ws_bom_robot_app/llm/agent_description.py +123 -123
  4. ws_bom_robot_app/llm/agent_handler.py +166 -166
  5. ws_bom_robot_app/llm/agent_lcel.py +50 -50
  6. ws_bom_robot_app/llm/api.py +2 -2
  7. ws_bom_robot_app/llm/defaut_prompt.py +15 -15
  8. ws_bom_robot_app/llm/feedbacks/feedback_manager.py +66 -66
  9. ws_bom_robot_app/llm/main.py +158 -158
  10. ws_bom_robot_app/llm/models/feedback.py +30 -30
  11. ws_bom_robot_app/llm/nebuly_handler.py +185 -185
  12. ws_bom_robot_app/llm/providers/llm_manager.py +5 -6
  13. ws_bom_robot_app/llm/tools/tool_builder.py +65 -65
  14. ws_bom_robot_app/llm/tools/tool_manager.py +330 -330
  15. ws_bom_robot_app/llm/tools/utils.py +41 -41
  16. ws_bom_robot_app/llm/utils/agent.py +34 -34
  17. ws_bom_robot_app/llm/utils/cleanup.py +7 -0
  18. ws_bom_robot_app/llm/utils/cms.py +114 -114
  19. ws_bom_robot_app/llm/utils/download.py +183 -185
  20. ws_bom_robot_app/llm/utils/print.py +29 -29
  21. ws_bom_robot_app/llm/vector_store/generator.py +137 -137
  22. ws_bom_robot_app/llm/vector_store/integration/azure.py +1 -1
  23. ws_bom_robot_app/llm/vector_store/integration/base.py +57 -15
  24. ws_bom_robot_app/llm/vector_store/integration/confluence.py +1 -1
  25. ws_bom_robot_app/llm/vector_store/integration/dropbox.py +1 -1
  26. ws_bom_robot_app/llm/vector_store/integration/gcs.py +1 -1
  27. ws_bom_robot_app/llm/vector_store/integration/github.py +22 -22
  28. ws_bom_robot_app/llm/vector_store/integration/googledrive.py +1 -1
  29. ws_bom_robot_app/llm/vector_store/integration/jira.py +93 -60
  30. ws_bom_robot_app/llm/vector_store/integration/s3.py +1 -1
  31. ws_bom_robot_app/llm/vector_store/integration/sftp.py +1 -1
  32. ws_bom_robot_app/llm/vector_store/integration/sharepoint.py +7 -14
  33. ws_bom_robot_app/llm/vector_store/integration/shopify.py +143 -144
  34. ws_bom_robot_app/llm/vector_store/integration/sitemap.py +3 -0
  35. ws_bom_robot_app/llm/vector_store/integration/slack.py +3 -2
  36. ws_bom_robot_app/llm/vector_store/integration/thron.py +102 -103
  37. ws_bom_robot_app/llm/vector_store/loader/base.py +8 -6
  38. ws_bom_robot_app/llm/vector_store/loader/docling.py +1 -1
  39. ws_bom_robot_app/llm/vector_store/loader/json_loader.py +25 -25
  40. ws_bom_robot_app/subprocess_runner.py +103 -0
  41. ws_bom_robot_app/task_manager.py +169 -41
  42. {ws_bom_robot_app-0.0.81.dist-info → ws_bom_robot_app-0.0.82.dist-info}/METADATA +18 -8
  43. ws_bom_robot_app-0.0.82.dist-info/RECORD +74 -0
  44. ws_bom_robot_app-0.0.81.dist-info/RECORD +0 -73
  45. {ws_bom_robot_app-0.0.81.dist-info → ws_bom_robot_app-0.0.82.dist-info}/WHEEL +0 -0
  46. {ws_bom_robot_app-0.0.81.dist-info → ws_bom_robot_app-0.0.82.dist-info}/top_level.txt +0 -0
@@ -1,137 +1,137 @@
1
- import os, gc, shutil, logging, traceback
2
- import asyncio, aiofiles, aiofiles.os
3
- from fastapi import HTTPException
4
- from fastapi.responses import StreamingResponse
5
- from langchain_core.documents import Document
6
- from ws_bom_robot_app.llm.vector_store.loader.base import Loader
7
- from ws_bom_robot_app.llm.models.api import RulesRequest, KbRequest, VectorDbResponse
8
- from ws_bom_robot_app.llm.vector_store.integration.manager import IntegrationManager
9
- from ws_bom_robot_app.llm.vector_store.db.manager import VectorDbManager
10
- from ws_bom_robot_app.config import config
11
- from ws_bom_robot_app.llm.models.kb import load_endpoints
12
- from ws_bom_robot_app.llm.utils.download import download_files
13
-
14
- async def _cleanup_directory(directory_path: str):
15
- if os.path.exists(directory_path):
16
- await asyncio.to_thread(shutil.rmtree, directory_path)
17
-
18
- #@timer
19
- async def rules(rq: RulesRequest) -> VectorDbResponse:
20
- _config = rq.config()
21
- db_name = rq.out_name()
22
- store_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_store, db_name)
23
- try:
24
- await VectorDbManager.get_strategy(rq.vector_type).create(rq.embeddings(),[Document(page_content=rule, metadata={"source": "rules"}) for rule in rq.rules], store_path) #type: ignore
25
- db_file_path = shutil.make_archive(os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_out, db_name), "zip", store_path)
26
- return VectorDbResponse(file = os.path.basename(db_file_path), vector_type=rq.vector_type)
27
- except Exception as e:
28
- try:
29
- await _cleanup_directory(store_path)
30
- finally:
31
- return VectorDbResponse(success = False, error = str(e))
32
- finally:
33
- gc.collect()
34
-
35
- #@atimer
36
- async def kb(rq: KbRequest) -> VectorDbResponse:
37
- os.environ['MPLCONFIGDIR'] = './tmp/.matplotlib'
38
- _config = rq.config()
39
- db_name = rq.out_name()
40
- src_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_src)
41
- working_path = os.path.join(src_path, db_name)
42
-
43
- if all([not rq.files,not rq.endpoints,not rq.integrations]):
44
- return VectorDbResponse(success = False, error = "No files, endpoints or integrations provided")
45
- else:
46
- await aiofiles.os.makedirs(src_path, exist_ok=True)
47
- await aiofiles.os.makedirs(working_path, exist_ok=True)
48
-
49
- documents: list[Document] = []
50
- # Download/copy all files
51
- if rq.files:
52
- try:
53
- loaders = Loader(working_path)
54
- filter_file_extensions = loaders.managed_file_extensions()
55
- files_to_download = [file for file in rq.files if not os.path.exists(os.path.join(src_path, os.path.basename(file)))]
56
- if files_to_download:
57
- await download_files(
58
- [f"{_config.robot_cms_host}/{_config.robot_cms_kb_folder}/{os.path.basename(file)}" for file in files_to_download if any([file.endswith(ext) for ext in filter_file_extensions])],
59
- src_path, authorization=_config.robot_cms_auth)
60
- # copy files to working tmp folder
61
- for file in rq.files:
62
- async with aiofiles.open(os.path.join(src_path, os.path.basename(file)), 'rb') as src_file:
63
- async with aiofiles.open(os.path.join(working_path, os.path.basename(file)), 'wb') as dest_file:
64
- await dest_file.write(await src_file.read())
65
- #load files
66
- try:
67
- documents.extend(await loaders.load())
68
- except Exception as e:
69
- tb = traceback.format_exc()
70
- _error = f"File loader failure: {e} | {tb}"
71
- logging.warning(_error)
72
- return VectorDbResponse(success = False, error = _error)
73
- except Exception as e:
74
- await _cleanup_directory(working_path)
75
- return VectorDbResponse(success = False, error = f"Failed to download file {e}")
76
-
77
- if rq.endpoints:
78
- try:
79
- documents.extend(await load_endpoints(rq.endpoints, working_path))
80
- except Exception as e:
81
- await _cleanup_directory(working_path)
82
- tb = traceback.format_exc()
83
- _error = f"Endpoint failure: {e} | {tb}"
84
- logging.warning(_error)
85
- return VectorDbResponse(success = False, error = _error)
86
-
87
- if rq.integrations:
88
- tasks = []
89
- for integration in rq.integrations:
90
- tasks.append(
91
- IntegrationManager
92
- .get_strategy(integration.type.lower(), working_path, integration.__pydantic_extra__) #type: ignore
93
- .load()
94
- )
95
- try:
96
- integration_documents = await asyncio.gather(*tasks)
97
- for docs in integration_documents:
98
- documents.extend(docs)
99
- except Exception as e:
100
- await _cleanup_directory(working_path)
101
- tb = traceback.format_exc()
102
- _error = f"Integration failure: {e} | {tb}"
103
- logging.warning(_error)
104
- return VectorDbResponse(success=False, error=_error)
105
-
106
- #cleanup
107
- await _cleanup_directory(working_path)
108
-
109
- if documents and len(documents) > 0:
110
- try:
111
- store_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_store, db_name)
112
- db_file_path = await aiofiles.os.wrap(shutil.make_archive)(
113
- os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_out, db_name),
114
- "zip",
115
- await VectorDbManager.get_strategy(rq.vector_type).create(rq.embeddings(), documents, store_path, return_folder_path=True)
116
- )
117
- return VectorDbResponse(file = os.path.basename(db_file_path), vector_type=rq.vector_type)
118
- except Exception as e:
119
- await _cleanup_directory(store_path)
120
- return VectorDbResponse(success = False, error = str(e))
121
- finally:
122
- del documents
123
- gc.collect()
124
- else:
125
- _error = "No documents found in the knowledgebase folder"
126
- logging.warning(_error)
127
- return VectorDbResponse(success = False, error = _error)
128
-
129
- async def kb_stream_file(filename: str):
130
- file_path = os.path.join(config.robot_data_folder, config.robot_data_db_folder, config.robot_data_db_folder_out, filename)
131
- if not os.path.isfile(file_path):
132
- raise HTTPException(status_code=404, detail="File not found")
133
- def iter_file():
134
- with open(file_path, mode="rb") as file:
135
- while chunk := file.read(1024*8):
136
- yield chunk
137
- return StreamingResponse(iter_file(), media_type="application/octet-stream", headers={"Content-Disposition": f"attachment; filename={filename}"})
1
+ import os, gc, shutil, logging, traceback
2
+ import asyncio, aiofiles, aiofiles.os
3
+ from fastapi import HTTPException
4
+ from fastapi.responses import StreamingResponse
5
+ from langchain_core.documents import Document
6
+ from ws_bom_robot_app.llm.vector_store.loader.base import Loader
7
+ from ws_bom_robot_app.llm.models.api import RulesRequest, KbRequest, VectorDbResponse
8
+ from ws_bom_robot_app.llm.vector_store.integration.manager import IntegrationManager
9
+ from ws_bom_robot_app.llm.vector_store.db.manager import VectorDbManager
10
+ from ws_bom_robot_app.config import config
11
+ from ws_bom_robot_app.llm.models.kb import load_endpoints
12
+ from ws_bom_robot_app.llm.utils.download import download_files
13
+
14
+ async def _cleanup_directory(directory_path: str):
15
+ if os.path.exists(directory_path):
16
+ await asyncio.to_thread(shutil.rmtree, directory_path)
17
+
18
+ #@timer
19
+ async def rules(rq: RulesRequest) -> VectorDbResponse:
20
+ _config = rq.config()
21
+ db_name = rq.out_name()
22
+ store_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_store, db_name)
23
+ try:
24
+ await VectorDbManager.get_strategy(rq.vector_type).create(rq.embeddings(),[Document(page_content=rule, metadata={"source": "rules"}) for rule in rq.rules], store_path) #type: ignore
25
+ db_file_path = shutil.make_archive(os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_out, db_name), "zip", store_path)
26
+ return VectorDbResponse(file = os.path.basename(db_file_path), vector_type=rq.vector_type)
27
+ except Exception as e:
28
+ try:
29
+ await _cleanup_directory(store_path)
30
+ finally:
31
+ return VectorDbResponse(success = False, error = str(e))
32
+ finally:
33
+ gc.collect()
34
+
35
+ #@atimer
36
+ async def kb(rq: KbRequest) -> VectorDbResponse:
37
+ os.environ['MPLCONFIGDIR'] = './tmp/.matplotlib'
38
+ _config = rq.config()
39
+ db_name = rq.out_name()
40
+ src_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_src)
41
+ working_path = os.path.join(src_path, db_name)
42
+
43
+ if all([not rq.files,not rq.endpoints,not rq.integrations]):
44
+ return VectorDbResponse(success = False, error = "No files, endpoints or integrations provided")
45
+ else:
46
+ await aiofiles.os.makedirs(src_path, exist_ok=True)
47
+ await aiofiles.os.makedirs(working_path, exist_ok=True)
48
+
49
+ documents: list[Document] = []
50
+ # Download/copy all files
51
+ if rq.files:
52
+ try:
53
+ loaders = Loader(working_path)
54
+ filter_file_extensions = loaders.managed_file_extensions()
55
+ files_to_download = [file for file in rq.files if not os.path.exists(os.path.join(src_path, os.path.basename(file)))]
56
+ if files_to_download:
57
+ await download_files(
58
+ [f"{_config.robot_cms_host}/{_config.robot_cms_kb_folder}/{os.path.basename(file)}" for file in files_to_download if any([file.endswith(ext) for ext in filter_file_extensions])],
59
+ src_path, authorization=_config.robot_cms_auth)
60
+ # copy files to working tmp folder
61
+ for file in rq.files:
62
+ async with aiofiles.open(os.path.join(src_path, os.path.basename(file)), 'rb') as src_file:
63
+ async with aiofiles.open(os.path.join(working_path, os.path.basename(file)), 'wb') as dest_file:
64
+ await dest_file.write(await src_file.read())
65
+ #load files
66
+ try:
67
+ documents.extend(await loaders.load())
68
+ except Exception as e:
69
+ tb = traceback.format_exc()
70
+ _error = f"File loader failure: {e} | {tb}"
71
+ logging.warning(_error)
72
+ return VectorDbResponse(success = False, error = _error)
73
+ except Exception as e:
74
+ await _cleanup_directory(working_path)
75
+ return VectorDbResponse(success = False, error = f"Failed to download file {e}")
76
+
77
+ if rq.endpoints:
78
+ try:
79
+ documents.extend(await load_endpoints(rq.endpoints, working_path))
80
+ except Exception as e:
81
+ await _cleanup_directory(working_path)
82
+ tb = traceback.format_exc()
83
+ _error = f"Endpoint failure: {e} | {tb}"
84
+ logging.warning(_error)
85
+ return VectorDbResponse(success = False, error = _error)
86
+
87
+ if rq.integrations:
88
+ tasks = []
89
+ for integration in rq.integrations:
90
+ tasks.append(
91
+ IntegrationManager
92
+ .get_strategy(integration.type.lower(), working_path, integration.__pydantic_extra__) #type: ignore
93
+ .load()
94
+ )
95
+ try:
96
+ integration_documents = await asyncio.gather(*tasks)
97
+ for docs in integration_documents:
98
+ documents.extend(docs)
99
+ except Exception as e:
100
+ await _cleanup_directory(working_path)
101
+ tb = traceback.format_exc()
102
+ _error = f"Integration failure: {e} | {tb}"
103
+ logging.warning(_error)
104
+ return VectorDbResponse(success=False, error=_error)
105
+
106
+ #cleanup
107
+ await _cleanup_directory(working_path)
108
+
109
+ if documents and len(documents) > 0:
110
+ try:
111
+ store_path = os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_store, db_name)
112
+ db_file_path = await aiofiles.os.wrap(shutil.make_archive)(
113
+ os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_out, db_name),
114
+ "zip",
115
+ await VectorDbManager.get_strategy(rq.vector_type).create(rq.embeddings(), documents, store_path, return_folder_path=True)
116
+ )
117
+ return VectorDbResponse(file = os.path.basename(db_file_path), vector_type=rq.vector_type)
118
+ except Exception as e:
119
+ await _cleanup_directory(store_path)
120
+ return VectorDbResponse(success = False, error = str(e))
121
+ finally:
122
+ del documents
123
+ gc.collect()
124
+ else:
125
+ _error = "No documents found in the knowledgebase folder"
126
+ logging.warning(_error)
127
+ return VectorDbResponse(success = False, error = _error)
128
+
129
+ async def kb_stream_file(filename: str):
130
+ file_path = os.path.join(config.robot_data_folder, config.robot_data_db_folder, config.robot_data_db_folder_out, filename)
131
+ if not os.path.isfile(file_path):
132
+ raise HTTPException(status_code=404, detail="File not found")
133
+ def iter_file():
134
+ with open(file_path, mode="rb") as file:
135
+ while chunk := file.read(1024*8):
136
+ yield chunk
137
+ return StreamingResponse(iter_file(), media_type="application/octet-stream", headers={"Content-Disposition": f"attachment; filename={filename}"})
@@ -1,6 +1,6 @@
1
1
  import asyncio
2
2
  from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
- from unstructured_ingest.v2.processes.connectors.fsspec.azure import AzureConnectionConfig, AzureAccessConfig, AzureDownloaderConfig, AzureIndexerConfig
3
+ from unstructured_ingest.processes.connectors.fsspec.azure import AzureConnectionConfig, AzureAccessConfig, AzureDownloaderConfig, AzureIndexerConfig
4
4
  from langchain_core.documents import Document
5
5
  from ws_bom_robot_app.llm.vector_store.loader.base import Loader
6
6
  from typing import Union, Optional
@@ -1,10 +1,17 @@
1
- import os
1
+ import os, copy
2
+ from random import random
2
3
  from langchain_core.documents import Document
3
4
  from abc import ABC, abstractmethod
4
- from unstructured_ingest.v2.interfaces import ProcessorConfig
5
- from unstructured_ingest.v2.pipeline.pipeline import Pipeline, PartitionerConfig, FiltererConfig
5
+ from unstructured_ingest.interfaces import ProcessorConfig
6
+ from unstructured_ingest.pipeline.pipeline import (
7
+ Pipeline,
8
+ PartitionerConfig,
9
+ FiltererConfig
10
+ )
11
+ from unstructured_ingest.processes.connector_registry import source_registry
6
12
  from typing import Union
7
13
  from ws_bom_robot_app.llm.utils.secrets import Secrets
14
+ from ws_bom_robot_app.config import config
8
15
 
9
16
  class IntegrationStrategy(ABC):
10
17
  @classmethod
@@ -32,23 +39,58 @@ class IntegrationStrategy(ABC):
32
39
  pass
33
40
 
34
41
  class UnstructuredIngest():
42
+ _PIPELINE: Pipeline = None
35
43
  def __init__(self, working_directory: str):
36
44
  self.working_directory = working_directory
37
- def pipeline(self,indexer,downloader,connection,extension: list[str] = None) -> Pipeline:
38
- return Pipeline.from_configs(
39
- context=ProcessorConfig(
45
+ def pipeline(self,indexer_config,downloader_config,connection_config,extension: list[str] = None) -> Pipeline:
46
+ def _default_processor_config() -> ProcessorConfig:
47
+ return ProcessorConfig(
40
48
  reprocess=False,
41
49
  verbose=False,
42
50
  tqdm=False,
43
- num_processes=2,
51
+ num_processes=config.robot_ingest_max_threads, #safe choice to 1, avoid potential process-related issues with Docker
52
+ disable_parallelism=False,
44
53
  preserve_downloads=True,
45
54
  download_only=True,
46
- raise_on_error=False
47
- ),
48
- indexer_config=indexer,
49
- downloader_config=downloader,
50
- source_connection_config=connection,
51
- partitioner_config=PartitionerConfig(),
52
- filterer_config=FiltererConfig(file_glob=[f"**/*{ext}" for ext in extension] if extension else None)
53
- )
55
+ raise_on_error=False,
56
+ iter_delete=True,
57
+ delete_cache=False #already managed by the generator task
58
+ )
59
+ def _init_pipeline() -> Pipeline:
60
+ return Pipeline.from_configs(
61
+ context=_default_processor_config(),
62
+ indexer_config=indexer_config,
63
+ downloader_config=downloader_config,
64
+ source_connection_config=connection_config,
65
+ partitioner_config=PartitionerConfig(),
66
+ filterer_config=FiltererConfig(file_glob=[f"**/*{ext}" for ext in extension] if extension else None)
67
+ )
68
+ def _instance_pipeline() -> Pipeline:
69
+ from unstructured_ingest.pipeline.steps.index import IndexStep
70
+ from unstructured_ingest.pipeline.steps.download import DownloadStep
71
+ from unstructured_ingest.pipeline.steps.filter import Filterer, FilterStep
72
+ _context = _default_processor_config()
73
+ source_entry = {
74
+ k: v
75
+ for k, v in source_registry.items()
76
+ if type(indexer_config) is v.indexer_config
77
+ and type(downloader_config) is v.downloader_config
78
+ and type(connection_config) is v.connection_config
79
+ }
80
+ source = list(source_entry.values())[0]
81
+ _pipeline = copy.deepcopy(UnstructuredIngest._PIPELINE)
82
+ _pipeline.context = _context
83
+ _pipeline.context.work_dir = f"{self.working_directory}_unstructured" # use sibling directory, cleaned up by the generator task
84
+ _pipeline.indexer_step = IndexStep(process=source.indexer(index_config=indexer_config, connection_config=connection_config), context=_context)
85
+ _pipeline.downloader_step = DownloadStep(process=source.downloader(download_config=downloader_config, connection_config=connection_config), context=_context)
86
+ _pipeline.filter_step = FilterStep(process=Filterer(config=FiltererConfig(file_glob=[f"**/*{ext}" for ext in extension] if extension else None)), context=_context) if extension else None
87
+ return _pipeline
54
88
 
89
+ if not UnstructuredIngest._PIPELINE:
90
+ import random
91
+ import time
92
+ time.sleep(random.uniform(0.2, 1))
93
+ if not UnstructuredIngest._PIPELINE:
94
+ UnstructuredIngest._PIPELINE = _init_pipeline()
95
+
96
+ return _instance_pipeline()
@@ -1,6 +1,6 @@
1
1
  import asyncio
2
2
  from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
- from unstructured_ingest.v2.processes.connectors.confluence import ConfluenceIndexerConfig, ConfluenceDownloaderConfig, ConfluenceConnectionConfig, ConfluenceAccessConfig
3
+ from unstructured_ingest.processes.connectors.confluence import ConfluenceIndexerConfig, ConfluenceDownloaderConfig, ConfluenceConnectionConfig, ConfluenceAccessConfig
4
4
  from langchain_core.documents import Document
5
5
  from ws_bom_robot_app.llm.vector_store.loader.base import Loader
6
6
  from typing import Optional, Union
@@ -1,6 +1,6 @@
1
1
  import asyncio
2
2
  from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
- from unstructured_ingest.v2.processes.connectors.fsspec.dropbox import DropboxConnectionConfig, DropboxAccessConfig, DropboxDownloaderConfig, DropboxIndexerConfig
3
+ from unstructured_ingest.processes.connectors.fsspec.dropbox import DropboxConnectionConfig, DropboxAccessConfig, DropboxDownloaderConfig, DropboxIndexerConfig
4
4
  from langchain_core.documents import Document
5
5
  from ws_bom_robot_app.llm.vector_store.loader.base import Loader
6
6
  from typing import Union
@@ -1,6 +1,6 @@
1
1
  import asyncio
2
2
  from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
- from unstructured_ingest.v2.processes.connectors.fsspec.gcs import GcsIndexerConfig, GcsConnectionConfig, GcsAccessConfig, GcsDownloaderConfig
3
+ from unstructured_ingest.processes.connectors.fsspec.gcs import GcsIndexerConfig, GcsConnectionConfig, GcsAccessConfig, GcsDownloaderConfig
4
4
  from langchain_core.documents import Document
5
5
  from ws_bom_robot_app.llm.vector_store.loader.base import Loader
6
6
  from typing import Union, Optional
@@ -1,10 +1,12 @@
1
1
  import asyncio
2
2
  from typing import Optional, Union
3
- from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy
4
- from unstructured_ingest.interfaces import ProcessorConfig, ReadConfig
5
- from unstructured_ingest.connector.git import GitAccessConfig
6
- from unstructured_ingest.connector.github import SimpleGitHubConfig
7
- from unstructured_ingest.runner import GithubRunner
3
+ from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
4
+ from unstructured_ingest.processes.connectors.github import (
5
+ GithubIndexerConfig,
6
+ GithubDownloaderConfig,
7
+ GithubConnectionConfig,
8
+ GithubAccessConfig
9
+ )
8
10
  from langchain_core.documents import Document
9
11
  from ws_bom_robot_app.llm.vector_store.loader.base import Loader
10
12
  from pydantic import BaseModel, Field, AliasChoices
@@ -27,28 +29,26 @@ class Github(IntegrationStrategy):
27
29
  def __init__(self, knowledgebase_path: str, data: dict[str, Union[str,int,list]]):
28
30
  super().__init__(knowledgebase_path, data)
29
31
  self.__data = GithubParams.model_validate(self.data)
32
+ self.__unstructured_ingest = UnstructuredIngest(self.working_directory)
30
33
  def working_subdirectory(self) -> str:
31
34
  return 'github'
32
35
  def run(self) -> None:
33
- access_config = GitAccessConfig(
34
- access_token=self.__data.access_token
35
- )
36
- file_ext = self.__data.file_ext or None
37
- file_glob = [f"**/*{ext}" for ext in file_ext] if file_ext else None
38
- config = SimpleGitHubConfig(
39
- url = self.__data.repo,
40
- access_config=access_config,
36
+ indexer_config = GithubIndexerConfig(
41
37
  branch=self.__data.branch,
42
- file_glob=file_glob
38
+ recursive=True
39
+ )
40
+ downloader_config = GithubDownloaderConfig(
41
+ download_dir=self.working_directory
42
+ )
43
+ connection_config = GithubConnectionConfig(
44
+ access_config=GithubAccessConfig(access_token=self.__data.access_token),
45
+ url=self.__data.repo
43
46
  )
44
- runner = GithubRunner(
45
- connector_config=config,
46
- processor_config=ProcessorConfig(reprocess=False,verbose=False,num_processes=2,raise_on_error=False),
47
- read_config=ReadConfig(download_dir=self.working_directory,re_download=True,preserve_downloads=True,download_only=True),
48
- partition_config=None,
49
- retry_strategy_config=None
50
- )
51
- runner.run()
47
+ self.__unstructured_ingest.pipeline(
48
+ indexer_config,
49
+ downloader_config,
50
+ connection_config,
51
+ extension=self.__data.file_ext).run()
52
52
  async def load(self) -> list[Document]:
53
53
  await asyncio.to_thread(self.run)
54
54
  await asyncio.sleep(1)
@@ -1,6 +1,6 @@
1
1
  import asyncio
2
2
  from ws_bom_robot_app.llm.vector_store.integration.base import IntegrationStrategy, UnstructuredIngest
3
- from unstructured_ingest.v2.processes.connectors.google_drive import GoogleDriveConnectionConfig, GoogleDriveDownloaderConfig, GoogleDriveIndexerConfig, GoogleDriveAccessConfig
3
+ from unstructured_ingest.processes.connectors.google_drive import GoogleDriveConnectionConfig, GoogleDriveDownloaderConfig, GoogleDriveIndexerConfig, GoogleDriveAccessConfig
4
4
  from langchain_core.documents import Document
5
5
  from ws_bom_robot_app.llm.vector_store.loader.base import Loader
6
6
  from typing import Union