bioguider 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bioguider might be problematic. Click here for more details.

Files changed (47) hide show
  1. bioguider/__init__.py +0 -0
  2. bioguider/agents/__init__.py +0 -0
  3. bioguider/agents/agent_task.py +88 -0
  4. bioguider/agents/agent_tools.py +147 -0
  5. bioguider/agents/agent_utils.py +357 -0
  6. bioguider/agents/collection_execute_step.py +180 -0
  7. bioguider/agents/collection_observe_step.py +113 -0
  8. bioguider/agents/collection_plan_step.py +154 -0
  9. bioguider/agents/collection_task.py +179 -0
  10. bioguider/agents/collection_task_utils.py +109 -0
  11. bioguider/agents/common_agent.py +159 -0
  12. bioguider/agents/common_agent_2step.py +126 -0
  13. bioguider/agents/common_step.py +85 -0
  14. bioguider/agents/dockergeneration_execute_step.py +186 -0
  15. bioguider/agents/dockergeneration_observe_step.py +153 -0
  16. bioguider/agents/dockergeneration_plan_step.py +158 -0
  17. bioguider/agents/dockergeneration_task.py +158 -0
  18. bioguider/agents/dockergeneration_task_utils.py +220 -0
  19. bioguider/agents/evaluation_task.py +269 -0
  20. bioguider/agents/identification_execute_step.py +179 -0
  21. bioguider/agents/identification_observe_step.py +92 -0
  22. bioguider/agents/identification_plan_step.py +135 -0
  23. bioguider/agents/identification_task.py +220 -0
  24. bioguider/agents/identification_task_utils.py +18 -0
  25. bioguider/agents/peo_common_step.py +64 -0
  26. bioguider/agents/prompt_utils.py +190 -0
  27. bioguider/agents/python_ast_repl_tool.py +69 -0
  28. bioguider/agents/rag_collection_task.py +130 -0
  29. bioguider/conversation.py +67 -0
  30. bioguider/database/summarized_file_db.py +140 -0
  31. bioguider/managers/evaluation_manager.py +108 -0
  32. bioguider/rag/__init__.py +0 -0
  33. bioguider/rag/config.py +117 -0
  34. bioguider/rag/data_pipeline.py +648 -0
  35. bioguider/rag/embedder.py +24 -0
  36. bioguider/rag/rag.py +134 -0
  37. bioguider/settings.py +103 -0
  38. bioguider/utils/constants.py +40 -0
  39. bioguider/utils/default.gitignore +140 -0
  40. bioguider/utils/file_utils.py +126 -0
  41. bioguider/utils/gitignore_checker.py +175 -0
  42. bioguider/utils/pyphen_utils.py +73 -0
  43. bioguider/utils/utils.py +27 -0
  44. bioguider-0.2.3.dist-info/LICENSE +21 -0
  45. bioguider-0.2.3.dist-info/METADATA +44 -0
  46. bioguider-0.2.3.dist-info/RECORD +47 -0
  47. bioguider-0.2.3.dist-info/WHEEL +4 -0
@@ -0,0 +1,130 @@
1
+
2
+ import os
3
+ from adalflow import Document
4
+ from langchain_core.prompts import ChatPromptTemplate
5
+ from pydantic import BaseModel, Field
6
+
7
+ from .common_agent_2step import CommonAgentTwoSteps
8
+ from ..rag.rag import RAG
9
+
10
+ RAG_COLLECT_SYSTEM_PROMPT = ChatPromptTemplate.from_template("""
11
+ You are an expert in repository documents retrieval and collection.
12
+ Your task is to collect relevant documents based on the user's query using the RAG system.
13
+ Here is the user's query:
14
+ {query}
15
+ The following are the documents extracted from the RAG system:
16
+ {documents}
17
+ Please analyze the documents one by one and determine which ones are relevant to the user's query.
18
+ Return a list of boolean values indicating the relevance of each document. Output example:
19
+ [True, False, True, ...] # True if the document is relevant, False otherwise
20
+ """)
21
+
22
+ class RAGCollectResult(BaseModel):
23
+ """
24
+ Represents the result of a RAG collection task.
25
+
26
+ Attributes:
27
+ query (str): The user's query.
28
+ documents (list): List of documents retrieved from the RAG system.
29
+ relevance (list): List of boolean values indicating the relevance of each document.
30
+ """
31
+ query: str = Field(..., description="The user's query")
32
+ documents: list[str] = Field(..., description="List of documents retrieved from the RAG system")
33
+ relevance: list[bool] = Field(..., description="List of boolean values indicating the relevance of each document")
34
+
35
+ RAGCollectResultSchema = {
36
+ 'description': "Represents the result of a RAG collection task.\n\nAttributes:\n query (str): The user's query.\n documents (list): List of documents retrieved from the RAG system.\n relevance (list): List of boolean values indicating the relevance of each document.",
37
+ 'properties': {
38
+ 'query': {'description': "The user's query", 'title': 'Query', 'type': 'string'},
39
+ 'documents': {'description': 'List of documents retrieved from the RAG system', 'items': {'type': 'string'}, 'title': 'Documents', 'type': 'array'},
40
+ 'relevance': {'description': 'List of boolean values indicating the relevance of each document', 'items': {'type': 'boolean'}, 'title': 'Relevance', 'type': 'array'}
41
+ },
42
+ 'required': [
43
+ 'query', 'documents', 'relevance'
44
+ ],
45
+ 'title': 'RAGCollectResult',
46
+ 'type': 'object'
47
+ }
48
+
49
+ class RAGCollectionTaskItem:
50
+ def __init__(self, llm, rag: RAG, step_callback, batch_size: int = 5):
51
+ """
52
+ Initialize the RAGCollectionTaskItem with a repository URL or local path.
53
+
54
+ Args:
55
+ rag: An instance of the RAG class
56
+ """
57
+ self.llm = llm
58
+ self.rag = rag
59
+ self.batch_size = batch_size
60
+ self.step_callback = step_callback
61
+
62
+ def collect(self, query: str, rag_documents: list[Document]) -> list[Document]:
63
+ relevant_documents = []
64
+ for i in range(0, len(rag_documents), self.batch_size):
65
+ contents = [' - ' + doc.text for doc in rag_documents[i:i + self.batch_size]]
66
+ documents_text = "\n".join(contents)
67
+ prompt = RAG_COLLECT_SYSTEM_PROMPT.format(query=query, documents=documents_text)
68
+ prompt = prompt.replace("{", "{{").replace("}", "}}") # Escape curly braces for LangChain
69
+ agent = CommonAgentTwoSteps(llm=self.llm)
70
+ res, _, token_usage, reasoning = agent.go(
71
+ system_prompt=prompt,
72
+ instruction_prompt="Please analyze the documents and determine their relevance to the query.",
73
+ schema=RAGCollectResultSchema,
74
+ )
75
+ self.step_callback(
76
+ step_output=f"**Reasoning Process**: {reasoning}\n",
77
+ )
78
+ self.step_callback(
79
+ step_output=f"**RAG Collection Result**: {res}",
80
+ )
81
+ self.step_callback(
82
+ token_usage=token_usage,
83
+ )
84
+ res = RAGCollectResult(**res)
85
+ relevants = self._collect_documents(
86
+ rag_documents[i:i + self.batch_size],
87
+ res.relevance
88
+ )
89
+ relevant_documents.extend(relevants)
90
+ return relevant_documents
91
+
92
+ def _collect_documents(self, docs: list[Document], relevants: list[bool]) -> list[Document]:
93
+ """
94
+ Collect documents based on relevance.
95
+
96
+ Args:
97
+ docs: List of documents to filter
98
+ relevants: List of boolean values indicating relevance
99
+
100
+ Returns:
101
+ List of relevant documents
102
+ """
103
+ return [doc for doc, relevant in zip(docs, relevants) if relevant]
104
+
105
+
106
+
107
+ class RAGCollectionTask:
108
+ def __init__(self, rag: RAG):
109
+ """
110
+ Initialize the RAGCollectionTask with a repository URL or local path.
111
+
112
+ Args:
113
+ repo_url_or_path: URL or local path to the repository
114
+ access_token: Optional access token for private repositories
115
+ """
116
+ self.rag = rag
117
+
118
+
119
+ def query(self, query: str) -> list:
120
+ """
121
+ Process a query using RAG.
122
+
123
+ Args:
124
+ query: The user's query
125
+
126
+ Returns:
127
+ retrieved_documents: List of documents retrieved based on the query
128
+ """
129
+ return self.rag.query_doc(query)
130
+
@@ -0,0 +1,67 @@
1
+
2
+ from abc import ABC, abstractmethod
3
+ from langchain_core.messages import BaseMessage
4
+ from langchain_deepseek import ChatDeepSeek
5
+ from openai import AuthenticationError
6
+ from pydantic import PositiveFloat, PositiveInt
7
+
8
+ class Conversation(ABC):
9
+ def __init__(self):
10
+ super().__init__()
11
+
12
+ @abstractmethod
13
+ def chat(
14
+ question: str,
15
+ messages: list[BaseMessage] = None
16
+ ):
17
+ """ chat with LLM """
18
+
19
+ class DeepSeekConversation(Conversation):
20
+ chatter: ChatDeepSeek | None = None
21
+ model: str = "deepseek-chat"
22
+ temperature: PositiveFloat = 0.1
23
+ request_timeout: PositiveInt = 60
24
+ base_url: str = "https://api.deepseek.com/v1"
25
+ max_retries: PositiveInt = 3
26
+ api_key: str | None = None
27
+ def __init__(
28
+ self,
29
+ ):
30
+ super().__init__()
31
+
32
+ def set_api_key(self, key: str):
33
+
34
+ try:
35
+ self.chatter = ChatDeepSeek(
36
+ model=self.model,
37
+ api_key=key,
38
+ temperature=self.temperature,
39
+ max_retries=self.max_retries,
40
+ timeout=self.request_timeout,
41
+ base_url=self.base_url,
42
+ )
43
+ # verify chat
44
+ ai_msg = self.chatter.invoke(
45
+ [("system", "Hi")]
46
+ )
47
+ return True
48
+ except AuthenticationError as e:
49
+ self.chatter = None
50
+ return False
51
+
52
+ def chat(
53
+ self,
54
+ question: str,
55
+ messages: list[BaseMessage] = None
56
+ ):
57
+ msgs = messages + [("user", question)] if messages is not None else \
58
+ [("user", question)]
59
+
60
+ try:
61
+ res_msg = self.chatter.invoke(msgs)
62
+ return res_msg
63
+ except Exception as e:
64
+ return str(e)
65
+
66
+
67
+
@@ -0,0 +1,140 @@
1
+
2
+ import sqlite3
3
+ from sqlite3 import Connection
4
+ import os
5
+ from time import strftime
6
+ from typing import Optional
7
+ import logging
8
+ from string import Template
9
+ import json
10
+
11
+ from bioguider.utils.constants import DEFAULT_TOKEN_USAGE
12
+
13
+ logging = logging.getLogger(__name__)
14
+
15
+ SUMMARIZED_FILES_TABLE_NAME = "SummarizedFiles"
16
+
17
+ summarized_files_create_table_query = f"""
18
+ CREATE TABLE IF NOT EXISTS {SUMMARIZED_FILES_TABLE_NAME} (
19
+ file_path VARCHAR(512),
20
+ instruction TEXT,
21
+ summarize_level INTEGER,
22
+ summarized_text TEXT,
23
+ token_usage VARCHAR(512),
24
+ datetime TEXT NOT NULL DEFAULT (strftime('%Y-%m-%d %H:%M:%f', 'now')),
25
+ UNIQUE (file_path, instruction, summarize_level)
26
+ );
27
+ """
28
+ summarized_files_upsert_query = f"""
29
+ INSERT INTO {SUMMARIZED_FILES_TABLE_NAME}(file_path, instruction, summarize_level, summarized_text, token_usage, datetime)
30
+ VALUES (?, ?, ?, ?, ?, strftime('%Y-%m-%d %H:%M:%f', 'now'))
31
+ ON CONFLICT(file_path, instruction, summarize_level) DO UPDATE SET summarized_text=excluded.summarized_text,
32
+ datetime=strftime('%Y-%m-%d %H:%M:%f', 'now');
33
+ """
34
+ summarized_files_select_query = f"""
35
+ SELECT summarized_text, datetime FROM {SUMMARIZED_FILES_TABLE_NAME}
36
+ where file_path = ? and instruction = ? and summarize_level = ?;
37
+ """
38
+
39
+ class SummarizedFilesDb:
40
+ def __init__(self, author: str, repo_name: str):
41
+ self.author = author
42
+ self.repo_name = repo_name
43
+ self.connection: Connection | None = None
44
+
45
+ def _ensure_tables(self) -> bool:
46
+ if self.connection is None:
47
+ return False
48
+ try:
49
+ cursor = self.connection.cursor()
50
+ cursor.execute(
51
+ summarized_files_create_table_query
52
+ )
53
+ self.connection.commit()
54
+ return True
55
+ except Exception as e:
56
+ logging.error(e)
57
+ return False
58
+
59
+ def _connect_to_db(self) -> bool:
60
+ if self.connection is not None:
61
+ return True
62
+ db_path = os.environ.get("DATA_FOLDER", "./data")
63
+ db_path = os.path.join(db_path, "databases")
64
+ # Ensure the local path exists
65
+ try:
66
+ os.makedirs(db_path, exist_ok=True)
67
+ except Exception as e:
68
+ logging.error(e)
69
+ return False
70
+ db_path = os.path.join(db_path, f"{self.author}_{self.repo_name}.db")
71
+ if not os.path.exists(db_path):
72
+ try:
73
+ with open(db_path, "w"):
74
+ pass
75
+ except Exception as e:
76
+ logging.error(e)
77
+ return False
78
+ self.connection = sqlite3.connect(db_path)
79
+ return True
80
+
81
+ def upsert_summarized_file(
82
+ self,
83
+ file_path: str,
84
+ instruction: str,
85
+ summarize_level: int,
86
+ summarized_text: str,
87
+ token_usage: dict | None = None
88
+ ):
89
+ token_usage = token_usage if token_usage is not None else {**DEFAULT_TOKEN_USAGE}
90
+ token_usage = json.dumps(token_usage)
91
+ res = self._connect_to_db()
92
+ assert res
93
+ res = self._ensure_tables()
94
+ assert res
95
+ try:
96
+ cursor = self.connection.cursor()
97
+ cursor.execute(
98
+ summarized_files_upsert_query,
99
+ (file_path, instruction, summarize_level, summarized_text, token_usage, )
100
+ )
101
+ self.connection.commit()
102
+ return True
103
+ except Exception as e:
104
+ logging.error(e)
105
+ return False
106
+ finally:
107
+ self.connection.close()
108
+ self.connection = None
109
+
110
+ def select_summarized_text(
111
+ self,
112
+ file_path: str,
113
+ instruction: str,
114
+ summarize_level: int,
115
+ ) -> str | None:
116
+ self._connect_to_db()
117
+ self._ensure_tables()
118
+ try:
119
+ cursor = self.connection.cursor()
120
+ cursor.execute(
121
+ summarized_files_select_query,
122
+ (file_path, instruction, summarize_level,)
123
+ )
124
+ row = cursor.fetchone()
125
+ if row is None:
126
+ return None
127
+ return row[0]
128
+ except Exception as e:
129
+ logging.error(e)
130
+ return None
131
+ finally:
132
+ self.connection.close()
133
+ self.connection = None
134
+
135
+ def get_db_file(self):
136
+ db_path = os.environ.get("DATA_FOLDER", "./data")
137
+ db_path = os.path.join(db_path, f"{self.author}_{self.repo_name}.db")
138
+ return db_path
139
+
140
+
@@ -0,0 +1,108 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+ from bioguider.agents.prompt_utils import CollectionGoalItemEnum
5
+ from bioguider.utils.constants import ProjectMetadata
6
+ from bioguider.utils.gitignore_checker import GitignoreChecker
7
+
8
+ from ..agents.identification_task import IdentificationTask
9
+ from ..rag.rag import RAG
10
+ from ..utils.file_utils import parse_repo_url
11
+ from ..database.summarized_file_db import SummarizedFilesDb
12
+ from ..agents.evaluation_task import EvaluationREADMETask
13
+ from ..agents.collection_task import CollectionTask
14
+
15
+ class EvaluationManager:
16
+ def __init__(self, llm, step_callback):
17
+ self.rag = None
18
+ self.llm = llm
19
+ self.step_callback = step_callback
20
+ self.repo_url: str | None = None
21
+ self.project_metadata: ProjectMetadata | None = None
22
+
23
+ def prepare_repo(self, repo_url: str):
24
+ self.repo_url = repo_url
25
+ self.rag = RAG()
26
+ self.rag.initialize_db_manager()
27
+ self.rag.prepare_retriever(repo_url_or_path=repo_url)
28
+
29
+ author, repo_name = parse_repo_url(repo_url)
30
+ self.summary_file_db = SummarizedFilesDb(author, repo_name)
31
+
32
+ def identify_project(self) -> ProjectMetadata:
33
+ repo_path = self.rag.repo_dir
34
+ gitignore_path = Path(repo_path, ".gitignore")
35
+
36
+ identfication_task = IdentificationTask(
37
+ llm=self.llm,
38
+ step_callback=self.step_callback,
39
+ )
40
+ identfication_task.compile(
41
+ repo_path=repo_path,
42
+ gitignore_path=gitignore_path,
43
+ db=self.summary_file_db,
44
+ )
45
+ language = identfication_task.identify_primary_language()
46
+ project_type = identfication_task.identify_project_type()
47
+ meta_data = identfication_task.identify_meta_data()
48
+
49
+ self.project_metadata = ProjectMetadata(
50
+ url=self.repo_url,
51
+ project_type=project_type,
52
+ primary_language=language,
53
+ repo_name=meta_data["name"] if "name" in meta_data else "",
54
+ description=meta_data["description"] if "description" in meta_data else "",
55
+ owner=meta_data["owner"] if "owner" in meta_data else "",
56
+ license=meta_data["license"] if "license" in meta_data else "",
57
+ )
58
+ return self.project_metadata
59
+
60
+ def evaluate_readme(self):
61
+ task = EvaluationREADMETask(
62
+ llm=self.llm,
63
+ repo_path=self.rag.repo_dir,
64
+ gitignore_path=Path(self.rag.repo_dir, ".gitignore"),
65
+ meta_data=self.project_metadata,
66
+ step_callback=self.step_callback,
67
+ )
68
+ readme_files = self._find_readme_files()
69
+ results = task.evaluate(readme_files)
70
+ return results
71
+
72
+ def evaluate_tutorial(self):
73
+ task = CollectionTask(
74
+ llm=self.llm,
75
+ step_callback=self.step_callback,
76
+ )
77
+ task.compile(
78
+ repo_path=self.rag.repo_dir,
79
+ gitignore_path=Path(self.rag.repo_dir, ".gitignore"),
80
+ db=self.summary_file_db,
81
+ goal_item=CollectionGoalItemEnum.Tutorial.name,
82
+ )
83
+ s = task.collect()
84
+ if s is None or 'final_answer' not in s:
85
+ return None
86
+
87
+ def _find_readme_files(self) -> list[str]:
88
+ """
89
+ Search for a README file in the repository directory.
90
+ """
91
+ possible_readme_files = [
92
+ "readme.md",
93
+ "readme.rst",
94
+ "readme.txt",
95
+ "readme",
96
+ ]
97
+ repo_path = self.rag.repo_dir
98
+ gitignore_path = Path(repo_path, ".gitignore")
99
+ gitignore_checker = GitignoreChecker(
100
+ directory=self.repo_path, gitignore_path=gitignore_path
101
+ )
102
+ found_readme_files = gitignore_checker.check_files_and_folders(
103
+ check_file_cb=lambda root_dir, relative_path: Path(relative_path).name.lower() in possible_readme_files,
104
+ )
105
+
106
+ return found_readme_files
107
+
108
+
File without changes
@@ -0,0 +1,117 @@
1
+ import os
2
+ from typing import List
3
+ from adalflow import GoogleGenAIClient
4
+ from adalflow.components.model_client.openai_client import OpenAIClient
5
+ from adalflow.components.model_client.azureai_client import AzureAIClient
6
+
7
+
8
+ DEFAULT_EXCLUDED_DIRS: List[str] = [
9
+ # Virtual environments and package managers
10
+ "./.venv/", "./venv/", "./env/", "./virtualenv/",
11
+ "./node_modules/", "./bower_components/", "./jspm_packages/",
12
+ # Version control
13
+ "./.git/", "./.svn/", "./.hg/", "./.bzr/",
14
+ # Cache and compiled files
15
+ "./__pycache__/", "./.pytest_cache/", "./.mypy_cache/", "./.ruff_cache/", "./.coverage/",
16
+ # Build and distribution
17
+ "./dist/", "./build/", "./out/", "./target/", "./bin/", "./obj/",
18
+ # Documentation
19
+ "./docs/", "./_docs/", "./site-docs/", "./_site/",
20
+ # IDE specific
21
+ "./.idea/", "./.vscode/", "./.vs/", "./.eclipse/", "./.settings/",
22
+ # Logs and temporary files
23
+ "./logs/", "./log/", "./tmp/", "./temp/",
24
+ ]
25
+
26
+ DEFAULT_EXCLUDED_FILES: List[str] = [
27
+ ]
28
+
29
+ configs = {
30
+ "embedder": {
31
+ "batch_size": 500,
32
+ "model_client": OpenAIClient,
33
+ "model_kwargs": {
34
+ "model": "text-embedding-3-small",
35
+ "dimensions": 256,
36
+ "encoding_format": "float",
37
+ },
38
+ },
39
+ "retriever": {
40
+ "top_k": 20,
41
+ },
42
+ "generator": {
43
+ "model_client": GoogleGenAIClient,
44
+ "model_kwargs": {
45
+ "model": "gemini-2.5-flash-preview-04-17",
46
+ "temperature": 0.7,
47
+ "top_p": 0.8,
48
+ },
49
+ },
50
+ "text_splitter": {
51
+ "split_by": "word",
52
+ "chunk_size": 350,
53
+ "chunk_overlap": 100,
54
+ },
55
+ "file_filters": {
56
+ "excluded_dirs": [
57
+ "./.venv/", "./venv/", "./env/", "./virtualenv/",
58
+ "./node_modules/", "./bower_components/", "./jspm_packages/",
59
+ "./.git/", "./.svn/", "./.hg/", "./.bzr/",
60
+ "./__pycache__/", "./.pytest_cache/", "./.mypy_cache/", "./.ruff_cache/", "./.coverage/",
61
+ "./dist/", "./build/", "./out/", "./target/", "./bin/", "./obj/",
62
+ "./_docs/", "./site-docs/", "./_site/",
63
+ "./.idea/", "./.vscode/", "./.vs/", "./.eclipse/", "./.settings/",
64
+ "./logs/", "./log/", "./tmp/", "./temp/",
65
+ ],
66
+ "excluded_files": [
67
+ "package-lock.json", "yarn.lock", "pnpm-lock.yaml", "npm-shrinkwrap.json",
68
+ "poetry.lock", "Pipfile.lock", "requirements.txt.lock", "Cargo.lock", "composer.lock",
69
+ ".lock", ".DS_Store", "Thumbs.db", "desktop.ini", "*.lnk",
70
+ ".env", ".env.*", "*.env", "*.cfg", "*.ini", ".flaskenv",
71
+ ".gitignore", ".gitattributes", ".gitmodules", ".github", ".gitlab-ci.yml",
72
+ ".prettierrc", ".eslintrc", ".eslintignore", ".stylelintrc", ".editorconfig",
73
+ ".jshintrc", ".pylintrc", ".flake8", "mypy.ini", "pyproject.toml",
74
+ "tsconfig.json", "webpack.config.js", "babel.config.js", "rollup.config.js",
75
+ "jest.config.js", "karma.conf.js", "vite.config.js", "next.config.js",
76
+ "*.min.js", "*.min.css", "*.bundle.js", "*.bundle.css",
77
+ "*.map", "*.gz", "*.zip", "*.tar", "*.tgz", "*.rar",
78
+ "*.pyc", "*.pyo", "*.pyd", "*.so", "*.dll", "*.class", "*.exe", "*.o", "*.a",
79
+ "*.jpg", "*.jpeg", "*.png", "*.gif", "*.ico", "*.svg", "*.webp",
80
+ "*.mp3", "*.mp4", "*.wav", "*.avi", "*.mov", "*.webm",
81
+ "*.csv", "*.tsv", "*.xls", "*.xlsx", "*.db", "*.sqlite", "*.sqlite3",
82
+ "*.pdf", "*.docx", "*.pptx",
83
+ ],
84
+ },
85
+ "repository": {
86
+ # Maximum repository size in MB
87
+ "size_limit_mb": 50000,
88
+ },
89
+ }
90
+
91
+ def get_embedder_config():
92
+ return configs["embedder"]
93
+
94
+ def create_model_client():
95
+ openai_type = os.environ.get("OPENAI_API_TYPE")
96
+ is_azure = openai_type == "azure" if openai_type is not None else False
97
+ if not is_azure:
98
+ return OpenAIClient()
99
+ return AzureAIClient(
100
+ api_key=os.environ.get("OPENAI_API_KEY"),
101
+ api_version=os.environ.get("OPENAI_API_VERSION"),
102
+ azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"),
103
+ )
104
+ def create_model_kwargs():
105
+ openai_type = os.environ.get("OPENAI_API_TYPE")
106
+ is_azure = openai_type == "azure" if openai_type is not None else False
107
+ if not is_azure:
108
+ return {
109
+ "model": "text-embedding-3-small",
110
+ "dimensions": 256,
111
+ "encoding_format": "float",
112
+ }
113
+ return {
114
+ "model": os.environ.get("OPENAI_TEXT_EMBEDDING_DEPLOYMENT_NAME"),
115
+ "dimensions": 256,
116
+ "encoding_format": "float",
117
+ }