mfcli 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. mfcli/.env.example +72 -0
  2. mfcli/__init__.py +0 -0
  3. mfcli/agents/__init__.py +0 -0
  4. mfcli/agents/controller/__init__.py +0 -0
  5. mfcli/agents/controller/agent.py +19 -0
  6. mfcli/agents/controller/config.yaml +27 -0
  7. mfcli/agents/controller/tools.py +42 -0
  8. mfcli/agents/tools/general.py +118 -0
  9. mfcli/alembic/env.py +61 -0
  10. mfcli/alembic/script.py.mako +28 -0
  11. mfcli/alembic/versions/6ccc0c7c397c_added_fields_to_pdf_parts_model.py +39 -0
  12. mfcli/alembic/versions/769019ef4870_added_gemini_file_path_to_pdf_part_model.py +33 -0
  13. mfcli/alembic/versions/7a2e3a779fdc_added_functional_block_and_component_.py +54 -0
  14. mfcli/alembic/versions/7d5adb2a47a7_added_pdf_parts_model.py +41 -0
  15. mfcli/alembic/versions/7fcb7d6a5836_init.py +167 -0
  16. mfcli/alembic/versions/e0f2b5765c72_added_cascade_delete_for_models_that_.py +32 -0
  17. mfcli/alembic.ini +147 -0
  18. mfcli/cli/__init__.py +0 -0
  19. mfcli/cli/dependencies.py +59 -0
  20. mfcli/cli/main.py +192 -0
  21. mfcli/client/__init__.py +0 -0
  22. mfcli/client/chroma_db.py +184 -0
  23. mfcli/client/docling.py +44 -0
  24. mfcli/client/gemini.py +252 -0
  25. mfcli/client/llama_parse.py +38 -0
  26. mfcli/client/vector_db.py +93 -0
  27. mfcli/constants/__init__.py +0 -0
  28. mfcli/constants/base_enum.py +18 -0
  29. mfcli/constants/directory_names.py +1 -0
  30. mfcli/constants/file_types.py +189 -0
  31. mfcli/constants/gemini.py +1 -0
  32. mfcli/constants/openai.py +6 -0
  33. mfcli/constants/pipeline_run_status.py +3 -0
  34. mfcli/crud/__init__.py +0 -0
  35. mfcli/crud/file.py +42 -0
  36. mfcli/crud/functional_blocks.py +26 -0
  37. mfcli/crud/netlist.py +18 -0
  38. mfcli/crud/pipeline_run.py +17 -0
  39. mfcli/crud/project.py +99 -0
  40. mfcli/digikey/__init__.py +0 -0
  41. mfcli/digikey/digikey.py +105 -0
  42. mfcli/main.py +5 -0
  43. mfcli/mcp/__init__.py +0 -0
  44. mfcli/mcp/configs/cline_mcp_settings.json +11 -0
  45. mfcli/mcp/configs/mfcli.mcp.json +7 -0
  46. mfcli/mcp/mcp_instance.py +6 -0
  47. mfcli/mcp/server.py +37 -0
  48. mfcli/mcp/state_manager.py +51 -0
  49. mfcli/mcp/tools/__init__.py +0 -0
  50. mfcli/mcp/tools/query_knowledgebase.py +108 -0
  51. mfcli/models/__init__.py +10 -0
  52. mfcli/models/base.py +10 -0
  53. mfcli/models/bom.py +71 -0
  54. mfcli/models/datasheet.py +10 -0
  55. mfcli/models/debug_setup.py +64 -0
  56. mfcli/models/file.py +43 -0
  57. mfcli/models/file_docket.py +94 -0
  58. mfcli/models/file_metadata.py +19 -0
  59. mfcli/models/functional_blocks.py +94 -0
  60. mfcli/models/llm_response.py +5 -0
  61. mfcli/models/mcu.py +97 -0
  62. mfcli/models/mcu_errata.py +26 -0
  63. mfcli/models/netlist.py +59 -0
  64. mfcli/models/pdf_parts.py +25 -0
  65. mfcli/models/pipeline_run.py +34 -0
  66. mfcli/models/project.py +27 -0
  67. mfcli/models/project_metadata.py +15 -0
  68. mfcli/pipeline/__init__.py +0 -0
  69. mfcli/pipeline/analysis/__init__.py +0 -0
  70. mfcli/pipeline/analysis/bom_netlist_mapper.py +28 -0
  71. mfcli/pipeline/analysis/generators/__init__.py +0 -0
  72. mfcli/pipeline/analysis/generators/bom/__init__.py +0 -0
  73. mfcli/pipeline/analysis/generators/bom/bom.py +74 -0
  74. mfcli/pipeline/analysis/generators/debug_setup/__init__.py +0 -0
  75. mfcli/pipeline/analysis/generators/debug_setup/debug_setup.py +71 -0
  76. mfcli/pipeline/analysis/generators/debug_setup/instructions.py +150 -0
  77. mfcli/pipeline/analysis/generators/functional_blocks/__init__.py +0 -0
  78. mfcli/pipeline/analysis/generators/functional_blocks/functional_blocks.py +93 -0
  79. mfcli/pipeline/analysis/generators/functional_blocks/instructions.py +34 -0
  80. mfcli/pipeline/analysis/generators/functional_blocks/validator.py +94 -0
  81. mfcli/pipeline/analysis/generators/generator.py +258 -0
  82. mfcli/pipeline/analysis/generators/generator_base.py +18 -0
  83. mfcli/pipeline/analysis/generators/mcu/__init__.py +0 -0
  84. mfcli/pipeline/analysis/generators/mcu/instructions.py +156 -0
  85. mfcli/pipeline/analysis/generators/mcu/mcu.py +84 -0
  86. mfcli/pipeline/analysis/generators/mcu_errata/__init__.py +1 -0
  87. mfcli/pipeline/analysis/generators/mcu_errata/instructions.py +77 -0
  88. mfcli/pipeline/analysis/generators/mcu_errata/mcu_errata.py +95 -0
  89. mfcli/pipeline/analysis/generators/summary/__init__.py +0 -0
  90. mfcli/pipeline/analysis/generators/summary/summary.py +47 -0
  91. mfcli/pipeline/classifier.py +93 -0
  92. mfcli/pipeline/data_enricher.py +15 -0
  93. mfcli/pipeline/extractor.py +34 -0
  94. mfcli/pipeline/extractors/__init__.py +0 -0
  95. mfcli/pipeline/extractors/pdf.py +12 -0
  96. mfcli/pipeline/parser.py +120 -0
  97. mfcli/pipeline/parsers/__init__.py +0 -0
  98. mfcli/pipeline/parsers/netlist/__init__.py +0 -0
  99. mfcli/pipeline/parsers/netlist/edif.py +93 -0
  100. mfcli/pipeline/parsers/netlist/kicad_legacy_net.py +326 -0
  101. mfcli/pipeline/parsers/netlist/kicad_spice.py +135 -0
  102. mfcli/pipeline/parsers/netlist/pads.py +185 -0
  103. mfcli/pipeline/parsers/netlist/protel.py +166 -0
  104. mfcli/pipeline/parsers/netlist/protel_detector.py +29 -0
  105. mfcli/pipeline/pipeline.py +419 -0
  106. mfcli/pipeline/preprocessors/__init__.py +0 -0
  107. mfcli/pipeline/preprocessors/user_guide.py +127 -0
  108. mfcli/pipeline/run_context.py +32 -0
  109. mfcli/pipeline/schema_mapper.py +89 -0
  110. mfcli/pipeline/sub_classifier.py +115 -0
  111. mfcli/utils/__init__.py +0 -0
  112. mfcli/utils/config.py +33 -0
  113. mfcli/utils/configurator.py +324 -0
  114. mfcli/utils/data_cleaner.py +82 -0
  115. mfcli/utils/datasheet_vectorizer.py +281 -0
  116. mfcli/utils/directory_manager.py +96 -0
  117. mfcli/utils/file_upload.py +298 -0
  118. mfcli/utils/files.py +16 -0
  119. mfcli/utils/http_requests.py +54 -0
  120. mfcli/utils/kb_lister.py +89 -0
  121. mfcli/utils/kb_remover.py +173 -0
  122. mfcli/utils/logger.py +28 -0
  123. mfcli/utils/mcp_configurator.py +311 -0
  124. mfcli/utils/migrations.py +18 -0
  125. mfcli/utils/orm.py +43 -0
  126. mfcli/utils/pdf_splitter.py +63 -0
  127. mfcli/utils/query_service.py +22 -0
  128. mfcli/utils/system_check.py +306 -0
  129. mfcli/utils/tools.py +31 -0
  130. mfcli/utils/vectorizer.py +28 -0
  131. mfcli-0.2.0.dist-info/METADATA +841 -0
  132. mfcli-0.2.0.dist-info/RECORD +136 -0
  133. mfcli-0.2.0.dist-info/WHEEL +5 -0
  134. mfcli-0.2.0.dist-info/entry_points.txt +3 -0
  135. mfcli-0.2.0.dist-info/licenses/LICENSE +21 -0
  136. mfcli-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,184 @@
1
+ from typing import Mapping, List
2
+
3
+ import chromadb
4
+ import tiktoken
5
+ import unicodedata
6
+ from chromadb import SparseVector
7
+ from chromadb.utils import embedding_functions
8
+ from pydantic import BaseModel
9
+
10
+ from mfcli.constants.openai import (
11
+ OPENAI_ENCODING_MODEL,
12
+ OPENAI_MAX_ENCODING_REQUEST_TOKENS,
13
+ OPENAI_MAX_TOKENS_PER_CHUNK
14
+ )
15
+ from mfcli.crud.project import get_project_by_name
16
+ from mfcli.utils.config import get_config
17
+ from mfcli.utils.directory_manager import app_dirs
18
+ from mfcli.utils.logger import get_logger
19
+ from mfcli.utils.orm import Session
20
+
21
+ logger = get_logger(__name__)
22
+
23
+ ChunkMetadata = Mapping[str, str | int | float | bool | SparseVector | None]
24
+
25
+
26
+ class VectorDBChunk(BaseModel):
27
+ id: str
28
+ document: str
29
+ metadata: ChunkMetadata
30
+ embedding: list[float] | None = None
31
+
32
+
33
+ class ChromaClient:
34
+ def __init__(self, index_name: str):
35
+ self._index_name = index_name
36
+ self._config = get_config()
37
+ self._client = chromadb.PersistentClient(
38
+ path=app_dirs.chroma_db_dir
39
+ )
40
+ openai_ef = embedding_functions.OpenAIEmbeddingFunction(
41
+ api_key=self._config.openai_api_key,
42
+ model_name=self._config.embedding_model
43
+ )
44
+ self._collection = self._client.get_or_create_collection(
45
+ name=index_name,
46
+ embedding_function=openai_ef
47
+ )
48
+ self._enc = tiktoken.get_encoding(OPENAI_ENCODING_MODEL)
49
+
50
+ def delete_collection(self):
51
+ self._client.delete_collection(self._index_name)
52
+
53
+ @staticmethod
54
+ def _sanitize_chunk(text: str):
55
+ if not isinstance(text, str):
56
+ raise TypeError(f"Chunk is not a string: {type(text)}")
57
+
58
+ # Remove ASCII control characters except newline/tab
59
+ text = ''.join(
60
+ ch for ch in text
61
+ if (32 <= ord(ch) <= 0x10FFFF) or ch in "\n\t\r"
62
+ )
63
+ return text.strip()
64
+
65
+ def _validate_chunk_for_embedding(self, text: str) -> None:
66
+ """
67
+ Raises an error if the chunk cannot be embedded.
68
+ """
69
+ if not isinstance(text, str):
70
+ raise TypeError(f"Chunk is not a string: {type(text)}")
71
+
72
+ if not text.strip():
73
+ raise ValueError("Chunk is empty or whitespace only")
74
+
75
+ try:
76
+ text.encode("utf-8")
77
+ except UnicodeEncodeError as e:
78
+ raise ValueError(f"Chunk contains invalid Unicode: {e}")
79
+
80
+ # Check for illegal control characters (other than \n, \t, \r)
81
+ for ch in text:
82
+ if unicodedata.category(ch) == "Cc" and ch not in "\n\t\r":
83
+ raise ValueError(f"Chunk contains control character: {repr(ch)}")
84
+
85
+ # Check token length
86
+ token_count = len(self._enc.encode(text))
87
+ if token_count > OPENAI_MAX_TOKENS_PER_CHUNK:
88
+ raise ValueError(f"Chunk too long: {token_count} tokens (> {OPENAI_MAX_TOKENS_PER_CHUNK})")
89
+
90
+ def _batch_chunks(self, chunks: List[VectorDBChunk]) -> List[List[VectorDBChunk]]:
91
+ batches = []
92
+ current_batch = []
93
+ total_tokens = 0
94
+ failed_count = 0
95
+
96
+ for chunk in chunks:
97
+ try:
98
+ text = self._sanitize_chunk(chunk.document)
99
+ self._validate_chunk_for_embedding(text)
100
+ chunk.document = text
101
+ except (TypeError, ValueError) as e:
102
+ failed_count += 1
103
+ chunk_preview = chunk.document[:100] if len(chunk.document) > 100 else chunk.document
104
+ logger.warning(f"Chunk validation failed ({type(e).__name__}): {str(e)}")
105
+ logger.debug(f"Failed chunk preview: {repr(chunk_preview)}")
106
+ continue
107
+ chunk_tokens = len(self._enc.encode(chunk.document))
108
+ if current_batch and (total_tokens + chunk_tokens > OPENAI_MAX_ENCODING_REQUEST_TOKENS):
109
+ batches.append(current_batch)
110
+ current_batch = []
111
+ total_tokens = 0
112
+ current_batch.append(chunk)
113
+ total_tokens += chunk_tokens
114
+
115
+ if current_batch:
116
+ batches.append(current_batch)
117
+
118
+ if failed_count > 0:
119
+ logger.warning(f"Failed to process {failed_count} out of {len(chunks)} chunks during batching")
120
+
121
+ return batches
122
+
123
+ def add(self, chunks: list[VectorDBChunk]):
124
+ logger.debug(f"Adding {len(chunks)} embeddings")
125
+ chunk_batches = self._batch_chunks(chunks)
126
+
127
+ if not chunk_batches:
128
+ logger.warning("No valid chunks to add after batching - all chunks failed validation")
129
+ return
130
+
131
+ valid_chunk_count = sum(len(batch) for batch in chunk_batches)
132
+ logger.debug(
133
+ f"Processed {len(chunks)} chunks into {len(chunk_batches)} batches ({valid_chunk_count} valid chunks)")
134
+
135
+ for batch_idx, batch in enumerate(chunk_batches):
136
+ if not batch:
137
+ logger.warning("Skipping empty batch")
138
+ continue
139
+ try:
140
+ logger.debug(f"Adding batch {batch_idx + 1}/{len(chunk_batches)} with {len(batch)} chunks")
141
+
142
+ # If embeddings are pre-generated, use them; otherwise let ChromaDB generate them
143
+ if batch[0].embedding is not None:
144
+ self._collection.add(
145
+ ids=[chunk.id for chunk in batch],
146
+ documents=[chunk.document for chunk in batch],
147
+ embeddings=[chunk.embedding for chunk in batch],
148
+ metadatas=[chunk.metadata for chunk in batch]
149
+ )
150
+ else:
151
+ self._collection.add(
152
+ ids=[chunk.id for chunk in batch],
153
+ documents=[chunk.document for chunk in batch],
154
+ metadatas=[chunk.metadata for chunk in batch]
155
+ )
156
+ logger.debug(f"Batch {batch_idx + 1}/{len(chunk_batches)} added successfully")
157
+ except Exception as e:
158
+ logger.error(f"Failed to add batch {batch_idx + 1}/{len(chunk_batches)}")
159
+ logger.error(
160
+ f"Batch details: {len(batch)} chunks, first chunk length: {len(batch[0].document) if batch else 0}")
161
+ logger.error(f"First chunk preview: {batch[0].document[:200] if batch else 'N/A'}")
162
+ raise
163
+ logger.debug("All embeddings added successfully")
164
+
165
+ def query(self, text: str) -> list[VectorDBChunk]:
166
+ logger.debug(f"Querying vector DB: {text}")
167
+ results = self._collection.query(
168
+ query_texts=[text],
169
+ n_results=8
170
+ )
171
+ logger.debug(f"Query results: {results}")
172
+ return [
173
+ VectorDBChunk(
174
+ id=chunk_id,
175
+ document=results["documents"][0][i],
176
+ metadata=results["metadatas"][0][i]
177
+ )
178
+ for i, chunk_id in enumerate(results["ids"][0])
179
+ ]
180
+
181
+
182
+ def get_chromadb_client_for_project_name(db: Session, project_name: str) -> ChromaClient:
183
+ project = get_project_by_name(db, project_name)
184
+ return ChromaClient(project.index_id)
@@ -0,0 +1,44 @@
1
+ from io import BytesIO
2
+ from typing import List
3
+
4
+ import tiktoken
5
+ from docling.document_converter import DocumentConverter
6
+ from docling_core.transforms.chunker import HybridChunker
7
+ from docling_core.transforms.chunker.tokenizer.openai import OpenAITokenizer
8
+ from docling_core.types.io import DocumentStream
9
+
10
+ from mfcli.constants.openai import OPENAI_ENCODING_MODEL
11
+ from mfcli.utils.config import get_config
12
+ from mfcli.utils.logger import get_logger
13
+
14
+ logger = get_logger(__name__)
15
+
16
+
17
+ class DoclingChunker:
18
+ def __init__(self):
19
+ self._converter = DocumentConverter()
20
+ self._config = get_config()
21
+
22
+ def chunk(self, file_name: str, file_bytes: bytes) -> List[str]:
23
+ logger.debug(f"DoclingChunker: chunking document: {file_name}")
24
+ stream = DocumentStream(
25
+ name=file_name,
26
+ stream=BytesIO(file_bytes)
27
+ )
28
+ doc = self._converter.convert(stream).document
29
+ tokenizer = OpenAITokenizer(
30
+ tokenizer=tiktoken.get_encoding(OPENAI_ENCODING_MODEL),
31
+ max_tokens=self._config.chunk_tokens
32
+ )
33
+ chunker = HybridChunker(
34
+ tokenizer=tokenizer,
35
+ max_tokens=self._config.chunk_tokens,
36
+ merge_peers=True
37
+ )
38
+ chunk_iterator = chunker.chunk(dl_doc=doc)
39
+ logger.debug(f"DoclingChunker: chunking complete: {file_name}")
40
+
41
+ chunks = []
42
+ for chunk in chunk_iterator:
43
+ chunks.append(chunker.contextualize(chunk))
44
+ return chunks
mfcli/client/gemini.py ADDED
@@ -0,0 +1,252 @@
1
+ import asyncio
2
+ import os
3
+ import traceback
4
+ from pathlib import Path
5
+ from typing import Type, Literal, List
6
+
7
+ from google import genai
8
+ from google.genai.client import AsyncClient
9
+ from google.genai.types import GenerateContentConfig, HttpRetryOptionsDict, HttpOptions, File
10
+ from pydantic import BaseModel, ValidationError
11
+ from typing_extensions import TypeVar
12
+
13
+ from mfcli.agents.tools.general import format_instructions
14
+ from mfcli.utils.config import get_config
15
+ from mfcli.utils.logger import get_logger
16
+
17
+ logger = get_logger(__name__)
18
+
19
+ T = TypeVar(name='T', bound=BaseModel)
20
+
21
+ GeminiSupportedModels = Literal['gemini-2.5-flash', 'gemini-2.5-pro', 'gemini-3-pro-preview']
22
+ DefaultGeminiModel = 'gemini-2.5-pro'
23
+
24
+
25
+ class GeminiFileEntity(BaseModel):
26
+ path: Path
27
+ mime_type: str
28
+
29
+
30
+ GeminiFileInput = GeminiFileEntity | str | Path
31
+
32
+
33
+ class Gemini:
34
+ def __init__(self):
35
+ self._config = get_config()
36
+ self._client: AsyncClient = genai.Client(api_key=self._config.google_api_key).aio
37
+
38
+ @staticmethod
39
+ def _get_request_config(
40
+ timeout: int,
41
+ instructions: str,
42
+ response_model: Type[T]
43
+ ) -> GenerateContentConfig:
44
+ retry_options = HttpRetryOptionsDict(
45
+ attempts=3,
46
+ initial_delay=1,
47
+ max_delay=10,
48
+ exp_base=2
49
+ )
50
+ http_options = HttpOptions(
51
+ retry_options=retry_options,
52
+ timeout=timeout * 1000
53
+ )
54
+ return GenerateContentConfig(
55
+ system_instruction=instructions,
56
+ response_mime_type="application/json",
57
+ response_json_schema=response_model.model_json_schema(),
58
+ http_options=http_options
59
+ )
60
+
61
+ @staticmethod
62
+ def _file_access_check(file_path: str):
63
+ file_path_obj = Path(file_path)
64
+
65
+ # Validate file exists and is readable
66
+ if not file_path_obj.exists():
67
+ raise ValueError(f"File does not exist: {file_path}")
68
+ if not os.access(file_path_obj, os.R_OK):
69
+ raise ValueError(f"File is not readable: {file_path}")
70
+
71
+ async def upload(self, file: GeminiFileInput) -> File:
72
+ config = None
73
+ if isinstance(file, GeminiFileEntity):
74
+ file_path = str(file.path)
75
+ config = {"mime_type": file.mime_type}
76
+ else:
77
+ file_path = str(file)
78
+ self._file_access_check(file_path)
79
+ return await self._client.files.upload(
80
+ file=file_path,
81
+ config=config
82
+ )
83
+
84
+ async def _generate_once(
85
+ self,
86
+ prompt: str,
87
+ instructions: str,
88
+ response_model: Type[T],
89
+ model: GeminiSupportedModels,
90
+ files: List[File] | None = None,
91
+ timeout: int = 60
92
+ ) -> str:
93
+ contents = [prompt]
94
+ if files:
95
+ contents += files
96
+ response = await self._client.models.generate_content(
97
+ model=str(model),
98
+ contents=contents,
99
+ config=self._get_request_config(timeout, instructions, response_model),
100
+ )
101
+ return response.text
102
+
103
+ async def _generate_with_retry(
104
+ self,
105
+ prompt: str,
106
+ instructions: str,
107
+ response_model: Type[T],
108
+ model: GeminiSupportedModels,
109
+ files: List[File] | None = None,
110
+ timeout: int = 60
111
+ ) -> T:
112
+
113
+ attempts = 3
114
+ backoff = 1.5
115
+ delay = 1.0
116
+ last_err = None
117
+
118
+ for attempt in range(1, attempts + 1):
119
+
120
+ try:
121
+ # --- FIRST ATTEMPT (normal generation) ---
122
+ raw = await self._generate_once(
123
+ prompt=prompt,
124
+ instructions=instructions,
125
+ response_model=response_model,
126
+ model=model,
127
+ files=files,
128
+ timeout=timeout
129
+ )
130
+
131
+ try:
132
+ # Try to parse normally
133
+ return response_model.model_validate_json(raw)
134
+
135
+ except ValidationError as ve:
136
+ # --- SECOND CHANCE: RE-ASK THE MODEL TO FIX ITS OUTPUT ---
137
+ fix_prompt = (
138
+ "Your previous response did not match the required JSON schema.\n\n"
139
+ f"Validation error:\n{ve}\n\n"
140
+ f"Invalid response:\n{raw}\n\n"
141
+ "Please correct the response so that it validates successfully."
142
+ )
143
+
144
+ corrected_raw = await self._generate_once(
145
+ prompt=fix_prompt,
146
+ instructions=instructions,
147
+ response_model=response_model,
148
+ model=model,
149
+ files=files,
150
+ timeout=timeout
151
+ )
152
+
153
+ # Parse corrected output
154
+ return response_model.model_validate_json(corrected_raw)
155
+
156
+ except Exception as e:
157
+ # network/SDK/parsing failures that aren't validation-related
158
+ last_err = e
159
+ if attempt == attempts:
160
+ break
161
+
162
+ logger.debug(f"[Gemini retry] Attempt {attempt}/{attempts} failed: {e}")
163
+ await asyncio.sleep(delay)
164
+ delay *= backoff
165
+
166
+ raise RuntimeError(
167
+ f"Gemini generate_with_retry failed after {attempts} attempts"
168
+ ) from last_err
169
+
170
+ async def generate_and_validate_with(
171
+ self,
172
+ prompt: str,
173
+ instructions: str,
174
+ response_model: Type[T],
175
+ validation_func,
176
+ model: GeminiSupportedModels = DefaultGeminiModel,
177
+ files: List[File] | None = None,
178
+ timeout: int = 60
179
+ ) -> T:
180
+
181
+ original_user_prompt = prompt
182
+
183
+ async def run_generation(p: str) -> T:
184
+ return await self.generate(
185
+ prompt=p,
186
+ instructions=instructions,
187
+ response_model=response_model,
188
+ model=model,
189
+ files=files,
190
+ timeout=timeout
191
+ )
192
+
193
+ # --- First attempt ---
194
+ resp: T = await run_generation(original_user_prompt)
195
+
196
+ try:
197
+ validation_func(resp)
198
+ return resp
199
+ except Exception:
200
+ first_error = traceback.format_exc()
201
+
202
+ # --- Retry attempt ---
203
+ retry_prompt = format_instructions(
204
+ f"""
205
+ You previously generated an invalid response.
206
+ Correct it.
207
+
208
+ User Prompt:
209
+ {original_user_prompt}
210
+
211
+ Error raised by validator:
212
+ {first_error}
213
+
214
+ Your previous output:
215
+ {resp}
216
+ """
217
+ )
218
+
219
+ resp_retry: T = await run_generation(retry_prompt)
220
+
221
+ try:
222
+ validation_func(resp_retry)
223
+ return resp_retry
224
+ except Exception as e:
225
+ second_error = traceback.format_exc()
226
+ raise RuntimeError(
227
+ f"Model failed validation twice.\n"
228
+ f"First error:\n{first_error}\n\n"
229
+ f"Second error:\n{second_error}\n\n"
230
+ f"Last model output:\n{resp_retry}"
231
+ ) from e
232
+
233
+ async def generate(
234
+ self,
235
+ prompt: str,
236
+ instructions: str,
237
+ response_model: Type[T],
238
+ model: GeminiSupportedModels = DefaultGeminiModel,
239
+ files: List[File] | None = None,
240
+ timeout: int = 60
241
+ ) -> T:
242
+ logger.debug(f"Generating for model: {response_model}")
243
+ parsed_response = await self._generate_with_retry(
244
+ prompt=prompt,
245
+ instructions=instructions,
246
+ response_model=response_model,
247
+ model=model,
248
+ files=files,
249
+ timeout=timeout
250
+ )
251
+ logger.debug(f"Finished generating for model: {response_model}")
252
+ return parsed_response
@@ -0,0 +1,38 @@
1
+ from llama_cloud_services import LlamaParse
2
+ from llama_index.core import Document
3
+
4
+ from mfcli.utils.config import get_config
5
+ from mfcli.utils.logger import get_logger
6
+
7
+ logger = get_logger(__name__)
8
+
9
+
10
+ class LlamaParseClient:
11
+ def __init__(self):
12
+ self._config = get_config()
13
+ self._parser = LlamaParse(
14
+ api_key=self._config.llama_cloud_api_key,
15
+ result_type="markdown",
16
+ use_vendor_multimodal_model=True,
17
+ vendor_multimodal_model_name="openai-gpt-5",
18
+ vendor_multimodal_api_key=self._config.openai_api_key,
19
+ verbose=True,
20
+ invalidate_cache=False,
21
+ ignore_errors=False
22
+ )
23
+ self._parser.parsing_instruction = None
24
+
25
+ def parse(self, file_name: str, file_bytes: bytes) -> str:
26
+ logger.debug(f"Parsing file: {file_name}")
27
+ extra_info = {"file_name": file_name}
28
+ try:
29
+ documents: list[Document] = self._parser.load_data(file_bytes, extra_info=extra_info)
30
+ text = ""
31
+ for document in documents:
32
+ text += document.text
33
+ logger.debug(f"Document text extracted: {file_name}")
34
+ return text
35
+ except Exception as e:
36
+ logger.error(f"Error parsing file: {file_name}")
37
+ logger.exception(e)
38
+ raise e
@@ -0,0 +1,93 @@
1
+ from typing import List
2
+ from uuid import uuid4
3
+
4
+ import tiktoken
5
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
6
+ from openai import OpenAI
7
+
8
+ from mfcli.client.chroma_db import ChromaClient, ChunkMetadata, VectorDBChunk
9
+ from mfcli.utils.config import get_config
10
+ from mfcli.utils.logger import get_logger
11
+
12
+ logger = get_logger(__name__)
13
+
14
+
15
+ class DocumentVectorizer:
16
+ def __init__(self, chroma_db: ChromaClient):
17
+ self._config = get_config()
18
+ self._client: OpenAI = OpenAI(api_key=self._config.openai_api_key)
19
+ self._splitter = RecursiveCharacterTextSplitter(
20
+ chunk_size=self._config.chunk_size,
21
+ chunk_overlap=self._config.chunk_overlap,
22
+ length_function=len
23
+ )
24
+ self._chroma_db = chroma_db
25
+
26
+ def _batch_texts(self, texts: list[str]) -> list[list[str]]:
27
+ max_request_chunks = 2048
28
+ max_request_tokens = 300000
29
+
30
+ encoding = tiktoken.encoding_for_model(self._config.embedding_model)
31
+
32
+ request_batches = []
33
+ for i in range(0, len(texts), max_request_chunks):
34
+ request_batches.append(texts[i:i + max_request_chunks])
35
+
36
+ batches = []
37
+ for request_batch in request_batches:
38
+ total_tokens = 0
39
+ batch = []
40
+ for text in request_batch:
41
+ tokens = len(encoding.encode(text))
42
+ # if adding this text would exceed the token limit, push the current batch
43
+ if total_tokens + tokens > max_request_tokens:
44
+ batches.append(batch)
45
+ batch = [text]
46
+ total_tokens = tokens # reset with current text
47
+ else:
48
+ batch.append(text)
49
+ total_tokens += tokens
50
+ if batch:
51
+ batches.append(batch)
52
+ return batches
53
+
54
+ def _get_embeddings(self, texts: list[str]) -> list[list[float]]:
55
+ batches = self._batch_texts(texts)
56
+ embeddings = []
57
+ for batch in batches:
58
+ response = self._client.embeddings.create(
59
+ model=self._config.embedding_model,
60
+ input=batch
61
+ )
62
+ embeddings += [row.embedding for row in response.data]
63
+ return embeddings
64
+
65
+ def _chunk_document(self, text: str) -> list[str]:
66
+ return self._splitter.split_text(text)
67
+
68
+ def vectorize_chunks(self, chunks: List[str], metadata: ChunkMetadata) -> list[VectorDBChunk]:
69
+ # Generate embeddings ourselves instead of letting ChromaDB do it
70
+ logger.debug("Generating embeddings")
71
+ embeddings = self._get_embeddings(chunks)
72
+ logger.debug(f"Generated {len(embeddings)} embeddings")
73
+
74
+ vectors = [
75
+ VectorDBChunk(
76
+ id=uuid4().hex,
77
+ document=chunk,
78
+ metadata=metadata,
79
+ embedding=embedding
80
+ )
81
+ for chunk, embedding in zip(chunks, embeddings)
82
+ ]
83
+ logger.debug("Adding vectors")
84
+ self._chroma_db.add(vectors)
85
+ logger.debug("Vectors added")
86
+ return vectors
87
+
88
+ def vectorize(self, text: str, metadata: ChunkMetadata) -> list[VectorDBChunk]:
89
+ logger.debug("Vectorize document")
90
+ chunks = self._chunk_document(text)
91
+ logger.debug(f"Document split into {len(chunks)} chunks")
92
+
93
+ return self.vectorize_chunks(chunks, metadata)
File without changes
@@ -0,0 +1,18 @@
1
+ from enum import IntEnum
2
+
3
+
4
+ class BaseEnum(IntEnum):
5
+
6
+ @classmethod
7
+ def get(cls, name: str) -> int | None:
8
+ try:
9
+ return cls[name].value
10
+ except KeyError:
11
+ return None
12
+
13
+ @classmethod
14
+ def name_from_value(cls, value: int) -> str | None:
15
+ member = cls._value2member_map_.get(value)
16
+ return member.name if member else None
17
+
18
+
@@ -0,0 +1 @@
1
+ MF_PROJECT_CONFIG_DIR_NAME = ".multifactor"