PyPI - mfcli - Versions diffs - 0.2.0__py3-none-any.whl - Mend

mfcli 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (136) hide show

mfcli/.env.example +72 -0
mfcli/__init__.py +0 -0
mfcli/agents/__init__.py +0 -0
mfcli/agents/controller/__init__.py +0 -0
mfcli/agents/controller/agent.py +19 -0
mfcli/agents/controller/config.yaml +27 -0
mfcli/agents/controller/tools.py +42 -0
mfcli/agents/tools/general.py +118 -0
mfcli/alembic/env.py +61 -0
mfcli/alembic/script.py.mako +28 -0
mfcli/alembic/versions/6ccc0c7c397c_added_fields_to_pdf_parts_model.py +39 -0
mfcli/alembic/versions/769019ef4870_added_gemini_file_path_to_pdf_part_model.py +33 -0
mfcli/alembic/versions/7a2e3a779fdc_added_functional_block_and_component_.py +54 -0
mfcli/alembic/versions/7d5adb2a47a7_added_pdf_parts_model.py +41 -0
mfcli/alembic/versions/7fcb7d6a5836_init.py +167 -0
mfcli/alembic/versions/e0f2b5765c72_added_cascade_delete_for_models_that_.py +32 -0
mfcli/alembic.ini +147 -0
mfcli/cli/__init__.py +0 -0
mfcli/cli/dependencies.py +59 -0
mfcli/cli/main.py +192 -0
mfcli/client/__init__.py +0 -0
mfcli/client/chroma_db.py +184 -0
mfcli/client/docling.py +44 -0
mfcli/client/gemini.py +252 -0
mfcli/client/llama_parse.py +38 -0
mfcli/client/vector_db.py +93 -0
mfcli/constants/__init__.py +0 -0
mfcli/constants/base_enum.py +18 -0
mfcli/constants/directory_names.py +1 -0
mfcli/constants/file_types.py +189 -0
mfcli/constants/gemini.py +1 -0
mfcli/constants/openai.py +6 -0
mfcli/constants/pipeline_run_status.py +3 -0
mfcli/crud/__init__.py +0 -0
mfcli/crud/file.py +42 -0
mfcli/crud/functional_blocks.py +26 -0
mfcli/crud/netlist.py +18 -0
mfcli/crud/pipeline_run.py +17 -0
mfcli/crud/project.py +99 -0
mfcli/digikey/__init__.py +0 -0
mfcli/digikey/digikey.py +105 -0
mfcli/main.py +5 -0
mfcli/mcp/__init__.py +0 -0
mfcli/mcp/configs/cline_mcp_settings.json +11 -0
mfcli/mcp/configs/mfcli.mcp.json +7 -0
mfcli/mcp/mcp_instance.py +6 -0
mfcli/mcp/server.py +37 -0
mfcli/mcp/state_manager.py +51 -0
mfcli/mcp/tools/__init__.py +0 -0
mfcli/mcp/tools/query_knowledgebase.py +108 -0
mfcli/models/__init__.py +10 -0
mfcli/models/base.py +10 -0
mfcli/models/bom.py +71 -0
mfcli/models/datasheet.py +10 -0
mfcli/models/debug_setup.py +64 -0
mfcli/models/file.py +43 -0
mfcli/models/file_docket.py +94 -0
mfcli/models/file_metadata.py +19 -0
mfcli/models/functional_blocks.py +94 -0
mfcli/models/llm_response.py +5 -0
mfcli/models/mcu.py +97 -0
mfcli/models/mcu_errata.py +26 -0
mfcli/models/netlist.py +59 -0
mfcli/models/pdf_parts.py +25 -0
mfcli/models/pipeline_run.py +34 -0
mfcli/models/project.py +27 -0
mfcli/models/project_metadata.py +15 -0
mfcli/pipeline/__init__.py +0 -0
mfcli/pipeline/analysis/__init__.py +0 -0
mfcli/pipeline/analysis/bom_netlist_mapper.py +28 -0
mfcli/pipeline/analysis/generators/__init__.py +0 -0
mfcli/pipeline/analysis/generators/bom/__init__.py +0 -0
mfcli/pipeline/analysis/generators/bom/bom.py +74 -0
mfcli/pipeline/analysis/generators/debug_setup/__init__.py +0 -0
mfcli/pipeline/analysis/generators/debug_setup/debug_setup.py +71 -0
mfcli/pipeline/analysis/generators/debug_setup/instructions.py +150 -0
mfcli/pipeline/analysis/generators/functional_blocks/__init__.py +0 -0
mfcli/pipeline/analysis/generators/functional_blocks/functional_blocks.py +93 -0
mfcli/pipeline/analysis/generators/functional_blocks/instructions.py +34 -0
mfcli/pipeline/analysis/generators/functional_blocks/validator.py +94 -0
mfcli/pipeline/analysis/generators/generator.py +258 -0
mfcli/pipeline/analysis/generators/generator_base.py +18 -0
mfcli/pipeline/analysis/generators/mcu/__init__.py +0 -0
mfcli/pipeline/analysis/generators/mcu/instructions.py +156 -0
mfcli/pipeline/analysis/generators/mcu/mcu.py +84 -0
mfcli/pipeline/analysis/generators/mcu_errata/__init__.py +1 -0
mfcli/pipeline/analysis/generators/mcu_errata/instructions.py +77 -0
mfcli/pipeline/analysis/generators/mcu_errata/mcu_errata.py +95 -0
mfcli/pipeline/analysis/generators/summary/__init__.py +0 -0
mfcli/pipeline/analysis/generators/summary/summary.py +47 -0
mfcli/pipeline/classifier.py +93 -0
mfcli/pipeline/data_enricher.py +15 -0
mfcli/pipeline/extractor.py +34 -0
mfcli/pipeline/extractors/__init__.py +0 -0
mfcli/pipeline/extractors/pdf.py +12 -0
mfcli/pipeline/parser.py +120 -0
mfcli/pipeline/parsers/__init__.py +0 -0
mfcli/pipeline/parsers/netlist/__init__.py +0 -0
mfcli/pipeline/parsers/netlist/edif.py +93 -0
mfcli/pipeline/parsers/netlist/kicad_legacy_net.py +326 -0
mfcli/pipeline/parsers/netlist/kicad_spice.py +135 -0
mfcli/pipeline/parsers/netlist/pads.py +185 -0
mfcli/pipeline/parsers/netlist/protel.py +166 -0
mfcli/pipeline/parsers/netlist/protel_detector.py +29 -0
mfcli/pipeline/pipeline.py +419 -0
mfcli/pipeline/preprocessors/__init__.py +0 -0
mfcli/pipeline/preprocessors/user_guide.py +127 -0
mfcli/pipeline/run_context.py +32 -0
mfcli/pipeline/schema_mapper.py +89 -0
mfcli/pipeline/sub_classifier.py +115 -0
mfcli/utils/__init__.py +0 -0
mfcli/utils/config.py +33 -0
mfcli/utils/configurator.py +324 -0
mfcli/utils/data_cleaner.py +82 -0
mfcli/utils/datasheet_vectorizer.py +281 -0
mfcli/utils/directory_manager.py +96 -0
mfcli/utils/file_upload.py +298 -0
mfcli/utils/files.py +16 -0
mfcli/utils/http_requests.py +54 -0
mfcli/utils/kb_lister.py +89 -0
mfcli/utils/kb_remover.py +173 -0
mfcli/utils/logger.py +28 -0
mfcli/utils/mcp_configurator.py +311 -0
mfcli/utils/migrations.py +18 -0
mfcli/utils/orm.py +43 -0
mfcli/utils/pdf_splitter.py +63 -0
mfcli/utils/query_service.py +22 -0
mfcli/utils/system_check.py +306 -0
mfcli/utils/tools.py +31 -0
mfcli/utils/vectorizer.py +28 -0
mfcli-0.2.0.dist-info/METADATA +841 -0
mfcli-0.2.0.dist-info/RECORD +136 -0
mfcli-0.2.0.dist-info/WHEEL +5 -0
mfcli-0.2.0.dist-info/entry_points.txt +3 -0
mfcli-0.2.0.dist-info/licenses/LICENSE +21 -0
mfcli-0.2.0.dist-info/top_level.txt +1 -0

mfcli/client/chroma_db.py ADDED Viewed

@@ -0,0 +1,184 @@
+from typing import Mapping, List
+import chromadb
+import tiktoken
+import unicodedata
+from chromadb import SparseVector
+from chromadb.utils import embedding_functions
+from pydantic import BaseModel
+from mfcli.constants.openai import (
+    OPENAI_ENCODING_MODEL,
+    OPENAI_MAX_ENCODING_REQUEST_TOKENS,
+    OPENAI_MAX_TOKENS_PER_CHUNK
+)
+from mfcli.crud.project import get_project_by_name
+from mfcli.utils.config import get_config
+from mfcli.utils.directory_manager import app_dirs
+from mfcli.utils.logger import get_logger
+from mfcli.utils.orm import Session
+logger = get_logger(__name__)
+ChunkMetadata = Mapping[str, str | int | float | bool | SparseVector | None]
+class VectorDBChunk(BaseModel):
+    id: str
+    document: str
+    metadata: ChunkMetadata
+    embedding: list[float] | None = None
+class ChromaClient:
+    def __init__(self, index_name: str):
+        self._index_name = index_name
+        self._config = get_config()
+        self._client = chromadb.PersistentClient(
+            path=app_dirs.chroma_db_dir
+        )
+        openai_ef = embedding_functions.OpenAIEmbeddingFunction(
+            api_key=self._config.openai_api_key,
+            model_name=self._config.embedding_model
+        )
+        self._collection = self._client.get_or_create_collection(
+            name=index_name,
+            embedding_function=openai_ef
+        )
+        self._enc = tiktoken.get_encoding(OPENAI_ENCODING_MODEL)
+    def delete_collection(self):
+        self._client.delete_collection(self._index_name)
+    @staticmethod
+    def _sanitize_chunk(text: str):
+        if not isinstance(text, str):
+            raise TypeError(f"Chunk is not a string: {type(text)}")
+        # Remove ASCII control characters except newline/tab
+        text = ''.join(
+            ch for ch in text
+            if (32 <= ord(ch) <= 0x10FFFF) or ch in "\n\t\r"
+        )
+        return text.strip()
+    def _validate_chunk_for_embedding(self, text: str) -> None:
+        """
+        Raises an error if the chunk cannot be embedded.
+        """
+        if not isinstance(text, str):
+            raise TypeError(f"Chunk is not a string: {type(text)}")
+        if not text.strip():
+            raise ValueError("Chunk is empty or whitespace only")
+        try:
+            text.encode("utf-8")
+        except UnicodeEncodeError as e:
+            raise ValueError(f"Chunk contains invalid Unicode: {e}")
+        # Check for illegal control characters (other than \n, \t, \r)
+        for ch in text:
+            if unicodedata.category(ch) == "Cc" and ch not in "\n\t\r":
+                raise ValueError(f"Chunk contains control character: {repr(ch)}")
+        # Check token length
+        token_count = len(self._enc.encode(text))
+        if token_count > OPENAI_MAX_TOKENS_PER_CHUNK:
+            raise ValueError(f"Chunk too long: {token_count} tokens (> {OPENAI_MAX_TOKENS_PER_CHUNK})")
+    def _batch_chunks(self, chunks: List[VectorDBChunk]) -> List[List[VectorDBChunk]]:
+        batches = []
+        current_batch = []
+        total_tokens = 0
+        failed_count = 0
+        for chunk in chunks:
+            try:
+                text = self._sanitize_chunk(chunk.document)
+                self._validate_chunk_for_embedding(text)
+                chunk.document = text
+            except (TypeError, ValueError) as e:
+                failed_count += 1
+                chunk_preview = chunk.document[:100] if len(chunk.document) > 100 else chunk.document
+                logger.warning(f"Chunk validation failed ({type(e).__name__}): {str(e)}")
+                logger.debug(f"Failed chunk preview: {repr(chunk_preview)}")
+                continue
+            chunk_tokens = len(self._enc.encode(chunk.document))
+            if current_batch and (total_tokens + chunk_tokens > OPENAI_MAX_ENCODING_REQUEST_TOKENS):
+                batches.append(current_batch)
+                current_batch = []
+                total_tokens = 0
+            current_batch.append(chunk)
+            total_tokens += chunk_tokens
+        if current_batch:
+            batches.append(current_batch)
+        if failed_count > 0:
+            logger.warning(f"Failed to process {failed_count} out of {len(chunks)} chunks during batching")
+        return batches
+    def add(self, chunks: list[VectorDBChunk]):
+        logger.debug(f"Adding {len(chunks)} embeddings")
+        chunk_batches = self._batch_chunks(chunks)
+        if not chunk_batches:
+            logger.warning("No valid chunks to add after batching - all chunks failed validation")
+            return
+        valid_chunk_count = sum(len(batch) for batch in chunk_batches)
+        logger.debug(
+            f"Processed {len(chunks)} chunks into {len(chunk_batches)} batches ({valid_chunk_count} valid chunks)")
+        for batch_idx, batch in enumerate(chunk_batches):
+            if not batch:
+                logger.warning("Skipping empty batch")
+                continue
+            try:
+                logger.debug(f"Adding batch {batch_idx + 1}/{len(chunk_batches)} with {len(batch)} chunks")
+                # If embeddings are pre-generated, use them; otherwise let ChromaDB generate them
+                if batch[0].embedding is not None:
+                    self._collection.add(
+                        ids=[chunk.id for chunk in batch],
+                        documents=[chunk.document for chunk in batch],
+                        embeddings=[chunk.embedding for chunk in batch],
+                        metadatas=[chunk.metadata for chunk in batch]
+                    )
+                else:
+                    self._collection.add(
+                        ids=[chunk.id for chunk in batch],
+                        documents=[chunk.document for chunk in batch],
+                        metadatas=[chunk.metadata for chunk in batch]
+                    )
+                logger.debug(f"Batch {batch_idx + 1}/{len(chunk_batches)} added successfully")
+            except Exception as e:
+                logger.error(f"Failed to add batch {batch_idx + 1}/{len(chunk_batches)}")
+                logger.error(
+                    f"Batch details: {len(batch)} chunks, first chunk length: {len(batch[0].document) if batch else 0}")
+                logger.error(f"First chunk preview: {batch[0].document[:200] if batch else 'N/A'}")
+                raise
+        logger.debug("All embeddings added successfully")
+    def query(self, text: str) -> list[VectorDBChunk]:
+        logger.debug(f"Querying vector DB: {text}")
+        results = self._collection.query(
+            query_texts=[text],
+            n_results=8
+        )
+        logger.debug(f"Query results: {results}")
+        return [
+            VectorDBChunk(
+                id=chunk_id,
+                document=results["documents"][0][i],
+                metadata=results["metadatas"][0][i]
+            )
+            for i, chunk_id in enumerate(results["ids"][0])
+        ]
+def get_chromadb_client_for_project_name(db: Session, project_name: str) -> ChromaClient:
+    project = get_project_by_name(db, project_name)
+    return ChromaClient(project.index_id)

mfcli/client/docling.py ADDED Viewed

@@ -0,0 +1,44 @@
+from io import BytesIO
+from typing import List
+import tiktoken
+from docling.document_converter import DocumentConverter
+from docling_core.transforms.chunker import HybridChunker
+from docling_core.transforms.chunker.tokenizer.openai import OpenAITokenizer
+from docling_core.types.io import DocumentStream
+from mfcli.constants.openai import OPENAI_ENCODING_MODEL
+from mfcli.utils.config import get_config
+from mfcli.utils.logger import get_logger
+logger = get_logger(__name__)
+class DoclingChunker:
+    def __init__(self):
+        self._converter = DocumentConverter()
+        self._config = get_config()
+    def chunk(self, file_name: str, file_bytes: bytes) -> List[str]:
+        logger.debug(f"DoclingChunker: chunking document: {file_name}")
+        stream = DocumentStream(
+            name=file_name,
+            stream=BytesIO(file_bytes)
+        )
+        doc = self._converter.convert(stream).document
+        tokenizer = OpenAITokenizer(
+            tokenizer=tiktoken.get_encoding(OPENAI_ENCODING_MODEL),
+            max_tokens=self._config.chunk_tokens
+        )
+        chunker = HybridChunker(
+            tokenizer=tokenizer,
+            max_tokens=self._config.chunk_tokens,
+            merge_peers=True
+        )
+        chunk_iterator = chunker.chunk(dl_doc=doc)
+        logger.debug(f"DoclingChunker: chunking complete: {file_name}")
+        chunks = []
+        for chunk in chunk_iterator:
+            chunks.append(chunker.contextualize(chunk))
+        return chunks

mfcli/client/gemini.py ADDED Viewed

@@ -0,0 +1,252 @@
+import asyncio
+import os
+import traceback
+from pathlib import Path
+from typing import Type, Literal, List
+from google import genai
+from google.genai.client import AsyncClient
+from google.genai.types import GenerateContentConfig, HttpRetryOptionsDict, HttpOptions, File
+from pydantic import BaseModel, ValidationError
+from typing_extensions import TypeVar
+from mfcli.agents.tools.general import format_instructions
+from mfcli.utils.config import get_config
+from mfcli.utils.logger import get_logger
+logger = get_logger(__name__)
+T = TypeVar(name='T', bound=BaseModel)
+GeminiSupportedModels = Literal['gemini-2.5-flash', 'gemini-2.5-pro', 'gemini-3-pro-preview']
+DefaultGeminiModel = 'gemini-2.5-pro'
+class GeminiFileEntity(BaseModel):
+    path: Path
+    mime_type: str
+GeminiFileInput = GeminiFileEntity | str | Path
+class Gemini:
+    def __init__(self):
+        self._config = get_config()
+        self._client: AsyncClient = genai.Client(api_key=self._config.google_api_key).aio
+    @staticmethod
+    def _get_request_config(
+            timeout: int,
+            instructions: str,
+            response_model: Type[T]
+    ) -> GenerateContentConfig:
+        retry_options = HttpRetryOptionsDict(
+            attempts=3,
+            initial_delay=1,
+            max_delay=10,
+            exp_base=2
+        )
+        http_options = HttpOptions(
+            retry_options=retry_options,
+            timeout=timeout * 1000
+        )
+        return GenerateContentConfig(
+            system_instruction=instructions,
+            response_mime_type="application/json",
+            response_json_schema=response_model.model_json_schema(),
+            http_options=http_options
+        )
+    @staticmethod
+    def _file_access_check(file_path: str):
+        file_path_obj = Path(file_path)
+        # Validate file exists and is readable
+        if not file_path_obj.exists():
+            raise ValueError(f"File does not exist: {file_path}")
+        if not os.access(file_path_obj, os.R_OK):
+            raise ValueError(f"File is not readable: {file_path}")
+    async def upload(self, file: GeminiFileInput) -> File:
+        config = None
+        if isinstance(file, GeminiFileEntity):
+            file_path = str(file.path)
+            config = {"mime_type": file.mime_type}
+        else:
+            file_path = str(file)
+        self._file_access_check(file_path)
+        return await self._client.files.upload(
+            file=file_path,
+            config=config
+        )
+    async def _generate_once(
+            self,
+            prompt: str,
+            instructions: str,
+            response_model: Type[T],
+            model: GeminiSupportedModels,
+            files: List[File] | None = None,
+            timeout: int = 60
+    ) -> str:
+        contents = [prompt]
+        if files:
+            contents += files
+        response = await self._client.models.generate_content(
+            model=str(model),
+            contents=contents,
+            config=self._get_request_config(timeout, instructions, response_model),
+        )
+        return response.text
+    async def _generate_with_retry(
+            self,
+            prompt: str,
+            instructions: str,
+            response_model: Type[T],
+            model: GeminiSupportedModels,
+            files: List[File] | None = None,
+            timeout: int = 60
+    ) -> T:
+        attempts = 3
+        backoff = 1.5
+        delay = 1.0
+        last_err = None
+        for attempt in range(1, attempts + 1):
+            try:
+                # --- FIRST ATTEMPT (normal generation) ---
+                raw = await self._generate_once(
+                    prompt=prompt,
+                    instructions=instructions,
+                    response_model=response_model,
+                    model=model,
+                    files=files,
+                    timeout=timeout
+                )
+                try:
+                    # Try to parse normally
+                    return response_model.model_validate_json(raw)
+                except ValidationError as ve:
+                    # --- SECOND CHANCE: RE-ASK THE MODEL TO FIX ITS OUTPUT ---
+                    fix_prompt = (
+                        "Your previous response did not match the required JSON schema.\n\n"
+                        f"Validation error:\n{ve}\n\n"
+                        f"Invalid response:\n{raw}\n\n"
+                        "Please correct the response so that it validates successfully."
+                    )
+                    corrected_raw = await self._generate_once(
+                        prompt=fix_prompt,
+                        instructions=instructions,
+                        response_model=response_model,
+                        model=model,
+                        files=files,
+                        timeout=timeout
+                    )
+                    # Parse corrected output
+                    return response_model.model_validate_json(corrected_raw)
+            except Exception as e:
+                # network/SDK/parsing failures that aren't validation-related
+                last_err = e
+                if attempt == attempts:
+                    break
+                logger.debug(f"[Gemini retry] Attempt {attempt}/{attempts} failed: {e}")
+                await asyncio.sleep(delay)
+                delay *= backoff
+        raise RuntimeError(
+            f"Gemini generate_with_retry failed after {attempts} attempts"
+        ) from last_err
+    async def generate_and_validate_with(
+            self,
+            prompt: str,
+            instructions: str,
+            response_model: Type[T],
+            validation_func,
+            model: GeminiSupportedModels = DefaultGeminiModel,
+            files: List[File] | None = None,
+            timeout: int = 60
+    ) -> T:
+        original_user_prompt = prompt
+        async def run_generation(p: str) -> T:
+            return await self.generate(
+                prompt=p,
+                instructions=instructions,
+                response_model=response_model,
+                model=model,
+                files=files,
+                timeout=timeout
+            )
+        # --- First attempt ---
+        resp: T = await run_generation(original_user_prompt)
+        try:
+            validation_func(resp)
+            return resp
+        except Exception:
+            first_error = traceback.format_exc()
+        # --- Retry attempt ---
+        retry_prompt = format_instructions(
+            f"""
+            You previously generated an invalid response.
+            Correct it.
+            User Prompt:
+            {original_user_prompt}
+            Error raised by validator:
+            {first_error}
+            Your previous output:
+            {resp}
+            """
+        )
+        resp_retry: T = await run_generation(retry_prompt)
+        try:
+            validation_func(resp_retry)
+            return resp_retry
+        except Exception as e:
+            second_error = traceback.format_exc()
+            raise RuntimeError(
+                f"Model failed validation twice.\n"
+                f"First error:\n{first_error}\n\n"
+                f"Second error:\n{second_error}\n\n"
+                f"Last model output:\n{resp_retry}"
+            ) from e
+    async def generate(
+            self,
+            prompt: str,
+            instructions: str,
+            response_model: Type[T],
+            model: GeminiSupportedModels = DefaultGeminiModel,
+            files: List[File] | None = None,
+            timeout: int = 60
+    ) -> T:
+        logger.debug(f"Generating for model: {response_model}")
+        parsed_response = await self._generate_with_retry(
+            prompt=prompt,
+            instructions=instructions,
+            response_model=response_model,
+            model=model,
+            files=files,
+            timeout=timeout
+        )
+        logger.debug(f"Finished generating for model: {response_model}")
+        return parsed_response

mfcli/client/llama_parse.py ADDED Viewed

@@ -0,0 +1,38 @@
+from llama_cloud_services import LlamaParse
+from llama_index.core import Document
+from mfcli.utils.config import get_config
+from mfcli.utils.logger import get_logger
+logger = get_logger(__name__)
+class LlamaParseClient:
+    def __init__(self):
+        self._config = get_config()
+        self._parser = LlamaParse(
+            api_key=self._config.llama_cloud_api_key,
+            result_type="markdown",
+            use_vendor_multimodal_model=True,
+            vendor_multimodal_model_name="openai-gpt-5",
+            vendor_multimodal_api_key=self._config.openai_api_key,
+            verbose=True,
+            invalidate_cache=False,
+            ignore_errors=False
+        )
+        self._parser.parsing_instruction = None
+    def parse(self, file_name: str, file_bytes: bytes) -> str:
+        logger.debug(f"Parsing file: {file_name}")
+        extra_info = {"file_name": file_name}
+        try:
+            documents: list[Document] = self._parser.load_data(file_bytes, extra_info=extra_info)
+            text = ""
+            for document in documents:
+                text += document.text
+            logger.debug(f"Document text extracted: {file_name}")
+            return text
+        except Exception as e:
+            logger.error(f"Error parsing file: {file_name}")
+            logger.exception(e)
+            raise e

mfcli/client/vector_db.py ADDED Viewed

@@ -0,0 +1,93 @@
+from typing import List
+from uuid import uuid4
+import tiktoken
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from openai import OpenAI
+from mfcli.client.chroma_db import ChromaClient, ChunkMetadata, VectorDBChunk
+from mfcli.utils.config import get_config
+from mfcli.utils.logger import get_logger
+logger = get_logger(__name__)
+class DocumentVectorizer:
+    def __init__(self, chroma_db: ChromaClient):
+        self._config = get_config()
+        self._client: OpenAI = OpenAI(api_key=self._config.openai_api_key)
+        self._splitter = RecursiveCharacterTextSplitter(
+            chunk_size=self._config.chunk_size,
+            chunk_overlap=self._config.chunk_overlap,
+            length_function=len
+        )
+        self._chroma_db = chroma_db
+    def _batch_texts(self, texts: list[str]) -> list[list[str]]:
+        max_request_chunks = 2048
+        max_request_tokens = 300000
+        encoding = tiktoken.encoding_for_model(self._config.embedding_model)
+        request_batches = []
+        for i in range(0, len(texts), max_request_chunks):
+            request_batches.append(texts[i:i + max_request_chunks])
+        batches = []
+        for request_batch in request_batches:
+            total_tokens = 0
+            batch = []
+            for text in request_batch:
+                tokens = len(encoding.encode(text))
+                # if adding this text would exceed the token limit, push the current batch
+                if total_tokens + tokens > max_request_tokens:
+                    batches.append(batch)
+                    batch = [text]
+                    total_tokens = tokens  # reset with current text
+                else:
+                    batch.append(text)
+                    total_tokens += tokens
+            if batch:
+                batches.append(batch)
+        return batches
+    def _get_embeddings(self, texts: list[str]) -> list[list[float]]:
+        batches = self._batch_texts(texts)
+        embeddings = []
+        for batch in batches:
+            response = self._client.embeddings.create(
+                model=self._config.embedding_model,
+                input=batch
+            )
+            embeddings += [row.embedding for row in response.data]
+        return embeddings
+    def _chunk_document(self, text: str) -> list[str]:
+        return self._splitter.split_text(text)
+    def vectorize_chunks(self, chunks: List[str], metadata: ChunkMetadata) -> list[VectorDBChunk]:
+        # Generate embeddings ourselves instead of letting ChromaDB do it
+        logger.debug("Generating embeddings")
+        embeddings = self._get_embeddings(chunks)
+        logger.debug(f"Generated {len(embeddings)} embeddings")
+        vectors = [
+            VectorDBChunk(
+                id=uuid4().hex,
+                document=chunk,
+                metadata=metadata,
+                embedding=embedding
+            )
+            for chunk, embedding in zip(chunks, embeddings)
+        ]
+        logger.debug("Adding vectors")
+        self._chroma_db.add(vectors)
+        logger.debug("Vectors added")
+        return vectors
+    def vectorize(self, text: str, metadata: ChunkMetadata) -> list[VectorDBChunk]:
+        logger.debug("Vectorize document")
+        chunks = self._chunk_document(text)
+        logger.debug(f"Document split into {len(chunks)} chunks")
+        return self.vectorize_chunks(chunks, metadata)

mfcli/constants/__init__.py ADDED Viewed

File without changes

mfcli/constants/base_enum.py ADDED Viewed

@@ -0,0 +1,18 @@
+from enum import IntEnum
+class BaseEnum(IntEnum):
+    @classmethod
+    def get(cls, name: str) -> int | None:
+        try:
+            return cls[name].value
+        except KeyError:
+            return None
+    @classmethod
+    def name_from_value(cls, value: int) -> str | None:
+        member = cls._value2member_map_.get(value)
+        return member.name if member else None

mfcli/constants/directory_names.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ MF_PROJECT_CONFIG_DIR_NAME = ".multifactor"