mfcli 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mfcli/.env.example +72 -0
- mfcli/__init__.py +0 -0
- mfcli/agents/__init__.py +0 -0
- mfcli/agents/controller/__init__.py +0 -0
- mfcli/agents/controller/agent.py +19 -0
- mfcli/agents/controller/config.yaml +27 -0
- mfcli/agents/controller/tools.py +42 -0
- mfcli/agents/tools/general.py +118 -0
- mfcli/alembic/env.py +61 -0
- mfcli/alembic/script.py.mako +28 -0
- mfcli/alembic/versions/6ccc0c7c397c_added_fields_to_pdf_parts_model.py +39 -0
- mfcli/alembic/versions/769019ef4870_added_gemini_file_path_to_pdf_part_model.py +33 -0
- mfcli/alembic/versions/7a2e3a779fdc_added_functional_block_and_component_.py +54 -0
- mfcli/alembic/versions/7d5adb2a47a7_added_pdf_parts_model.py +41 -0
- mfcli/alembic/versions/7fcb7d6a5836_init.py +167 -0
- mfcli/alembic/versions/e0f2b5765c72_added_cascade_delete_for_models_that_.py +32 -0
- mfcli/alembic.ini +147 -0
- mfcli/cli/__init__.py +0 -0
- mfcli/cli/dependencies.py +59 -0
- mfcli/cli/main.py +192 -0
- mfcli/client/__init__.py +0 -0
- mfcli/client/chroma_db.py +184 -0
- mfcli/client/docling.py +44 -0
- mfcli/client/gemini.py +252 -0
- mfcli/client/llama_parse.py +38 -0
- mfcli/client/vector_db.py +93 -0
- mfcli/constants/__init__.py +0 -0
- mfcli/constants/base_enum.py +18 -0
- mfcli/constants/directory_names.py +1 -0
- mfcli/constants/file_types.py +189 -0
- mfcli/constants/gemini.py +1 -0
- mfcli/constants/openai.py +6 -0
- mfcli/constants/pipeline_run_status.py +3 -0
- mfcli/crud/__init__.py +0 -0
- mfcli/crud/file.py +42 -0
- mfcli/crud/functional_blocks.py +26 -0
- mfcli/crud/netlist.py +18 -0
- mfcli/crud/pipeline_run.py +17 -0
- mfcli/crud/project.py +99 -0
- mfcli/digikey/__init__.py +0 -0
- mfcli/digikey/digikey.py +105 -0
- mfcli/main.py +5 -0
- mfcli/mcp/__init__.py +0 -0
- mfcli/mcp/configs/cline_mcp_settings.json +11 -0
- mfcli/mcp/configs/mfcli.mcp.json +7 -0
- mfcli/mcp/mcp_instance.py +6 -0
- mfcli/mcp/server.py +37 -0
- mfcli/mcp/state_manager.py +51 -0
- mfcli/mcp/tools/__init__.py +0 -0
- mfcli/mcp/tools/query_knowledgebase.py +108 -0
- mfcli/models/__init__.py +10 -0
- mfcli/models/base.py +10 -0
- mfcli/models/bom.py +71 -0
- mfcli/models/datasheet.py +10 -0
- mfcli/models/debug_setup.py +64 -0
- mfcli/models/file.py +43 -0
- mfcli/models/file_docket.py +94 -0
- mfcli/models/file_metadata.py +19 -0
- mfcli/models/functional_blocks.py +94 -0
- mfcli/models/llm_response.py +5 -0
- mfcli/models/mcu.py +97 -0
- mfcli/models/mcu_errata.py +26 -0
- mfcli/models/netlist.py +59 -0
- mfcli/models/pdf_parts.py +25 -0
- mfcli/models/pipeline_run.py +34 -0
- mfcli/models/project.py +27 -0
- mfcli/models/project_metadata.py +15 -0
- mfcli/pipeline/__init__.py +0 -0
- mfcli/pipeline/analysis/__init__.py +0 -0
- mfcli/pipeline/analysis/bom_netlist_mapper.py +28 -0
- mfcli/pipeline/analysis/generators/__init__.py +0 -0
- mfcli/pipeline/analysis/generators/bom/__init__.py +0 -0
- mfcli/pipeline/analysis/generators/bom/bom.py +74 -0
- mfcli/pipeline/analysis/generators/debug_setup/__init__.py +0 -0
- mfcli/pipeline/analysis/generators/debug_setup/debug_setup.py +71 -0
- mfcli/pipeline/analysis/generators/debug_setup/instructions.py +150 -0
- mfcli/pipeline/analysis/generators/functional_blocks/__init__.py +0 -0
- mfcli/pipeline/analysis/generators/functional_blocks/functional_blocks.py +93 -0
- mfcli/pipeline/analysis/generators/functional_blocks/instructions.py +34 -0
- mfcli/pipeline/analysis/generators/functional_blocks/validator.py +94 -0
- mfcli/pipeline/analysis/generators/generator.py +258 -0
- mfcli/pipeline/analysis/generators/generator_base.py +18 -0
- mfcli/pipeline/analysis/generators/mcu/__init__.py +0 -0
- mfcli/pipeline/analysis/generators/mcu/instructions.py +156 -0
- mfcli/pipeline/analysis/generators/mcu/mcu.py +84 -0
- mfcli/pipeline/analysis/generators/mcu_errata/__init__.py +1 -0
- mfcli/pipeline/analysis/generators/mcu_errata/instructions.py +77 -0
- mfcli/pipeline/analysis/generators/mcu_errata/mcu_errata.py +95 -0
- mfcli/pipeline/analysis/generators/summary/__init__.py +0 -0
- mfcli/pipeline/analysis/generators/summary/summary.py +47 -0
- mfcli/pipeline/classifier.py +93 -0
- mfcli/pipeline/data_enricher.py +15 -0
- mfcli/pipeline/extractor.py +34 -0
- mfcli/pipeline/extractors/__init__.py +0 -0
- mfcli/pipeline/extractors/pdf.py +12 -0
- mfcli/pipeline/parser.py +120 -0
- mfcli/pipeline/parsers/__init__.py +0 -0
- mfcli/pipeline/parsers/netlist/__init__.py +0 -0
- mfcli/pipeline/parsers/netlist/edif.py +93 -0
- mfcli/pipeline/parsers/netlist/kicad_legacy_net.py +326 -0
- mfcli/pipeline/parsers/netlist/kicad_spice.py +135 -0
- mfcli/pipeline/parsers/netlist/pads.py +185 -0
- mfcli/pipeline/parsers/netlist/protel.py +166 -0
- mfcli/pipeline/parsers/netlist/protel_detector.py +29 -0
- mfcli/pipeline/pipeline.py +419 -0
- mfcli/pipeline/preprocessors/__init__.py +0 -0
- mfcli/pipeline/preprocessors/user_guide.py +127 -0
- mfcli/pipeline/run_context.py +32 -0
- mfcli/pipeline/schema_mapper.py +89 -0
- mfcli/pipeline/sub_classifier.py +115 -0
- mfcli/utils/__init__.py +0 -0
- mfcli/utils/config.py +33 -0
- mfcli/utils/configurator.py +324 -0
- mfcli/utils/data_cleaner.py +82 -0
- mfcli/utils/datasheet_vectorizer.py +281 -0
- mfcli/utils/directory_manager.py +96 -0
- mfcli/utils/file_upload.py +298 -0
- mfcli/utils/files.py +16 -0
- mfcli/utils/http_requests.py +54 -0
- mfcli/utils/kb_lister.py +89 -0
- mfcli/utils/kb_remover.py +173 -0
- mfcli/utils/logger.py +28 -0
- mfcli/utils/mcp_configurator.py +311 -0
- mfcli/utils/migrations.py +18 -0
- mfcli/utils/orm.py +43 -0
- mfcli/utils/pdf_splitter.py +63 -0
- mfcli/utils/query_service.py +22 -0
- mfcli/utils/system_check.py +306 -0
- mfcli/utils/tools.py +31 -0
- mfcli/utils/vectorizer.py +28 -0
- mfcli-0.2.0.dist-info/METADATA +841 -0
- mfcli-0.2.0.dist-info/RECORD +136 -0
- mfcli-0.2.0.dist-info/WHEEL +5 -0
- mfcli-0.2.0.dist-info/entry_points.txt +3 -0
- mfcli-0.2.0.dist-info/licenses/LICENSE +21 -0
- mfcli-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from google.genai.types import File as GeminiFile
|
|
6
|
+
from pydantic import BaseModel, Field
|
|
7
|
+
|
|
8
|
+
from mfcli.agents.tools.general import format_instructions
|
|
9
|
+
from mfcli.models.file import File
|
|
10
|
+
from mfcli.models.pdf_parts import PDFPart
|
|
11
|
+
from mfcli.pipeline.run_context import PipelineRunContext
|
|
12
|
+
from mfcli.utils.directory_manager import app_dirs
|
|
13
|
+
from mfcli.utils.pdf_splitter import PDFSplitter
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TOCSection(BaseModel):
|
|
17
|
+
title: str = Field(..., description="Section title")
|
|
18
|
+
section_no: int = Field(..., description="Section number")
|
|
19
|
+
start_page: int = Field(..., description="Starting page")
|
|
20
|
+
end_page: int = Field(..., description="End page")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class TOC(BaseModel):
|
|
24
|
+
sections: List[TOCSection] = Field(..., description="Table of Contents sections")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
user_guide_preprocessor_instructions = format_instructions(
|
|
28
|
+
"""
|
|
29
|
+
You will receive the start of a PDF for hardware engineering user guide.
|
|
30
|
+
Your task is to extract all the Table of Contents sections from the PDF.
|
|
31
|
+
You will respond with the section title (no numbers in the title).
|
|
32
|
+
You will respond separately with the section number (section_no).
|
|
33
|
+
You will also respond with the start page (start_page) and (end_page) of this section.
|
|
34
|
+
You MUST respond with all relevant top-level sections in the PDF.
|
|
35
|
+
|
|
36
|
+
Here are examples of relevant top-level sections:
|
|
37
|
+
|
|
38
|
+
1. Architecture
|
|
39
|
+
2. PMCU
|
|
40
|
+
3. CPU
|
|
41
|
+
|
|
42
|
+
Here are examples of sections which are NOT relevant (content) sections (do not include these):
|
|
43
|
+
|
|
44
|
+
Read This First
|
|
45
|
+
About This Manual
|
|
46
|
+
Glossary
|
|
47
|
+
Related Documentation
|
|
48
|
+
Support Resources
|
|
49
|
+
|
|
50
|
+
Here are examples of sections which are NOT top-level (do not include these):
|
|
51
|
+
|
|
52
|
+
1.1 Architecture Overview
|
|
53
|
+
1.2 Bus Organization
|
|
54
|
+
1.3 Platform Memory Map
|
|
55
|
+
|
|
56
|
+
ONLY include content sections and top-level sections.
|
|
57
|
+
"""
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class UserGuidePreprocessor:
|
|
62
|
+
def __init__(
|
|
63
|
+
self,
|
|
64
|
+
context: PipelineRunContext,
|
|
65
|
+
file: File,
|
|
66
|
+
pdf_head: GeminiFile,
|
|
67
|
+
content: bytes,
|
|
68
|
+
splitter: PDFSplitter
|
|
69
|
+
):
|
|
70
|
+
self._context = context
|
|
71
|
+
self._file = file
|
|
72
|
+
self._pdf_head = pdf_head
|
|
73
|
+
self._content = content
|
|
74
|
+
self._splitter = splitter
|
|
75
|
+
|
|
76
|
+
async def _generate_toc(self) -> TOC:
|
|
77
|
+
return await self._context.gemini.generate(
|
|
78
|
+
prompt="Generate the Table of Content sections for this PDF",
|
|
79
|
+
instructions=user_guide_preprocessor_instructions,
|
|
80
|
+
response_model=TOC,
|
|
81
|
+
files=[self._pdf_head]
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
async def _create_pdf_part(self, section: TOCSection, pdf_part_path: Path) -> PDFPart:
|
|
85
|
+
pdf_part_gemini_file = await self._context.gemini.upload(pdf_part_path)
|
|
86
|
+
self._context.gemini_file_cache[pdf_part_gemini_file.name] = pdf_part_gemini_file
|
|
87
|
+
return PDFPart(
|
|
88
|
+
path=str(pdf_part_path),
|
|
89
|
+
file_id=self._file.id,
|
|
90
|
+
gemini_file_id=pdf_part_gemini_file.name,
|
|
91
|
+
start_page=section.start_page,
|
|
92
|
+
end_page=section.end_page,
|
|
93
|
+
title=section.title,
|
|
94
|
+
section_no=section.section_no
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
async def preprocess(self) -> List[PDFPart]:
|
|
98
|
+
toc = await self._generate_toc()
|
|
99
|
+
upload_tasks = []
|
|
100
|
+
for section in toc.sections:
|
|
101
|
+
pdf_part_path = self._splitter.extract_range(
|
|
102
|
+
start_page=section.start_page,
|
|
103
|
+
end_page=section.end_page,
|
|
104
|
+
output_folder=app_dirs.pdf_parts_dir
|
|
105
|
+
)
|
|
106
|
+
upload_tasks.append(self._create_pdf_part(section, pdf_part_path))
|
|
107
|
+
pdf_parts: List[PDFPart] = await asyncio.gather(*upload_tasks)
|
|
108
|
+
return pdf_parts
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
async def preprocess_user_guide(
|
|
112
|
+
context: PipelineRunContext,
|
|
113
|
+
file: File,
|
|
114
|
+
pdf_head: GeminiFile,
|
|
115
|
+
content: bytes,
|
|
116
|
+
splitter: PDFSplitter
|
|
117
|
+
) -> None:
|
|
118
|
+
preprocessor = UserGuidePreprocessor(
|
|
119
|
+
context=context,
|
|
120
|
+
file=file,
|
|
121
|
+
pdf_head=pdf_head,
|
|
122
|
+
content=content,
|
|
123
|
+
splitter=splitter
|
|
124
|
+
)
|
|
125
|
+
pdf_parts = await preprocessor.preprocess()
|
|
126
|
+
context.db.add_all(pdf_parts)
|
|
127
|
+
context.db.commit()
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
|
|
3
|
+
from google.genai.types import File as GeminiFile
|
|
4
|
+
from mfcli.utils.query_service import QueryService
|
|
5
|
+
|
|
6
|
+
from mfcli.client.gemini import Gemini
|
|
7
|
+
from mfcli.models.file_docket import FileDocket
|
|
8
|
+
from mfcli.models.pipeline_run import PipelineRun
|
|
9
|
+
from mfcli.models.project_metadata import ProjectConfig
|
|
10
|
+
from mfcli.utils.datasheet_vectorizer import DatasheetVectorizer
|
|
11
|
+
from mfcli.utils.orm import Session
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class PipelineRunContext:
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
db: Session,
|
|
18
|
+
pipeline_run: PipelineRun,
|
|
19
|
+
gemini: Gemini,
|
|
20
|
+
gemini_file_cache: Dict[str, GeminiFile],
|
|
21
|
+
docket: FileDocket,
|
|
22
|
+
config: ProjectConfig,
|
|
23
|
+
vectorizer: DatasheetVectorizer
|
|
24
|
+
):
|
|
25
|
+
self.db = db
|
|
26
|
+
self.run = pipeline_run
|
|
27
|
+
self.gemini = gemini
|
|
28
|
+
self.gemini_file_cache = gemini_file_cache
|
|
29
|
+
self.docket = docket
|
|
30
|
+
self.config = config
|
|
31
|
+
self.vectorizer = vectorizer
|
|
32
|
+
self.query_service = QueryService(self.db)
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
from typing import Type, Optional
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
from sqlmodel import SQLModel
|
|
5
|
+
|
|
6
|
+
from mfcli.agents.tools.general import format_instructions
|
|
7
|
+
from mfcli.client.gemini import Gemini
|
|
8
|
+
from mfcli.constants.file_types import FileSubtypes
|
|
9
|
+
from mfcli.models.bom import BOM, BOMSchema
|
|
10
|
+
from mfcli.models.netlist import Netlist
|
|
11
|
+
from mfcli.utils.logger import get_logger
|
|
12
|
+
|
|
13
|
+
logger = get_logger(__name__)
|
|
14
|
+
|
|
15
|
+
SubtypeSchemas: dict[FileSubtypes, Type[BaseModel]] = {
|
|
16
|
+
FileSubtypes.BOM: BOMSchema
|
|
17
|
+
}
|
|
18
|
+
SubtypeModels: dict[FileSubtypes, Type[SQLModel]] = {
|
|
19
|
+
FileSubtypes.BOM: BOM,
|
|
20
|
+
FileSubtypes.KICAD_LEGACY_NET: Netlist,
|
|
21
|
+
FileSubtypes.PADS_PCB_ASCII: Netlist,
|
|
22
|
+
FileSubtypes.KICAD_SPICE: Netlist,
|
|
23
|
+
FileSubtypes.PROTEL_ALTIUM: Netlist
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class SchemaMapping(BaseModel):
|
|
28
|
+
input_field: Optional[str] = Field(None, description="A field found in the sample file")
|
|
29
|
+
mapped_field: str = Field(..., description="The field found in the backend to be mapped to")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class SchemaMappings(BaseModel):
|
|
33
|
+
fields: list[SchemaMapping] = Field(default_factory=list, description="List of schema mappings")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
schema_mapper_instructions = format_instructions(
|
|
37
|
+
"""
|
|
38
|
+
You are responsible for mapping fields found in a file to a backend schema.
|
|
39
|
+
You will be given the schema for a file subtype, like Bill of Materials (BOM).
|
|
40
|
+
You will map whatever fields you see in the file, to fields in the backend.
|
|
41
|
+
For example, a column "Designator" found in a BOM file would map to "reference" in the backend.
|
|
42
|
+
|
|
43
|
+
You must respond **only** with valid JSON that exactly matches the `SchemaMappings` model:
|
|
44
|
+
|
|
45
|
+
- The top-level object must have a key `"fields"` containing a list.
|
|
46
|
+
- Each item in the list must be an object with:
|
|
47
|
+
- `"input_field"` (optional string) — the field name from the file header.
|
|
48
|
+
- `"mapped_field"` (required string) — the corresponding backend field name.
|
|
49
|
+
|
|
50
|
+
Do **not** include any markdown, text, or code fences. Respond **only** with JSON.
|
|
51
|
+
|
|
52
|
+
Example of valid response:
|
|
53
|
+
|
|
54
|
+
{
|
|
55
|
+
"fields": [
|
|
56
|
+
{"input_field": "RefDes", "mapped_field": "reference"},
|
|
57
|
+
{"input_field": "Value", "mapped_field": "value"},
|
|
58
|
+
{"input_field": "Qty", "mapped_field": "quantity"},
|
|
59
|
+
{"input_field": "Description", "mapped_field": "description"}
|
|
60
|
+
]
|
|
61
|
+
}
|
|
62
|
+
"""
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
async def map_schema(gemini: Gemini, subtype: int, text: str) -> SchemaMappings | None:
|
|
67
|
+
if not SubtypeSchemas.get(subtype):
|
|
68
|
+
logger.debug(f"No subtype mapping required for subtype: {subtype}")
|
|
69
|
+
return
|
|
70
|
+
schema = str(SubtypeSchemas[subtype].model_json_schema())
|
|
71
|
+
prompt = format_instructions(
|
|
72
|
+
f"""
|
|
73
|
+
{schema_mapper_instructions}
|
|
74
|
+
|
|
75
|
+
Here is the backend schema for this filetype:
|
|
76
|
+
|
|
77
|
+
{schema}
|
|
78
|
+
|
|
79
|
+
Here is the file contents:
|
|
80
|
+
|
|
81
|
+
{text}
|
|
82
|
+
|
|
83
|
+
"""
|
|
84
|
+
)
|
|
85
|
+
return await gemini.generate(
|
|
86
|
+
prompt=prompt,
|
|
87
|
+
instructions=schema_mapper_instructions,
|
|
88
|
+
response_model=SchemaMappings
|
|
89
|
+
)
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
from typing import Literal
|
|
2
|
+
|
|
3
|
+
from google.genai.types import File as GeminiFile
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
from mfcli.agents.tools.general import format_instructions
|
|
7
|
+
from mfcli.client.gemini import Gemini
|
|
8
|
+
from mfcli.constants.file_types import (
|
|
9
|
+
FileTypes,
|
|
10
|
+
FileSubtypes,
|
|
11
|
+
FILE_SUBTYPE_UNKNOWN,
|
|
12
|
+
PDFSubtypeDescriptions,
|
|
13
|
+
OtherFileTypeDescriptions,
|
|
14
|
+
PDFFileSubtypeNames,
|
|
15
|
+
OtherFileSubtypeNames
|
|
16
|
+
)
|
|
17
|
+
from mfcli.models.file import File
|
|
18
|
+
from mfcli.pipeline.parsers.netlist.kicad_legacy_net import is_kicad_legacy_netlist
|
|
19
|
+
from mfcli.pipeline.parsers.netlist.protel_detector import is_protel_netlist
|
|
20
|
+
from mfcli.utils.files import is_text_mime_type
|
|
21
|
+
from mfcli.utils.logger import get_logger
|
|
22
|
+
|
|
23
|
+
sub_classifier_instructions = format_instructions(
|
|
24
|
+
"""
|
|
25
|
+
You are the sub-classifier agent for an engineering document processing pipeline.
|
|
26
|
+
You will receive the first 50 lines of text from a file.
|
|
27
|
+
You will examine the content of the file, and determine the sub-type of this file.
|
|
28
|
+
You will be given the sub-types and sub-type descriptions.
|
|
29
|
+
If you are not able to determine the sub-type you will respond with "UNKNOWN".
|
|
30
|
+
|
|
31
|
+
Here are the valid sub-types and a description of each:
|
|
32
|
+
|
|
33
|
+
{}
|
|
34
|
+
|
|
35
|
+
"""
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
logger = get_logger(__name__)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class PDFSubtypeClassifierResponse(BaseModel):
|
|
42
|
+
type: PDFFileSubtypeNames
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class OtherFileSubtypeClassifierResponse(BaseModel):
|
|
46
|
+
type: OtherFileSubtypeNames
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
FileClass = Literal['pdf', 'other']
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class FileSubtypeAnalyzer:
|
|
53
|
+
def __init__(self, gemini: Gemini):
|
|
54
|
+
self._gemini = gemini
|
|
55
|
+
|
|
56
|
+
async def _get_subtype_from_gemini(
|
|
57
|
+
self,
|
|
58
|
+
prompt: str,
|
|
59
|
+
instructions: str,
|
|
60
|
+
gemini_file: GeminiFile | None = None,
|
|
61
|
+
file_class: FileClass = 'other'
|
|
62
|
+
) -> str:
|
|
63
|
+
model = OtherFileSubtypeClassifierResponse if file_class == 'other' else PDFSubtypeClassifierResponse
|
|
64
|
+
files = [gemini_file] if gemini_file else None
|
|
65
|
+
response = await self._gemini.generate(
|
|
66
|
+
prompt=prompt,
|
|
67
|
+
instructions=instructions,
|
|
68
|
+
response_model=model,
|
|
69
|
+
files=files
|
|
70
|
+
)
|
|
71
|
+
return response.type
|
|
72
|
+
|
|
73
|
+
async def _get_subtype(
|
|
74
|
+
self,
|
|
75
|
+
prompt: str,
|
|
76
|
+
file: File,
|
|
77
|
+
gemini_file: GeminiFile | None = None,
|
|
78
|
+
file_class: FileClass = 'other'
|
|
79
|
+
) -> None:
|
|
80
|
+
logger.debug(f"Fetching subtype for file: {file.name}")
|
|
81
|
+
relevant_subtype_descriptions = PDFSubtypeDescriptions if file.type == FileTypes.PDF else OtherFileTypeDescriptions
|
|
82
|
+
logger.debug(f"Relevant subtypes: {relevant_subtype_descriptions.keys()}")
|
|
83
|
+
instructions = sub_classifier_instructions.format(relevant_subtype_descriptions)
|
|
84
|
+
subtype = await self._get_subtype_from_gemini(prompt, instructions, gemini_file, file_class)
|
|
85
|
+
logger.debug(f"Subtype discovered: {subtype}")
|
|
86
|
+
if subtype == FILE_SUBTYPE_UNKNOWN:
|
|
87
|
+
raise RuntimeError(f"Could not determine the file subtype for file: {file.name}")
|
|
88
|
+
if not subtype in relevant_subtype_descriptions:
|
|
89
|
+
raise RuntimeError(f"LLM responded with invalid subtype: {subtype}")
|
|
90
|
+
file.sub_type = FileSubtypes.get(subtype)
|
|
91
|
+
|
|
92
|
+
async def analyze_pdf(
|
|
93
|
+
self,
|
|
94
|
+
file: File,
|
|
95
|
+
gemini_file: GeminiFile
|
|
96
|
+
) -> None:
|
|
97
|
+
prompt = "Determine this PDF file subtype"
|
|
98
|
+
await self._get_subtype(prompt, file, gemini_file, 'pdf')
|
|
99
|
+
|
|
100
|
+
async def analyze_file(
|
|
101
|
+
self,
|
|
102
|
+
file: File,
|
|
103
|
+
text: str
|
|
104
|
+
) -> None:
|
|
105
|
+
# Handle text MIME types
|
|
106
|
+
if text and is_text_mime_type(file.mime_type):
|
|
107
|
+
if file.type == FileTypes.NET:
|
|
108
|
+
if is_kicad_legacy_netlist(text):
|
|
109
|
+
file.sub_type = FileSubtypes.KICAD_LEGACY_NET.value
|
|
110
|
+
elif is_protel_netlist(text):
|
|
111
|
+
file.sub_type = FileSubtypes.PROTEL_ALTIUM.value
|
|
112
|
+
|
|
113
|
+
# If subtype cannot be parsed, use LLM to determine subtype
|
|
114
|
+
if not file.sub_type:
|
|
115
|
+
await self._get_subtype(text[0:500], file)
|
mfcli/utils/__init__.py
ADDED
|
File without changes
|
mfcli/utils/config.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from functools import lru_cache
|
|
2
|
+
|
|
3
|
+
from pydantic import Field
|
|
4
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
5
|
+
|
|
6
|
+
from mfcli.constants.openai import OPENAI_DEFAULT_EMBEDDING_MODEL, OPENAI_DEFAULT_EMBEDDING_DIMENSIONS
|
|
7
|
+
from mfcli.utils.directory_manager import app_dirs
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Settings(BaseSettings):
|
|
11
|
+
digikey_client_id: str
|
|
12
|
+
digikey_client_secret: str
|
|
13
|
+
openai_api_key: str
|
|
14
|
+
google_api_key: str
|
|
15
|
+
log_level: str = Field(default="INFO")
|
|
16
|
+
use_docling: bool = Field(default=True)
|
|
17
|
+
chunk_tokens: int = Field(default=2000)
|
|
18
|
+
chunk_size: int = Field(default=500)
|
|
19
|
+
chunk_overlap: int = Field(default=50)
|
|
20
|
+
embedding_model: str = Field(default=OPENAI_DEFAULT_EMBEDDING_MODEL)
|
|
21
|
+
embedding_dimensions: int = Field(default=OPENAI_DEFAULT_EMBEDDING_DIMENSIONS)
|
|
22
|
+
|
|
23
|
+
model_config = SettingsConfigDict(
|
|
24
|
+
env_file=str(app_dirs.env_file_path),
|
|
25
|
+
extra="allow",
|
|
26
|
+
env_file_encoding="utf-8",
|
|
27
|
+
case_sensitive=False
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@lru_cache
|
|
32
|
+
def get_config() -> Settings:
|
|
33
|
+
return Settings()
|