mfcli 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mfcli/.env.example +72 -0
- mfcli/__init__.py +0 -0
- mfcli/agents/__init__.py +0 -0
- mfcli/agents/controller/__init__.py +0 -0
- mfcli/agents/controller/agent.py +19 -0
- mfcli/agents/controller/config.yaml +27 -0
- mfcli/agents/controller/tools.py +42 -0
- mfcli/agents/tools/general.py +118 -0
- mfcli/alembic/env.py +61 -0
- mfcli/alembic/script.py.mako +28 -0
- mfcli/alembic/versions/6ccc0c7c397c_added_fields_to_pdf_parts_model.py +39 -0
- mfcli/alembic/versions/769019ef4870_added_gemini_file_path_to_pdf_part_model.py +33 -0
- mfcli/alembic/versions/7a2e3a779fdc_added_functional_block_and_component_.py +54 -0
- mfcli/alembic/versions/7d5adb2a47a7_added_pdf_parts_model.py +41 -0
- mfcli/alembic/versions/7fcb7d6a5836_init.py +167 -0
- mfcli/alembic/versions/e0f2b5765c72_added_cascade_delete_for_models_that_.py +32 -0
- mfcli/alembic.ini +147 -0
- mfcli/cli/__init__.py +0 -0
- mfcli/cli/dependencies.py +59 -0
- mfcli/cli/main.py +192 -0
- mfcli/client/__init__.py +0 -0
- mfcli/client/chroma_db.py +184 -0
- mfcli/client/docling.py +44 -0
- mfcli/client/gemini.py +252 -0
- mfcli/client/llama_parse.py +38 -0
- mfcli/client/vector_db.py +93 -0
- mfcli/constants/__init__.py +0 -0
- mfcli/constants/base_enum.py +18 -0
- mfcli/constants/directory_names.py +1 -0
- mfcli/constants/file_types.py +189 -0
- mfcli/constants/gemini.py +1 -0
- mfcli/constants/openai.py +6 -0
- mfcli/constants/pipeline_run_status.py +3 -0
- mfcli/crud/__init__.py +0 -0
- mfcli/crud/file.py +42 -0
- mfcli/crud/functional_blocks.py +26 -0
- mfcli/crud/netlist.py +18 -0
- mfcli/crud/pipeline_run.py +17 -0
- mfcli/crud/project.py +99 -0
- mfcli/digikey/__init__.py +0 -0
- mfcli/digikey/digikey.py +105 -0
- mfcli/main.py +5 -0
- mfcli/mcp/__init__.py +0 -0
- mfcli/mcp/configs/cline_mcp_settings.json +11 -0
- mfcli/mcp/configs/mfcli.mcp.json +7 -0
- mfcli/mcp/mcp_instance.py +6 -0
- mfcli/mcp/server.py +37 -0
- mfcli/mcp/state_manager.py +51 -0
- mfcli/mcp/tools/__init__.py +0 -0
- mfcli/mcp/tools/query_knowledgebase.py +108 -0
- mfcli/models/__init__.py +10 -0
- mfcli/models/base.py +10 -0
- mfcli/models/bom.py +71 -0
- mfcli/models/datasheet.py +10 -0
- mfcli/models/debug_setup.py +64 -0
- mfcli/models/file.py +43 -0
- mfcli/models/file_docket.py +94 -0
- mfcli/models/file_metadata.py +19 -0
- mfcli/models/functional_blocks.py +94 -0
- mfcli/models/llm_response.py +5 -0
- mfcli/models/mcu.py +97 -0
- mfcli/models/mcu_errata.py +26 -0
- mfcli/models/netlist.py +59 -0
- mfcli/models/pdf_parts.py +25 -0
- mfcli/models/pipeline_run.py +34 -0
- mfcli/models/project.py +27 -0
- mfcli/models/project_metadata.py +15 -0
- mfcli/pipeline/__init__.py +0 -0
- mfcli/pipeline/analysis/__init__.py +0 -0
- mfcli/pipeline/analysis/bom_netlist_mapper.py +28 -0
- mfcli/pipeline/analysis/generators/__init__.py +0 -0
- mfcli/pipeline/analysis/generators/bom/__init__.py +0 -0
- mfcli/pipeline/analysis/generators/bom/bom.py +74 -0
- mfcli/pipeline/analysis/generators/debug_setup/__init__.py +0 -0
- mfcli/pipeline/analysis/generators/debug_setup/debug_setup.py +71 -0
- mfcli/pipeline/analysis/generators/debug_setup/instructions.py +150 -0
- mfcli/pipeline/analysis/generators/functional_blocks/__init__.py +0 -0
- mfcli/pipeline/analysis/generators/functional_blocks/functional_blocks.py +93 -0
- mfcli/pipeline/analysis/generators/functional_blocks/instructions.py +34 -0
- mfcli/pipeline/analysis/generators/functional_blocks/validator.py +94 -0
- mfcli/pipeline/analysis/generators/generator.py +258 -0
- mfcli/pipeline/analysis/generators/generator_base.py +18 -0
- mfcli/pipeline/analysis/generators/mcu/__init__.py +0 -0
- mfcli/pipeline/analysis/generators/mcu/instructions.py +156 -0
- mfcli/pipeline/analysis/generators/mcu/mcu.py +84 -0
- mfcli/pipeline/analysis/generators/mcu_errata/__init__.py +1 -0
- mfcli/pipeline/analysis/generators/mcu_errata/instructions.py +77 -0
- mfcli/pipeline/analysis/generators/mcu_errata/mcu_errata.py +95 -0
- mfcli/pipeline/analysis/generators/summary/__init__.py +0 -0
- mfcli/pipeline/analysis/generators/summary/summary.py +47 -0
- mfcli/pipeline/classifier.py +93 -0
- mfcli/pipeline/data_enricher.py +15 -0
- mfcli/pipeline/extractor.py +34 -0
- mfcli/pipeline/extractors/__init__.py +0 -0
- mfcli/pipeline/extractors/pdf.py +12 -0
- mfcli/pipeline/parser.py +120 -0
- mfcli/pipeline/parsers/__init__.py +0 -0
- mfcli/pipeline/parsers/netlist/__init__.py +0 -0
- mfcli/pipeline/parsers/netlist/edif.py +93 -0
- mfcli/pipeline/parsers/netlist/kicad_legacy_net.py +326 -0
- mfcli/pipeline/parsers/netlist/kicad_spice.py +135 -0
- mfcli/pipeline/parsers/netlist/pads.py +185 -0
- mfcli/pipeline/parsers/netlist/protel.py +166 -0
- mfcli/pipeline/parsers/netlist/protel_detector.py +29 -0
- mfcli/pipeline/pipeline.py +419 -0
- mfcli/pipeline/preprocessors/__init__.py +0 -0
- mfcli/pipeline/preprocessors/user_guide.py +127 -0
- mfcli/pipeline/run_context.py +32 -0
- mfcli/pipeline/schema_mapper.py +89 -0
- mfcli/pipeline/sub_classifier.py +115 -0
- mfcli/utils/__init__.py +0 -0
- mfcli/utils/config.py +33 -0
- mfcli/utils/configurator.py +324 -0
- mfcli/utils/data_cleaner.py +82 -0
- mfcli/utils/datasheet_vectorizer.py +281 -0
- mfcli/utils/directory_manager.py +96 -0
- mfcli/utils/file_upload.py +298 -0
- mfcli/utils/files.py +16 -0
- mfcli/utils/http_requests.py +54 -0
- mfcli/utils/kb_lister.py +89 -0
- mfcli/utils/kb_remover.py +173 -0
- mfcli/utils/logger.py +28 -0
- mfcli/utils/mcp_configurator.py +311 -0
- mfcli/utils/migrations.py +18 -0
- mfcli/utils/orm.py +43 -0
- mfcli/utils/pdf_splitter.py +63 -0
- mfcli/utils/query_service.py +22 -0
- mfcli/utils/system_check.py +306 -0
- mfcli/utils/tools.py +31 -0
- mfcli/utils/vectorizer.py +28 -0
- mfcli-0.2.0.dist-info/METADATA +841 -0
- mfcli-0.2.0.dist-info/RECORD +136 -0
- mfcli-0.2.0.dist-info/WHEEL +5 -0
- mfcli-0.2.0.dist-info/entry_points.txt +3 -0
- mfcli-0.2.0.dist-info/licenses/LICENSE +21 -0
- mfcli-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
errata_extraction_base_instructions = (
|
|
2
|
+
"""
|
|
3
|
+
You are the MCU Errata Cheat Sheet Generator agent. Your role is to analyze MCU errata
|
|
4
|
+
documents (PDF files) and extract ONLY firmware-relevant issues - silicon bugs that can be
|
|
5
|
+
addressed or worked around in firmware code. Exclude hardware-only issues that cannot be
|
|
6
|
+
fixed in software. You will be given an MCU Errata file to analyze.
|
|
7
|
+
"""
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
extract_errata_ids_instruction = (
|
|
11
|
+
f"""
|
|
12
|
+
{errata_extraction_base_instructions}
|
|
13
|
+
|
|
14
|
+
Your job is to extract all the Errata IDs. Return a list of official IDs from document (e.g., "I2C_01", "ADV0123")
|
|
15
|
+
"""
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
extract_errata_instructions = (
|
|
19
|
+
f"""
|
|
20
|
+
{errata_extraction_base_instructions}
|
|
21
|
+
|
|
22
|
+
You be given the errata ID to extract, and you will extract this info:
|
|
23
|
+
|
|
24
|
+
a. IDENTIFICATION:
|
|
25
|
+
- errata_id: Official ID from document (e.g., "I2C_01", "ADV0123")
|
|
26
|
+
- title: Brief descriptive title
|
|
27
|
+
- affected_modules: List of affected peripherals/modules
|
|
28
|
+
Examples: ["I2C", "SPI", "UART", "ADC", "Timer", "DMA", "RTC"]
|
|
29
|
+
b. SEVERITY CLASSIFICATION:
|
|
30
|
+
- Critical: Can cause data corruption, system hang, or major malfunction
|
|
31
|
+
- Major: Significant functional impact, workaround is complex
|
|
32
|
+
- Minor: Minor inconvenience, easy workaround
|
|
33
|
+
c. DETAILED INFORMATION:
|
|
34
|
+
- description: Clear explanation of the bug
|
|
35
|
+
- conditions: When/how the bug occurs
|
|
36
|
+
* Specific register values
|
|
37
|
+
* Timing conditions
|
|
38
|
+
* Operating modes
|
|
39
|
+
* Environmental conditions (temperature, voltage)
|
|
40
|
+
d. FIRMWARE WORKAROUND:
|
|
41
|
+
- firmware_workaround: Specific code-level workaround
|
|
42
|
+
Examples:
|
|
43
|
+
* "Add 10us delay after setting register X"
|
|
44
|
+
* "Avoid using bits [7:5] in CONFIG register"
|
|
45
|
+
* "Initialize peripheral in specific order: Step 1, Step 2, Step 3"
|
|
46
|
+
* "Use polling instead of interrupts for this peripheral"
|
|
47
|
+
* "Apply calibration value from factory settings"
|
|
48
|
+
- Be SPECIFIC - provide actual steps/code guidance
|
|
49
|
+
e. IMPACT:
|
|
50
|
+
- impact: How this affects firmware operation
|
|
51
|
+
Examples:
|
|
52
|
+
* "May cause I2C communication failures"
|
|
53
|
+
* "Incorrect ADC readings below 10% of range"
|
|
54
|
+
* "System hang if DMA used with this peripheral"
|
|
55
|
+
f. SILICON REVISIONS:
|
|
56
|
+
- affected_revisions: Which chip revisions have this bug
|
|
57
|
+
Examples: ["Rev A", "Rev B"], ["All revisions"], ["Rev 1.0 - 1.2"]
|
|
58
|
+
"""
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
errata_document_summary_instructions = (
|
|
62
|
+
f"""
|
|
63
|
+
{errata_extraction_base_instructions}
|
|
64
|
+
|
|
65
|
+
You will extract three items from this document:
|
|
66
|
+
|
|
67
|
+
1. Errata document name, for example Silicon Errata Rev 1.2 - March 2024.
|
|
68
|
+
|
|
69
|
+
2. MCU name, for example MSPM0L1306
|
|
70
|
+
|
|
71
|
+
3. Top-level firmware recommendations.
|
|
72
|
+
Examples:
|
|
73
|
+
* "Always use polling mode for I2C on Rev A silicon"
|
|
74
|
+
* "Add delays in ADC initialization sequence"
|
|
75
|
+
* "Avoid simultaneous use of Timer3 and DMA Channel 2"
|
|
76
|
+
"""
|
|
77
|
+
)
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from asyncio import Semaphore
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import Dict, List
|
|
5
|
+
|
|
6
|
+
from google.genai.types import File as GeminiFile
|
|
7
|
+
|
|
8
|
+
from mfcli.models.file import File
|
|
9
|
+
from mfcli.models.mcu_errata import ErrataIDs, ErrataItem, ErrataTopLevelSummary
|
|
10
|
+
from mfcli.pipeline.analysis.generators.generator_base import GeneratorBase
|
|
11
|
+
from mfcli.pipeline.analysis.generators.mcu_errata.instructions import (
|
|
12
|
+
extract_errata_ids_instruction,
|
|
13
|
+
extract_errata_instructions,
|
|
14
|
+
errata_document_summary_instructions
|
|
15
|
+
)
|
|
16
|
+
from mfcli.pipeline.run_context import PipelineRunContext
|
|
17
|
+
from mfcli.utils.logger import get_logger
|
|
18
|
+
|
|
19
|
+
logger = get_logger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ErrataCheatSheetGenerator(GeneratorBase):
|
|
23
|
+
def __init__(self, context: PipelineRunContext, db_file: File, uploads: List[GeminiFile]):
|
|
24
|
+
super().__init__(context, db_file, uploads)
|
|
25
|
+
|
|
26
|
+
async def _extract_errata_ids(self) -> list[str]:
|
|
27
|
+
prompt = "Use the Errata file to extract errata_ids"
|
|
28
|
+
errata_ids: ErrataIDs = await self._context.gemini.generate(
|
|
29
|
+
prompt=prompt,
|
|
30
|
+
instructions=extract_errata_ids_instruction,
|
|
31
|
+
response_model=ErrataIDs,
|
|
32
|
+
files=self._uploads
|
|
33
|
+
)
|
|
34
|
+
return errata_ids.ids
|
|
35
|
+
|
|
36
|
+
async def _extract_errata(self, errata_id: str, sem: Semaphore) -> ErrataItem:
|
|
37
|
+
prompt = f"Extract errata info from ID: {errata_id}"
|
|
38
|
+
async with sem:
|
|
39
|
+
return await self._context.gemini.generate(
|
|
40
|
+
prompt=prompt,
|
|
41
|
+
instructions=extract_errata_instructions,
|
|
42
|
+
response_model=ErrataItem,
|
|
43
|
+
files=self._uploads
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
async def _generate_top_level_summary(self) -> ErrataTopLevelSummary:
|
|
47
|
+
prompt = "Generate top-level document summary"
|
|
48
|
+
return await self._context.gemini.generate(
|
|
49
|
+
prompt=prompt,
|
|
50
|
+
instructions=errata_document_summary_instructions,
|
|
51
|
+
response_model=ErrataTopLevelSummary,
|
|
52
|
+
files=self._uploads
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
async def _create_summary(self, errata: list[ErrataItem]) -> Dict:
|
|
56
|
+
critical = []
|
|
57
|
+
major = []
|
|
58
|
+
minor = []
|
|
59
|
+
model_issues_counts = defaultdict(int)
|
|
60
|
+
for erratum in errata:
|
|
61
|
+
if erratum.severity == 'Critical':
|
|
62
|
+
critical.append(erratum.model_dump())
|
|
63
|
+
elif erratum.severity == 'Major':
|
|
64
|
+
major.append(erratum.model_dump())
|
|
65
|
+
else:
|
|
66
|
+
minor.append(erratum.model_dump())
|
|
67
|
+
for module in erratum.affected_modules:
|
|
68
|
+
model_issues_counts[module] += 1
|
|
69
|
+
summary = await self._generate_top_level_summary()
|
|
70
|
+
return {
|
|
71
|
+
"errata_cheat_sheet": {
|
|
72
|
+
"mcu_name": summary.mcu_name,
|
|
73
|
+
"errata_document": summary.errata_document,
|
|
74
|
+
"total_issues": len(errata),
|
|
75
|
+
"critical_issues": critical,
|
|
76
|
+
"major_issues": major,
|
|
77
|
+
"minor_issues": minor,
|
|
78
|
+
"summary_by_module": model_issues_counts,
|
|
79
|
+
"key_recommendations": summary.recommendations
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
async def generate(self) -> Dict:
|
|
84
|
+
errata_ids = await self._extract_errata_ids()
|
|
85
|
+
errata: list[ErrataItem] = []
|
|
86
|
+
sem = asyncio.Semaphore(5)
|
|
87
|
+
tasks = [self._extract_errata(errata_id, sem) for errata_id in errata_ids]
|
|
88
|
+
results: list[ErrataItem | Exception] = await asyncio.gather(*tasks, return_exceptions=True)
|
|
89
|
+
for result in results:
|
|
90
|
+
if isinstance(result, Exception):
|
|
91
|
+
logger.exception(result)
|
|
92
|
+
logger.error(f"Error extracting errata")
|
|
93
|
+
continue
|
|
94
|
+
errata.append(result)
|
|
95
|
+
return await self._create_summary(errata)
|
|
File without changes
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from typing import List, Dict
|
|
2
|
+
|
|
3
|
+
from google.genai.types import File as GeminiFile
|
|
4
|
+
|
|
5
|
+
from mfcli.models.file import File
|
|
6
|
+
from mfcli.models.llm_response import LLMResponse
|
|
7
|
+
from mfcli.models.pdf_parts import PDFPart
|
|
8
|
+
from mfcli.pipeline.analysis.generators.generator_base import GeneratorBase
|
|
9
|
+
from mfcli.pipeline.run_context import PipelineRunContext
|
|
10
|
+
|
|
11
|
+
user_guide_summary_instructions = (
|
|
12
|
+
"""
|
|
13
|
+
You will receive sections of a hardware engineering user guide PDF.
|
|
14
|
+
Your job is to summarize those sections.
|
|
15
|
+
You will also receive the sections that have been summarized so far.
|
|
16
|
+
Use the sections that have been summarized as context to generate new summaries.
|
|
17
|
+
ONLY output the summary text, no other information.
|
|
18
|
+
The summary text will be read directly by users.
|
|
19
|
+
"""
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class SummaryCheatSheetGenerator(GeneratorBase):
|
|
24
|
+
def __init__(self, context: PipelineRunContext, db_file: File, uploads: List[GeminiFile]):
|
|
25
|
+
super().__init__(context, db_file, uploads)
|
|
26
|
+
|
|
27
|
+
async def generate(self) -> Dict:
|
|
28
|
+
pdf_parts: List[PDFPart] = self._file.pdf_parts
|
|
29
|
+
pdf_parts.sort(key=lambda part: part.section_no)
|
|
30
|
+
summaries = []
|
|
31
|
+
for pdf_part in pdf_parts:
|
|
32
|
+
prompt = f"Summarize the {pdf_part.title} section\n\nCurrent summaries:\n\n{summaries}"
|
|
33
|
+
upload = self._context.gemini_file_cache[pdf_part.gemini_file_id]
|
|
34
|
+
response: LLMResponse = await self._context.gemini.generate(
|
|
35
|
+
prompt=prompt,
|
|
36
|
+
instructions=user_guide_summary_instructions,
|
|
37
|
+
response_model=LLMResponse,
|
|
38
|
+
files=[upload]
|
|
39
|
+
)
|
|
40
|
+
summaries.append({
|
|
41
|
+
"no": pdf_part.section_no,
|
|
42
|
+
"title": pdf_part.title,
|
|
43
|
+
"summary": response.text
|
|
44
|
+
})
|
|
45
|
+
return {
|
|
46
|
+
"summaries": summaries
|
|
47
|
+
}
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import mimetypes
|
|
3
|
+
import os
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from werkzeug.utils import secure_filename
|
|
9
|
+
|
|
10
|
+
from mfcli.constants.file_types import SupportedFileTypes, FileTypes
|
|
11
|
+
from mfcli.models.file_metadata import FileMetadata
|
|
12
|
+
from mfcli.utils.logger import get_logger
|
|
13
|
+
|
|
14
|
+
MAX_FILE_GB = 1
|
|
15
|
+
MAX_FILE_SIZE = 1024 * 1024 * 1024 * MAX_FILE_GB
|
|
16
|
+
|
|
17
|
+
logger = get_logger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def is_csv(file_path):
|
|
21
|
+
pd.read_csv(file_path, nrows=5, encoding_errors='ignore')
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def validate_file(metadata: FileMetadata):
|
|
25
|
+
logger.debug(f"Validating file: {metadata.name}")
|
|
26
|
+
try:
|
|
27
|
+
if metadata.type_id == FileTypes.CSV:
|
|
28
|
+
is_csv(metadata.path)
|
|
29
|
+
else:
|
|
30
|
+
logger.debug(f"File type has no validator: {metadata.type_name}")
|
|
31
|
+
except Exception as e:
|
|
32
|
+
raise ValueError(f"The file is not a valid {metadata.type_name} file: {e}")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_file_metadata(file_path: str, is_datasheet: bool) -> tuple[FileMetadata, bytes]:
|
|
36
|
+
logger.debug(f"Starting categorize_and_validate_file tool: {file_path}")
|
|
37
|
+
file_name = os.path.basename(file_path)
|
|
38
|
+
file_name = secure_filename(file_name).lower().strip()
|
|
39
|
+
|
|
40
|
+
file_ext = os.path.splitext(file_name)[1]
|
|
41
|
+
file_type_name = file_ext.replace('.', '').upper()
|
|
42
|
+
if not file_type_name:
|
|
43
|
+
raise ValueError("File requires an extension")
|
|
44
|
+
if file_type_name not in SupportedFileTypes:
|
|
45
|
+
raise ValueError(f"File extension is not supported: {file_type_name}")
|
|
46
|
+
file_type_id = FileTypes[file_type_name].value
|
|
47
|
+
logger.debug(f"File type id: {file_type_id}")
|
|
48
|
+
|
|
49
|
+
path = Path(file_path)
|
|
50
|
+
if not path.exists():
|
|
51
|
+
raise ValueError("File does not exist")
|
|
52
|
+
if not os.access(path, os.R_OK):
|
|
53
|
+
raise ValueError("File is not readable")
|
|
54
|
+
|
|
55
|
+
file_bytes = os.stat(file_path).st_size
|
|
56
|
+
if file_bytes == 0:
|
|
57
|
+
raise ValueError(f"File is empty: {file_name}")
|
|
58
|
+
if file_bytes > MAX_FILE_SIZE:
|
|
59
|
+
raise ValueError(f"File size exceeds limit: {file_bytes}")
|
|
60
|
+
|
|
61
|
+
logger.debug(f"File validated: {file_path}")
|
|
62
|
+
with open(file_path, "rb") as fp:
|
|
63
|
+
content = fp.read()
|
|
64
|
+
md5_sum = hashlib.md5(content).hexdigest()
|
|
65
|
+
|
|
66
|
+
# Use mimetypes module to guess MIME type from file extension
|
|
67
|
+
mime_type, _ = mimetypes.guess_type(file_path)
|
|
68
|
+
|
|
69
|
+
# If mimetypes can't detect it, use default based on file type
|
|
70
|
+
if mime_type is None:
|
|
71
|
+
# Use the first supported MIME type as default
|
|
72
|
+
mime_types = SupportedFileTypes[file_type_name]["mime_types"]
|
|
73
|
+
mime_type = list(mime_types)[0] if mime_types else 'application/octet-stream'
|
|
74
|
+
|
|
75
|
+
# Validate MIME type matches expected types for this file extension
|
|
76
|
+
if mime_type not in SupportedFileTypes[file_type_name]["mime_types"]:
|
|
77
|
+
logger.warning(f"Extension {file_type_name} has unexpected MIME type: {mime_type}. Expected one of: {SupportedFileTypes[file_type_name]['mime_types']}")
|
|
78
|
+
# Use the first supported MIME type as default instead of failing
|
|
79
|
+
mime_type = list(SupportedFileTypes[file_type_name]["mime_types"])[0]
|
|
80
|
+
|
|
81
|
+
file_metadata = FileMetadata(
|
|
82
|
+
name=file_name,
|
|
83
|
+
size=file_bytes,
|
|
84
|
+
md5=md5_sum,
|
|
85
|
+
path=file_path,
|
|
86
|
+
mime=mime_type,
|
|
87
|
+
ext=file_ext,
|
|
88
|
+
type_id=file_type_id,
|
|
89
|
+
type_name=file_type_name,
|
|
90
|
+
is_datasheet=is_datasheet
|
|
91
|
+
)
|
|
92
|
+
logger.debug(file_metadata)
|
|
93
|
+
return file_metadata, content
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from typing import TypeVar
|
|
2
|
+
|
|
3
|
+
from sqlmodel import SQLModel
|
|
4
|
+
|
|
5
|
+
from mfcli.client.chroma_db import ChromaClient
|
|
6
|
+
from mfcli.constants.file_types import FileSubtypes
|
|
7
|
+
from mfcli.utils.datasheet_vectorizer import get_datasheets_for_bom_entries
|
|
8
|
+
from mfcli.utils.orm import Session
|
|
9
|
+
|
|
10
|
+
T = TypeVar('T', bound=SQLModel)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
async def enrich_data_for_model(db: Session, chroma_db: ChromaClient, subtype: int, instances: list[T]):
|
|
14
|
+
if subtype == FileSubtypes.BOM:
|
|
15
|
+
await get_datasheets_for_bom_entries(db, chroma_db, instances)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from mfcli.constants.file_types import FileTypes
|
|
2
|
+
from mfcli.models.file import File
|
|
3
|
+
from mfcli.pipeline.extractors.pdf import extract_text_from_pdf
|
|
4
|
+
from mfcli.utils.files import is_text_mime_type
|
|
5
|
+
|
|
6
|
+
import os.path
|
|
7
|
+
|
|
8
|
+
from mfcli.client.llama_parse import LlamaParseClient
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TextExtractor(LlamaParseClient):
|
|
12
|
+
def __init__(self):
|
|
13
|
+
super().__init__()
|
|
14
|
+
|
|
15
|
+
@staticmethod
|
|
16
|
+
def extract_pdf_bytes(pdf_bytes: bytes):
|
|
17
|
+
return extract_text_from_pdf(pdf_bytes)
|
|
18
|
+
|
|
19
|
+
def extract_text_from_file_bytes(self, file_name: str, file_bytes: bytes) -> str:
|
|
20
|
+
return self.parse(file_name, file_bytes)
|
|
21
|
+
|
|
22
|
+
def extract_text(self, file_path: str):
|
|
23
|
+
with open(file_path, "rb") as f:
|
|
24
|
+
file_name = os.path.basename(file_path)
|
|
25
|
+
file_bytes = f.read()
|
|
26
|
+
return self.parse(file_name, file_bytes)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def extract_document_text(file: File, file_bytes: bytes) -> str:
|
|
30
|
+
if is_text_mime_type(file.mime_type):
|
|
31
|
+
return file_bytes.decode(errors='ignore')
|
|
32
|
+
elif file.type == FileTypes.PDF:
|
|
33
|
+
return TextExtractor().extract_pdf_bytes(file_bytes)
|
|
34
|
+
raise ValueError(f"Unsupported MIME type: {file.mime_type}")
|
|
File without changes
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import fitz
|
|
2
|
+
from io import BytesIO
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def extract_text_from_pdf(pdf_bytes: bytes) -> str:
|
|
6
|
+
"""Extract all text from a PDF given as bytes (no temp file needed)."""
|
|
7
|
+
text = []
|
|
8
|
+
# Open from a memory stream instead of file path
|
|
9
|
+
with fitz.open(stream=BytesIO(pdf_bytes), filetype="pdf") as doc:
|
|
10
|
+
for page in doc:
|
|
11
|
+
text.append(page.get_text("text"))
|
|
12
|
+
return "\n".join(text)
|
mfcli/pipeline/parser.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Type
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from sqlmodel import SQLModel
|
|
7
|
+
|
|
8
|
+
from mfcli.constants.file_types import FileTypes, FileSubtypes
|
|
9
|
+
from mfcli.crud.netlist import create_netlist
|
|
10
|
+
from mfcli.models.file import File
|
|
11
|
+
from mfcli.models.netlist import Netlist
|
|
12
|
+
from mfcli.pipeline.parsers.netlist.kicad_legacy_net import parse_kicad_legacy_net_file
|
|
13
|
+
from mfcli.pipeline.parsers.netlist.kicad_spice import parse_kicad_spice_file
|
|
14
|
+
from mfcli.pipeline.parsers.netlist.pads import parse_pads_file
|
|
15
|
+
from mfcli.pipeline.parsers.netlist.protel import parse_protel_file
|
|
16
|
+
from mfcli.pipeline.schema_mapper import SchemaMappings, SubtypeModels
|
|
17
|
+
from mfcli.utils.logger import get_logger
|
|
18
|
+
from mfcli.utils.orm import Session
|
|
19
|
+
|
|
20
|
+
logger = get_logger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def parse_csv(file_path: str) -> dict:
|
|
24
|
+
logger.debug(f"Parsing CSV: {file_path}")
|
|
25
|
+
df = pd.read_csv(file_path, header='infer', encoding_errors='ignore')
|
|
26
|
+
json_str = df.to_json(orient="records")
|
|
27
|
+
logger.debug(f"CSV parsed: {file_path}")
|
|
28
|
+
return json.loads(json_str)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _extract_schema_from_csv(
|
|
32
|
+
file: File,
|
|
33
|
+
input_column_field_map: dict[str, str],
|
|
34
|
+
model: Type[SQLModel]
|
|
35
|
+
) -> list[SQLModel]:
|
|
36
|
+
rows = parse_csv(file.path)
|
|
37
|
+
model_instances: list[SQLModel] = []
|
|
38
|
+
for row in rows:
|
|
39
|
+
try:
|
|
40
|
+
mapped_data = {}
|
|
41
|
+
for input_col, model_field in input_column_field_map.items():
|
|
42
|
+
if input_col in row:
|
|
43
|
+
mapped_data[model_field] = row[input_col]
|
|
44
|
+
else:
|
|
45
|
+
mapped_data[model_field] = None
|
|
46
|
+
instance = model(**mapped_data)
|
|
47
|
+
instance.file_id = file.id
|
|
48
|
+
model_instances.append(instance)
|
|
49
|
+
logger.debug(f"Model parsed from CSV: {instance}")
|
|
50
|
+
except Exception as e:
|
|
51
|
+
logger.warn(e)
|
|
52
|
+
if not model_instances:
|
|
53
|
+
raise ValueError(f"No data could be parsed from this CSV: {file.path}")
|
|
54
|
+
return model_instances
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class SchemaParser:
|
|
58
|
+
def __init__(self, db: Session, file: File):
|
|
59
|
+
self._db = db
|
|
60
|
+
self.file = file
|
|
61
|
+
|
|
62
|
+
def _parse_with_schema_mappings(
|
|
63
|
+
self,
|
|
64
|
+
model: Type[SQLModel],
|
|
65
|
+
mappings: SchemaMappings
|
|
66
|
+
) -> list[SQLModel]:
|
|
67
|
+
input_column_field_map = {mapping.input_field: mapping.mapped_field for mapping in mappings.fields}
|
|
68
|
+
logger.debug(f"Model: {model}")
|
|
69
|
+
if self.file.type == FileTypes.CSV:
|
|
70
|
+
return _extract_schema_from_csv(self.file, input_column_field_map, model)
|
|
71
|
+
raise ValueError(f"Unsupported extraction file type: {self.file.type}")
|
|
72
|
+
|
|
73
|
+
@staticmethod
|
|
74
|
+
def _is_netlist_file(subtype: FileSubtypes):
|
|
75
|
+
if subtype in [
|
|
76
|
+
FileSubtypes.KICAD_LEGACY_NET,
|
|
77
|
+
FileSubtypes.KICAD_SPICE,
|
|
78
|
+
FileSubtypes.PADS_PCB_ASCII,
|
|
79
|
+
FileSubtypes.PROTEL_ALTIUM
|
|
80
|
+
]:
|
|
81
|
+
return True
|
|
82
|
+
return False
|
|
83
|
+
|
|
84
|
+
def _parse_netlist_file(self, subtype: FileSubtypes, file_path: Path) -> Netlist:
|
|
85
|
+
if subtype == FileSubtypes.KICAD_LEGACY_NET:
|
|
86
|
+
netlist_schema = parse_kicad_legacy_net_file(file_path)
|
|
87
|
+
elif subtype == FileSubtypes.PADS_PCB_ASCII:
|
|
88
|
+
netlist_schema = parse_pads_file(file_path)
|
|
89
|
+
elif subtype == FileSubtypes.KICAD_SPICE:
|
|
90
|
+
netlist_schema = parse_kicad_spice_file(file_path)
|
|
91
|
+
elif subtype == FileSubtypes.PROTEL_ALTIUM:
|
|
92
|
+
netlist_schema = parse_protel_file(file_path)
|
|
93
|
+
else:
|
|
94
|
+
raise ValueError(f"Netlist file has no parser: {self.file.name}")
|
|
95
|
+
netlist = create_netlist(self.file.pipeline_run_id, netlist_schema)
|
|
96
|
+
return netlist
|
|
97
|
+
|
|
98
|
+
def _parse_without_schema_mappings(self) -> list[SQLModel]:
|
|
99
|
+
subtype = self.file.sub_type
|
|
100
|
+
file_path = Path(self.file.path)
|
|
101
|
+
if self._is_netlist_file(subtype):
|
|
102
|
+
return [self._parse_netlist_file(subtype, file_path)]
|
|
103
|
+
raise ValueError(f"No parser for file subtype: {self.file.sub_type}")
|
|
104
|
+
|
|
105
|
+
def parse(self, mappings: SchemaMappings | None) -> list[SQLModel]:
|
|
106
|
+
logger.debug(f"Extracting schema from file: {self.file.name}")
|
|
107
|
+
if not SubtypeModels.get(self.file.sub_type):
|
|
108
|
+
raise ValueError(f"Cannot find subtype model for subtype: {self.file.sub_type}")
|
|
109
|
+
model: Type[SQLModel] = SubtypeModels.get(self.file.sub_type)
|
|
110
|
+
if mappings:
|
|
111
|
+
instances = self._parse_with_schema_mappings(model, mappings)
|
|
112
|
+
else:
|
|
113
|
+
instances = self._parse_without_schema_mappings()
|
|
114
|
+
self._db.add_all(instances)
|
|
115
|
+
logger.debug(f"File has been successfully parsed")
|
|
116
|
+
return instances
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def parse_schema(db: Session, file: File, mappings: SchemaMappings | None) -> list[SQLModel]:
|
|
120
|
+
return SchemaParser(db, file).parse(mappings)
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
from typing import Dict, Any
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def parse(file_path: str) -> Dict[str, Any]:
|
|
7
|
+
"""
|
|
8
|
+
Parse an EDIF file and extract a relational JSON representation:
|
|
9
|
+
component-pin-net mapping, deduplicated net names, annotated with voltage domains and pull-up/pull-down info.
|
|
10
|
+
"""
|
|
11
|
+
with open(file_path, "r") as f:
|
|
12
|
+
edif_text = f.read()
|
|
13
|
+
|
|
14
|
+
# --- Step 1: Extract NETLIST_TEXT ---
|
|
15
|
+
match = re.search(r'\(property NETLIST_TEXT \(string "(.*?)"\)\)', edif_text, re.DOTALL)
|
|
16
|
+
if not match:
|
|
17
|
+
raise ValueError("No NETLIST_TEXT property found in EDIF")
|
|
18
|
+
|
|
19
|
+
netlist_text = match.group(1)
|
|
20
|
+
netlist_text = netlist_text.replace('\\n', '\n').strip()
|
|
21
|
+
|
|
22
|
+
# --- Step 2: Parse SPICE-style lines ---
|
|
23
|
+
component_pattern = re.compile(
|
|
24
|
+
r'^(?P<name>\w+)\s+(?P<n1>\S+)\s+(?P<n2>\S+)\s+(?P<rest>.*)$', re.MULTILINE
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
components = []
|
|
28
|
+
nets = defaultdict(lambda: {"name": None, "connected_pins": []})
|
|
29
|
+
|
|
30
|
+
for match in component_pattern.finditer(netlist_text):
|
|
31
|
+
name = match.group("name")
|
|
32
|
+
n1, n2 = match.group("n1"), match.group("n2")
|
|
33
|
+
rest = match.group("rest")
|
|
34
|
+
|
|
35
|
+
# Deduce type from prefix (SPICE convention)
|
|
36
|
+
type_prefix = name[0].lower()
|
|
37
|
+
type_map = {
|
|
38
|
+
"r": "resistor",
|
|
39
|
+
"c": "capacitor",
|
|
40
|
+
"l": "inductor",
|
|
41
|
+
"v": "voltage_source",
|
|
42
|
+
"i": "current_source",
|
|
43
|
+
"e": "op_amp",
|
|
44
|
+
"d": "diode",
|
|
45
|
+
"j": "jumper",
|
|
46
|
+
"led": "led",
|
|
47
|
+
"s": "switch",
|
|
48
|
+
"y": "crystal",
|
|
49
|
+
"sh": "shunt",
|
|
50
|
+
"rt": "thermistor",
|
|
51
|
+
"tp": "transistor",
|
|
52
|
+
}
|
|
53
|
+
comp_type = type_map.get(type_prefix, "unknown")
|
|
54
|
+
|
|
55
|
+
pins = [n1, n2]
|
|
56
|
+
components.append({
|
|
57
|
+
"name": name,
|
|
58
|
+
"type": comp_type,
|
|
59
|
+
"pins": pins,
|
|
60
|
+
"params": rest,
|
|
61
|
+
})
|
|
62
|
+
|
|
63
|
+
# Connect pins to nets
|
|
64
|
+
for pin in pins:
|
|
65
|
+
nets[pin]["name"] = pin
|
|
66
|
+
nets[pin]["connected_pins"].append({"component": name, "pin": pin})
|
|
67
|
+
|
|
68
|
+
# --- Step 3: Deduplicate net names ---
|
|
69
|
+
unique_nets = list(nets.values())
|
|
70
|
+
|
|
71
|
+
# --- Step 4: Annotate nets ---
|
|
72
|
+
for net in unique_nets:
|
|
73
|
+
name = net["name"].lower()
|
|
74
|
+
if name in ("vcc", "vdd", "vin"):
|
|
75
|
+
net["voltage_domain"] = "high"
|
|
76
|
+
elif name in ("gnd", "vss", "0"):
|
|
77
|
+
net["voltage_domain"] = "ground"
|
|
78
|
+
else:
|
|
79
|
+
net["voltage_domain"] = "signal"
|
|
80
|
+
|
|
81
|
+
# Simple pull heuristic
|
|
82
|
+
if "pullup" in name:
|
|
83
|
+
net["pull"] = "pull-up"
|
|
84
|
+
elif "pulldown" in name:
|
|
85
|
+
net["pull"] = "pull-down"
|
|
86
|
+
else:
|
|
87
|
+
net["pull"] = None
|
|
88
|
+
|
|
89
|
+
# --- Step 5: Return structured output ---
|
|
90
|
+
return {
|
|
91
|
+
"components": components,
|
|
92
|
+
"nets": unique_nets
|
|
93
|
+
}
|