mfcli 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. mfcli/.env.example +72 -0
  2. mfcli/__init__.py +0 -0
  3. mfcli/agents/__init__.py +0 -0
  4. mfcli/agents/controller/__init__.py +0 -0
  5. mfcli/agents/controller/agent.py +19 -0
  6. mfcli/agents/controller/config.yaml +27 -0
  7. mfcli/agents/controller/tools.py +42 -0
  8. mfcli/agents/tools/general.py +118 -0
  9. mfcli/alembic/env.py +61 -0
  10. mfcli/alembic/script.py.mako +28 -0
  11. mfcli/alembic/versions/6ccc0c7c397c_added_fields_to_pdf_parts_model.py +39 -0
  12. mfcli/alembic/versions/769019ef4870_added_gemini_file_path_to_pdf_part_model.py +33 -0
  13. mfcli/alembic/versions/7a2e3a779fdc_added_functional_block_and_component_.py +54 -0
  14. mfcli/alembic/versions/7d5adb2a47a7_added_pdf_parts_model.py +41 -0
  15. mfcli/alembic/versions/7fcb7d6a5836_init.py +167 -0
  16. mfcli/alembic/versions/e0f2b5765c72_added_cascade_delete_for_models_that_.py +32 -0
  17. mfcli/alembic.ini +147 -0
  18. mfcli/cli/__init__.py +0 -0
  19. mfcli/cli/dependencies.py +59 -0
  20. mfcli/cli/main.py +192 -0
  21. mfcli/client/__init__.py +0 -0
  22. mfcli/client/chroma_db.py +184 -0
  23. mfcli/client/docling.py +44 -0
  24. mfcli/client/gemini.py +252 -0
  25. mfcli/client/llama_parse.py +38 -0
  26. mfcli/client/vector_db.py +93 -0
  27. mfcli/constants/__init__.py +0 -0
  28. mfcli/constants/base_enum.py +18 -0
  29. mfcli/constants/directory_names.py +1 -0
  30. mfcli/constants/file_types.py +189 -0
  31. mfcli/constants/gemini.py +1 -0
  32. mfcli/constants/openai.py +6 -0
  33. mfcli/constants/pipeline_run_status.py +3 -0
  34. mfcli/crud/__init__.py +0 -0
  35. mfcli/crud/file.py +42 -0
  36. mfcli/crud/functional_blocks.py +26 -0
  37. mfcli/crud/netlist.py +18 -0
  38. mfcli/crud/pipeline_run.py +17 -0
  39. mfcli/crud/project.py +99 -0
  40. mfcli/digikey/__init__.py +0 -0
  41. mfcli/digikey/digikey.py +105 -0
  42. mfcli/main.py +5 -0
  43. mfcli/mcp/__init__.py +0 -0
  44. mfcli/mcp/configs/cline_mcp_settings.json +11 -0
  45. mfcli/mcp/configs/mfcli.mcp.json +7 -0
  46. mfcli/mcp/mcp_instance.py +6 -0
  47. mfcli/mcp/server.py +37 -0
  48. mfcli/mcp/state_manager.py +51 -0
  49. mfcli/mcp/tools/__init__.py +0 -0
  50. mfcli/mcp/tools/query_knowledgebase.py +108 -0
  51. mfcli/models/__init__.py +10 -0
  52. mfcli/models/base.py +10 -0
  53. mfcli/models/bom.py +71 -0
  54. mfcli/models/datasheet.py +10 -0
  55. mfcli/models/debug_setup.py +64 -0
  56. mfcli/models/file.py +43 -0
  57. mfcli/models/file_docket.py +94 -0
  58. mfcli/models/file_metadata.py +19 -0
  59. mfcli/models/functional_blocks.py +94 -0
  60. mfcli/models/llm_response.py +5 -0
  61. mfcli/models/mcu.py +97 -0
  62. mfcli/models/mcu_errata.py +26 -0
  63. mfcli/models/netlist.py +59 -0
  64. mfcli/models/pdf_parts.py +25 -0
  65. mfcli/models/pipeline_run.py +34 -0
  66. mfcli/models/project.py +27 -0
  67. mfcli/models/project_metadata.py +15 -0
  68. mfcli/pipeline/__init__.py +0 -0
  69. mfcli/pipeline/analysis/__init__.py +0 -0
  70. mfcli/pipeline/analysis/bom_netlist_mapper.py +28 -0
  71. mfcli/pipeline/analysis/generators/__init__.py +0 -0
  72. mfcli/pipeline/analysis/generators/bom/__init__.py +0 -0
  73. mfcli/pipeline/analysis/generators/bom/bom.py +74 -0
  74. mfcli/pipeline/analysis/generators/debug_setup/__init__.py +0 -0
  75. mfcli/pipeline/analysis/generators/debug_setup/debug_setup.py +71 -0
  76. mfcli/pipeline/analysis/generators/debug_setup/instructions.py +150 -0
  77. mfcli/pipeline/analysis/generators/functional_blocks/__init__.py +0 -0
  78. mfcli/pipeline/analysis/generators/functional_blocks/functional_blocks.py +93 -0
  79. mfcli/pipeline/analysis/generators/functional_blocks/instructions.py +34 -0
  80. mfcli/pipeline/analysis/generators/functional_blocks/validator.py +94 -0
  81. mfcli/pipeline/analysis/generators/generator.py +258 -0
  82. mfcli/pipeline/analysis/generators/generator_base.py +18 -0
  83. mfcli/pipeline/analysis/generators/mcu/__init__.py +0 -0
  84. mfcli/pipeline/analysis/generators/mcu/instructions.py +156 -0
  85. mfcli/pipeline/analysis/generators/mcu/mcu.py +84 -0
  86. mfcli/pipeline/analysis/generators/mcu_errata/__init__.py +1 -0
  87. mfcli/pipeline/analysis/generators/mcu_errata/instructions.py +77 -0
  88. mfcli/pipeline/analysis/generators/mcu_errata/mcu_errata.py +95 -0
  89. mfcli/pipeline/analysis/generators/summary/__init__.py +0 -0
  90. mfcli/pipeline/analysis/generators/summary/summary.py +47 -0
  91. mfcli/pipeline/classifier.py +93 -0
  92. mfcli/pipeline/data_enricher.py +15 -0
  93. mfcli/pipeline/extractor.py +34 -0
  94. mfcli/pipeline/extractors/__init__.py +0 -0
  95. mfcli/pipeline/extractors/pdf.py +12 -0
  96. mfcli/pipeline/parser.py +120 -0
  97. mfcli/pipeline/parsers/__init__.py +0 -0
  98. mfcli/pipeline/parsers/netlist/__init__.py +0 -0
  99. mfcli/pipeline/parsers/netlist/edif.py +93 -0
  100. mfcli/pipeline/parsers/netlist/kicad_legacy_net.py +326 -0
  101. mfcli/pipeline/parsers/netlist/kicad_spice.py +135 -0
  102. mfcli/pipeline/parsers/netlist/pads.py +185 -0
  103. mfcli/pipeline/parsers/netlist/protel.py +166 -0
  104. mfcli/pipeline/parsers/netlist/protel_detector.py +29 -0
  105. mfcli/pipeline/pipeline.py +419 -0
  106. mfcli/pipeline/preprocessors/__init__.py +0 -0
  107. mfcli/pipeline/preprocessors/user_guide.py +127 -0
  108. mfcli/pipeline/run_context.py +32 -0
  109. mfcli/pipeline/schema_mapper.py +89 -0
  110. mfcli/pipeline/sub_classifier.py +115 -0
  111. mfcli/utils/__init__.py +0 -0
  112. mfcli/utils/config.py +33 -0
  113. mfcli/utils/configurator.py +324 -0
  114. mfcli/utils/data_cleaner.py +82 -0
  115. mfcli/utils/datasheet_vectorizer.py +281 -0
  116. mfcli/utils/directory_manager.py +96 -0
  117. mfcli/utils/file_upload.py +298 -0
  118. mfcli/utils/files.py +16 -0
  119. mfcli/utils/http_requests.py +54 -0
  120. mfcli/utils/kb_lister.py +89 -0
  121. mfcli/utils/kb_remover.py +173 -0
  122. mfcli/utils/logger.py +28 -0
  123. mfcli/utils/mcp_configurator.py +311 -0
  124. mfcli/utils/migrations.py +18 -0
  125. mfcli/utils/orm.py +43 -0
  126. mfcli/utils/pdf_splitter.py +63 -0
  127. mfcli/utils/query_service.py +22 -0
  128. mfcli/utils/system_check.py +306 -0
  129. mfcli/utils/tools.py +31 -0
  130. mfcli/utils/vectorizer.py +28 -0
  131. mfcli-0.2.0.dist-info/METADATA +841 -0
  132. mfcli-0.2.0.dist-info/RECORD +136 -0
  133. mfcli-0.2.0.dist-info/WHEEL +5 -0
  134. mfcli-0.2.0.dist-info/entry_points.txt +3 -0
  135. mfcli-0.2.0.dist-info/licenses/LICENSE +21 -0
  136. mfcli-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,166 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Protel/Altium Designer Netlist Parser
4
+
5
+ Parses Protel/Altium Designer netlist files and extracts:
6
+ - Reference designators (ref_des)
7
+ - Part numbers/footprints
8
+ - Pin connections (pin number + net name)
9
+
10
+ Format example:
11
+ {COMPONENT PROTEL.PCB
12
+ {DETAIL
13
+ {SUBCOMP
14
+ {I <footprint>.PRT <ref_des>
15
+ {CN
16
+ <pin> <net>
17
+ ...
18
+ }
19
+ }
20
+ }
21
+ }
22
+ }
23
+ """
24
+
25
+ from pathlib import Path
26
+ from typing import Dict
27
+
28
+ from mfcli.models.netlist import Component, NetlistSchema, Pin
29
+
30
+
31
+ class ProtelParser:
32
+ """Parser for Protel/Altium Designer netlist files."""
33
+
34
+ def __init__(self, protel_content: str):
35
+ self.content = protel_content
36
+ self.lines = [line.strip() for line in protel_content.strip().split('\n')]
37
+ self.components: Dict[str, Component] = {}
38
+
39
+ def parse(self) -> NetlistSchema:
40
+ """Parse Protel content and return validated schema."""
41
+ # Validate header
42
+ if not self._validate_header():
43
+ raise ValueError("Not a valid Protel/Altium netlist file (missing {COMPONENT PROTEL.PCB header)")
44
+
45
+ # Parse components
46
+ self._parse_components()
47
+
48
+ # Validate and return
49
+ components_list = list(self.components.values())
50
+ return NetlistSchema(components=components_list)
51
+
52
+ def _validate_header(self) -> bool:
53
+ """Check if file has Protel/Altium header."""
54
+ return len(self.lines) > 0 and '{COMPONENT PROTEL.PCB' in self.lines[0]
55
+
56
+ def _parse_components(self):
57
+ """Parse components from the netlist."""
58
+ i = 0
59
+ while i < len(self.lines):
60
+ line = self.lines[i]
61
+
62
+ # Look for component definition: {I <footprint>.PRT <ref_des>
63
+ if line.startswith('{I ') and '.PRT ' in line:
64
+ # Extract footprint and ref_des
65
+ parts = line.split()
66
+ if len(parts) >= 3:
67
+ footprint = parts[1] # e.g., "0603.PRT"
68
+ ref_des = parts[2] # e.g., "C1"
69
+
70
+ # Remove .PRT extension from footprint
71
+ if footprint.endswith('.PRT'):
72
+ footprint = footprint[:-4]
73
+
74
+ # Create component
75
+ self.components[ref_des] = Component(
76
+ ref_des=ref_des,
77
+ part_number=footprint,
78
+ pins=[]
79
+ )
80
+
81
+ # Parse pins for this component
82
+ i = self._parse_pins(i + 1, ref_des)
83
+ continue
84
+
85
+ i += 1
86
+
87
+ def _parse_pins(self, start_idx: int, ref_des: str) -> int:
88
+ """
89
+ Parse pins for a component starting from the {CN block.
90
+ Returns the index after parsing all pins.
91
+ """
92
+ i = start_idx
93
+ in_cn_block = False
94
+
95
+ while i < len(self.lines):
96
+ line = self.lines[i]
97
+
98
+ # Check if we're entering the {CN block
99
+ if line == '{CN':
100
+ in_cn_block = True
101
+ i += 1
102
+ continue
103
+
104
+ # Check if we're exiting the {CN block or component block
105
+ if line == '}':
106
+ if in_cn_block:
107
+ in_cn_block = False
108
+ return i + 1 # Exit after {CN block closes
109
+ else:
110
+ return i + 1 # Exit after component block closes
111
+
112
+ # Parse pin connections within {CN block
113
+ if in_cn_block and line:
114
+ # Format: <pin_number> <net_name>
115
+ # Example: "1 3V3" or "2 GND"
116
+ parts = line.split(None, 1)
117
+ if len(parts) >= 1:
118
+ pin_number = parts[0]
119
+ net_name = parts[1] if len(parts) > 1 else ""
120
+
121
+ # Skip lines that don't start with a number (metadata)
122
+ if not pin_number or not pin_number[0].isdigit():
123
+ i += 1
124
+ continue
125
+
126
+ # Add pin to component
127
+ if ref_des in self.components and net_name:
128
+ pin = Pin(pin=pin_number, net=net_name)
129
+
130
+ # Avoid duplicates
131
+ existing_pins = self.components[ref_des].pins
132
+ if not any(p.pin == pin.pin and p.net == pin.net for p in existing_pins):
133
+ self.components[ref_des].pins.append(pin)
134
+
135
+ i += 1
136
+
137
+ return i
138
+
139
+
140
+ def parse_protel_file(filepath: Path) -> NetlistSchema:
141
+ """
142
+ Parse a Protel/Altium Designer netlist file and return validated netlist schema.
143
+
144
+ Args:
145
+ filepath: Path to Protel/Altium netlist file
146
+
147
+ Returns:
148
+ NetlistSchema with components and pins
149
+
150
+ Raises:
151
+ FileNotFoundError: If file doesn't exist
152
+ ValidationError: If parsed data doesn't match schema
153
+ ValueError: If file is not a valid Protel/Altium netlist
154
+ """
155
+ if not filepath.exists():
156
+ raise FileNotFoundError(f"Protel/Altium netlist file not found: {filepath}")
157
+
158
+ # Read file content
159
+ with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
160
+ content = f.read()
161
+
162
+ # Parse
163
+ parser = ProtelParser(content)
164
+ schema = parser.parse()
165
+
166
+ return schema
@@ -0,0 +1,29 @@
1
+ """
2
+ Protel/Altium Designer Netlist Detector
3
+
4
+ Helper function to detect if a file is a Protel/Altium Designer netlist.
5
+ """
6
+
7
+
8
+ def is_protel_netlist(content: str) -> bool:
9
+ """
10
+ Detect if content is from a Protel/Altium Designer netlist file.
11
+
12
+ Args:
13
+ content: File content to check
14
+
15
+ Returns:
16
+ True if content appears to be a Protel/Altium netlist
17
+ """
18
+ lines = content.strip().split('\n')
19
+
20
+ if not lines:
21
+ return False
22
+
23
+ # Check for Protel/Altium header in first few lines
24
+ for line in lines[:5]:
25
+ line = line.strip()
26
+ if '{COMPONENT PROTEL.PCB' in line:
27
+ return True
28
+
29
+ return False
@@ -0,0 +1,419 @@
1
+ import json
2
+ import os.path
3
+ from pathlib import Path
4
+ from typing import Dict, List
5
+
6
+ from google.genai.types import File as GeminiFile
7
+
8
+ from mfcli.models.pdf_parts import PDFPart
9
+ from mfcli.models.project_metadata import ProjectConfig
10
+ from mfcli.pipeline.preprocessors.user_guide import preprocess_user_guide
11
+ from mfcli.pipeline.run_context import PipelineRunContext
12
+ from mfcli.utils.datasheet_vectorizer import DatasheetVectorizer
13
+
14
+ from mfcli.agents.tools.general import format_error_for_llm
15
+ from mfcli.client.chroma_db import ChromaClient
16
+ from mfcli.client.gemini import Gemini
17
+ from mfcli.constants.directory_names import MF_PROJECT_CONFIG_DIR_NAME
18
+ from mfcli.constants.file_types import (
19
+ SchemalessFileSubtypes,
20
+ FileTypes,
21
+ FileSubtypes,
22
+ PDFNoVectorizeFileSubtypes,
23
+ SummaryCheatSheetSubtypes
24
+ )
25
+ from mfcli.crud.file import create_file
26
+ from mfcli.crud.pipeline_run import create_pipeline_run
27
+ from mfcli.crud.project import get_project_by_name, read_project_config_file
28
+ from mfcli.models.file import File
29
+ from mfcli.models.file_docket import FileDocket, FileDocketEntry
30
+ from mfcli.models.pipeline_run import PipelineRun
31
+ from mfcli.models.project import Project
32
+ from mfcli.pipeline.analysis.bom_netlist_mapper import map_netlist_to_bom_entries
33
+ from mfcli.pipeline.analysis.generators.generator import Generator
34
+ from mfcli.pipeline.classifier import get_file_metadata, validate_file
35
+ from mfcli.pipeline.data_enricher import enrich_data_for_model
36
+ from mfcli.pipeline.extractor import extract_document_text
37
+ from mfcli.pipeline.parser import parse_schema
38
+ from mfcli.pipeline.schema_mapper import map_schema
39
+ from mfcli.pipeline.sub_classifier import FileSubtypeAnalyzer
40
+ from mfcli.utils.directory_manager import app_dirs
41
+ from mfcli.utils.logger import get_logger
42
+ from mfcli.utils.orm import Session
43
+ from mfcli.utils.pdf_splitter import PDFSplitter
44
+
45
+ logger = get_logger(__name__)
46
+
47
+
48
+ # TODO: IMPROVE get_file_subtype SO IT DOESN'T USE LLM
49
+
50
+
51
+ class PipelineRunner:
52
+ def __init__(self, db: Session, project: Project, project_config: ProjectConfig):
53
+ self._db = db
54
+ self._project = project
55
+ self.folder_path = project.repo_dir
56
+ self.total_files = 0
57
+ self.successfully_processed = 0
58
+ self.failed_files = 0
59
+ self.skipped_files = 0
60
+ self.errors = []
61
+ self.pipeline_run: PipelineRun | None = None
62
+ self._gemini = Gemini()
63
+ self._gemini_file_cache: Dict[str, GeminiFile] = {}
64
+ self._chroma_db = ChromaClient(project.index_id)
65
+ self._docket = FileDocket()
66
+ self._vectorizer = DatasheetVectorizer(self._chroma_db)
67
+ self._subtype_analyzer = FileSubtypeAnalyzer(self._gemini)
68
+ self._config = project_config
69
+ self._context = PipelineRunContext(
70
+ db=self._db,
71
+ pipeline_run=self.pipeline_run,
72
+ gemini=self._gemini,
73
+ gemini_file_cache=self._gemini_file_cache,
74
+ docket=self._docket,
75
+ config=self._config,
76
+ vectorizer=self._vectorizer
77
+ )
78
+ # Track which file types were actually processed (not skipped) in this run
79
+ self._processed_file_types: set[str] = set()
80
+ # Load existing file docket if it exists
81
+ self._load_existing_docket()
82
+
83
+ def _load_existing_docket(self):
84
+ """Load existing file docket from JSON file"""
85
+ if app_dirs.file_docket_path and app_dirs.file_docket_path.exists():
86
+ logger.info(f"Loading existing file docket from: {app_dirs.file_docket_path}")
87
+ self._docket.load_from_json(app_dirs.file_docket_path)
88
+ else:
89
+ logger.info("No existing file docket found, starting fresh")
90
+
91
+ def _add_to_file_docket(self, file: File):
92
+ if file.is_datasheet:
93
+ vectorize = self._config.vectorize_datasheets
94
+ else:
95
+ vectorize = self._config.vectorize_hw_files
96
+ entry = FileDocketEntry(
97
+ name=file.name,
98
+ path=file.path,
99
+ vectorize=vectorize,
100
+ sub_type=FileSubtypes(file.sub_type).name,
101
+ md5=file.md5,
102
+ is_datasheet=bool(file.is_datasheet)
103
+ )
104
+ self._docket.add(entry)
105
+
106
+ def _save_file_docket(self):
107
+ json_data = json.dumps(self._docket.get_entries(), indent=2)
108
+ with open(app_dirs.file_docket_path, "w") as f:
109
+ f.write(json_data)
110
+
111
+ async def _gemini_files_upload(self, files: List[File | PDFPart]) -> List[GeminiFile]:
112
+ gemini_files = []
113
+ for file in files:
114
+ logger.debug(f"Checking for {type(file)} Gemini file")
115
+ if file.gemini_file_id in self._gemini_file_cache:
116
+ gemini_files.append(self._gemini_file_cache[file.gemini_file_id])
117
+ continue
118
+ logger.debug(f"Uploading {type(file)} to Gemini API")
119
+ gemini_file = await self._gemini.upload(file.path)
120
+ self._gemini_file_cache[gemini_file.name] = gemini_file
121
+ file.gemini_file_id = gemini_file.name
122
+ gemini_files.append(gemini_file)
123
+ return gemini_files
124
+
125
+ async def _preprocess_pdf(self, file: File, file_path: str, content: bytes):
126
+ logger.debug(f"Uploading file to Gemini: {file_path}")
127
+ splitter = PDFSplitter(file.name, content)
128
+ logger.debug("Splitting PDF head")
129
+ pdf_head_path = splitter.split_pdf_head()
130
+ gemini_pdf_head_file = await self._gemini.upload(pdf_head_path)
131
+ logger.debug(f"Analyzing PDF subtype: {file_path}")
132
+ await self._subtype_analyzer.analyze_pdf(file, gemini_pdf_head_file)
133
+
134
+ # Pre-process user guide files which are too big to upload to Gemini normally
135
+ # Extract the table of contents and split PDF into relevant content sections
136
+ # These sections will be used to generate summaries in analysis phase of pipeline
137
+ if file.sub_type in SummaryCheatSheetSubtypes:
138
+ await preprocess_user_guide(
139
+ context=self._context,
140
+ file=file,
141
+ pdf_head=gemini_pdf_head_file,
142
+ content=content,
143
+ splitter=splitter
144
+ )
145
+ else:
146
+ gemini_files = await self._gemini_files_upload([file])
147
+ logger.debug(f"Gemini files: {gemini_files}")
148
+
149
+ if self._config.vectorize_hw_files:
150
+ if file.sub_type in PDFNoVectorizeFileSubtypes:
151
+ logger.debug(f"PDF subtype does not require vectorization, skipping")
152
+ else:
153
+ logger.info(f"Chunking and vectorizing PDF: {file_path}")
154
+
155
+ # Use Docling to chunk PDF file and then vectorize it
156
+ self._vectorizer.vectorize_file_buf(
157
+ file_name=file.name,
158
+ file_bytes=content,
159
+ purpose=FileSubtypes(file.sub_type).name
160
+ )
161
+ else:
162
+ logger.debug(f"vectorize_hw_files is set to False, skipping")
163
+
164
+ async def _preprocess_other_file_types(self, file: File, file_path: str, content: bytes):
165
+ logger.debug(f"File is not a PDF: {file_path}")
166
+ text_content = extract_document_text(file, content)
167
+ logger.debug(f"Analyzing file subtype: {file_path}")
168
+
169
+ # Analyze subtype
170
+ await self._subtype_analyzer.analyze_file(file, text_content)
171
+
172
+ if self._config.vectorize_hw_files:
173
+ logger.info(f"Vectorizing file: {file_path}")
174
+
175
+ # Vectorize
176
+ self._vectorizer.vectorize_text_content(
177
+ text=text_content,
178
+ file_name=file.name,
179
+ purpose=FileSubtypes(file.sub_type).name,
180
+ additional_metadata={"is_datasheet": file.is_datasheet}
181
+ )
182
+
183
+ # Ignore file subtypes that have no schema to parse like schematic files
184
+ if file.sub_type in SchemalessFileSubtypes:
185
+ logger.debug(f"File subtype is in ignore list, not parsing")
186
+ else:
187
+ logger.debug(f"File subtype is not in ignore list, parsing")
188
+
189
+ # Map schema
190
+ logger.debug(f"Mapping schema: {file_path}")
191
+ schema_mapping = await map_schema(self._gemini, file.sub_type, text_content)
192
+
193
+ # Parse schema from file
194
+ instances = parse_schema(self._db, file, schema_mapping)
195
+
196
+ # Enrich data
197
+ await enrich_data_for_model(self._db, self._chroma_db, file.sub_type, instances)
198
+
199
+ async def _preprocess_file(self, file_path: str, is_datasheet: bool) -> File:
200
+ logger.debug(f"Initializing Gemini client for pipeline: {self.pipeline_run.id}")
201
+
202
+ # Retrieve metadata
203
+ logger.debug(f"Retrieving file metadata: {file_path}")
204
+ metadata, content = get_file_metadata(file_path, is_datasheet)
205
+ logger.debug(f"Metadata retrieved: {file_path}")
206
+
207
+ # Validate file
208
+ validate_file(metadata)
209
+ logger.debug(f"File type validated: {file_path}")
210
+
211
+ # Create file
212
+ file = create_file(self._db, self.pipeline_run.id, metadata)
213
+
214
+ # PDF file pre-processing
215
+ if file.type == FileTypes.PDF:
216
+ await self._preprocess_pdf(file, file_path, content)
217
+
218
+ # Other file types pre-processing
219
+ else:
220
+ await self._preprocess_other_file_types(file, file_path, content)
221
+
222
+ self._db.commit()
223
+ logger.info(f"Pre-processing complete: {file_path}")
224
+ return file
225
+
226
+ async def _run_analysis(self):
227
+ logger.debug(f"Starting analysis for pipeline: {self.pipeline_run.id}")
228
+
229
+ # Only run netlist-to-BOM mapping if BOM or netlist files were processed
230
+ netlist_types = {"PROTEL_ALTIUM", "KICAD_LEGACY_NET", "KICAD_SPICE", "PADS", "EDIF"}
231
+ should_map_netlist = (
232
+ "BOM" in self._processed_file_types or
233
+ any(nt in self._processed_file_types for nt in netlist_types)
234
+ )
235
+
236
+ if should_map_netlist:
237
+ logger.info("Running netlist-to-BOM mapping (BOM or netlist files were processed)")
238
+ map_netlist_to_bom_entries(self._db, self.pipeline_run.id)
239
+ else:
240
+ logger.info("Skipping netlist-to-BOM mapping (no BOM or netlist files processed)")
241
+
242
+ logger.debug(f"Finished analysis for pipeline: {self.pipeline_run.id}")
243
+
244
+ # Pass processed file types to generator for conditional generation
245
+ generator = Generator(self._context, self._processed_file_types)
246
+ await generator.generate_cheat_sheets()
247
+
248
+ def _check_file_in_docket(self, file_path: str, file_md5: str) -> tuple[bool, bool]:
249
+ """
250
+ Check if file exists in docket and if MD5 matches.
251
+ Returns: (should_process, should_remove_old)
252
+ """
253
+ # Check if file with same path exists in docket
254
+ existing_entry = self._docket.get_by_path(file_path)
255
+
256
+ if not existing_entry:
257
+ # File not in docket, process it
258
+ return True, False
259
+
260
+ # File exists in docket, check MD5
261
+ if existing_entry.md5 == file_md5:
262
+ # MD5 matches, skip processing
263
+ logger.info(f"File already processed with matching MD5, skipping: {file_path}")
264
+ return False, False
265
+
266
+ # MD5 doesn't match, prompt user
267
+ logger.warning(f"File exists in docket but MD5 has changed: {file_path}")
268
+ logger.warning(f" Old MD5: {existing_entry.md5}")
269
+ logger.warning(f" New MD5: {file_md5}")
270
+
271
+ print(f"\n{'='*70}")
272
+ print(f"File has been modified: {Path(file_path).name}")
273
+ print(f"Path: {file_path}")
274
+ print(f"Old MD5: {existing_entry.md5}")
275
+ print(f"New MD5: {file_md5}")
276
+ print(f"{'='*70}")
277
+ response = input("Do you want to delete the old file data and process the new version? (yes/no): ").strip().lower()
278
+
279
+ if response in ['yes', 'y']:
280
+ logger.info(f"User confirmed deletion and reprocessing of: {file_path}")
281
+ return True, True
282
+ else:
283
+ logger.info(f"User declined reprocessing, skipping: {file_path}")
284
+ return False, False
285
+
286
+ def _remove_file_from_kb(self, entry: FileDocketEntry):
287
+ """Remove file from knowledge base (ChromaDB)"""
288
+ try:
289
+ collection = self._chroma_db._collection
290
+ results = collection.get()
291
+
292
+ if not results or not results.get('metadatas'):
293
+ logger.warning(f"No data found in knowledge base to remove for: {entry.name}")
294
+ return
295
+
296
+ # Find matching chunks for this file
297
+ matching_ids = []
298
+ for idx, metadata in enumerate(results['metadatas']):
299
+ if metadata and metadata.get('file_name') == entry.name:
300
+ matching_ids.append(results['ids'][idx])
301
+
302
+ if matching_ids:
303
+ collection.delete(ids=matching_ids)
304
+ logger.info(f"Removed {len(matching_ids)} chunks from knowledge base for: {entry.name}")
305
+ else:
306
+ logger.info(f"No chunks found in knowledge base for: {entry.name}")
307
+
308
+ # Remove from docket
309
+ self._docket.remove(entry)
310
+ logger.info(f"Removed file from docket: {entry.name}")
311
+
312
+ except Exception as e:
313
+ logger.error(f"Error removing file from knowledge base: {entry.name}")
314
+ logger.exception(e)
315
+
316
+ async def _preprocess_folder(self, folder_path: str | Path, is_datasheet: bool):
317
+ ignore_dirs = [MF_PROJECT_CONFIG_DIR_NAME]
318
+ for dir_path, dir_names, file_names in os.walk(folder_path):
319
+ dir_names[:] = [d for d in dir_names if d not in ignore_dirs]
320
+ for file_name in file_names:
321
+ self.total_files += 1
322
+ file_path = os.path.join(dir_path, file_name)
323
+
324
+ try:
325
+ # Get file metadata to check MD5
326
+ logger.debug(f"Checking file: {file_path}")
327
+ metadata, _ = get_file_metadata(file_path, is_datasheet)
328
+
329
+ # Check if file should be processed
330
+ should_process, should_remove_old = self._check_file_in_docket(file_path, metadata.md5)
331
+
332
+ if not should_process:
333
+ # Skip this file
334
+ self.skipped_files += 1
335
+ logger.info(f"Skipping file: {file_path}")
336
+ continue
337
+
338
+ # If we need to remove old version first
339
+ if should_remove_old:
340
+ existing_entry = self._docket.get_by_path(file_path)
341
+ if existing_entry:
342
+ logger.info(f"Removing old version from knowledge base: {file_path}")
343
+ self._remove_file_from_kb(existing_entry)
344
+
345
+ # Process the file
346
+ logger.info(f"Pre-processing file: {file_path}")
347
+ file = await self._preprocess_file(file_path, is_datasheet)
348
+ self.successfully_processed += 1
349
+ self._add_to_file_docket(file)
350
+
351
+ # Track the file subtype as processed
352
+ if file.sub_type:
353
+ self._processed_file_types.add(FileSubtypes(file.sub_type).name)
354
+
355
+ except Exception as e:
356
+ self.failed_files += 1
357
+ self.errors.append({"file_path": file_path, "error": str(e)})
358
+ logger.exception(e)
359
+ logger.error(f"Error processing file: {file_path}")
360
+
361
+ async def run(self):
362
+ try:
363
+ logger.info(f"Starting pipeline for directory: {self.folder_path}")
364
+ self.pipeline_run = create_pipeline_run(self._db, self._project)
365
+ self._context.run = self.pipeline_run
366
+ await self._preprocess_folder(self.folder_path, False)
367
+ logger.info(f"Finished pre-processing folder: {self.folder_path}")
368
+
369
+ # Run pre-processing on datasheets which were just downloaded
370
+ logger.info(f"Starting pre-processing of datasheets: {app_dirs.data_sheets_dir}")
371
+ await self._preprocess_folder(app_dirs.data_sheets_dir, True)
372
+ logger.info(f"Finished pre-processing of datasheets: {app_dirs.data_sheets_dir}")
373
+
374
+ logger.info(f"Preprocessing finished: {self.folder_path}")
375
+ logger.info(f"Running analysis step: {self.folder_path}")
376
+ await self._run_analysis()
377
+ self._db.commit()
378
+ self._save_file_docket()
379
+ report = json.dumps({
380
+ "total_files": self.total_files,
381
+ "successfully_processed": self.successfully_processed,
382
+ "skipped_files": self.skipped_files,
383
+ "failed_files": self.failed_files,
384
+ "errors": self.errors
385
+ })
386
+ logger.info(f"Finished pipeline")
387
+ logger.info(f"Report: {report}")
388
+
389
+ # Print summary to console
390
+ print(f"\n{'='*70}")
391
+ print(f"Pipeline Execution Summary")
392
+ print(f"{'='*70}")
393
+ print(f"Total files found: {self.total_files}")
394
+ print(f"Successfully processed: {self.successfully_processed}")
395
+ print(f"Skipped (already processed): {self.skipped_files}")
396
+ print(f"Failed: {self.failed_files}")
397
+ print(f"{'='*70}\n")
398
+ except Exception as e:
399
+ logger.exception(e)
400
+ logger.error(f"Error in pipeline: {e}")
401
+ return format_error_for_llm(e)
402
+
403
+
404
+ async def run_pipeline_with_config(project_config: ProjectConfig):
405
+ with Session() as db:
406
+ project = get_project_by_name(db, project_config.name)
407
+ return await PipelineRunner(db, project, project_config).run()
408
+
409
+
410
+ async def run_pipeline(project_name: str) -> str:
411
+ """
412
+ The controller agent will call this tool to start the pipeline processing for all the files in a directory.
413
+ :param project_name: The name of the project
414
+ :return: Status of the pipeline run
415
+ """
416
+ with Session() as db:
417
+ project = get_project_by_name(db, project_name)
418
+ project_config = read_project_config_file()
419
+ return await PipelineRunner(db, project, project_config).run()
File without changes