mfcli 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. mfcli/.env.example +72 -0
  2. mfcli/__init__.py +0 -0
  3. mfcli/agents/__init__.py +0 -0
  4. mfcli/agents/controller/__init__.py +0 -0
  5. mfcli/agents/controller/agent.py +19 -0
  6. mfcli/agents/controller/config.yaml +27 -0
  7. mfcli/agents/controller/tools.py +42 -0
  8. mfcli/agents/tools/general.py +118 -0
  9. mfcli/alembic/env.py +61 -0
  10. mfcli/alembic/script.py.mako +28 -0
  11. mfcli/alembic/versions/6ccc0c7c397c_added_fields_to_pdf_parts_model.py +39 -0
  12. mfcli/alembic/versions/769019ef4870_added_gemini_file_path_to_pdf_part_model.py +33 -0
  13. mfcli/alembic/versions/7a2e3a779fdc_added_functional_block_and_component_.py +54 -0
  14. mfcli/alembic/versions/7d5adb2a47a7_added_pdf_parts_model.py +41 -0
  15. mfcli/alembic/versions/7fcb7d6a5836_init.py +167 -0
  16. mfcli/alembic/versions/e0f2b5765c72_added_cascade_delete_for_models_that_.py +32 -0
  17. mfcli/alembic.ini +147 -0
  18. mfcli/cli/__init__.py +0 -0
  19. mfcli/cli/dependencies.py +59 -0
  20. mfcli/cli/main.py +200 -0
  21. mfcli/client/__init__.py +0 -0
  22. mfcli/client/chroma_db.py +184 -0
  23. mfcli/client/docling.py +44 -0
  24. mfcli/client/gemini.py +252 -0
  25. mfcli/client/llama_parse.py +38 -0
  26. mfcli/client/vector_db.py +93 -0
  27. mfcli/constants/__init__.py +0 -0
  28. mfcli/constants/base_enum.py +18 -0
  29. mfcli/constants/directory_names.py +1 -0
  30. mfcli/constants/file_types.py +189 -0
  31. mfcli/constants/gemini.py +1 -0
  32. mfcli/constants/openai.py +6 -0
  33. mfcli/constants/pipeline_run_status.py +3 -0
  34. mfcli/crud/__init__.py +0 -0
  35. mfcli/crud/file.py +42 -0
  36. mfcli/crud/functional_blocks.py +26 -0
  37. mfcli/crud/netlist.py +18 -0
  38. mfcli/crud/pipeline_run.py +17 -0
  39. mfcli/crud/project.py +144 -0
  40. mfcli/digikey/__init__.py +0 -0
  41. mfcli/digikey/digikey.py +105 -0
  42. mfcli/main.py +5 -0
  43. mfcli/mcp/__init__.py +0 -0
  44. mfcli/mcp/configs/cline_mcp_settings.json +11 -0
  45. mfcli/mcp/configs/mfcli.mcp.json +7 -0
  46. mfcli/mcp/mcp_instance.py +6 -0
  47. mfcli/mcp/server.py +37 -0
  48. mfcli/mcp/state_manager.py +51 -0
  49. mfcli/mcp/tools/__init__.py +0 -0
  50. mfcli/mcp/tools/query_knowledgebase.py +108 -0
  51. mfcli/models/__init__.py +10 -0
  52. mfcli/models/base.py +10 -0
  53. mfcli/models/bom.py +71 -0
  54. mfcli/models/datasheet.py +10 -0
  55. mfcli/models/debug_setup.py +64 -0
  56. mfcli/models/file.py +43 -0
  57. mfcli/models/file_docket.py +94 -0
  58. mfcli/models/file_metadata.py +19 -0
  59. mfcli/models/functional_blocks.py +94 -0
  60. mfcli/models/llm_response.py +5 -0
  61. mfcli/models/mcu.py +97 -0
  62. mfcli/models/mcu_errata.py +26 -0
  63. mfcli/models/netlist.py +59 -0
  64. mfcli/models/pdf_parts.py +25 -0
  65. mfcli/models/pipeline_run.py +34 -0
  66. mfcli/models/project.py +27 -0
  67. mfcli/models/project_metadata.py +15 -0
  68. mfcli/pipeline/__init__.py +0 -0
  69. mfcli/pipeline/analysis/__init__.py +0 -0
  70. mfcli/pipeline/analysis/bom_netlist_mapper.py +28 -0
  71. mfcli/pipeline/analysis/generators/__init__.py +0 -0
  72. mfcli/pipeline/analysis/generators/bom/__init__.py +0 -0
  73. mfcli/pipeline/analysis/generators/bom/bom.py +74 -0
  74. mfcli/pipeline/analysis/generators/debug_setup/__init__.py +0 -0
  75. mfcli/pipeline/analysis/generators/debug_setup/debug_setup.py +71 -0
  76. mfcli/pipeline/analysis/generators/debug_setup/instructions.py +150 -0
  77. mfcli/pipeline/analysis/generators/functional_blocks/__init__.py +0 -0
  78. mfcli/pipeline/analysis/generators/functional_blocks/functional_blocks.py +93 -0
  79. mfcli/pipeline/analysis/generators/functional_blocks/instructions.py +34 -0
  80. mfcli/pipeline/analysis/generators/functional_blocks/validator.py +94 -0
  81. mfcli/pipeline/analysis/generators/generator.py +258 -0
  82. mfcli/pipeline/analysis/generators/generator_base.py +18 -0
  83. mfcli/pipeline/analysis/generators/mcu/__init__.py +0 -0
  84. mfcli/pipeline/analysis/generators/mcu/instructions.py +156 -0
  85. mfcli/pipeline/analysis/generators/mcu/mcu.py +84 -0
  86. mfcli/pipeline/analysis/generators/mcu_errata/__init__.py +1 -0
  87. mfcli/pipeline/analysis/generators/mcu_errata/instructions.py +77 -0
  88. mfcli/pipeline/analysis/generators/mcu_errata/mcu_errata.py +95 -0
  89. mfcli/pipeline/analysis/generators/summary/__init__.py +0 -0
  90. mfcli/pipeline/analysis/generators/summary/summary.py +47 -0
  91. mfcli/pipeline/classifier.py +93 -0
  92. mfcli/pipeline/data_enricher.py +15 -0
  93. mfcli/pipeline/extractor.py +34 -0
  94. mfcli/pipeline/extractors/__init__.py +0 -0
  95. mfcli/pipeline/extractors/pdf.py +12 -0
  96. mfcli/pipeline/parser.py +120 -0
  97. mfcli/pipeline/parsers/__init__.py +0 -0
  98. mfcli/pipeline/parsers/netlist/__init__.py +0 -0
  99. mfcli/pipeline/parsers/netlist/edif.py +93 -0
  100. mfcli/pipeline/parsers/netlist/kicad_legacy_net.py +326 -0
  101. mfcli/pipeline/parsers/netlist/kicad_spice.py +135 -0
  102. mfcli/pipeline/parsers/netlist/pads.py +185 -0
  103. mfcli/pipeline/parsers/netlist/protel.py +166 -0
  104. mfcli/pipeline/parsers/netlist/protel_detector.py +29 -0
  105. mfcli/pipeline/pipeline.py +470 -0
  106. mfcli/pipeline/preprocessors/__init__.py +0 -0
  107. mfcli/pipeline/preprocessors/user_guide.py +127 -0
  108. mfcli/pipeline/run_context.py +32 -0
  109. mfcli/pipeline/schema_mapper.py +89 -0
  110. mfcli/pipeline/sub_classifier.py +115 -0
  111. mfcli/utils/__init__.py +0 -0
  112. mfcli/utils/cline_rules.py +256 -0
  113. mfcli/utils/config.py +33 -0
  114. mfcli/utils/configurator.py +324 -0
  115. mfcli/utils/data_cleaner.py +114 -0
  116. mfcli/utils/datasheet_vectorizer.py +283 -0
  117. mfcli/utils/directory_manager.py +116 -0
  118. mfcli/utils/file_upload.py +298 -0
  119. mfcli/utils/files.py +16 -0
  120. mfcli/utils/http_requests.py +54 -0
  121. mfcli/utils/kb_lister.py +89 -0
  122. mfcli/utils/kb_remover.py +173 -0
  123. mfcli/utils/logger.py +28 -0
  124. mfcli/utils/mcp_configurator.py +394 -0
  125. mfcli/utils/migrations.py +18 -0
  126. mfcli/utils/orm.py +43 -0
  127. mfcli/utils/pdf_splitter.py +63 -0
  128. mfcli/utils/pre_uninstall.py +167 -0
  129. mfcli/utils/query_service.py +22 -0
  130. mfcli/utils/system_check.py +306 -0
  131. mfcli/utils/tools.py +98 -0
  132. mfcli/utils/vectorizer.py +28 -0
  133. mfcli-0.2.1.dist-info/METADATA +956 -0
  134. mfcli-0.2.1.dist-info/RECORD +138 -0
  135. mfcli-0.2.1.dist-info/WHEEL +5 -0
  136. mfcli-0.2.1.dist-info/entry_points.txt +4 -0
  137. mfcli-0.2.1.dist-info/licenses/LICENSE +21 -0
  138. mfcli-0.2.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,470 @@
1
+ import json
2
+ import os.path
3
+ from pathlib import Path
4
+ from typing import Dict, List
5
+
6
+ from google.genai.types import File as GeminiFile
7
+
8
+ from mfcli.models.pdf_parts import PDFPart
9
+ from mfcli.models.project_metadata import ProjectConfig
10
+ from mfcli.pipeline.preprocessors.user_guide import preprocess_user_guide
11
+ from mfcli.pipeline.run_context import PipelineRunContext
12
+ from mfcli.utils.datasheet_vectorizer import DatasheetVectorizer
13
+
14
+ from mfcli.agents.tools.general import format_error_for_llm
15
+ from mfcli.client.chroma_db import ChromaClient
16
+ from mfcli.client.gemini import Gemini
17
+ from mfcli.constants.directory_names import MF_PROJECT_CONFIG_DIR_NAME
18
+ from mfcli.constants.file_types import (
19
+ SchemalessFileSubtypes,
20
+ FileTypes,
21
+ FileSubtypes,
22
+ PDFNoVectorizeFileSubtypes,
23
+ SummaryCheatSheetSubtypes
24
+ )
25
+ from mfcli.crud.file import create_file
26
+ from mfcli.crud.pipeline_run import create_pipeline_run
27
+ from mfcli.crud.project import get_project_by_name, read_project_config_file
28
+ from mfcli.models.file import File
29
+ from mfcli.models.file_docket import FileDocket, FileDocketEntry
30
+ from mfcli.models.pipeline_run import PipelineRun
31
+ from mfcli.models.project import Project
32
+ from mfcli.pipeline.analysis.bom_netlist_mapper import map_netlist_to_bom_entries
33
+ from mfcli.pipeline.analysis.generators.generator import Generator
34
+ from mfcli.pipeline.classifier import get_file_metadata, validate_file
35
+ from mfcli.pipeline.data_enricher import enrich_data_for_model
36
+ from mfcli.pipeline.extractor import extract_document_text
37
+ from mfcli.pipeline.parser import parse_schema
38
+ from mfcli.pipeline.schema_mapper import map_schema
39
+ from mfcli.pipeline.sub_classifier import FileSubtypeAnalyzer
40
+ from mfcli.utils.directory_manager import app_dirs
41
+ from mfcli.utils.logger import get_logger
42
+ from mfcli.utils.orm import Session
43
+ from mfcli.utils.pdf_splitter import PDFSplitter
44
+
45
+ logger = get_logger(__name__)
46
+
47
+
48
+ # TODO: IMPROVE get_file_subtype SO IT DOESN'T USE LLM
49
+
50
+
51
+ class PipelineRunner:
52
+ def __init__(self, db: Session, project: Project, project_config: ProjectConfig):
53
+ self._db = db
54
+ self._project = project
55
+ # Use context folder for file ingestion by default
56
+ self.folder_path = str(app_dirs.context_dir)
57
+ self.total_files = 0
58
+ self.successfully_processed = 0
59
+ self.failed_files = 0
60
+ self.skipped_files = 0
61
+ self.errors = []
62
+ self.pipeline_run: PipelineRun | None = None
63
+ self._gemini = Gemini()
64
+ self._gemini_file_cache: Dict[str, GeminiFile] = {}
65
+ self._chroma_db = ChromaClient(project.index_id)
66
+ self._docket = FileDocket()
67
+ self._vectorizer = DatasheetVectorizer(self._chroma_db)
68
+ self._subtype_analyzer = FileSubtypeAnalyzer(self._gemini)
69
+ self._config = project_config
70
+ self._context = PipelineRunContext(
71
+ db=self._db,
72
+ pipeline_run=self.pipeline_run,
73
+ gemini=self._gemini,
74
+ gemini_file_cache=self._gemini_file_cache,
75
+ docket=self._docket,
76
+ config=self._config,
77
+ vectorizer=self._vectorizer
78
+ )
79
+ # Track which file types were actually processed (not skipped) in this run
80
+ self._processed_file_types: set[str] = set()
81
+ # Load existing file docket if it exists
82
+ self._load_existing_docket()
83
+
84
+ def _load_existing_docket(self):
85
+ """Load existing file docket from JSON file"""
86
+ if app_dirs.file_docket_path and app_dirs.file_docket_path.exists():
87
+ logger.info(f"Loading existing file docket from: {app_dirs.file_docket_path}")
88
+ self._docket.load_from_json(app_dirs.file_docket_path)
89
+ else:
90
+ logger.info("No existing file docket found, starting fresh")
91
+
92
+ def _add_to_file_docket(self, file: File):
93
+ if file.is_datasheet:
94
+ vectorize = self._config.vectorize_datasheets
95
+ else:
96
+ vectorize = self._config.vectorize_hw_files
97
+ entry = FileDocketEntry(
98
+ name=file.name,
99
+ path=file.path,
100
+ vectorize=vectorize,
101
+ sub_type=FileSubtypes(file.sub_type).name,
102
+ md5=file.md5,
103
+ is_datasheet=bool(file.is_datasheet)
104
+ )
105
+ self._docket.add(entry)
106
+
107
+ def _save_file_docket(self):
108
+ json_data = json.dumps(self._docket.get_entries(), indent=2)
109
+ with open(app_dirs.file_docket_path, "w") as f:
110
+ f.write(json_data)
111
+
112
+ def _check_context_folder_has_files(self) -> bool:
113
+ """
114
+ Check if the context folder contains any files.
115
+ Returns True if files exist, False otherwise.
116
+ """
117
+ context_path = Path(self.folder_path)
118
+
119
+ if not context_path.exists():
120
+ logger.warning(f"Context folder does not exist: {context_path}")
121
+ return False
122
+
123
+ # Check for any files in the context folder (recursively)
124
+ ignore_dirs = [MF_PROJECT_CONFIG_DIR_NAME]
125
+ for dir_path, dir_names, file_names in os.walk(context_path):
126
+ dir_names[:] = [d for d in dir_names if d not in ignore_dirs]
127
+ if file_names:
128
+ return True
129
+
130
+ return False
131
+
132
+ def _display_empty_context_message(self):
133
+ """
134
+ Display a helpful message when the context folder is empty.
135
+ """
136
+ context_path = Path(self.folder_path)
137
+
138
+ print(f"\n{'='*70}")
139
+ print(f"CONTEXT FOLDER IS EMPTY")
140
+ print(f"{'='*70}")
141
+ print(f"\nThe context folder contains no files to process:")
142
+ print(f" {context_path}")
143
+ print(f"\nTo run the pipeline, please add critical files to this folder, such as:")
144
+ print(f" • Bill of Materials (BOM) files")
145
+ print(f" • Schematics (PDF or other supported formats)")
146
+ print(f" • MCU/IC user manuals and datasheets")
147
+ print(f" • Netlist files")
148
+ print(f" • Reference designs")
149
+ print(f" • Application notes")
150
+ print(f" • Any other hardware design documentation")
151
+ print(f"\nOnce you've added your files, run 'mfcli run' again.")
152
+ print(f"{'='*70}\n")
153
+
154
+ logger.info("Pipeline execution cancelled: context folder is empty")
155
+
156
+ async def _gemini_files_upload(self, files: List[File | PDFPart]) -> List[GeminiFile]:
157
+ gemini_files = []
158
+ for file in files:
159
+ logger.debug(f"Checking for {type(file)} Gemini file")
160
+ if file.gemini_file_id in self._gemini_file_cache:
161
+ gemini_files.append(self._gemini_file_cache[file.gemini_file_id])
162
+ continue
163
+ logger.debug(f"Uploading {type(file)} to Gemini API")
164
+ gemini_file = await self._gemini.upload(file.path)
165
+ self._gemini_file_cache[gemini_file.name] = gemini_file
166
+ file.gemini_file_id = gemini_file.name
167
+ gemini_files.append(gemini_file)
168
+ return gemini_files
169
+
170
+ async def _preprocess_pdf(self, file: File, file_path: str, content: bytes):
171
+ logger.debug(f"Uploading file to Gemini: {file_path}")
172
+ splitter = PDFSplitter(file.name, content)
173
+ logger.debug("Splitting PDF head")
174
+ pdf_head_path = splitter.split_pdf_head()
175
+ gemini_pdf_head_file = await self._gemini.upload(pdf_head_path)
176
+ logger.debug(f"Analyzing PDF subtype: {file_path}")
177
+ await self._subtype_analyzer.analyze_pdf(file, gemini_pdf_head_file)
178
+
179
+ # Pre-process user guide files which are too big to upload to Gemini normally
180
+ # Extract the table of contents and split PDF into relevant content sections
181
+ # These sections will be used to generate summaries in analysis phase of pipeline
182
+ if file.sub_type in SummaryCheatSheetSubtypes:
183
+ await preprocess_user_guide(
184
+ context=self._context,
185
+ file=file,
186
+ pdf_head=gemini_pdf_head_file,
187
+ content=content,
188
+ splitter=splitter
189
+ )
190
+ else:
191
+ gemini_files = await self._gemini_files_upload([file])
192
+ logger.debug(f"Gemini files: {gemini_files}")
193
+
194
+ if self._config.vectorize_hw_files:
195
+ if file.sub_type in PDFNoVectorizeFileSubtypes:
196
+ logger.debug(f"PDF subtype does not require vectorization, skipping")
197
+ else:
198
+ logger.info(f"Chunking and vectorizing PDF: {file_path}")
199
+
200
+ # Use Docling to chunk PDF file and then vectorize it
201
+ self._vectorizer.vectorize_file_buf(
202
+ file_name=file.name,
203
+ file_bytes=content,
204
+ purpose=FileSubtypes(file.sub_type).name
205
+ )
206
+ else:
207
+ logger.debug(f"vectorize_hw_files is set to False, skipping")
208
+
209
+ async def _preprocess_other_file_types(self, file: File, file_path: str, content: bytes):
210
+ logger.debug(f"File is not a PDF: {file_path}")
211
+ text_content = extract_document_text(file, content)
212
+ logger.debug(f"Analyzing file subtype: {file_path}")
213
+
214
+ # Analyze subtype
215
+ await self._subtype_analyzer.analyze_file(file, text_content)
216
+
217
+ if self._config.vectorize_hw_files:
218
+ logger.info(f"Vectorizing file: {file_path}")
219
+
220
+ # Vectorize
221
+ self._vectorizer.vectorize_text_content(
222
+ text=text_content,
223
+ file_name=file.name,
224
+ purpose=FileSubtypes(file.sub_type).name,
225
+ additional_metadata={"is_datasheet": file.is_datasheet}
226
+ )
227
+
228
+ # Ignore file subtypes that have no schema to parse like schematic files
229
+ if file.sub_type in SchemalessFileSubtypes:
230
+ logger.debug(f"File subtype is in ignore list, not parsing")
231
+ else:
232
+ logger.debug(f"File subtype is not in ignore list, parsing")
233
+
234
+ # Map schema
235
+ logger.debug(f"Mapping schema: {file_path}")
236
+ schema_mapping = await map_schema(self._gemini, file.sub_type, text_content)
237
+
238
+ # Parse schema from file
239
+ instances = parse_schema(self._db, file, schema_mapping)
240
+
241
+ # Enrich data
242
+ await enrich_data_for_model(self._db, self._chroma_db, file.sub_type, instances)
243
+
244
+ async def _preprocess_file(self, file_path: str, is_datasheet: bool) -> File:
245
+ logger.debug(f"Initializing Gemini client for pipeline: {self.pipeline_run.id}")
246
+
247
+ # Retrieve metadata
248
+ logger.debug(f"Retrieving file metadata: {file_path}")
249
+ metadata, content = get_file_metadata(file_path, is_datasheet)
250
+ logger.debug(f"Metadata retrieved: {file_path}")
251
+
252
+ # Validate file
253
+ validate_file(metadata)
254
+ logger.debug(f"File type validated: {file_path}")
255
+
256
+ # Create file
257
+ file = create_file(self._db, self.pipeline_run.id, metadata)
258
+
259
+ # PDF file pre-processing
260
+ if file.type == FileTypes.PDF:
261
+ await self._preprocess_pdf(file, file_path, content)
262
+
263
+ # Other file types pre-processing
264
+ else:
265
+ await self._preprocess_other_file_types(file, file_path, content)
266
+
267
+ self._db.commit()
268
+ logger.info(f"Pre-processing complete: {file_path}")
269
+ return file
270
+
271
+ async def _run_analysis(self):
272
+ logger.debug(f"Starting analysis for pipeline: {self.pipeline_run.id}")
273
+
274
+ # Only run netlist-to-BOM mapping if BOM or netlist files were processed
275
+ netlist_types = {"PROTEL_ALTIUM", "KICAD_LEGACY_NET", "KICAD_SPICE", "PADS", "EDIF"}
276
+ should_map_netlist = (
277
+ "BOM" in self._processed_file_types or
278
+ any(nt in self._processed_file_types for nt in netlist_types)
279
+ )
280
+
281
+ if should_map_netlist:
282
+ logger.info("Running netlist-to-BOM mapping (BOM or netlist files were processed)")
283
+ map_netlist_to_bom_entries(self._db, self.pipeline_run.id)
284
+ else:
285
+ logger.info("Skipping netlist-to-BOM mapping (no BOM or netlist files processed)")
286
+
287
+ logger.debug(f"Finished analysis for pipeline: {self.pipeline_run.id}")
288
+
289
+ # Pass processed file types to generator for conditional generation
290
+ generator = Generator(self._context, self._processed_file_types)
291
+ await generator.generate_cheat_sheets()
292
+
293
+ def _check_file_in_docket(self, file_path: str, file_md5: str) -> tuple[bool, bool]:
294
+ """
295
+ Check if file exists in docket and if MD5 matches.
296
+ Returns: (should_process, should_remove_old)
297
+ """
298
+ # Check if file with same path exists in docket
299
+ existing_entry = self._docket.get_by_path(file_path)
300
+
301
+ if not existing_entry:
302
+ # File not in docket, process it
303
+ return True, False
304
+
305
+ # File exists in docket, check MD5
306
+ if existing_entry.md5 == file_md5:
307
+ # MD5 matches, skip processing
308
+ logger.info(f"File already processed with matching MD5, skipping: {file_path}")
309
+ return False, False
310
+
311
+ # MD5 doesn't match, prompt user
312
+ logger.warning(f"File exists in docket but MD5 has changed: {file_path}")
313
+ logger.warning(f" Old MD5: {existing_entry.md5}")
314
+ logger.warning(f" New MD5: {file_md5}")
315
+
316
+ print(f"\n{'='*70}")
317
+ print(f"File has been modified: {Path(file_path).name}")
318
+ print(f"Path: {file_path}")
319
+ print(f"Old MD5: {existing_entry.md5}")
320
+ print(f"New MD5: {file_md5}")
321
+ print(f"{'='*70}")
322
+ response = input("Do you want to delete the old file data and process the new version? (yes/no): ").strip().lower()
323
+
324
+ if response in ['yes', 'y']:
325
+ logger.info(f"User confirmed deletion and reprocessing of: {file_path}")
326
+ return True, True
327
+ else:
328
+ logger.info(f"User declined reprocessing, skipping: {file_path}")
329
+ return False, False
330
+
331
+ def _remove_file_from_kb(self, entry: FileDocketEntry):
332
+ """Remove file from knowledge base (ChromaDB)"""
333
+ try:
334
+ collection = self._chroma_db._collection
335
+ results = collection.get()
336
+
337
+ if not results or not results.get('metadatas'):
338
+ logger.warning(f"No data found in knowledge base to remove for: {entry.name}")
339
+ return
340
+
341
+ # Find matching chunks for this file
342
+ matching_ids = []
343
+ for idx, metadata in enumerate(results['metadatas']):
344
+ if metadata and metadata.get('file_name') == entry.name:
345
+ matching_ids.append(results['ids'][idx])
346
+
347
+ if matching_ids:
348
+ collection.delete(ids=matching_ids)
349
+ logger.info(f"Removed {len(matching_ids)} chunks from knowledge base for: {entry.name}")
350
+ else:
351
+ logger.info(f"No chunks found in knowledge base for: {entry.name}")
352
+
353
+ # Remove from docket
354
+ self._docket.remove(entry)
355
+ logger.info(f"Removed file from docket: {entry.name}")
356
+
357
+ except Exception as e:
358
+ logger.error(f"Error removing file from knowledge base: {entry.name}")
359
+ logger.exception(e)
360
+
361
+ async def _preprocess_folder(self, folder_path: str | Path, is_datasheet: bool):
362
+ ignore_dirs = [MF_PROJECT_CONFIG_DIR_NAME]
363
+ for dir_path, dir_names, file_names in os.walk(folder_path):
364
+ dir_names[:] = [d for d in dir_names if d not in ignore_dirs]
365
+ for file_name in file_names:
366
+ self.total_files += 1
367
+ file_path = os.path.join(dir_path, file_name)
368
+
369
+ try:
370
+ # Get file metadata to check MD5
371
+ logger.debug(f"Checking file: {file_path}")
372
+ metadata, _ = get_file_metadata(file_path, is_datasheet)
373
+
374
+ # Check if file should be processed
375
+ should_process, should_remove_old = self._check_file_in_docket(file_path, metadata.md5)
376
+
377
+ if not should_process:
378
+ # Skip this file
379
+ self.skipped_files += 1
380
+ logger.info(f"Skipping file: {file_path}")
381
+ continue
382
+
383
+ # If we need to remove old version first
384
+ if should_remove_old:
385
+ existing_entry = self._docket.get_by_path(file_path)
386
+ if existing_entry:
387
+ logger.info(f"Removing old version from knowledge base: {file_path}")
388
+ self._remove_file_from_kb(existing_entry)
389
+
390
+ # Process the file
391
+ logger.info(f"Pre-processing file: {file_path}")
392
+ file = await self._preprocess_file(file_path, is_datasheet)
393
+ self.successfully_processed += 1
394
+ self._add_to_file_docket(file)
395
+
396
+ # Track the file subtype as processed
397
+ if file.sub_type:
398
+ self._processed_file_types.add(FileSubtypes(file.sub_type).name)
399
+
400
+ except Exception as e:
401
+ self.failed_files += 1
402
+ self.errors.append({"file_path": file_path, "error": str(e)})
403
+ logger.exception(e)
404
+ logger.error(f"Error processing file: {file_path}")
405
+
406
+ async def run(self):
407
+ try:
408
+ logger.info(f"Starting pipeline for directory: {self.folder_path}")
409
+
410
+ # Check if context folder has any files
411
+ if not self._check_context_folder_has_files():
412
+ self._display_empty_context_message()
413
+ return
414
+
415
+ self.pipeline_run = create_pipeline_run(self._db, self._project)
416
+ self._context.run = self.pipeline_run
417
+ await self._preprocess_folder(self.folder_path, False)
418
+ logger.info(f"Finished pre-processing folder: {self.folder_path}")
419
+
420
+ # Run pre-processing on datasheets which were just downloaded
421
+ logger.info(f"Starting pre-processing of datasheets: {app_dirs.data_sheets_dir}")
422
+ await self._preprocess_folder(app_dirs.data_sheets_dir, True)
423
+ logger.info(f"Finished pre-processing of datasheets: {app_dirs.data_sheets_dir}")
424
+
425
+ logger.info(f"Preprocessing finished: {self.folder_path}")
426
+ logger.info(f"Running analysis step: {self.folder_path}")
427
+ await self._run_analysis()
428
+ self._db.commit()
429
+ self._save_file_docket()
430
+ report = json.dumps({
431
+ "total_files": self.total_files,
432
+ "successfully_processed": self.successfully_processed,
433
+ "skipped_files": self.skipped_files,
434
+ "failed_files": self.failed_files,
435
+ "errors": self.errors
436
+ })
437
+ logger.info(f"Finished pipeline")
438
+ logger.info(f"Report: {report}")
439
+
440
+ # Print summary to console
441
+ print(f"\n{'='*70}")
442
+ print(f"Pipeline Execution Summary")
443
+ print(f"{'='*70}")
444
+ print(f"Total files found: {self.total_files}")
445
+ print(f"Successfully processed: {self.successfully_processed}")
446
+ print(f"Skipped (already processed): {self.skipped_files}")
447
+ print(f"Failed: {self.failed_files}")
448
+ print(f"{'='*70}\n")
449
+ except Exception as e:
450
+ logger.exception(e)
451
+ logger.error(f"Error in pipeline: {e}")
452
+ return format_error_for_llm(e)
453
+
454
+
455
+ async def run_pipeline_with_config(project_config: ProjectConfig):
456
+ with Session() as db:
457
+ project = get_project_by_name(db, project_config.name)
458
+ return await PipelineRunner(db, project, project_config).run()
459
+
460
+
461
+ async def run_pipeline(project_name: str) -> str:
462
+ """
463
+ The controller agent will call this tool to start the pipeline processing for all the files in a directory.
464
+ :param project_name: The name of the project
465
+ :return: Status of the pipeline run
466
+ """
467
+ with Session() as db:
468
+ project = get_project_by_name(db, project_name)
469
+ project_config = read_project_config_file()
470
+ return await PipelineRunner(db, project, project_config).run()
File without changes
@@ -0,0 +1,127 @@
1
+ import asyncio
2
+ from pathlib import Path
3
+ from typing import List
4
+
5
+ from google.genai.types import File as GeminiFile
6
+ from pydantic import BaseModel, Field
7
+
8
+ from mfcli.agents.tools.general import format_instructions
9
+ from mfcli.models.file import File
10
+ from mfcli.models.pdf_parts import PDFPart
11
+ from mfcli.pipeline.run_context import PipelineRunContext
12
+ from mfcli.utils.directory_manager import app_dirs
13
+ from mfcli.utils.pdf_splitter import PDFSplitter
14
+
15
+
16
+ class TOCSection(BaseModel):
17
+ title: str = Field(..., description="Section title")
18
+ section_no: int = Field(..., description="Section number")
19
+ start_page: int = Field(..., description="Starting page")
20
+ end_page: int = Field(..., description="End page")
21
+
22
+
23
+ class TOC(BaseModel):
24
+ sections: List[TOCSection] = Field(..., description="Table of Contents sections")
25
+
26
+
27
+ user_guide_preprocessor_instructions = format_instructions(
28
+ """
29
+ You will receive the start of a PDF for hardware engineering user guide.
30
+ Your task is to extract all the Table of Contents sections from the PDF.
31
+ You will respond with the section title (no numbers in the title).
32
+ You will respond separately with the section number (section_no).
33
+ You will also respond with the start page (start_page) and (end_page) of this section.
34
+ You MUST respond with all relevant top-level sections in the PDF.
35
+
36
+ Here are examples of relevant top-level sections:
37
+
38
+ 1. Architecture
39
+ 2. PMCU
40
+ 3. CPU
41
+
42
+ Here are examples of sections which are NOT relevant (content) sections (do not include these):
43
+
44
+ Read This First
45
+ About This Manual
46
+ Glossary
47
+ Related Documentation
48
+ Support Resources
49
+
50
+ Here are examples of sections which are NOT top-level (do not include these):
51
+
52
+ 1.1 Architecture Overview
53
+ 1.2 Bus Organization
54
+ 1.3 Platform Memory Map
55
+
56
+ ONLY include content sections and top-level sections.
57
+ """
58
+ )
59
+
60
+
61
+ class UserGuidePreprocessor:
62
+ def __init__(
63
+ self,
64
+ context: PipelineRunContext,
65
+ file: File,
66
+ pdf_head: GeminiFile,
67
+ content: bytes,
68
+ splitter: PDFSplitter
69
+ ):
70
+ self._context = context
71
+ self._file = file
72
+ self._pdf_head = pdf_head
73
+ self._content = content
74
+ self._splitter = splitter
75
+
76
+ async def _generate_toc(self) -> TOC:
77
+ return await self._context.gemini.generate(
78
+ prompt="Generate the Table of Content sections for this PDF",
79
+ instructions=user_guide_preprocessor_instructions,
80
+ response_model=TOC,
81
+ files=[self._pdf_head]
82
+ )
83
+
84
+ async def _create_pdf_part(self, section: TOCSection, pdf_part_path: Path) -> PDFPart:
85
+ pdf_part_gemini_file = await self._context.gemini.upload(pdf_part_path)
86
+ self._context.gemini_file_cache[pdf_part_gemini_file.name] = pdf_part_gemini_file
87
+ return PDFPart(
88
+ path=str(pdf_part_path),
89
+ file_id=self._file.id,
90
+ gemini_file_id=pdf_part_gemini_file.name,
91
+ start_page=section.start_page,
92
+ end_page=section.end_page,
93
+ title=section.title,
94
+ section_no=section.section_no
95
+ )
96
+
97
+ async def preprocess(self) -> List[PDFPart]:
98
+ toc = await self._generate_toc()
99
+ upload_tasks = []
100
+ for section in toc.sections:
101
+ pdf_part_path = self._splitter.extract_range(
102
+ start_page=section.start_page,
103
+ end_page=section.end_page,
104
+ output_folder=app_dirs.pdf_parts_dir
105
+ )
106
+ upload_tasks.append(self._create_pdf_part(section, pdf_part_path))
107
+ pdf_parts: List[PDFPart] = await asyncio.gather(*upload_tasks)
108
+ return pdf_parts
109
+
110
+
111
+ async def preprocess_user_guide(
112
+ context: PipelineRunContext,
113
+ file: File,
114
+ pdf_head: GeminiFile,
115
+ content: bytes,
116
+ splitter: PDFSplitter
117
+ ) -> None:
118
+ preprocessor = UserGuidePreprocessor(
119
+ context=context,
120
+ file=file,
121
+ pdf_head=pdf_head,
122
+ content=content,
123
+ splitter=splitter
124
+ )
125
+ pdf_parts = await preprocessor.preprocess()
126
+ context.db.add_all(pdf_parts)
127
+ context.db.commit()
@@ -0,0 +1,32 @@
1
+ from typing import Dict
2
+
3
+ from google.genai.types import File as GeminiFile
4
+ from mfcli.utils.query_service import QueryService
5
+
6
+ from mfcli.client.gemini import Gemini
7
+ from mfcli.models.file_docket import FileDocket
8
+ from mfcli.models.pipeline_run import PipelineRun
9
+ from mfcli.models.project_metadata import ProjectConfig
10
+ from mfcli.utils.datasheet_vectorizer import DatasheetVectorizer
11
+ from mfcli.utils.orm import Session
12
+
13
+
14
+ class PipelineRunContext:
15
+ def __init__(
16
+ self,
17
+ db: Session,
18
+ pipeline_run: PipelineRun,
19
+ gemini: Gemini,
20
+ gemini_file_cache: Dict[str, GeminiFile],
21
+ docket: FileDocket,
22
+ config: ProjectConfig,
23
+ vectorizer: DatasheetVectorizer
24
+ ):
25
+ self.db = db
26
+ self.run = pipeline_run
27
+ self.gemini = gemini
28
+ self.gemini_file_cache = gemini_file_cache
29
+ self.docket = docket
30
+ self.config = config
31
+ self.vectorizer = vectorizer
32
+ self.query_service = QueryService(self.db)