mfcli 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. mfcli/.env.example +72 -0
  2. mfcli/__init__.py +0 -0
  3. mfcli/agents/__init__.py +0 -0
  4. mfcli/agents/controller/__init__.py +0 -0
  5. mfcli/agents/controller/agent.py +19 -0
  6. mfcli/agents/controller/config.yaml +27 -0
  7. mfcli/agents/controller/tools.py +42 -0
  8. mfcli/agents/tools/general.py +118 -0
  9. mfcli/alembic/env.py +61 -0
  10. mfcli/alembic/script.py.mako +28 -0
  11. mfcli/alembic/versions/6ccc0c7c397c_added_fields_to_pdf_parts_model.py +39 -0
  12. mfcli/alembic/versions/769019ef4870_added_gemini_file_path_to_pdf_part_model.py +33 -0
  13. mfcli/alembic/versions/7a2e3a779fdc_added_functional_block_and_component_.py +54 -0
  14. mfcli/alembic/versions/7d5adb2a47a7_added_pdf_parts_model.py +41 -0
  15. mfcli/alembic/versions/7fcb7d6a5836_init.py +167 -0
  16. mfcli/alembic/versions/e0f2b5765c72_added_cascade_delete_for_models_that_.py +32 -0
  17. mfcli/alembic.ini +147 -0
  18. mfcli/cli/__init__.py +0 -0
  19. mfcli/cli/dependencies.py +59 -0
  20. mfcli/cli/main.py +192 -0
  21. mfcli/client/__init__.py +0 -0
  22. mfcli/client/chroma_db.py +184 -0
  23. mfcli/client/docling.py +44 -0
  24. mfcli/client/gemini.py +252 -0
  25. mfcli/client/llama_parse.py +38 -0
  26. mfcli/client/vector_db.py +93 -0
  27. mfcli/constants/__init__.py +0 -0
  28. mfcli/constants/base_enum.py +18 -0
  29. mfcli/constants/directory_names.py +1 -0
  30. mfcli/constants/file_types.py +189 -0
  31. mfcli/constants/gemini.py +1 -0
  32. mfcli/constants/openai.py +6 -0
  33. mfcli/constants/pipeline_run_status.py +3 -0
  34. mfcli/crud/__init__.py +0 -0
  35. mfcli/crud/file.py +42 -0
  36. mfcli/crud/functional_blocks.py +26 -0
  37. mfcli/crud/netlist.py +18 -0
  38. mfcli/crud/pipeline_run.py +17 -0
  39. mfcli/crud/project.py +99 -0
  40. mfcli/digikey/__init__.py +0 -0
  41. mfcli/digikey/digikey.py +105 -0
  42. mfcli/main.py +5 -0
  43. mfcli/mcp/__init__.py +0 -0
  44. mfcli/mcp/configs/cline_mcp_settings.json +11 -0
  45. mfcli/mcp/configs/mfcli.mcp.json +7 -0
  46. mfcli/mcp/mcp_instance.py +6 -0
  47. mfcli/mcp/server.py +37 -0
  48. mfcli/mcp/state_manager.py +51 -0
  49. mfcli/mcp/tools/__init__.py +0 -0
  50. mfcli/mcp/tools/query_knowledgebase.py +108 -0
  51. mfcli/models/__init__.py +10 -0
  52. mfcli/models/base.py +10 -0
  53. mfcli/models/bom.py +71 -0
  54. mfcli/models/datasheet.py +10 -0
  55. mfcli/models/debug_setup.py +64 -0
  56. mfcli/models/file.py +43 -0
  57. mfcli/models/file_docket.py +94 -0
  58. mfcli/models/file_metadata.py +19 -0
  59. mfcli/models/functional_blocks.py +94 -0
  60. mfcli/models/llm_response.py +5 -0
  61. mfcli/models/mcu.py +97 -0
  62. mfcli/models/mcu_errata.py +26 -0
  63. mfcli/models/netlist.py +59 -0
  64. mfcli/models/pdf_parts.py +25 -0
  65. mfcli/models/pipeline_run.py +34 -0
  66. mfcli/models/project.py +27 -0
  67. mfcli/models/project_metadata.py +15 -0
  68. mfcli/pipeline/__init__.py +0 -0
  69. mfcli/pipeline/analysis/__init__.py +0 -0
  70. mfcli/pipeline/analysis/bom_netlist_mapper.py +28 -0
  71. mfcli/pipeline/analysis/generators/__init__.py +0 -0
  72. mfcli/pipeline/analysis/generators/bom/__init__.py +0 -0
  73. mfcli/pipeline/analysis/generators/bom/bom.py +74 -0
  74. mfcli/pipeline/analysis/generators/debug_setup/__init__.py +0 -0
  75. mfcli/pipeline/analysis/generators/debug_setup/debug_setup.py +71 -0
  76. mfcli/pipeline/analysis/generators/debug_setup/instructions.py +150 -0
  77. mfcli/pipeline/analysis/generators/functional_blocks/__init__.py +0 -0
  78. mfcli/pipeline/analysis/generators/functional_blocks/functional_blocks.py +93 -0
  79. mfcli/pipeline/analysis/generators/functional_blocks/instructions.py +34 -0
  80. mfcli/pipeline/analysis/generators/functional_blocks/validator.py +94 -0
  81. mfcli/pipeline/analysis/generators/generator.py +258 -0
  82. mfcli/pipeline/analysis/generators/generator_base.py +18 -0
  83. mfcli/pipeline/analysis/generators/mcu/__init__.py +0 -0
  84. mfcli/pipeline/analysis/generators/mcu/instructions.py +156 -0
  85. mfcli/pipeline/analysis/generators/mcu/mcu.py +84 -0
  86. mfcli/pipeline/analysis/generators/mcu_errata/__init__.py +1 -0
  87. mfcli/pipeline/analysis/generators/mcu_errata/instructions.py +77 -0
  88. mfcli/pipeline/analysis/generators/mcu_errata/mcu_errata.py +95 -0
  89. mfcli/pipeline/analysis/generators/summary/__init__.py +0 -0
  90. mfcli/pipeline/analysis/generators/summary/summary.py +47 -0
  91. mfcli/pipeline/classifier.py +93 -0
  92. mfcli/pipeline/data_enricher.py +15 -0
  93. mfcli/pipeline/extractor.py +34 -0
  94. mfcli/pipeline/extractors/__init__.py +0 -0
  95. mfcli/pipeline/extractors/pdf.py +12 -0
  96. mfcli/pipeline/parser.py +120 -0
  97. mfcli/pipeline/parsers/__init__.py +0 -0
  98. mfcli/pipeline/parsers/netlist/__init__.py +0 -0
  99. mfcli/pipeline/parsers/netlist/edif.py +93 -0
  100. mfcli/pipeline/parsers/netlist/kicad_legacy_net.py +326 -0
  101. mfcli/pipeline/parsers/netlist/kicad_spice.py +135 -0
  102. mfcli/pipeline/parsers/netlist/pads.py +185 -0
  103. mfcli/pipeline/parsers/netlist/protel.py +166 -0
  104. mfcli/pipeline/parsers/netlist/protel_detector.py +29 -0
  105. mfcli/pipeline/pipeline.py +419 -0
  106. mfcli/pipeline/preprocessors/__init__.py +0 -0
  107. mfcli/pipeline/preprocessors/user_guide.py +127 -0
  108. mfcli/pipeline/run_context.py +32 -0
  109. mfcli/pipeline/schema_mapper.py +89 -0
  110. mfcli/pipeline/sub_classifier.py +115 -0
  111. mfcli/utils/__init__.py +0 -0
  112. mfcli/utils/config.py +33 -0
  113. mfcli/utils/configurator.py +324 -0
  114. mfcli/utils/data_cleaner.py +82 -0
  115. mfcli/utils/datasheet_vectorizer.py +281 -0
  116. mfcli/utils/directory_manager.py +96 -0
  117. mfcli/utils/file_upload.py +298 -0
  118. mfcli/utils/files.py +16 -0
  119. mfcli/utils/http_requests.py +54 -0
  120. mfcli/utils/kb_lister.py +89 -0
  121. mfcli/utils/kb_remover.py +173 -0
  122. mfcli/utils/logger.py +28 -0
  123. mfcli/utils/mcp_configurator.py +311 -0
  124. mfcli/utils/migrations.py +18 -0
  125. mfcli/utils/orm.py +43 -0
  126. mfcli/utils/pdf_splitter.py +63 -0
  127. mfcli/utils/query_service.py +22 -0
  128. mfcli/utils/system_check.py +306 -0
  129. mfcli/utils/tools.py +31 -0
  130. mfcli/utils/vectorizer.py +28 -0
  131. mfcli-0.2.0.dist-info/METADATA +841 -0
  132. mfcli-0.2.0.dist-info/RECORD +136 -0
  133. mfcli-0.2.0.dist-info/WHEEL +5 -0
  134. mfcli-0.2.0.dist-info/entry_points.txt +3 -0
  135. mfcli-0.2.0.dist-info/licenses/LICENSE +21 -0
  136. mfcli-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,324 @@
1
+ """Interactive configuration wizard for mfcli."""
2
+ import os
3
+ import sys
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ from mfcli.utils.directory_manager import app_dirs
8
+
9
+
10
+ def get_env_path() -> Path:
11
+ """Get the path to the .env file."""
12
+ return app_dirs.env_file_path
13
+
14
+
15
+ def read_existing_env() -> dict:
16
+ """Read existing environment variables from .env file."""
17
+ env_path = get_env_path()
18
+ env_vars = {}
19
+
20
+ if env_path.exists():
21
+ with open(env_path, 'r') as f:
22
+ for line in f:
23
+ line = line.strip()
24
+ if line and not line.startswith('#') and '=' in line:
25
+ key, value = line.split('=', 1)
26
+ env_vars[key.strip()] = value.strip()
27
+
28
+ return env_vars
29
+
30
+
31
+ def write_env_file(env_vars: dict) -> None:
32
+ """Write environment variables to .env file."""
33
+ env_path = get_env_path()
34
+ env_path.parent.mkdir(parents=True, exist_ok=True)
35
+
36
+ # Read the template
37
+ template_path = Path(__file__).parent.parent / '.env.example'
38
+ if not template_path.exists():
39
+ # Fallback: create basic template
40
+ template_content = []
41
+ for key in env_vars:
42
+ template_content.append(f"{key}={env_vars[key]}")
43
+ content = '\n'.join(template_content)
44
+ else:
45
+ with open(template_path, 'r') as f:
46
+ template_content = f.read()
47
+
48
+ # Replace placeholder values with actual values
49
+ content = template_content
50
+ for key, value in env_vars.items():
51
+ # Replace the placeholder value in the template
52
+ content = content.replace(f"{key}=your_{key.lower()}_here", f"{key}={value}")
53
+ content = content.replace(f"{key}=your_{key.replace('_', ' ').lower()}_here", f"{key}={value}")
54
+ # Handle specific patterns
55
+ if key == 'google_api_key':
56
+ content = content.replace(f"{key}=your_google_api_key_here", f"{key}={value}")
57
+ elif key == 'openai_api_key':
58
+ content = content.replace(f"{key}=your_openai_api_key_here", f"{key}={value}")
59
+ elif key == 'llama_cloud_api_key':
60
+ content = content.replace(f"{key}=your_llamaparse_api_key_here", f"{key}={value}")
61
+ elif key == 'digikey_client_id':
62
+ content = content.replace(f"{key}=your_digikey_client_id_here", f"{key}={value}")
63
+ elif key == 'digikey_client_secret':
64
+ content = content.replace(f"{key}=your_digikey_client_secret_here", f"{key}={value}")
65
+
66
+ with open(env_path, 'w') as f:
67
+ f.write(content)
68
+
69
+
70
+ def prompt_for_value(
71
+ key: str,
72
+ description: str,
73
+ link: Optional[str] = None,
74
+ current_value: Optional[str] = None,
75
+ required: bool = True
76
+ ) -> Optional[str]:
77
+ """Prompt user for a configuration value."""
78
+ print(f"\n{'='*70}")
79
+ print(f" {description}")
80
+ if link:
81
+ print(f" Get your key: {link}")
82
+ if current_value and current_value != f"your_{key.lower()}_here":
83
+ print(f" Current value: {current_value[:20]}..." if len(current_value) > 20 else f" Current value: {current_value}")
84
+ prompt = f" Enter new value (press Enter to keep current): "
85
+ else:
86
+ prompt = f" Enter value{' (required)' if required else ' (optional)'}: "
87
+
88
+ print(f"{'='*70}")
89
+
90
+ value = input(prompt).strip()
91
+
92
+ if not value:
93
+ if current_value and current_value != f"your_{key.lower()}_here":
94
+ return current_value
95
+ elif not required:
96
+ return None
97
+ else:
98
+ print(" ❌ This value is required!")
99
+ return prompt_for_value(key, description, link, current_value, required)
100
+
101
+ return value
102
+
103
+
104
+ def validate_api_key(key_name: str, api_key: str) -> bool:
105
+ """Validate an API key by making a test request."""
106
+ print(f"\n Validating {key_name}...", end=' ')
107
+ sys.stdout.flush()
108
+
109
+ try:
110
+ if key_name == "Google API":
111
+ import google.generativeai as genai
112
+ genai.configure(api_key=api_key)
113
+ # Test with a simple list models call
114
+ list(genai.list_models())
115
+ print("✅")
116
+ return True
117
+
118
+ elif key_name == "OpenAI API":
119
+ from openai import OpenAI
120
+ client = OpenAI(api_key=api_key)
121
+ # Test with a simple models list call
122
+ client.models.list()
123
+ print("✅")
124
+ return True
125
+
126
+ elif key_name == "LlamaParse API":
127
+ import requests
128
+ headers = {"Authorization": f"Bearer {api_key}"}
129
+ # LlamaParse doesn't have a simple test endpoint, so we'll just check format
130
+ if len(api_key) > 20:
131
+ print("✅ (format check)")
132
+ return True
133
+ else:
134
+ print("❌ Invalid format")
135
+ return False
136
+
137
+ elif key_name == "DigiKey API":
138
+ # DigiKey validation would require OAuth flow, so we'll just check format
139
+ if len(api_key) > 10:
140
+ print("✅ (format check)")
141
+ return True
142
+ else:
143
+ print("❌ Invalid format")
144
+ return False
145
+
146
+ except Exception as e:
147
+ print(f"❌ ({str(e)[:50]}...)")
148
+ return False
149
+
150
+ return True
151
+
152
+
153
+ def run_configuration_wizard() -> None:
154
+ """Run the interactive configuration wizard."""
155
+ print("\n" + "="*70)
156
+ print(" MFCLI CONFIGURATION WIZARD")
157
+ print("="*70)
158
+ print("\n This wizard will help you configure mfcli with your API keys.")
159
+ print(" You can press Ctrl+C at any time to exit.\n")
160
+
161
+ try:
162
+ # Read existing configuration
163
+ existing_env = read_existing_env()
164
+ new_env = existing_env.copy()
165
+
166
+ # Google API Key
167
+ value = prompt_for_value(
168
+ "google_api_key",
169
+ "Google Gemini API Key",
170
+ "https://aistudio.google.com/app/apikey",
171
+ existing_env.get("google_api_key"),
172
+ required=True
173
+ )
174
+ if value:
175
+ new_env["google_api_key"] = value
176
+ validate_api_key("Google API", value)
177
+
178
+ # OpenAI API Key
179
+ value = prompt_for_value(
180
+ "openai_api_key",
181
+ "OpenAI API Key (for embeddings)",
182
+ "https://platform.openai.com/api-keys",
183
+ existing_env.get("openai_api_key"),
184
+ required=True
185
+ )
186
+ if value:
187
+ new_env["openai_api_key"] = value
188
+ validate_api_key("OpenAI API", value)
189
+
190
+ # LlamaParse API Key
191
+ value = prompt_for_value(
192
+ "llama_cloud_api_key",
193
+ "LlamaParse API Key (for PDF parsing)",
194
+ "https://cloud.llamaindex.ai/",
195
+ existing_env.get("llama_cloud_api_key"),
196
+ required=True
197
+ )
198
+ if value:
199
+ new_env["llama_cloud_api_key"] = value
200
+ validate_api_key("LlamaParse API", value)
201
+
202
+ # DigiKey Client ID
203
+ value = prompt_for_value(
204
+ "digikey_client_id",
205
+ "DigiKey Client ID (for datasheet downloads)",
206
+ "https://developer.digikey.com/",
207
+ existing_env.get("digikey_client_id"),
208
+ required=True
209
+ )
210
+ if value:
211
+ new_env["digikey_client_id"] = value
212
+ validate_api_key("DigiKey API", value)
213
+
214
+ # DigiKey Client Secret
215
+ value = prompt_for_value(
216
+ "digikey_client_secret",
217
+ "DigiKey Client Secret",
218
+ None,
219
+ existing_env.get("digikey_client_secret"),
220
+ required=True
221
+ )
222
+ if value:
223
+ new_env["digikey_client_secret"] = value
224
+
225
+ # Embedding configuration
226
+ print("\n" + "="*70)
227
+ print(" Vector Database Configuration")
228
+ print("="*70)
229
+ print(" Using default values:")
230
+ print(" - Chunk size: 1000")
231
+ print(" - Chunk overlap: 200")
232
+ print(" - Embedding model: text-embedding-3-small")
233
+ print(" - Embedding dimensions: 1536")
234
+
235
+ change_defaults = input("\n Change these defaults? (y/N): ").strip().lower()
236
+
237
+ if change_defaults == 'y':
238
+ value = input(" Chunk size [1000]: ").strip()
239
+ new_env["chunk_size"] = value if value else "1000"
240
+
241
+ value = input(" Chunk overlap [200]: ").strip()
242
+ new_env["chunk_overlap"] = value if value else "200"
243
+
244
+ value = input(" Embedding model [text-embedding-3-small]: ").strip()
245
+ new_env["embedding_model"] = value if value else "text-embedding-3-small"
246
+
247
+ value = input(" Embedding dimensions [1536]: ").strip()
248
+ new_env["embedding_dimensions"] = value if value else "1536"
249
+ else:
250
+ new_env["chunk_size"] = existing_env.get("chunk_size", "1000")
251
+ new_env["chunk_overlap"] = existing_env.get("chunk_overlap", "200")
252
+ new_env["embedding_model"] = existing_env.get("embedding_model", "text-embedding-3-small")
253
+ new_env["embedding_dimensions"] = existing_env.get("embedding_dimensions", "1536")
254
+
255
+ # Write configuration
256
+ write_env_file(new_env)
257
+
258
+ env_path = get_env_path()
259
+ print("\n" + "="*70)
260
+ print(" ✅ Configuration saved successfully!")
261
+ print(f" Location: {env_path}")
262
+ print("="*70)
263
+ print("\n Next steps:")
264
+ print(" 1. Run 'mfcli init' in your hardware project directory")
265
+ print(" 2. Run 'mfcli run' to process your documents")
266
+ print(" 3. (Optional) Run 'mfcli setup-mcp' to configure MCP server")
267
+ print("\n")
268
+
269
+ except KeyboardInterrupt:
270
+ print("\n\n ⚠️ Configuration cancelled.")
271
+ sys.exit(0)
272
+
273
+
274
+ def check_configuration() -> None:
275
+ """Check and validate existing configuration."""
276
+ print("\n" + "="*70)
277
+ print(" CONFIGURATION CHECK")
278
+ print("="*70)
279
+
280
+ env_path = get_env_path()
281
+
282
+ if not env_path.exists():
283
+ print(f"\n ❌ Configuration file not found: {env_path}")
284
+ print("\n Run 'mfcli configure' to create your configuration.")
285
+ return
286
+
287
+ print(f"\n Configuration file: {env_path}")
288
+
289
+ env_vars = read_existing_env()
290
+
291
+ required_keys = [
292
+ ("google_api_key", "Google Gemini API"),
293
+ ("openai_api_key", "OpenAI API"),
294
+ ("llama_cloud_api_key", "LlamaParse API"),
295
+ ("digikey_client_id", "DigiKey Client ID"),
296
+ ("digikey_client_secret", "DigiKey Client Secret"),
297
+ ]
298
+
299
+ print("\n Checking configuration:")
300
+ all_valid = True
301
+
302
+ for key, name in required_keys:
303
+ value = env_vars.get(key)
304
+ if not value or value.startswith("your_"):
305
+ print(f" ❌ {name}: Not configured")
306
+ all_valid = False
307
+ else:
308
+ masked_value = value[:8] + "..." if len(value) > 8 else value
309
+ print(f" ✅ {name}: {masked_value}")
310
+
311
+ print("\n Vector database configuration:")
312
+ print(f" - Chunk size: {env_vars.get('chunk_size', 'Not set')}")
313
+ print(f" - Chunk overlap: {env_vars.get('chunk_overlap', 'Not set')}")
314
+ print(f" - Embedding model: {env_vars.get('embedding_model', 'Not set')}")
315
+ print(f" - Embedding dimensions: {env_vars.get('embedding_dimensions', 'Not set')}")
316
+
317
+ if all_valid:
318
+ print("\n ✅ All required configuration values are set!")
319
+ print("\n To validate API keys, run: mfcli doctor")
320
+ else:
321
+ print("\n ⚠️ Some configuration values are missing.")
322
+ print(" Run 'mfcli configure' to complete your configuration.")
323
+
324
+ print("="*70 + "\n")
@@ -0,0 +1,82 @@
1
+ import os.path
2
+ import shutil
3
+ import sys
4
+ from pathlib import Path
5
+ from textwrap import dedent
6
+ from typing import List
7
+
8
+ from mfcli.client.chroma_db import ChromaClient
9
+ from mfcli.models.project import Project
10
+ from mfcli.utils.config import get_config
11
+ from mfcli.utils.directory_manager import app_dirs, init_directory_structure
12
+ from mfcli.utils.logger import get_logger, setup_logging
13
+ from mfcli.utils.orm import Session
14
+ from mfcli.utils.query_service import QueryService
15
+
16
+ logger = get_logger(__name__)
17
+
18
+ warning_message = dedent(
19
+ """
20
+
21
+ WARNING: This will permanently delete all mfcli data, including datasheets, cheat sheets, and project data.
22
+ Should we proceed? (Y/n):
23
+
24
+ """
25
+ )
26
+
27
+
28
+ class DataCleaner:
29
+ def __init__(self, db: Session):
30
+ self._db = db
31
+ self._query_service = QueryService(db)
32
+ self._config = get_config()
33
+
34
+ @staticmethod
35
+ def _remove_dir(dir_path: Path):
36
+ if not os.path.isdir(dir_path):
37
+ logger.warning(f"Directory does not exist: {dir_path}")
38
+ return
39
+ try:
40
+ shutil.rmtree(dir_path)
41
+ except Exception as e:
42
+ logger.exception(e)
43
+ logger.error(f"Error deleting directory: {dir_path}")
44
+
45
+ def clean(self):
46
+ logger.info("Cleaning mfcli data")
47
+ projects: List[Project] = self._query_service.query_all(Project)
48
+ for project in projects:
49
+ init_directory_structure(project.repo_dir)
50
+ config_dir = Path(project.repo_dir) / ".multifactor"
51
+ chroma_db = ChromaClient(project.index_id)
52
+ chroma_db.delete_collection()
53
+ for dir_path in [
54
+ config_dir,
55
+ app_dirs.agent_instructions_dir,
56
+ app_dirs.data_sheets_dir,
57
+ app_dirs.fw_tasks_dir,
58
+ app_dirs.generated_files_dir,
59
+ app_dirs.reqs_dir,
60
+ app_dirs.cheat_sheets_dir,
61
+ app_dirs.pdf_parts_dir
62
+ ]:
63
+ logger.debug(f"Removing directory: {dir_path}")
64
+ self._remove_dir(dir_path)
65
+ self._db.delete(project)
66
+ self._db.commit()
67
+ logger.info("All mfcli data has been cleaned")
68
+
69
+
70
+ def run_data_cleaner():
71
+ with Session() as db:
72
+ DataCleaner(db).clean()
73
+
74
+
75
+ def clean_app_data(user_accepted: bool = False):
76
+ setup_logging()
77
+ if not user_accepted:
78
+ user_input = input(warning_message)
79
+ if not user_input.strip() == 'Y':
80
+ logger.debug("User cancelled")
81
+ sys.exit()
82
+ run_data_cleaner()
@@ -0,0 +1,281 @@
1
+ import os
2
+ import re
3
+ from urllib.parse import urlparse, unquote
4
+
5
+ from playwright.async_api import async_playwright, Browser
6
+ from requests import RequestException
7
+ from sqlmodel import select
8
+
9
+ from mfcli.client.chroma_db import ChromaClient
10
+ from mfcli.client.docling import DoclingChunker
11
+ from mfcli.client.vector_db import DocumentVectorizer
12
+ from mfcli.constants.file_types import PDFMimeTypes
13
+ from mfcli.digikey.digikey import DigiKey
14
+ from mfcli.models.bom import BOM
15
+ from mfcli.models.datasheet import Datasheet
16
+ from mfcli.pipeline.extractor import TextExtractor
17
+ from mfcli.utils.directory_manager import app_dirs
18
+ from mfcli.utils.http_requests import http_request
19
+ from mfcli.utils.logger import get_logger
20
+ from mfcli.utils.orm import Session
21
+ from mfcli.utils.tools import get_mime_type_from_bytes
22
+
23
+ logger = get_logger(__name__)
24
+
25
+
26
+ class DatasheetVectorizer:
27
+ def __init__(self, chroma_db: ChromaClient):
28
+ self._extractor = TextExtractor()
29
+ self._vectorizer = DocumentVectorizer(chroma_db)
30
+ self._docling = DoclingChunker()
31
+
32
+ def _vectorize_text(self, text: str, file_name: str, purpose: str, additional_metadata: dict = None):
33
+ """
34
+ Shared method to vectorize text with metadata
35
+ :param text: Extracted text content
36
+ :param file_name: Name of the file
37
+ :param purpose: Purpose of the vectorization (e.g., 'datasheet', 'bom', 'errata')
38
+ :param additional_metadata: Optional additional metadata to include
39
+ """
40
+ metadata = {"file_name": file_name, "purpose": purpose}
41
+ if additional_metadata:
42
+ metadata.update(additional_metadata)
43
+ self._vectorizer.vectorize(text, metadata)
44
+ logger.debug(f"File vectorized: {file_name} (purpose: {purpose})")
45
+
46
+ @staticmethod
47
+ async def _fetch_with_playwright(browser: Browser, url: str):
48
+ context = await browser.new_context()
49
+ response = await context.request.get(url)
50
+ body = await response.body()
51
+ return body
52
+
53
+ @staticmethod
54
+ def _parse_ti_url(url: str) -> str:
55
+ """
56
+ Texas Instruments URLs may have goTo param which is the real URL of the PDF
57
+ :param url: TI URL
58
+ :return: URL from goTo param
59
+ """
60
+ url_query_params = urlparse(url).query
61
+ if not url_query_params:
62
+ return url
63
+ params = url_query_params.split('&')
64
+ for param in params:
65
+ name = param.split('=')[0]
66
+ value = param.split('=')[1]
67
+ if not name == 'gotoUrl':
68
+ continue
69
+ return unquote(value)
70
+ return url
71
+
72
+ @staticmethod
73
+ def _save_datasheet(name: str, content: bytes):
74
+ file_path = app_dirs.data_sheets_dir / name
75
+ with open(file_path, "wb") as f:
76
+ f.write(content)
77
+
78
+ async def _download(self, browser: Browser, url: str, purpose: str = "datasheet"):
79
+ logger.debug(f"Fetching datasheet: {url}")
80
+ try:
81
+ ti_url_regex = r"^https?://www.ti.com/.+$"
82
+ if re.match(ti_url_regex, url, re.I):
83
+ logger.debug(f"URL is a TI URL: {url}")
84
+ url = self._parse_ti_url(url)
85
+ logger.debug(f"Parsed TI URL: {url}")
86
+ url_path = urlparse(url).path
87
+ except ValueError as e:
88
+ logger.debug(f"Unable to parse datasheet URL: {url}")
89
+ logger.debug(e)
90
+ return
91
+ file_name = os.path.basename(url_path)
92
+ if not file_name.endswith(".pdf"):
93
+ file_name = f"{file_name}.pdf"
94
+ try:
95
+ content = http_request(method='GET', url=url).content
96
+ mime_type = get_mime_type_from_bytes(content, file_name)
97
+ if mime_type not in PDFMimeTypes:
98
+ logger.debug(f"Retrieved PDF is not PDF MIME type: {url}")
99
+ logger.debug(f"Retrying with playwright: {url}")
100
+ content = await self._fetch_with_playwright(browser, url)
101
+ except RequestException as e:
102
+ logger.debug(e)
103
+ logger.debug(f"HTTP error fetching PDF: {url}")
104
+ logger.debug(f"Retrying with playwright: {url}")
105
+ content = await self._fetch_with_playwright(browser, url)
106
+ except Exception as e:
107
+ logger.debug(f"Unhandled error fetching datasheet URL: {url}")
108
+ logger.debug(e)
109
+ return
110
+ mime_type = get_mime_type_from_bytes(content, file_name)
111
+ if mime_type not in PDFMimeTypes:
112
+ logger.debug(f"Could not fetch PDF even with playwright: {url}")
113
+ return
114
+ try:
115
+ self._save_datasheet(file_name, content)
116
+ except Exception as e:
117
+ logger.debug(e)
118
+ logger.debug(f"Error saving datasheet: {file_name}")
119
+
120
+ async def download(self, urls: list[str], purpose: str = "datasheet"):
121
+ if not urls:
122
+ logger.debug(f"No datasheets to vectorize, exiting")
123
+ return
124
+ logger.debug(f"Vectorizing {len(urls)} documents (purpose: {purpose})")
125
+ async with async_playwright() as p:
126
+ browser = await p.chromium.launch(headless=True)
127
+ try:
128
+ for url in urls:
129
+ try:
130
+ await self._download(browser, url, purpose)
131
+ except Exception as e:
132
+ logger.debug(e)
133
+ logger.debug(f"Error processing document: {url}")
134
+ finally:
135
+ await browser.close()
136
+
137
+ def vectorize_local_file(self, file_path: str, purpose: str, additional_metadata: dict = None):
138
+ """
139
+ Vectorize a local file (e.g., generated by agents)
140
+ :param file_path: Path to the local file
141
+ :param purpose: Purpose of the vectorization (e.g., 'bom', 'errata', 'functional_blocks')
142
+ :param additional_metadata: Optional additional metadata to include
143
+ """
144
+ try:
145
+ logger.debug(f"Vectorizing local file: {file_path} (purpose: {purpose})")
146
+ file_name = os.path.basename(file_path)
147
+
148
+ # Check if file exists
149
+ if not os.path.exists(file_path):
150
+ logger.error(f"File does not exist: {file_path}")
151
+ return
152
+
153
+ # Extract text based on file type
154
+ with open(file_path, 'rb') as f:
155
+ content = f.read()
156
+
157
+ mime_type = get_mime_type_from_bytes(content, file_name)
158
+
159
+ if mime_type in PDFMimeTypes:
160
+ text = self._extractor.extract_pdf_bytes(content)
161
+ else:
162
+ # For non-PDF files, use the general extractor
163
+ text = self._extractor.extract_text_from_file_bytes(file_name, content)
164
+
165
+ logger.debug(f"Text extracted from local file: {file_path}")
166
+ self._vectorize_text(text, file_name, purpose, additional_metadata)
167
+
168
+ except Exception as e:
169
+ logger.error(f"Error vectorizing local file: {file_path}")
170
+ logger.exception(e)
171
+ raise
172
+
173
+ def vectorize_local_files(self, file_paths: list[str], purpose: str, additional_metadata: dict = None):
174
+ """
175
+ Vectorize multiple local files
176
+ :param file_paths: List of paths to local files
177
+ :param purpose: Purpose of the vectorization (e.g., 'bom', 'errata', 'functional_blocks')
178
+ :param additional_metadata: Optional additional metadata to include
179
+ """
180
+ if not file_paths:
181
+ logger.debug(f"No files to vectorize, exiting")
182
+ return
183
+
184
+ logger.debug(f"Vectorizing {len(file_paths)} local files (purpose: {purpose})")
185
+ for file_path in file_paths:
186
+ try:
187
+ self.vectorize_local_file(file_path, purpose, additional_metadata)
188
+ except Exception as e:
189
+ logger.exception(e)
190
+ logger.error(f"Error processing local file: {file_path}")
191
+ logger.debug(f"Finished vectorizing {len(file_paths)} local files")
192
+
193
+ def vectorize_text_content(self, text: str, file_name: str, purpose: str, additional_metadata: dict = None):
194
+ """
195
+ Vectorize text content directly (e.g., from agent output)
196
+ :param text: Text content to vectorize
197
+ :param file_name: Name to associate with this content
198
+ :param purpose: Purpose of the vectorization (e.g., 'bom', 'errata', 'functional_blocks')
199
+ :param additional_metadata: Optional additional metadata to include
200
+ """
201
+ try:
202
+ logger.debug(f"Vectorizing text content: {file_name} (purpose: {purpose})")
203
+ self._vectorize_text(text, file_name, purpose, additional_metadata)
204
+ except Exception as e:
205
+ logger.error(f"Error vectorizing text content: {file_name}")
206
+ logger.exception(e)
207
+ raise
208
+
209
+ def vectorize_file_buf(
210
+ self,
211
+ file_bytes: bytes,
212
+ file_name: str,
213
+ purpose: str,
214
+ additional_metadata: dict = None
215
+ ) -> None:
216
+ """
217
+ Vectorize a file from a buffer. This vectorizer uses DoclingChunker.
218
+ :param file_bytes: file bytes
219
+ :param file_name: file name
220
+ :param purpose: file purpose
221
+ :param additional_metadata: dict of metadata
222
+ :return: None
223
+ """
224
+ chunks = self._docling.chunk(file_name, file_bytes)
225
+ metadata = {"file_name": file_name, "purpose": purpose}
226
+ if additional_metadata:
227
+ metadata.update(additional_metadata)
228
+ self._vectorizer.vectorize_chunks(chunks, metadata)
229
+
230
+
231
+ async def get_datasheets_for_bom_entries(db: Session, chroma_db: ChromaClient, entries: list[BOM]):
232
+ logger.info(f"Fetching datasheets for {len(entries)} BOM entries")
233
+ part_numbers = {entry.value for entry in entries}
234
+ logger.debug("Fetching existing datasheets for part numbers")
235
+
236
+ # Fetch existing datasheets
237
+ stmt = select(Datasheet).where(Datasheet.part_number.in_(part_numbers))
238
+ datasheets: list[Datasheet] = db.execute(stmt).scalars().all()
239
+ datasheet_map = {d.part_number: d.datasheet for d in datasheets}
240
+
241
+ logger.debug(f"Datasheet map: {datasheet_map}")
242
+ client = DigiKey()
243
+ new_datasheets: list[Datasheet] = []
244
+ datasheet_urls: list[str] = []
245
+ for entry in entries:
246
+ try:
247
+ logger.debug(f"Processing BOM entry: {entry.value}")
248
+
249
+ # Skip resistors, capacitors and inductors
250
+ ref = entry.reference
251
+ if ref.startswith('R') \
252
+ or ref.startswith('C') \
253
+ or ref.startswith('L') \
254
+ or ref.startswith('J') \
255
+ or ref.startswith('T') \
256
+ or ref.startswith('D'):
257
+ logger.debug(f"Skipping BOM entry {entry.value} with reference: {ref}")
258
+ continue
259
+
260
+ existing_datasheet = datasheet_map.get(entry.value)
261
+ if not existing_datasheet:
262
+ logger.debug(f"Datasheet does not exist for {entry.value}")
263
+ entry.datasheet = existing_datasheet or client.datasheet(entry.value)
264
+
265
+ # If datasheet is new create it in DB
266
+ if not existing_datasheet and entry.datasheet:
267
+ logger.debug(f"Adding new datasheet for {entry.value}: {entry.datasheet}")
268
+ new_datasheets.append(Datasheet(part_number=entry.value, datasheet=entry.datasheet))
269
+ datasheet_urls.append(entry.datasheet)
270
+ except Exception as e:
271
+ logger.error(f"Error adding datasheet for BOM entry: {entry.value}")
272
+ logger.exception(e)
273
+ if new_datasheets:
274
+ db.add_all(new_datasheets)
275
+ if datasheet_urls:
276
+ try:
277
+ await DatasheetVectorizer(chroma_db).download(datasheet_urls)
278
+ except Exception as e:
279
+ logger.error("Error vectorizing datasheets for BOM")
280
+ raise e
281
+ logger.debug("Finished adding datasheets")