mfcli 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mfcli/.env.example +72 -0
- mfcli/__init__.py +0 -0
- mfcli/agents/__init__.py +0 -0
- mfcli/agents/controller/__init__.py +0 -0
- mfcli/agents/controller/agent.py +19 -0
- mfcli/agents/controller/config.yaml +27 -0
- mfcli/agents/controller/tools.py +42 -0
- mfcli/agents/tools/general.py +118 -0
- mfcli/alembic/env.py +61 -0
- mfcli/alembic/script.py.mako +28 -0
- mfcli/alembic/versions/6ccc0c7c397c_added_fields_to_pdf_parts_model.py +39 -0
- mfcli/alembic/versions/769019ef4870_added_gemini_file_path_to_pdf_part_model.py +33 -0
- mfcli/alembic/versions/7a2e3a779fdc_added_functional_block_and_component_.py +54 -0
- mfcli/alembic/versions/7d5adb2a47a7_added_pdf_parts_model.py +41 -0
- mfcli/alembic/versions/7fcb7d6a5836_init.py +167 -0
- mfcli/alembic/versions/e0f2b5765c72_added_cascade_delete_for_models_that_.py +32 -0
- mfcli/alembic.ini +147 -0
- mfcli/cli/__init__.py +0 -0
- mfcli/cli/dependencies.py +59 -0
- mfcli/cli/main.py +192 -0
- mfcli/client/__init__.py +0 -0
- mfcli/client/chroma_db.py +184 -0
- mfcli/client/docling.py +44 -0
- mfcli/client/gemini.py +252 -0
- mfcli/client/llama_parse.py +38 -0
- mfcli/client/vector_db.py +93 -0
- mfcli/constants/__init__.py +0 -0
- mfcli/constants/base_enum.py +18 -0
- mfcli/constants/directory_names.py +1 -0
- mfcli/constants/file_types.py +189 -0
- mfcli/constants/gemini.py +1 -0
- mfcli/constants/openai.py +6 -0
- mfcli/constants/pipeline_run_status.py +3 -0
- mfcli/crud/__init__.py +0 -0
- mfcli/crud/file.py +42 -0
- mfcli/crud/functional_blocks.py +26 -0
- mfcli/crud/netlist.py +18 -0
- mfcli/crud/pipeline_run.py +17 -0
- mfcli/crud/project.py +99 -0
- mfcli/digikey/__init__.py +0 -0
- mfcli/digikey/digikey.py +105 -0
- mfcli/main.py +5 -0
- mfcli/mcp/__init__.py +0 -0
- mfcli/mcp/configs/cline_mcp_settings.json +11 -0
- mfcli/mcp/configs/mfcli.mcp.json +7 -0
- mfcli/mcp/mcp_instance.py +6 -0
- mfcli/mcp/server.py +37 -0
- mfcli/mcp/state_manager.py +51 -0
- mfcli/mcp/tools/__init__.py +0 -0
- mfcli/mcp/tools/query_knowledgebase.py +108 -0
- mfcli/models/__init__.py +10 -0
- mfcli/models/base.py +10 -0
- mfcli/models/bom.py +71 -0
- mfcli/models/datasheet.py +10 -0
- mfcli/models/debug_setup.py +64 -0
- mfcli/models/file.py +43 -0
- mfcli/models/file_docket.py +94 -0
- mfcli/models/file_metadata.py +19 -0
- mfcli/models/functional_blocks.py +94 -0
- mfcli/models/llm_response.py +5 -0
- mfcli/models/mcu.py +97 -0
- mfcli/models/mcu_errata.py +26 -0
- mfcli/models/netlist.py +59 -0
- mfcli/models/pdf_parts.py +25 -0
- mfcli/models/pipeline_run.py +34 -0
- mfcli/models/project.py +27 -0
- mfcli/models/project_metadata.py +15 -0
- mfcli/pipeline/__init__.py +0 -0
- mfcli/pipeline/analysis/__init__.py +0 -0
- mfcli/pipeline/analysis/bom_netlist_mapper.py +28 -0
- mfcli/pipeline/analysis/generators/__init__.py +0 -0
- mfcli/pipeline/analysis/generators/bom/__init__.py +0 -0
- mfcli/pipeline/analysis/generators/bom/bom.py +74 -0
- mfcli/pipeline/analysis/generators/debug_setup/__init__.py +0 -0
- mfcli/pipeline/analysis/generators/debug_setup/debug_setup.py +71 -0
- mfcli/pipeline/analysis/generators/debug_setup/instructions.py +150 -0
- mfcli/pipeline/analysis/generators/functional_blocks/__init__.py +0 -0
- mfcli/pipeline/analysis/generators/functional_blocks/functional_blocks.py +93 -0
- mfcli/pipeline/analysis/generators/functional_blocks/instructions.py +34 -0
- mfcli/pipeline/analysis/generators/functional_blocks/validator.py +94 -0
- mfcli/pipeline/analysis/generators/generator.py +258 -0
- mfcli/pipeline/analysis/generators/generator_base.py +18 -0
- mfcli/pipeline/analysis/generators/mcu/__init__.py +0 -0
- mfcli/pipeline/analysis/generators/mcu/instructions.py +156 -0
- mfcli/pipeline/analysis/generators/mcu/mcu.py +84 -0
- mfcli/pipeline/analysis/generators/mcu_errata/__init__.py +1 -0
- mfcli/pipeline/analysis/generators/mcu_errata/instructions.py +77 -0
- mfcli/pipeline/analysis/generators/mcu_errata/mcu_errata.py +95 -0
- mfcli/pipeline/analysis/generators/summary/__init__.py +0 -0
- mfcli/pipeline/analysis/generators/summary/summary.py +47 -0
- mfcli/pipeline/classifier.py +93 -0
- mfcli/pipeline/data_enricher.py +15 -0
- mfcli/pipeline/extractor.py +34 -0
- mfcli/pipeline/extractors/__init__.py +0 -0
- mfcli/pipeline/extractors/pdf.py +12 -0
- mfcli/pipeline/parser.py +120 -0
- mfcli/pipeline/parsers/__init__.py +0 -0
- mfcli/pipeline/parsers/netlist/__init__.py +0 -0
- mfcli/pipeline/parsers/netlist/edif.py +93 -0
- mfcli/pipeline/parsers/netlist/kicad_legacy_net.py +326 -0
- mfcli/pipeline/parsers/netlist/kicad_spice.py +135 -0
- mfcli/pipeline/parsers/netlist/pads.py +185 -0
- mfcli/pipeline/parsers/netlist/protel.py +166 -0
- mfcli/pipeline/parsers/netlist/protel_detector.py +29 -0
- mfcli/pipeline/pipeline.py +419 -0
- mfcli/pipeline/preprocessors/__init__.py +0 -0
- mfcli/pipeline/preprocessors/user_guide.py +127 -0
- mfcli/pipeline/run_context.py +32 -0
- mfcli/pipeline/schema_mapper.py +89 -0
- mfcli/pipeline/sub_classifier.py +115 -0
- mfcli/utils/__init__.py +0 -0
- mfcli/utils/config.py +33 -0
- mfcli/utils/configurator.py +324 -0
- mfcli/utils/data_cleaner.py +82 -0
- mfcli/utils/datasheet_vectorizer.py +281 -0
- mfcli/utils/directory_manager.py +96 -0
- mfcli/utils/file_upload.py +298 -0
- mfcli/utils/files.py +16 -0
- mfcli/utils/http_requests.py +54 -0
- mfcli/utils/kb_lister.py +89 -0
- mfcli/utils/kb_remover.py +173 -0
- mfcli/utils/logger.py +28 -0
- mfcli/utils/mcp_configurator.py +311 -0
- mfcli/utils/migrations.py +18 -0
- mfcli/utils/orm.py +43 -0
- mfcli/utils/pdf_splitter.py +63 -0
- mfcli/utils/query_service.py +22 -0
- mfcli/utils/system_check.py +306 -0
- mfcli/utils/tools.py +31 -0
- mfcli/utils/vectorizer.py +28 -0
- mfcli-0.2.0.dist-info/METADATA +841 -0
- mfcli-0.2.0.dist-info/RECORD +136 -0
- mfcli-0.2.0.dist-info/WHEEL +5 -0
- mfcli-0.2.0.dist-info/entry_points.txt +3 -0
- mfcli-0.2.0.dist-info/licenses/LICENSE +21 -0
- mfcli-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,324 @@
|
|
|
1
|
+
"""Interactive configuration wizard for mfcli."""
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from mfcli.utils.directory_manager import app_dirs
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_env_path() -> Path:
|
|
11
|
+
"""Get the path to the .env file."""
|
|
12
|
+
return app_dirs.env_file_path
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def read_existing_env() -> dict:
|
|
16
|
+
"""Read existing environment variables from .env file."""
|
|
17
|
+
env_path = get_env_path()
|
|
18
|
+
env_vars = {}
|
|
19
|
+
|
|
20
|
+
if env_path.exists():
|
|
21
|
+
with open(env_path, 'r') as f:
|
|
22
|
+
for line in f:
|
|
23
|
+
line = line.strip()
|
|
24
|
+
if line and not line.startswith('#') and '=' in line:
|
|
25
|
+
key, value = line.split('=', 1)
|
|
26
|
+
env_vars[key.strip()] = value.strip()
|
|
27
|
+
|
|
28
|
+
return env_vars
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def write_env_file(env_vars: dict) -> None:
|
|
32
|
+
"""Write environment variables to .env file."""
|
|
33
|
+
env_path = get_env_path()
|
|
34
|
+
env_path.parent.mkdir(parents=True, exist_ok=True)
|
|
35
|
+
|
|
36
|
+
# Read the template
|
|
37
|
+
template_path = Path(__file__).parent.parent / '.env.example'
|
|
38
|
+
if not template_path.exists():
|
|
39
|
+
# Fallback: create basic template
|
|
40
|
+
template_content = []
|
|
41
|
+
for key in env_vars:
|
|
42
|
+
template_content.append(f"{key}={env_vars[key]}")
|
|
43
|
+
content = '\n'.join(template_content)
|
|
44
|
+
else:
|
|
45
|
+
with open(template_path, 'r') as f:
|
|
46
|
+
template_content = f.read()
|
|
47
|
+
|
|
48
|
+
# Replace placeholder values with actual values
|
|
49
|
+
content = template_content
|
|
50
|
+
for key, value in env_vars.items():
|
|
51
|
+
# Replace the placeholder value in the template
|
|
52
|
+
content = content.replace(f"{key}=your_{key.lower()}_here", f"{key}={value}")
|
|
53
|
+
content = content.replace(f"{key}=your_{key.replace('_', ' ').lower()}_here", f"{key}={value}")
|
|
54
|
+
# Handle specific patterns
|
|
55
|
+
if key == 'google_api_key':
|
|
56
|
+
content = content.replace(f"{key}=your_google_api_key_here", f"{key}={value}")
|
|
57
|
+
elif key == 'openai_api_key':
|
|
58
|
+
content = content.replace(f"{key}=your_openai_api_key_here", f"{key}={value}")
|
|
59
|
+
elif key == 'llama_cloud_api_key':
|
|
60
|
+
content = content.replace(f"{key}=your_llamaparse_api_key_here", f"{key}={value}")
|
|
61
|
+
elif key == 'digikey_client_id':
|
|
62
|
+
content = content.replace(f"{key}=your_digikey_client_id_here", f"{key}={value}")
|
|
63
|
+
elif key == 'digikey_client_secret':
|
|
64
|
+
content = content.replace(f"{key}=your_digikey_client_secret_here", f"{key}={value}")
|
|
65
|
+
|
|
66
|
+
with open(env_path, 'w') as f:
|
|
67
|
+
f.write(content)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def prompt_for_value(
|
|
71
|
+
key: str,
|
|
72
|
+
description: str,
|
|
73
|
+
link: Optional[str] = None,
|
|
74
|
+
current_value: Optional[str] = None,
|
|
75
|
+
required: bool = True
|
|
76
|
+
) -> Optional[str]:
|
|
77
|
+
"""Prompt user for a configuration value."""
|
|
78
|
+
print(f"\n{'='*70}")
|
|
79
|
+
print(f" {description}")
|
|
80
|
+
if link:
|
|
81
|
+
print(f" Get your key: {link}")
|
|
82
|
+
if current_value and current_value != f"your_{key.lower()}_here":
|
|
83
|
+
print(f" Current value: {current_value[:20]}..." if len(current_value) > 20 else f" Current value: {current_value}")
|
|
84
|
+
prompt = f" Enter new value (press Enter to keep current): "
|
|
85
|
+
else:
|
|
86
|
+
prompt = f" Enter value{' (required)' if required else ' (optional)'}: "
|
|
87
|
+
|
|
88
|
+
print(f"{'='*70}")
|
|
89
|
+
|
|
90
|
+
value = input(prompt).strip()
|
|
91
|
+
|
|
92
|
+
if not value:
|
|
93
|
+
if current_value and current_value != f"your_{key.lower()}_here":
|
|
94
|
+
return current_value
|
|
95
|
+
elif not required:
|
|
96
|
+
return None
|
|
97
|
+
else:
|
|
98
|
+
print(" ❌ This value is required!")
|
|
99
|
+
return prompt_for_value(key, description, link, current_value, required)
|
|
100
|
+
|
|
101
|
+
return value
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def validate_api_key(key_name: str, api_key: str) -> bool:
|
|
105
|
+
"""Validate an API key by making a test request."""
|
|
106
|
+
print(f"\n Validating {key_name}...", end=' ')
|
|
107
|
+
sys.stdout.flush()
|
|
108
|
+
|
|
109
|
+
try:
|
|
110
|
+
if key_name == "Google API":
|
|
111
|
+
import google.generativeai as genai
|
|
112
|
+
genai.configure(api_key=api_key)
|
|
113
|
+
# Test with a simple list models call
|
|
114
|
+
list(genai.list_models())
|
|
115
|
+
print("✅")
|
|
116
|
+
return True
|
|
117
|
+
|
|
118
|
+
elif key_name == "OpenAI API":
|
|
119
|
+
from openai import OpenAI
|
|
120
|
+
client = OpenAI(api_key=api_key)
|
|
121
|
+
# Test with a simple models list call
|
|
122
|
+
client.models.list()
|
|
123
|
+
print("✅")
|
|
124
|
+
return True
|
|
125
|
+
|
|
126
|
+
elif key_name == "LlamaParse API":
|
|
127
|
+
import requests
|
|
128
|
+
headers = {"Authorization": f"Bearer {api_key}"}
|
|
129
|
+
# LlamaParse doesn't have a simple test endpoint, so we'll just check format
|
|
130
|
+
if len(api_key) > 20:
|
|
131
|
+
print("✅ (format check)")
|
|
132
|
+
return True
|
|
133
|
+
else:
|
|
134
|
+
print("❌ Invalid format")
|
|
135
|
+
return False
|
|
136
|
+
|
|
137
|
+
elif key_name == "DigiKey API":
|
|
138
|
+
# DigiKey validation would require OAuth flow, so we'll just check format
|
|
139
|
+
if len(api_key) > 10:
|
|
140
|
+
print("✅ (format check)")
|
|
141
|
+
return True
|
|
142
|
+
else:
|
|
143
|
+
print("❌ Invalid format")
|
|
144
|
+
return False
|
|
145
|
+
|
|
146
|
+
except Exception as e:
|
|
147
|
+
print(f"❌ ({str(e)[:50]}...)")
|
|
148
|
+
return False
|
|
149
|
+
|
|
150
|
+
return True
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def run_configuration_wizard() -> None:
|
|
154
|
+
"""Run the interactive configuration wizard."""
|
|
155
|
+
print("\n" + "="*70)
|
|
156
|
+
print(" MFCLI CONFIGURATION WIZARD")
|
|
157
|
+
print("="*70)
|
|
158
|
+
print("\n This wizard will help you configure mfcli with your API keys.")
|
|
159
|
+
print(" You can press Ctrl+C at any time to exit.\n")
|
|
160
|
+
|
|
161
|
+
try:
|
|
162
|
+
# Read existing configuration
|
|
163
|
+
existing_env = read_existing_env()
|
|
164
|
+
new_env = existing_env.copy()
|
|
165
|
+
|
|
166
|
+
# Google API Key
|
|
167
|
+
value = prompt_for_value(
|
|
168
|
+
"google_api_key",
|
|
169
|
+
"Google Gemini API Key",
|
|
170
|
+
"https://aistudio.google.com/app/apikey",
|
|
171
|
+
existing_env.get("google_api_key"),
|
|
172
|
+
required=True
|
|
173
|
+
)
|
|
174
|
+
if value:
|
|
175
|
+
new_env["google_api_key"] = value
|
|
176
|
+
validate_api_key("Google API", value)
|
|
177
|
+
|
|
178
|
+
# OpenAI API Key
|
|
179
|
+
value = prompt_for_value(
|
|
180
|
+
"openai_api_key",
|
|
181
|
+
"OpenAI API Key (for embeddings)",
|
|
182
|
+
"https://platform.openai.com/api-keys",
|
|
183
|
+
existing_env.get("openai_api_key"),
|
|
184
|
+
required=True
|
|
185
|
+
)
|
|
186
|
+
if value:
|
|
187
|
+
new_env["openai_api_key"] = value
|
|
188
|
+
validate_api_key("OpenAI API", value)
|
|
189
|
+
|
|
190
|
+
# LlamaParse API Key
|
|
191
|
+
value = prompt_for_value(
|
|
192
|
+
"llama_cloud_api_key",
|
|
193
|
+
"LlamaParse API Key (for PDF parsing)",
|
|
194
|
+
"https://cloud.llamaindex.ai/",
|
|
195
|
+
existing_env.get("llama_cloud_api_key"),
|
|
196
|
+
required=True
|
|
197
|
+
)
|
|
198
|
+
if value:
|
|
199
|
+
new_env["llama_cloud_api_key"] = value
|
|
200
|
+
validate_api_key("LlamaParse API", value)
|
|
201
|
+
|
|
202
|
+
# DigiKey Client ID
|
|
203
|
+
value = prompt_for_value(
|
|
204
|
+
"digikey_client_id",
|
|
205
|
+
"DigiKey Client ID (for datasheet downloads)",
|
|
206
|
+
"https://developer.digikey.com/",
|
|
207
|
+
existing_env.get("digikey_client_id"),
|
|
208
|
+
required=True
|
|
209
|
+
)
|
|
210
|
+
if value:
|
|
211
|
+
new_env["digikey_client_id"] = value
|
|
212
|
+
validate_api_key("DigiKey API", value)
|
|
213
|
+
|
|
214
|
+
# DigiKey Client Secret
|
|
215
|
+
value = prompt_for_value(
|
|
216
|
+
"digikey_client_secret",
|
|
217
|
+
"DigiKey Client Secret",
|
|
218
|
+
None,
|
|
219
|
+
existing_env.get("digikey_client_secret"),
|
|
220
|
+
required=True
|
|
221
|
+
)
|
|
222
|
+
if value:
|
|
223
|
+
new_env["digikey_client_secret"] = value
|
|
224
|
+
|
|
225
|
+
# Embedding configuration
|
|
226
|
+
print("\n" + "="*70)
|
|
227
|
+
print(" Vector Database Configuration")
|
|
228
|
+
print("="*70)
|
|
229
|
+
print(" Using default values:")
|
|
230
|
+
print(" - Chunk size: 1000")
|
|
231
|
+
print(" - Chunk overlap: 200")
|
|
232
|
+
print(" - Embedding model: text-embedding-3-small")
|
|
233
|
+
print(" - Embedding dimensions: 1536")
|
|
234
|
+
|
|
235
|
+
change_defaults = input("\n Change these defaults? (y/N): ").strip().lower()
|
|
236
|
+
|
|
237
|
+
if change_defaults == 'y':
|
|
238
|
+
value = input(" Chunk size [1000]: ").strip()
|
|
239
|
+
new_env["chunk_size"] = value if value else "1000"
|
|
240
|
+
|
|
241
|
+
value = input(" Chunk overlap [200]: ").strip()
|
|
242
|
+
new_env["chunk_overlap"] = value if value else "200"
|
|
243
|
+
|
|
244
|
+
value = input(" Embedding model [text-embedding-3-small]: ").strip()
|
|
245
|
+
new_env["embedding_model"] = value if value else "text-embedding-3-small"
|
|
246
|
+
|
|
247
|
+
value = input(" Embedding dimensions [1536]: ").strip()
|
|
248
|
+
new_env["embedding_dimensions"] = value if value else "1536"
|
|
249
|
+
else:
|
|
250
|
+
new_env["chunk_size"] = existing_env.get("chunk_size", "1000")
|
|
251
|
+
new_env["chunk_overlap"] = existing_env.get("chunk_overlap", "200")
|
|
252
|
+
new_env["embedding_model"] = existing_env.get("embedding_model", "text-embedding-3-small")
|
|
253
|
+
new_env["embedding_dimensions"] = existing_env.get("embedding_dimensions", "1536")
|
|
254
|
+
|
|
255
|
+
# Write configuration
|
|
256
|
+
write_env_file(new_env)
|
|
257
|
+
|
|
258
|
+
env_path = get_env_path()
|
|
259
|
+
print("\n" + "="*70)
|
|
260
|
+
print(" ✅ Configuration saved successfully!")
|
|
261
|
+
print(f" Location: {env_path}")
|
|
262
|
+
print("="*70)
|
|
263
|
+
print("\n Next steps:")
|
|
264
|
+
print(" 1. Run 'mfcli init' in your hardware project directory")
|
|
265
|
+
print(" 2. Run 'mfcli run' to process your documents")
|
|
266
|
+
print(" 3. (Optional) Run 'mfcli setup-mcp' to configure MCP server")
|
|
267
|
+
print("\n")
|
|
268
|
+
|
|
269
|
+
except KeyboardInterrupt:
|
|
270
|
+
print("\n\n ⚠️ Configuration cancelled.")
|
|
271
|
+
sys.exit(0)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def check_configuration() -> None:
|
|
275
|
+
"""Check and validate existing configuration."""
|
|
276
|
+
print("\n" + "="*70)
|
|
277
|
+
print(" CONFIGURATION CHECK")
|
|
278
|
+
print("="*70)
|
|
279
|
+
|
|
280
|
+
env_path = get_env_path()
|
|
281
|
+
|
|
282
|
+
if not env_path.exists():
|
|
283
|
+
print(f"\n ❌ Configuration file not found: {env_path}")
|
|
284
|
+
print("\n Run 'mfcli configure' to create your configuration.")
|
|
285
|
+
return
|
|
286
|
+
|
|
287
|
+
print(f"\n Configuration file: {env_path}")
|
|
288
|
+
|
|
289
|
+
env_vars = read_existing_env()
|
|
290
|
+
|
|
291
|
+
required_keys = [
|
|
292
|
+
("google_api_key", "Google Gemini API"),
|
|
293
|
+
("openai_api_key", "OpenAI API"),
|
|
294
|
+
("llama_cloud_api_key", "LlamaParse API"),
|
|
295
|
+
("digikey_client_id", "DigiKey Client ID"),
|
|
296
|
+
("digikey_client_secret", "DigiKey Client Secret"),
|
|
297
|
+
]
|
|
298
|
+
|
|
299
|
+
print("\n Checking configuration:")
|
|
300
|
+
all_valid = True
|
|
301
|
+
|
|
302
|
+
for key, name in required_keys:
|
|
303
|
+
value = env_vars.get(key)
|
|
304
|
+
if not value or value.startswith("your_"):
|
|
305
|
+
print(f" ❌ {name}: Not configured")
|
|
306
|
+
all_valid = False
|
|
307
|
+
else:
|
|
308
|
+
masked_value = value[:8] + "..." if len(value) > 8 else value
|
|
309
|
+
print(f" ✅ {name}: {masked_value}")
|
|
310
|
+
|
|
311
|
+
print("\n Vector database configuration:")
|
|
312
|
+
print(f" - Chunk size: {env_vars.get('chunk_size', 'Not set')}")
|
|
313
|
+
print(f" - Chunk overlap: {env_vars.get('chunk_overlap', 'Not set')}")
|
|
314
|
+
print(f" - Embedding model: {env_vars.get('embedding_model', 'Not set')}")
|
|
315
|
+
print(f" - Embedding dimensions: {env_vars.get('embedding_dimensions', 'Not set')}")
|
|
316
|
+
|
|
317
|
+
if all_valid:
|
|
318
|
+
print("\n ✅ All required configuration values are set!")
|
|
319
|
+
print("\n To validate API keys, run: mfcli doctor")
|
|
320
|
+
else:
|
|
321
|
+
print("\n ⚠️ Some configuration values are missing.")
|
|
322
|
+
print(" Run 'mfcli configure' to complete your configuration.")
|
|
323
|
+
|
|
324
|
+
print("="*70 + "\n")
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import os.path
|
|
2
|
+
import shutil
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from textwrap import dedent
|
|
6
|
+
from typing import List
|
|
7
|
+
|
|
8
|
+
from mfcli.client.chroma_db import ChromaClient
|
|
9
|
+
from mfcli.models.project import Project
|
|
10
|
+
from mfcli.utils.config import get_config
|
|
11
|
+
from mfcli.utils.directory_manager import app_dirs, init_directory_structure
|
|
12
|
+
from mfcli.utils.logger import get_logger, setup_logging
|
|
13
|
+
from mfcli.utils.orm import Session
|
|
14
|
+
from mfcli.utils.query_service import QueryService
|
|
15
|
+
|
|
16
|
+
logger = get_logger(__name__)
|
|
17
|
+
|
|
18
|
+
warning_message = dedent(
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
WARNING: This will permanently delete all mfcli data, including datasheets, cheat sheets, and project data.
|
|
22
|
+
Should we proceed? (Y/n):
|
|
23
|
+
|
|
24
|
+
"""
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class DataCleaner:
|
|
29
|
+
def __init__(self, db: Session):
|
|
30
|
+
self._db = db
|
|
31
|
+
self._query_service = QueryService(db)
|
|
32
|
+
self._config = get_config()
|
|
33
|
+
|
|
34
|
+
@staticmethod
|
|
35
|
+
def _remove_dir(dir_path: Path):
|
|
36
|
+
if not os.path.isdir(dir_path):
|
|
37
|
+
logger.warning(f"Directory does not exist: {dir_path}")
|
|
38
|
+
return
|
|
39
|
+
try:
|
|
40
|
+
shutil.rmtree(dir_path)
|
|
41
|
+
except Exception as e:
|
|
42
|
+
logger.exception(e)
|
|
43
|
+
logger.error(f"Error deleting directory: {dir_path}")
|
|
44
|
+
|
|
45
|
+
def clean(self):
|
|
46
|
+
logger.info("Cleaning mfcli data")
|
|
47
|
+
projects: List[Project] = self._query_service.query_all(Project)
|
|
48
|
+
for project in projects:
|
|
49
|
+
init_directory_structure(project.repo_dir)
|
|
50
|
+
config_dir = Path(project.repo_dir) / ".multifactor"
|
|
51
|
+
chroma_db = ChromaClient(project.index_id)
|
|
52
|
+
chroma_db.delete_collection()
|
|
53
|
+
for dir_path in [
|
|
54
|
+
config_dir,
|
|
55
|
+
app_dirs.agent_instructions_dir,
|
|
56
|
+
app_dirs.data_sheets_dir,
|
|
57
|
+
app_dirs.fw_tasks_dir,
|
|
58
|
+
app_dirs.generated_files_dir,
|
|
59
|
+
app_dirs.reqs_dir,
|
|
60
|
+
app_dirs.cheat_sheets_dir,
|
|
61
|
+
app_dirs.pdf_parts_dir
|
|
62
|
+
]:
|
|
63
|
+
logger.debug(f"Removing directory: {dir_path}")
|
|
64
|
+
self._remove_dir(dir_path)
|
|
65
|
+
self._db.delete(project)
|
|
66
|
+
self._db.commit()
|
|
67
|
+
logger.info("All mfcli data has been cleaned")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def run_data_cleaner():
|
|
71
|
+
with Session() as db:
|
|
72
|
+
DataCleaner(db).clean()
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def clean_app_data(user_accepted: bool = False):
|
|
76
|
+
setup_logging()
|
|
77
|
+
if not user_accepted:
|
|
78
|
+
user_input = input(warning_message)
|
|
79
|
+
if not user_input.strip() == 'Y':
|
|
80
|
+
logger.debug("User cancelled")
|
|
81
|
+
sys.exit()
|
|
82
|
+
run_data_cleaner()
|
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from urllib.parse import urlparse, unquote
|
|
4
|
+
|
|
5
|
+
from playwright.async_api import async_playwright, Browser
|
|
6
|
+
from requests import RequestException
|
|
7
|
+
from sqlmodel import select
|
|
8
|
+
|
|
9
|
+
from mfcli.client.chroma_db import ChromaClient
|
|
10
|
+
from mfcli.client.docling import DoclingChunker
|
|
11
|
+
from mfcli.client.vector_db import DocumentVectorizer
|
|
12
|
+
from mfcli.constants.file_types import PDFMimeTypes
|
|
13
|
+
from mfcli.digikey.digikey import DigiKey
|
|
14
|
+
from mfcli.models.bom import BOM
|
|
15
|
+
from mfcli.models.datasheet import Datasheet
|
|
16
|
+
from mfcli.pipeline.extractor import TextExtractor
|
|
17
|
+
from mfcli.utils.directory_manager import app_dirs
|
|
18
|
+
from mfcli.utils.http_requests import http_request
|
|
19
|
+
from mfcli.utils.logger import get_logger
|
|
20
|
+
from mfcli.utils.orm import Session
|
|
21
|
+
from mfcli.utils.tools import get_mime_type_from_bytes
|
|
22
|
+
|
|
23
|
+
logger = get_logger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class DatasheetVectorizer:
|
|
27
|
+
def __init__(self, chroma_db: ChromaClient):
|
|
28
|
+
self._extractor = TextExtractor()
|
|
29
|
+
self._vectorizer = DocumentVectorizer(chroma_db)
|
|
30
|
+
self._docling = DoclingChunker()
|
|
31
|
+
|
|
32
|
+
def _vectorize_text(self, text: str, file_name: str, purpose: str, additional_metadata: dict = None):
|
|
33
|
+
"""
|
|
34
|
+
Shared method to vectorize text with metadata
|
|
35
|
+
:param text: Extracted text content
|
|
36
|
+
:param file_name: Name of the file
|
|
37
|
+
:param purpose: Purpose of the vectorization (e.g., 'datasheet', 'bom', 'errata')
|
|
38
|
+
:param additional_metadata: Optional additional metadata to include
|
|
39
|
+
"""
|
|
40
|
+
metadata = {"file_name": file_name, "purpose": purpose}
|
|
41
|
+
if additional_metadata:
|
|
42
|
+
metadata.update(additional_metadata)
|
|
43
|
+
self._vectorizer.vectorize(text, metadata)
|
|
44
|
+
logger.debug(f"File vectorized: {file_name} (purpose: {purpose})")
|
|
45
|
+
|
|
46
|
+
@staticmethod
|
|
47
|
+
async def _fetch_with_playwright(browser: Browser, url: str):
|
|
48
|
+
context = await browser.new_context()
|
|
49
|
+
response = await context.request.get(url)
|
|
50
|
+
body = await response.body()
|
|
51
|
+
return body
|
|
52
|
+
|
|
53
|
+
@staticmethod
|
|
54
|
+
def _parse_ti_url(url: str) -> str:
|
|
55
|
+
"""
|
|
56
|
+
Texas Instruments URLs may have goTo param which is the real URL of the PDF
|
|
57
|
+
:param url: TI URL
|
|
58
|
+
:return: URL from goTo param
|
|
59
|
+
"""
|
|
60
|
+
url_query_params = urlparse(url).query
|
|
61
|
+
if not url_query_params:
|
|
62
|
+
return url
|
|
63
|
+
params = url_query_params.split('&')
|
|
64
|
+
for param in params:
|
|
65
|
+
name = param.split('=')[0]
|
|
66
|
+
value = param.split('=')[1]
|
|
67
|
+
if not name == 'gotoUrl':
|
|
68
|
+
continue
|
|
69
|
+
return unquote(value)
|
|
70
|
+
return url
|
|
71
|
+
|
|
72
|
+
@staticmethod
|
|
73
|
+
def _save_datasheet(name: str, content: bytes):
|
|
74
|
+
file_path = app_dirs.data_sheets_dir / name
|
|
75
|
+
with open(file_path, "wb") as f:
|
|
76
|
+
f.write(content)
|
|
77
|
+
|
|
78
|
+
async def _download(self, browser: Browser, url: str, purpose: str = "datasheet"):
|
|
79
|
+
logger.debug(f"Fetching datasheet: {url}")
|
|
80
|
+
try:
|
|
81
|
+
ti_url_regex = r"^https?://www.ti.com/.+$"
|
|
82
|
+
if re.match(ti_url_regex, url, re.I):
|
|
83
|
+
logger.debug(f"URL is a TI URL: {url}")
|
|
84
|
+
url = self._parse_ti_url(url)
|
|
85
|
+
logger.debug(f"Parsed TI URL: {url}")
|
|
86
|
+
url_path = urlparse(url).path
|
|
87
|
+
except ValueError as e:
|
|
88
|
+
logger.debug(f"Unable to parse datasheet URL: {url}")
|
|
89
|
+
logger.debug(e)
|
|
90
|
+
return
|
|
91
|
+
file_name = os.path.basename(url_path)
|
|
92
|
+
if not file_name.endswith(".pdf"):
|
|
93
|
+
file_name = f"{file_name}.pdf"
|
|
94
|
+
try:
|
|
95
|
+
content = http_request(method='GET', url=url).content
|
|
96
|
+
mime_type = get_mime_type_from_bytes(content, file_name)
|
|
97
|
+
if mime_type not in PDFMimeTypes:
|
|
98
|
+
logger.debug(f"Retrieved PDF is not PDF MIME type: {url}")
|
|
99
|
+
logger.debug(f"Retrying with playwright: {url}")
|
|
100
|
+
content = await self._fetch_with_playwright(browser, url)
|
|
101
|
+
except RequestException as e:
|
|
102
|
+
logger.debug(e)
|
|
103
|
+
logger.debug(f"HTTP error fetching PDF: {url}")
|
|
104
|
+
logger.debug(f"Retrying with playwright: {url}")
|
|
105
|
+
content = await self._fetch_with_playwright(browser, url)
|
|
106
|
+
except Exception as e:
|
|
107
|
+
logger.debug(f"Unhandled error fetching datasheet URL: {url}")
|
|
108
|
+
logger.debug(e)
|
|
109
|
+
return
|
|
110
|
+
mime_type = get_mime_type_from_bytes(content, file_name)
|
|
111
|
+
if mime_type not in PDFMimeTypes:
|
|
112
|
+
logger.debug(f"Could not fetch PDF even with playwright: {url}")
|
|
113
|
+
return
|
|
114
|
+
try:
|
|
115
|
+
self._save_datasheet(file_name, content)
|
|
116
|
+
except Exception as e:
|
|
117
|
+
logger.debug(e)
|
|
118
|
+
logger.debug(f"Error saving datasheet: {file_name}")
|
|
119
|
+
|
|
120
|
+
async def download(self, urls: list[str], purpose: str = "datasheet"):
|
|
121
|
+
if not urls:
|
|
122
|
+
logger.debug(f"No datasheets to vectorize, exiting")
|
|
123
|
+
return
|
|
124
|
+
logger.debug(f"Vectorizing {len(urls)} documents (purpose: {purpose})")
|
|
125
|
+
async with async_playwright() as p:
|
|
126
|
+
browser = await p.chromium.launch(headless=True)
|
|
127
|
+
try:
|
|
128
|
+
for url in urls:
|
|
129
|
+
try:
|
|
130
|
+
await self._download(browser, url, purpose)
|
|
131
|
+
except Exception as e:
|
|
132
|
+
logger.debug(e)
|
|
133
|
+
logger.debug(f"Error processing document: {url}")
|
|
134
|
+
finally:
|
|
135
|
+
await browser.close()
|
|
136
|
+
|
|
137
|
+
def vectorize_local_file(self, file_path: str, purpose: str, additional_metadata: dict = None):
|
|
138
|
+
"""
|
|
139
|
+
Vectorize a local file (e.g., generated by agents)
|
|
140
|
+
:param file_path: Path to the local file
|
|
141
|
+
:param purpose: Purpose of the vectorization (e.g., 'bom', 'errata', 'functional_blocks')
|
|
142
|
+
:param additional_metadata: Optional additional metadata to include
|
|
143
|
+
"""
|
|
144
|
+
try:
|
|
145
|
+
logger.debug(f"Vectorizing local file: {file_path} (purpose: {purpose})")
|
|
146
|
+
file_name = os.path.basename(file_path)
|
|
147
|
+
|
|
148
|
+
# Check if file exists
|
|
149
|
+
if not os.path.exists(file_path):
|
|
150
|
+
logger.error(f"File does not exist: {file_path}")
|
|
151
|
+
return
|
|
152
|
+
|
|
153
|
+
# Extract text based on file type
|
|
154
|
+
with open(file_path, 'rb') as f:
|
|
155
|
+
content = f.read()
|
|
156
|
+
|
|
157
|
+
mime_type = get_mime_type_from_bytes(content, file_name)
|
|
158
|
+
|
|
159
|
+
if mime_type in PDFMimeTypes:
|
|
160
|
+
text = self._extractor.extract_pdf_bytes(content)
|
|
161
|
+
else:
|
|
162
|
+
# For non-PDF files, use the general extractor
|
|
163
|
+
text = self._extractor.extract_text_from_file_bytes(file_name, content)
|
|
164
|
+
|
|
165
|
+
logger.debug(f"Text extracted from local file: {file_path}")
|
|
166
|
+
self._vectorize_text(text, file_name, purpose, additional_metadata)
|
|
167
|
+
|
|
168
|
+
except Exception as e:
|
|
169
|
+
logger.error(f"Error vectorizing local file: {file_path}")
|
|
170
|
+
logger.exception(e)
|
|
171
|
+
raise
|
|
172
|
+
|
|
173
|
+
def vectorize_local_files(self, file_paths: list[str], purpose: str, additional_metadata: dict = None):
|
|
174
|
+
"""
|
|
175
|
+
Vectorize multiple local files
|
|
176
|
+
:param file_paths: List of paths to local files
|
|
177
|
+
:param purpose: Purpose of the vectorization (e.g., 'bom', 'errata', 'functional_blocks')
|
|
178
|
+
:param additional_metadata: Optional additional metadata to include
|
|
179
|
+
"""
|
|
180
|
+
if not file_paths:
|
|
181
|
+
logger.debug(f"No files to vectorize, exiting")
|
|
182
|
+
return
|
|
183
|
+
|
|
184
|
+
logger.debug(f"Vectorizing {len(file_paths)} local files (purpose: {purpose})")
|
|
185
|
+
for file_path in file_paths:
|
|
186
|
+
try:
|
|
187
|
+
self.vectorize_local_file(file_path, purpose, additional_metadata)
|
|
188
|
+
except Exception as e:
|
|
189
|
+
logger.exception(e)
|
|
190
|
+
logger.error(f"Error processing local file: {file_path}")
|
|
191
|
+
logger.debug(f"Finished vectorizing {len(file_paths)} local files")
|
|
192
|
+
|
|
193
|
+
def vectorize_text_content(self, text: str, file_name: str, purpose: str, additional_metadata: dict = None):
|
|
194
|
+
"""
|
|
195
|
+
Vectorize text content directly (e.g., from agent output)
|
|
196
|
+
:param text: Text content to vectorize
|
|
197
|
+
:param file_name: Name to associate with this content
|
|
198
|
+
:param purpose: Purpose of the vectorization (e.g., 'bom', 'errata', 'functional_blocks')
|
|
199
|
+
:param additional_metadata: Optional additional metadata to include
|
|
200
|
+
"""
|
|
201
|
+
try:
|
|
202
|
+
logger.debug(f"Vectorizing text content: {file_name} (purpose: {purpose})")
|
|
203
|
+
self._vectorize_text(text, file_name, purpose, additional_metadata)
|
|
204
|
+
except Exception as e:
|
|
205
|
+
logger.error(f"Error vectorizing text content: {file_name}")
|
|
206
|
+
logger.exception(e)
|
|
207
|
+
raise
|
|
208
|
+
|
|
209
|
+
def vectorize_file_buf(
|
|
210
|
+
self,
|
|
211
|
+
file_bytes: bytes,
|
|
212
|
+
file_name: str,
|
|
213
|
+
purpose: str,
|
|
214
|
+
additional_metadata: dict = None
|
|
215
|
+
) -> None:
|
|
216
|
+
"""
|
|
217
|
+
Vectorize a file from a buffer. This vectorizer uses DoclingChunker.
|
|
218
|
+
:param file_bytes: file bytes
|
|
219
|
+
:param file_name: file name
|
|
220
|
+
:param purpose: file purpose
|
|
221
|
+
:param additional_metadata: dict of metadata
|
|
222
|
+
:return: None
|
|
223
|
+
"""
|
|
224
|
+
chunks = self._docling.chunk(file_name, file_bytes)
|
|
225
|
+
metadata = {"file_name": file_name, "purpose": purpose}
|
|
226
|
+
if additional_metadata:
|
|
227
|
+
metadata.update(additional_metadata)
|
|
228
|
+
self._vectorizer.vectorize_chunks(chunks, metadata)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
async def get_datasheets_for_bom_entries(db: Session, chroma_db: ChromaClient, entries: list[BOM]):
|
|
232
|
+
logger.info(f"Fetching datasheets for {len(entries)} BOM entries")
|
|
233
|
+
part_numbers = {entry.value for entry in entries}
|
|
234
|
+
logger.debug("Fetching existing datasheets for part numbers")
|
|
235
|
+
|
|
236
|
+
# Fetch existing datasheets
|
|
237
|
+
stmt = select(Datasheet).where(Datasheet.part_number.in_(part_numbers))
|
|
238
|
+
datasheets: list[Datasheet] = db.execute(stmt).scalars().all()
|
|
239
|
+
datasheet_map = {d.part_number: d.datasheet for d in datasheets}
|
|
240
|
+
|
|
241
|
+
logger.debug(f"Datasheet map: {datasheet_map}")
|
|
242
|
+
client = DigiKey()
|
|
243
|
+
new_datasheets: list[Datasheet] = []
|
|
244
|
+
datasheet_urls: list[str] = []
|
|
245
|
+
for entry in entries:
|
|
246
|
+
try:
|
|
247
|
+
logger.debug(f"Processing BOM entry: {entry.value}")
|
|
248
|
+
|
|
249
|
+
# Skip resistors, capacitors and inductors
|
|
250
|
+
ref = entry.reference
|
|
251
|
+
if ref.startswith('R') \
|
|
252
|
+
or ref.startswith('C') \
|
|
253
|
+
or ref.startswith('L') \
|
|
254
|
+
or ref.startswith('J') \
|
|
255
|
+
or ref.startswith('T') \
|
|
256
|
+
or ref.startswith('D'):
|
|
257
|
+
logger.debug(f"Skipping BOM entry {entry.value} with reference: {ref}")
|
|
258
|
+
continue
|
|
259
|
+
|
|
260
|
+
existing_datasheet = datasheet_map.get(entry.value)
|
|
261
|
+
if not existing_datasheet:
|
|
262
|
+
logger.debug(f"Datasheet does not exist for {entry.value}")
|
|
263
|
+
entry.datasheet = existing_datasheet or client.datasheet(entry.value)
|
|
264
|
+
|
|
265
|
+
# If datasheet is new create it in DB
|
|
266
|
+
if not existing_datasheet and entry.datasheet:
|
|
267
|
+
logger.debug(f"Adding new datasheet for {entry.value}: {entry.datasheet}")
|
|
268
|
+
new_datasheets.append(Datasheet(part_number=entry.value, datasheet=entry.datasheet))
|
|
269
|
+
datasheet_urls.append(entry.datasheet)
|
|
270
|
+
except Exception as e:
|
|
271
|
+
logger.error(f"Error adding datasheet for BOM entry: {entry.value}")
|
|
272
|
+
logger.exception(e)
|
|
273
|
+
if new_datasheets:
|
|
274
|
+
db.add_all(new_datasheets)
|
|
275
|
+
if datasheet_urls:
|
|
276
|
+
try:
|
|
277
|
+
await DatasheetVectorizer(chroma_db).download(datasheet_urls)
|
|
278
|
+
except Exception as e:
|
|
279
|
+
logger.error("Error vectorizing datasheets for BOM")
|
|
280
|
+
raise e
|
|
281
|
+
logger.debug("Finished adding datasheets")
|