PyPI - jarvis-ai-assistant - Versions diffs - 0.1.138__py3-none-any.whl → 0.1.141__py3-none-any.whl - Mend

jarvis-ai-assistant 0.1.138py3-none-any.whl → 0.1.141py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of jarvis-ai-assistant might be problematic. Click here for more details.

Files changed (85) hide show

jarvis/__init__.py +1 -1
jarvis/jarvis_agent/__init__.py +62 -14
jarvis/jarvis_agent/builtin_input_handler.py +4 -14
jarvis/jarvis_agent/main.py +1 -1
jarvis/jarvis_agent/patch.py +37 -40
jarvis/jarvis_agent/shell_input_handler.py +2 -3
jarvis/jarvis_code_agent/code_agent.py +23 -30
jarvis/jarvis_code_analysis/checklists/__init__.py +3 -0
jarvis/jarvis_code_analysis/checklists/c_cpp.py +50 -0
jarvis/jarvis_code_analysis/checklists/csharp.py +75 -0
jarvis/jarvis_code_analysis/checklists/data_format.py +82 -0
jarvis/jarvis_code_analysis/checklists/devops.py +107 -0
jarvis/jarvis_code_analysis/checklists/docs.py +87 -0
jarvis/jarvis_code_analysis/checklists/go.py +52 -0
jarvis/jarvis_code_analysis/checklists/infrastructure.py +98 -0
jarvis/jarvis_code_analysis/checklists/java.py +66 -0
jarvis/jarvis_code_analysis/checklists/javascript.py +73 -0
jarvis/jarvis_code_analysis/checklists/kotlin.py +107 -0
jarvis/jarvis_code_analysis/checklists/loader.py +76 -0
jarvis/jarvis_code_analysis/checklists/php.py +77 -0
jarvis/jarvis_code_analysis/checklists/python.py +56 -0
jarvis/jarvis_code_analysis/checklists/ruby.py +107 -0
jarvis/jarvis_code_analysis/checklists/rust.py +58 -0
jarvis/jarvis_code_analysis/checklists/shell.py +75 -0
jarvis/jarvis_code_analysis/checklists/sql.py +72 -0
jarvis/jarvis_code_analysis/checklists/swift.py +77 -0
jarvis/jarvis_code_analysis/checklists/web.py +97 -0
jarvis/jarvis_code_analysis/code_review.py +660 -0
jarvis/jarvis_dev/main.py +61 -88
jarvis/jarvis_git_squash/main.py +3 -3
jarvis/jarvis_git_utils/git_commiter.py +242 -0
jarvis/jarvis_init/main.py +62 -0
jarvis/jarvis_platform/base.py +4 -0
jarvis/jarvis_platform/kimi.py +173 -5
jarvis/jarvis_platform/openai.py +3 -0
jarvis/jarvis_platform/registry.py +1 -0
jarvis/jarvis_platform/yuanbao.py +275 -5
jarvis/jarvis_tools/ask_codebase.py +6 -9
jarvis/jarvis_tools/ask_user.py +17 -5
jarvis/jarvis_tools/base.py +3 -1
jarvis/jarvis_tools/chdir.py +1 -0
jarvis/jarvis_tools/create_code_agent.py +4 -3
jarvis/jarvis_tools/create_sub_agent.py +1 -0
jarvis/jarvis_tools/execute_script.py +170 -0
jarvis/jarvis_tools/file_analyzer.py +90 -239
jarvis/jarvis_tools/file_operation.py +99 -31
jarvis/jarvis_tools/{find_methodolopy.py → find_methodology.py} +2 -1
jarvis/jarvis_tools/lsp_get_diagnostics.py +2 -0
jarvis/jarvis_tools/methodology.py +11 -11
jarvis/jarvis_tools/read_code.py +2 -0
jarvis/jarvis_tools/read_webpage.py +33 -196
jarvis/jarvis_tools/registry.py +68 -131
jarvis/jarvis_tools/search_web.py +14 -6
jarvis/jarvis_tools/virtual_tty.py +399 -0
jarvis/jarvis_utils/config.py +29 -3
jarvis/jarvis_utils/embedding.py +0 -317
jarvis/jarvis_utils/file_processors.py +343 -0
jarvis/jarvis_utils/input.py +0 -1
jarvis/jarvis_utils/methodology.py +94 -435
jarvis/jarvis_utils/utils.py +207 -9
{jarvis_ai_assistant-0.1.138.dist-info → jarvis_ai_assistant-0.1.141.dist-info}/METADATA +4 -4
jarvis_ai_assistant-0.1.141.dist-info/RECORD +94 -0
{jarvis_ai_assistant-0.1.138.dist-info → jarvis_ai_assistant-0.1.141.dist-info}/entry_points.txt +4 -4
jarvis/jarvis_code_agent/file_select.py +0 -202
jarvis/jarvis_platform/ai8.py +0 -268
jarvis/jarvis_platform/ollama.py +0 -137
jarvis/jarvis_platform/oyi.py +0 -307
jarvis/jarvis_rag/file_processors.py +0 -138
jarvis/jarvis_rag/main.py +0 -1734
jarvis/jarvis_tools/code_review.py +0 -333
jarvis/jarvis_tools/execute_python_script.py +0 -58
jarvis/jarvis_tools/execute_shell.py +0 -97
jarvis/jarvis_tools/execute_shell_script.py +0 -58
jarvis/jarvis_tools/find_caller.py +0 -278
jarvis/jarvis_tools/find_symbol.py +0 -295
jarvis/jarvis_tools/function_analyzer.py +0 -331
jarvis/jarvis_tools/git_commiter.py +0 -167
jarvis/jarvis_tools/project_analyzer.py +0 -304
jarvis/jarvis_tools/rag.py +0 -143
jarvis/jarvis_tools/tool_generator.py +0 -221
jarvis_ai_assistant-0.1.138.dist-info/RECORD +0 -85
/jarvis/{jarvis_rag → jarvis_init}/__init__.py +0 -0
{jarvis_ai_assistant-0.1.138.dist-info → jarvis_ai_assistant-0.1.141.dist-info}/LICENSE +0 -0
{jarvis_ai_assistant-0.1.138.dist-info → jarvis_ai_assistant-0.1.141.dist-info}/WHEEL +0 -0
{jarvis_ai_assistant-0.1.138.dist-info → jarvis_ai_assistant-0.1.141.dist-info}/top_level.txt +0 -0

jarvis/jarvis_utils/embedding.py CHANGED Viewed

@@ -33,181 +33,6 @@ def get_context_token_count(text: str) -> int:
         # 回退到基于字符的粗略估计
         return len(text) // 4  # 每个token大约4个字符的粗略估计
-@functools.lru_cache(maxsize=1)
-def load_embedding_model() -> SentenceTransformer:
-    """
-    加载句子嵌入模型，使用缓存避免重复加载。
-    返回：
-        SentenceTransformer: 加载的嵌入模型
-    """
-    model_name = "BAAI/bge-m3"
-    cache_dir = os.path.expanduser("~/.cache/huggingface/hub")
-    # 检查全局缓存中是否已有模型
-    if model_name in _global_models:
-        return _global_models[model_name]
-    try:
-        embedding_model = SentenceTransformer(
-            model_name,
-            cache_folder=cache_dir,
-            local_files_only=True
-        )
-    except Exception:
-        embedding_model = SentenceTransformer(
-            model_name,
-            cache_folder=cache_dir,
-            local_files_only=False
-        )
-    # 如果可用，将模型移到GPU上
-    if torch.cuda.is_available():
-        embedding_model.to(torch.device("cuda"))
-    # 保存到全局缓存
-    _global_models[model_name] = embedding_model
-    return embedding_model
-def get_embedding(embedding_model: Any, text: str) -> np.ndarray:
-    """
-    为给定文本生成嵌入向量。
-    参数：
-        embedding_model: 使用的嵌入模型
-        text: 要嵌入的输入文本
-    返回：
-        np.ndarray: 嵌入向量
-    """
-    embedding = embedding_model.encode(text,
-                                     normalize_embeddings=True,
-                                     show_progress_bar=False)
-    return np.array(embedding, dtype=np.float32)
-def get_embedding_batch(embedding_model: Any, prefix: str, texts: List[str], spinner: Optional[Yaspin] = None, batch_size: int = 8) -> np.ndarray:
-    """
-    为一批文本生成嵌入向量，使用高效的批处理，针对RAG优化。
-    参数：
-        embedding_model: 使用的嵌入模型
-        prefix: 进度条前缀
-        texts: 要嵌入的文本列表
-        spinner: 可选的进度指示器
-        batch_size: 批处理大小，更大的值可能更快但需要更多内存
-    返回：
-        np.ndarray: 堆叠的嵌入向量
-    """
-    # 简单嵌入缓存，避免重复计算相同文本块
-    embedding_cache = {}
-    cache_hits = 0
-    try:
-        # 预处理：将所有文本分块
-        all_chunks = []
-        chunk_indices = []  # 跟踪每个原始文本对应的块索引
-        for i, text in enumerate(texts):
-            if spinner:
-                spinner.text = f"{prefix} 预处理中 ({i+1}/{len(texts)}) ..."
-            # 预处理文本：移除多余空白，规范化
-            text = ' '.join(text.split()) if text else ""
-            # 使用更优化的分块函数
-            chunks = split_text_into_chunks(text, 512)
-            start_idx = len(all_chunks)
-            all_chunks.extend(chunks)
-            end_idx = len(all_chunks)
-            chunk_indices.append((start_idx, end_idx))
-        if not all_chunks:
-            return np.zeros((0, embedding_model.get_sentence_embedding_dimension()), dtype=np.float32)
-        # 批量处理所有块
-        all_vectors = []
-        for i in range(0, len(all_chunks), batch_size):
-            if spinner:
-                spinner.text = f"{prefix} 批量处理嵌入 ({i+1}/{len(all_chunks)}) ..."
-            batch = all_chunks[i:i+batch_size]
-            batch_to_process = []
-            batch_indices = []
-            # 检查缓存，避免重复计算
-            for j, chunk in enumerate(batch):
-                chunk_hash = hash(chunk)
-                if chunk_hash in embedding_cache:
-                    all_vectors.append(embedding_cache[chunk_hash])
-                    cache_hits += 1
-                else:
-                    batch_to_process.append(chunk)
-                    batch_indices.append(j)
-            if batch_to_process:
-                # 对未缓存的块处理
-                batch_vectors = embedding_model.encode(
-                    batch_to_process,
-                    normalize_embeddings=True,
-                    show_progress_bar=False,
-                    convert_to_numpy=True,
-                )
-                # 处理结果并更新缓存
-                if len(batch_to_process) == 1:
-                    vec = batch_vectors
-                    chunk_hash = hash(batch_to_process[0])
-                    embedding_cache[chunk_hash] = vec
-                    all_vectors.append(vec)
-                else:
-                    for j, vec in enumerate(batch_vectors):
-                        chunk_hash = hash(batch_to_process[j])
-                        embedding_cache[chunk_hash] = vec
-                        all_vectors.append(vec)
-        # 组织结果到原始文本顺序
-        result_vectors = []
-        for start_idx, end_idx in chunk_indices:
-            text_vectors = []
-            for j in range(start_idx, end_idx):
-                if j < len(all_vectors):
-                    text_vectors.append(all_vectors[j])
-            if text_vectors:
-                # 当一个文本被分成多个块时，采用加权平均
-                if len(text_vectors) > 1:
-                    # 针对RAG优化：对多个块进行加权平均，前面的块权重略高
-                    weights = np.linspace(1.0, 0.8, len(text_vectors))
-                    weights = weights / weights.sum()  # 归一化权重
-                    # 应用权重并求和
-                    weighted_sum = np.zeros_like(text_vectors[0])
-                    for i, vec in enumerate(text_vectors):
-                        # 确保向量形状一致，处理可能的维度不匹配问题
-                        vec_array = np.asarray(vec).reshape(weighted_sum.shape)
-                        weighted_sum += vec_array * weights[i]
-                    # 归一化结果向量
-                    norm = np.linalg.norm(weighted_sum)
-                    if norm > 0:
-                        weighted_sum = weighted_sum / norm
-                    result_vectors.append(weighted_sum)
-                else:
-                    # 单块直接使用
-                    result_vectors.append(text_vectors[0])
-        if spinner and cache_hits > 0:
-            spinner.text = f"{prefix} 缓存命中: {cache_hits}/{len(all_chunks)} 块"
-        return np.vstack(result_vectors)
-    except Exception as e:
-        PrettyOutput.print(f"批量嵌入失败: {str(e)}", OutputType.ERROR)
-        return np.zeros((0, embedding_model.get_sentence_embedding_dimension()), dtype=np.float32)
 def split_text_into_chunks(text: str, max_length: int = 512, min_length: int = 50) -> List[str]:
     """将文本分割成带重叠窗口的块，优化RAG检索效果。
@@ -357,145 +182,3 @@ def load_tokenizer() -> AutoTokenizer:
     _global_tokenizers[model_name] = tokenizer
     return tokenizer # type: ignore
-@functools.lru_cache(maxsize=1)
-def load_rerank_model() -> Tuple[AutoModelForSequenceClassification, AutoTokenizer]:
-    """
-    加载重排序模型和分词器，使用缓存避免重复加载。
-    返回：
-        Tuple[AutoModelForSequenceClassification, AutoTokenizer]: 加载的模型和分词器
-    """
-    model_name = "BAAI/bge-reranker-v2-m3"
-    cache_dir = os.path.expanduser("~/.cache/huggingface/hub")
-    # 检查全局缓存
-    key = f"rerank_{model_name}"
-    if key in _global_models and f"{key}_tokenizer" in _global_tokenizers:
-        return _global_models[key], _global_tokenizers[f"{key}_tokenizer"]
-    try:
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_name,
-            cache_dir=cache_dir,
-            local_files_only=True
-        )
-        model = AutoModelForSequenceClassification.from_pretrained(
-            model_name,
-            cache_dir=cache_dir,
-            local_files_only=True
-        )
-    except Exception:
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_name,
-            cache_dir=cache_dir,
-            local_files_only=False
-        )
-        model = AutoModelForSequenceClassification.from_pretrained(
-            model_name,
-            cache_dir=cache_dir,
-            local_files_only=False
-        )
-    if torch.cuda.is_available():
-        model = model.cuda()
-    model.eval()
-    # 保存到全局缓存
-    _global_models[key] = model
-    _global_tokenizers[f"{key}_tokenizer"] = tokenizer
-    return model, tokenizer # type: ignore
-def rerank_results(query: str, documents: List[str], initial_scores: Optional[List[float]] = None,
-                  batch_size: int = 8, spinner: Optional[Yaspin] = None) -> List[float]:
-    """
-    使用交叉编码器重排序检索结果，提高RAG精度。
-    参数：
-        query: 查询文本
-        documents: 要重排序的文档内容列表
-        initial_scores: 初始检索分数，可选。如果提供，将与重排序分数融合
-        batch_size: 批处理大小
-        spinner: 可选的进度指示器
-    返回：
-        List[float]: 重排序后的分数列表，与输入文档对应
-    """
-    try:
-        if not documents:
-            return []
-        # 加载重排序模型
-        if spinner:
-            spinner.text = "加载重排序模型..."
-        model, tokenizer = load_rerank_model()
-        # 准备评分
-        all_scores = []
-        # 批量处理
-        for i in range(0, len(documents), batch_size):
-            if spinner:
-                spinner.text = f"重排序进度: {i}/{len(documents)}..."
-            # 准备当前批次
-            batch_docs = documents[i:i+batch_size]
-            pairs = [(query, doc) for doc in batch_docs]
-            # 编码输入
-            with torch.no_grad():
-                # 使用类型忽略以避免mypy错误
-                inputs = tokenizer(  # type: ignore
-                    pairs,
-                    padding=True,
-                    truncation=True,
-                    return_tensors="pt",
-                    max_length=512
-                )
-                # 使用GPU加速（如果可用）
-                if torch.cuda.is_available():
-                    inputs = {k: v.cuda() for k, v in inputs.items()}
-                # 获取分数
-                outputs = model(**inputs)  # type: ignore
-                scores = outputs.logits.squeeze(-1).cpu().tolist()
-                # 如果只有一个文档，确保返回列表
-                if len(batch_docs) == 1:
-                    all_scores.append(float(scores))
-                else:
-                    all_scores.extend(scores)
-        # 归一化分数到0-1范围
-        if all_scores:
-            min_score = min(all_scores)
-            max_score = max(all_scores)
-            if max_score > min_score:
-                normalized_scores = [(score - min_score) / (max_score - min_score) for score in all_scores]
-            else:
-                normalized_scores = [0.5] * len(all_scores)
-            # 融合初始分数（如果提供）
-            if initial_scores and len(initial_scores) == len(normalized_scores):
-                # 使用加权平均融合分数：初始分数权重0.3，重排序分数权重0.7
-                final_scores = [0.3 * init_score + 0.7 * rerank_score
-                               for init_score, rerank_score in zip(initial_scores, normalized_scores)]
-                return final_scores
-            return normalized_scores
-        if spinner:
-            spinner.text = "重排序完成"
-        # 如果重排序失败，返回初始分数或默认分数
-        return initial_scores if initial_scores else [0.5] * len(documents)
-    except Exception as e:
-        PrettyOutput.print(f"重排序失败: {str(e)}", OutputType.ERROR)
-        if spinner:
-            spinner.text = f"重排序失败: {str(e)}"
-        # 发生错误时回退到初始分数
-        return initial_scores if initial_scores else [0.5] * len(documents)

jarvis/jarvis_utils/file_processors.py ADDED Viewed

@@ -0,0 +1,343 @@
+from pathlib import Path
+import fitz  # PyMuPDF for PDF files
+from docx import Document as DocxDocument  # python-docx for DOCX files
+from pptx import Presentation
+import pandas as pd
+import unicodedata
+class FileProcessor:
+    """Base class for file processor"""
+    @staticmethod
+    def can_handle(file_path: str) -> bool:
+        """Determine if the file can be processed"""
+        raise NotImplementedError
+    @staticmethod
+    def extract_text(file_path: str) -> str:
+        """Extract file text content"""
+        raise NotImplementedError
+class TextFileProcessor(FileProcessor):
+    """Text file processor"""
+    ENCODINGS = ['utf-8', 'gbk', 'gb2312', 'latin1']
+    SAMPLE_SIZE = 8192  # Read the first 8KB to detect encoding
+    @staticmethod
+    def can_handle(file_path: str) -> bool:
+        """Determine if the file is a text file by trying to decode it"""
+        try:
+            # Read the first part of the file to detect encoding
+            with open(file_path, 'rb') as f:
+                sample = f.read(TextFileProcessor.SAMPLE_SIZE)
+            # Check if it contains null bytes (usually represents a binary file)
+            if b'\x00' in sample:
+                return False
+            # Check if it contains too many non-printable characters (usually represents a binary file)
+            non_printable = sum(1 for byte in sample if byte < 32 and byte not in (9, 10, 13))  # tab, newline, carriage return
+            if non_printable / len(sample) > 0.3:  # If non-printable characters exceed 30%, it is considered a binary file
+                return False
+            # Try to decode with different encodings
+            for encoding in TextFileProcessor.ENCODINGS:
+                try:
+                    sample.decode(encoding)
+                    return True
+                except UnicodeDecodeError:
+                    continue
+            return False
+        except Exception:
+            return False
+    @staticmethod
+    def extract_text(file_path: str) -> str:
+        """Extract text content, using the detected correct encoding"""
+        detected_encoding = None
+        try:
+            # First try to detect encoding
+            with open(file_path, 'rb') as f:
+                raw_data = f.read()
+            # Try different encodings
+            for encoding in TextFileProcessor.ENCODINGS:
+                try:
+                    raw_data.decode(encoding)
+                    detected_encoding = encoding
+                    break
+                except UnicodeDecodeError:
+                    continue
+            if not detected_encoding:
+                raise UnicodeDecodeError(f"Failed to decode file with supported encodings: {file_path}") # type: ignore
+            # Use the detected encoding to read the file
+            with open(file_path, 'r', encoding=detected_encoding, errors='ignore') as f:
+                content = f.read()
+            # Normalize Unicode characters
+            content = unicodedata.normalize('NFKC', content)
+            return content
+        except Exception as e:
+            raise Exception(f"Failed to read file: {str(e)}")
+class PDFProcessor(FileProcessor):
+    """PDF file processor"""
+    @staticmethod
+    def can_handle(file_path: str) -> bool:
+        return Path(file_path).suffix.lower() == '.pdf'
+    @staticmethod
+    def extract_text(file_path: str) -> str:
+        """提取PDF文件中的所有文本内容，包括页码、图片描述等"""
+        try:
+            text_parts = []
+            with fitz.open(file_path) as doc:  # type: ignore
+                # 添加文档信息
+                info = doc.metadata
+                if info:
+                    meta_text = []
+                    if info.get("title"):
+                        meta_text.append(f"标题: {info['title']}")
+                    if info.get("author"):
+                        meta_text.append(f"作者: {info['author']}")
+                    if info.get("subject"):
+                        meta_text.append(f"主题: {info['subject']}")
+                    if info.get("keywords"):
+                        meta_text.append(f"关键词: {info['keywords']}")
+                    if meta_text:
+                        text_parts.append("=== 文档信息 ===")
+                        text_parts.append("\n".join(meta_text))
+                # 提取目录结构（如果有）
+                toc = doc.get_toc()  # type: ignore
+                if toc:
+                    text_parts.append("\n=== 目录结构 ===")
+                    for level, title, page in toc:
+                        indent = "  " * (level - 1)
+                        text_parts.append(f"{indent}- {title} (第{page}页)")
+                # 处理各页内容
+                text_parts.append("\n=== 页面内容 ===")
+                for page_index in range(len(doc)):  # 使用范围遍历而不是直接枚举文档对象
+                    # 添加页码标记
+                    text_parts.append(f"\n--- 第{page_index+1}页 ---")
+                    # 获取页面
+                    page = doc[page_index]
+                    # 提取页面文本（包括结构信息）
+                    try:
+                        # 尝试使用结构化提取（保留段落和块结构）
+                        text = page.get_text("text")  # type: ignore
+                        text = text.strip()
+                        if text:
+                            text_parts.append(text)
+                    except Exception:
+                        # 如果结构化提取失败，回退到简单文本提取
+                        text = page.get_text()  # type: ignore
+                        if text.strip():
+                            text_parts.append(text.strip())
+                    # 提取图像信息（如果需要）
+                    # 注意：这可能会增加处理时间，可根据需要启用
+                    """
+                    image_list = page.get_images()
+                    if image_list:
+                        text_parts.append(f"本页包含 {len(image_list)} 个图像")
+                    """
+            # 合并所有文本
+            return "\n".join(text_parts)
+        except Exception as e:
+            # 处理可能的异常
+            return f"PDF处理错误: {str(e)}"
+class DocxProcessor(FileProcessor):
+    """DOCX file processor"""
+    @staticmethod
+    def can_handle(file_path: str) -> bool:
+        return Path(file_path).suffix.lower() == '.docx'
+    @staticmethod
+    def extract_text(file_path: str) -> str:
+        """提取 DOCX 文件中的所有文本内容，包括段落、表格、页眉页脚等"""
+        doc = DocxDocument(file_path)
+        full_text = []
+        # 提取段落文本
+        for para in doc.paragraphs:
+            if para.text.strip():  # 跳过空段落
+                full_text.append(para.text)
+        # 提取表格文本
+        for table in doc.tables:
+            for row in table.rows:
+                row_texts = []
+                for cell in row.cells:
+                    # 每个单元格可能包含多个段落
+                    cell_text = "\n".join([p.text for p in cell.paragraphs if p.text.strip()])
+                    if cell_text:
+                        row_texts.append(cell_text)
+                if row_texts:
+                    full_text.append(" | ".join(row_texts))
+        # 提取页眉页脚（如果有节）
+        try:
+            for section in doc.sections:
+                # 提取页眉
+                if section.header:
+                    header_text = "\n".join([p.text for p in section.header.paragraphs if p.text.strip()])
+                    if header_text:
+                        full_text.append(f"页眉: {header_text}")
+                # 提取页脚
+                if section.footer:
+                    footer_text = "\n".join([p.text for p in section.footer.paragraphs if p.text.strip()])
+                    if footer_text:
+                        full_text.append(f"页脚: {footer_text}")
+        except:
+            # 如果提取页眉页脚失败，忽略错误继续
+            pass
+        # 合并所有文本
+        return "\n\n".join(full_text)
+class PPTProcessor(FileProcessor):
+    """PPT file processor"""
+    @staticmethod
+    def can_handle(file_path: str) -> bool:
+        return Path(file_path).suffix.lower() in ['.ppt', '.pptx']
+    @staticmethod
+    def extract_text(file_path: str) -> str:
+        """提取PPT文件中的所有文本内容，包括标题、文本框、备注等"""
+        prs = Presentation(file_path)
+        all_text = []
+        # 遍历所有幻灯片
+        for slide_index, slide in enumerate(prs.slides, 1):
+            slide_text = []
+            # 添加幻灯片编号
+            slide_text.append(f"=== 幻灯片 {slide_index} ===")
+            # 提取幻灯片中所有形状的文本
+            for shape in slide.shapes:
+                # 提取带有文本的形状
+                try:
+                    if hasattr(shape, "text_frame") and shape.text_frame:  # type: ignore
+                        for paragraph in shape.text_frame.paragraphs:  # type: ignore
+                            text = paragraph.text.strip()
+                            if text:
+                                slide_text.append(text)
+                except AttributeError:
+                    pass
+                # 提取表格内容
+                try:
+                    if hasattr(shape, "table") and shape.table:  # type: ignore
+                        for row in shape.table.rows:  # type: ignore
+                            row_texts = []
+                            for cell in row.cells:
+                                if hasattr(cell, "text_frame") and cell.text_frame:
+                                    cell_paragraphs = cell.text_frame.paragraphs  # type: ignore
+                                    cell_text = " ".join([p.text.strip() for p in cell_paragraphs if p.text.strip()])
+                                    if cell_text:
+                                        row_texts.append(cell_text)
+                            if row_texts:
+                                slide_text.append(" | ".join(row_texts))
+                except AttributeError:
+                    pass
+            # 提取幻灯片备注
+            try:
+                if hasattr(slide, "has_notes_slide") and slide.has_notes_slide:
+                    notes_slide = slide.notes_slide
+                    if notes_slide and hasattr(notes_slide, "notes_text_frame") and notes_slide.notes_text_frame:
+                        notes_text = notes_slide.notes_text_frame.text.strip()  # type: ignore
+                        if notes_text:
+                            slide_text.append(f"备注: {notes_text}")
+            except AttributeError:
+                pass
+            # 合并当前幻灯片的所有文本
+            if len(slide_text) > 1:  # 如果除了幻灯片编号外还有其他内容
+                all_text.append("\n".join(slide_text))
+        # 返回所有幻灯片的文本内容
+        return "\n\n".join(all_text)
+class ExcelProcessor(FileProcessor):
+    """Excel file processor"""
+    @staticmethod
+    def can_handle(file_path: str) -> bool:
+        return Path(file_path).suffix.lower() in ['.xls', '.xlsx']
+    @staticmethod
+    def extract_text(file_path: str) -> str:
+        """提取 Excel 文件中的所有文本内容，包括多个工作表及格式化内容"""
+        try:
+            # 读取所有工作表
+            excel_file = pd.ExcelFile(file_path)
+            sheets_text = []
+            # 处理每个工作表
+            for sheet_name in excel_file.sheet_names:
+                # 读取当前工作表
+                df = pd.read_excel(file_path, sheet_name=sheet_name)
+                # 如果是空表格，跳过
+                if df.empty:
+                    continue
+                # 添加工作表标题
+                sheet_text = [f"=== 工作表: {sheet_name} ==="]
+                # 填充空单元格，避免NaN显示
+                df = df.fillna("")
+                # 提取表格头信息
+                if not df.columns.empty:
+                    headers = [str(col) for col in df.columns]
+                    sheet_text.append("列标题: " + " | ".join(headers))
+                # 尝试提取表格中可能的关键信息
+                # 1. 表格内容概述
+                row_count, col_count = df.shape
+                sheet_text.append(f"表格大小: {row_count}行 x {col_count}列")
+                # 2. 表格数据，使用更友好的格式
+                try:
+                    # 转换数据框为字符串表示
+                    # 设置最大行数和列数，避免过大的表格
+                    max_rows = min(500, row_count)  # 最多显示500行
+                    if row_count > max_rows:
+                        sheet_text.append(f"注意: 表格太大，仅显示前{max_rows}行")
+                    # 将DataFrame转换为字符串表格
+                    table_str = df.head(max_rows).to_string(index=True, max_rows=max_rows, max_cols=None)
+                    sheet_text.append(table_str)
+                except Exception as e:
+                    sheet_text.append(f"表格数据提取错误: {str(e)}")
+                # 合并当前工作表的文本
+                sheets_text.append("\n".join(sheet_text))
+            # 如果没有提取到任何内容，返回一个提示信息
+            if not sheets_text:
+                return "Excel文件为空或无法提取内容"
+            # 合并所有工作表的文本
+            return "\n\n".join(sheets_text)
+        except Exception as e:
+            # 处理可能的异常，返回错误信息
+            return f"Excel文件处理错误: {str(e)}"

jarvis/jarvis_utils/input.py CHANGED Viewed

@@ -78,7 +78,6 @@ class FileCompleter(Completer):
             default_suggestions = [
                 (ot("CodeBase"), '查询代码库'),
                 (ot("Web"), '网页搜索'),
-                (ot("RAG"), '知识库检索'),
                 (ot("Summary"), '总结'),
                 (ot("Clear"), '清除历史'),
                 (ot("Methodology"), '查找相关方法论'),

jarvis-ai-assistant 0.1.138__py3-none-any.whl → 0.1.141__py3-none-any.whl

Potentially problematic release.

jarvis-ai-assistant 0.1.138py3-none-any.whl → 0.1.141py3-none-any.whl