PyPI - local-deep-research - Versions diffs - 0.3.12__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

local-deep-research 0.3.12py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (92) hide show

local_deep_research/defaults/default_settings.json CHANGED Viewed

@@ -194,6 +194,20 @@
         "value": 30000,
         "visible": true
     },
+    "llm.context_window_size": {
+        "category": "llm_parameters",
+        "description": "Maximum context window size in tokens for the LLM",
+        "editable": true,
+        "max_value": 20000000.0,
+        "min_value": 512.0,
+        "name": "Context Window Size",
+        "options": null,
+        "step": null,
+        "type": "LLM",
+        "ui_element": "number",
+        "value": 128000,
+        "visible": true
+    },
     "llm.supports_max_tokens": {
         "category": "llm_parameters",
         "description": "Whether the LLM API supports the 'max_tokens' option.",
@@ -624,6 +638,75 @@
         "value": false,
         "visible": true
     },
+    "search.journal_reputation.threshold": {
+        "category": "journal_quality_filter_parameters",
+        "description": "If enabled, journals with quality scores (scale from 1-10) below this threshold will be filtered out.",
+        "editable": true,
+        "max_value": 10,
+        "min_value": 1,
+        "name": "Journal Quality Threshold",
+        "options": null,
+        "step": 1,
+        "type": "SEARCH",
+        "ui_element": "range",
+        "value": 4,
+        "visible": true
+    },
+    "search.journal_reputation.max_context": {
+        "category": "journal_quality_filter_parameters",
+        "description": "Maximum number of characters to include in the prompt for journal quality checking.",
+        "editable": true,
+        "max_value": 1000000,
+        "min_value": 500,
+        "name": "Journal Quality Context Size",
+        "options": null,
+        "step": null,
+        "type": "SEARCH",
+        "ui_element": "number",
+        "value": 3000,
+        "visible": true
+    },
+    "search.journal_reputation.exclude_non_published": {
+        "category": "journal_quality_filter_parameters",
+        "description": "If true, quality filtering will exclude results that do not have a published journal reference.",
+        "editable": true,
+        "max_value": null,
+        "min_value": null,
+        "name": "Exclude Non-Published Results",
+        "options": null,
+        "step": null,
+        "type": "SEARCH",
+        "ui_element": "checkbox",
+        "value": false,
+        "visible": true
+    },
+    "search.journal_reputation.reanalysis_period": {
+        "category": "journal_quality_filter_parameters",
+        "description": "Period at which to re-check the quality of journals.",
+        "editable": true,
+        "max_value": null,
+        "min_value": null,
+        "name": "Quality Reanalysis Period",
+        "options": [
+            {
+                "label": "Yearly",
+                "value": "365"
+            },
+            {
+                "label": "Every 6 Months",
+                "value": "182"
+            },
+            {
+                "label": "Every Month",
+                "value": "30"
+            }
+        ],
+        "step": null,
+        "type": "SEARCH",
+        "ui_element": "select",
+        "value": "265",
+        "visible": true
+    },
     "search.snippets_only": {
         "category": "search_parameters",
         "description": "Only retrieve snippets instead of full search results",
@@ -778,6 +861,20 @@
         "value": "ArXivSearchEngine",
         "visible": true
     },
+    "search.engine.web.arxiv.journal_reputation.enabled": {
+        "category": "arxiv",
+        "description": "Enable journal quality filtering for this search engine.",
+        "editable": true,
+        "max_value": null,
+        "min_value": null,
+        "name": "Filter Low-Quality Journals",
+        "options": null,
+        "step": null,
+        "type": "SEARCH",
+        "ui_element": "checkbox",
+        "value": true,
+        "visible": true
+    },
     "search.engine.web.arxiv.default_params.max_results": {
         "category": "arxiv",
         "description": "Setting for arxiv.default_params.max_results",
@@ -3107,7 +3204,7 @@
         "step": 0.05,
         "type": "SEARCH",
         "ui_element": "range",
-        "value": 0.9,
+        "value": 1.0,
         "visible": true
     },
     "search.engine.web.searxng.requires_api_key": {
@@ -3136,12 +3233,16 @@
         "type": "SEARCH",
         "ui_element": "text",
         "value": [
-            "privacy-focused",
-            "metasearch engine",
-            "self-hosted",
-            "no tracking",
-            "configurable",
-            "multiple engines in one"
+            "comprehensive general information",
+            "current events and news",
+            "technical documentation",
+            "factual queries",
+            "historical information",
+            "consumer products",
+            "educational content",
+            "multi-source aggregation",
+            "real-time results",
+            "combined results from major search engines"
         ],
         "visible": true
     },

local_deep_research/search_system.py CHANGED Viewed

@@ -1,8 +1,8 @@
 # src/local_deep_research/search_system/search_system.py
-import logging
 from typing import Callable, Dict
 from langchain_core.language_models import BaseChatModel
+from loguru import logger
 from .advanced_search_system.findings.repository import FindingsRepository
 from .advanced_search_system.questions.standard_question import (
@@ -23,8 +23,6 @@ from .config.search_config import get_search
 from .utilities.db_utils import get_db_setting
 from .web_search_engines.search_engine_base import BaseSearchEngine
-logger = logging.getLogger(__name__)
 class AdvancedSearchSystem:
     """
@@ -38,6 +36,8 @@ class AdvancedSearchSystem:
         use_cross_engine_filter: bool = True,
         llm: BaseChatModel | None = None,
         search: BaseSearchEngine | None = None,
+        max_iterations: int | None = None,
+        questions_per_iteration: int | None = None,
     ):
         """Initialize the advanced search system.
@@ -49,6 +49,11 @@ class AdvancedSearchSystem:
             llm: LLM to use. If not provided, it will use the default one.
             search: Search engine to use. If not provided, it will use the
                 default one.
+            max_iterations: The maximum number of search iterations to
+                perform. Will be read from the settings if not specified.
+            questions_per_iteration: The number of questions to include in
+                each iteration. Will be read from the settings if not specified.
         """
         # Get configuration
         self.model = llm
@@ -59,11 +64,14 @@ class AdvancedSearchSystem:
             self.search = get_search(llm_instance=self.model)
         # Get iterations setting
-        self.max_iterations = get_db_setting("search.iterations", 1)
-        self.questions_per_iteration = get_db_setting(
-            "search.questions_per_iteration", 3
-        )
+        self.max_iterations = max_iterations
+        if self.max_iterations is None:
+            self.max_iterations = get_db_setting("search.iterations", 1)
+        self.questions_per_iteration = questions_per_iteration
+        if self.questions_per_iteration is None:
+            self.questions_per_iteration = get_db_setting(
+                "search.questions_per_iteration", 3
+            )
         # Log the strategy name that's being used
         logger.info(

local_deep_research/utilities/db_utils.py CHANGED Viewed

@@ -1,16 +1,13 @@
-import logging
 import os
 from functools import cache
 from typing import Any, Dict
+from loguru import logger
 from sqlalchemy import create_engine
 from sqlalchemy.orm import Session, sessionmaker
 from ..web.services.settings_manager import SettingsManager
-logger = logging.getLogger(__name__)
 # Database path.
 DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "data"))
 DB_PATH = os.path.join(DATA_DIR, "ldr.db")
@@ -57,8 +54,8 @@ def get_db_setting(
         if value is not None:
             return value
-    except Exception as e:
-        logger.error(f"Error getting setting {key} from database: {e}")
+    except Exception:
+        logger.exception(f"Error getting setting {key} from database")
     logger.warning(f"Could not find setting '{key}' in the database.")
     return default_value

local_deep_research/utilities/es_utils.py ADDED Viewed

@@ -0,0 +1,441 @@
+"""
+Elasticsearch utilities for indexing and managing documents.
+"""
+import json
+import logging
+import os
+from typing import Any, Dict, List, Optional, Union
+from elasticsearch import Elasticsearch
+from elasticsearch.helpers import bulk
+logger = logging.getLogger(__name__)
+class ElasticsearchManager:
+    """
+    Utility class for managing Elasticsearch indices and documents.
+    This class provides methods for creating indices, indexing documents,
+    and performing other Elasticsearch management tasks.
+    """
+    def __init__(
+        self,
+        hosts: List[str] = ["http://localhost:9200"],
+        username: Optional[str] = None,
+        password: Optional[str] = None,
+        api_key: Optional[str] = None,
+        cloud_id: Optional[str] = None,
+    ):
+        """
+        Initialize the Elasticsearch manager.
+        Args:
+            hosts: List of Elasticsearch hosts
+            username: Optional username for authentication
+            password: Optional password for authentication
+            api_key: Optional API key for authentication
+            cloud_id: Optional Elastic Cloud ID
+        """
+        # Initialize the Elasticsearch client
+        es_args = {}
+        # Basic authentication
+        if username and password:
+            es_args["basic_auth"] = (username, password)
+        # API key authentication
+        if api_key:
+            es_args["api_key"] = api_key
+        # Cloud ID for Elastic Cloud
+        if cloud_id:
+            es_args["cloud_id"] = cloud_id
+        # Connect to Elasticsearch
+        self.client = Elasticsearch(hosts, **es_args)
+        # Verify connection
+        try:
+            info = self.client.info()
+            logger.info(f"Connected to Elasticsearch cluster: {info.get('cluster_name')}")
+            logger.info(f"Elasticsearch version: {info.get('version', {}).get('number')}")
+        except Exception as e:
+            logger.error(f"Failed to connect to Elasticsearch: {str(e)}")
+            raise ConnectionError(f"Could not connect to Elasticsearch: {str(e)}")
+    def create_index(
+        self,
+        index_name: str,
+        mappings: Optional[Dict[str, Any]] = None,
+        settings: Optional[Dict[str, Any]] = None,
+    ) -> bool:
+        """
+        Create an Elasticsearch index with optional mappings and settings.
+        Args:
+            index_name: Name of the index to create
+            mappings: Optional mappings for the index fields
+            settings: Optional settings for the index
+        Returns:
+            bool: True if successful, False otherwise
+        """
+        try:
+            # Check if index already exists
+            if self.client.indices.exists(index=index_name):
+                logger.warning(f"Index '{index_name}' already exists - skipping creation")
+                return True
+            # Default mappings for better text search if none provided
+            if mappings is None:
+                mappings = {
+                    "properties": {
+                        "title": {
+                            "type": "text",
+                            "analyzer": "standard",
+                            "fields": {
+                                "keyword": {
+                                    "type": "keyword",
+                                    "ignore_above": 256
+                                }
+                            }
+                        },
+                        "content": {
+                            "type": "text",
+                            "analyzer": "standard"
+                        },
+                        "url": {
+                            "type": "keyword"
+                        },
+                        "source": {
+                            "type": "keyword"
+                        },
+                        "timestamp": {
+                            "type": "date"
+                        },
+                        "metadata": {
+                            "type": "object",
+                            "enabled": True
+                        }
+                    }
+                }
+            # Default settings if none provided
+            if settings is None:
+                settings = {
+                    "number_of_shards": 1,
+                    "number_of_replicas": 0,
+                    "analysis": {
+                        "analyzer": {
+                            "standard": {
+                                "type": "standard"
+                            }
+                        }
+                    }
+                }
+            # Create the index with mappings and settings
+            create_response = self.client.indices.create(
+                index=index_name,
+                mappings=mappings,
+                settings=settings,
+            )
+            logger.info(f"Created index '{index_name}': {create_response}")
+            return True
+        except Exception as e:
+            logger.error(f"Error creating index '{index_name}': {str(e)}")
+            return False
+    def delete_index(self, index_name: str) -> bool:
+        """
+        Delete an Elasticsearch index.
+        Args:
+            index_name: Name of the index to delete
+        Returns:
+            bool: True if successful, False otherwise
+        """
+        try:
+            # Check if index exists
+            if not self.client.indices.exists(index=index_name):
+                logger.warning(f"Index '{index_name}' does not exist - skipping deletion")
+                return True
+            # Delete the index
+            delete_response = self.client.indices.delete(index=index_name)
+            logger.info(f"Deleted index '{index_name}': {delete_response}")
+            return True
+        except Exception as e:
+            logger.error(f"Error deleting index '{index_name}': {str(e)}")
+            return False
+    def index_document(
+        self,
+        index_name: str,
+        document: Dict[str, Any],
+        document_id: Optional[str] = None,
+        refresh: bool = False,
+    ) -> Optional[str]:
+        """
+        Index a single document in Elasticsearch.
+        Args:
+            index_name: Name of the index to add the document to
+            document: The document to index
+            document_id: Optional document ID (will be generated if not provided)
+            refresh: Whether to refresh the index after indexing
+        Returns:
+            str: Document ID if successful, None otherwise
+        """
+        try:
+            # Index the document
+            response = self.client.index(
+                index=index_name,
+                document=document,
+                id=document_id,
+                refresh=refresh,
+            )
+            logger.info(f"Indexed document in '{index_name}' with ID: {response['_id']}")
+            return response["_id"]
+        except Exception as e:
+            logger.error(f"Error indexing document in '{index_name}': {str(e)}")
+            return None
+    def bulk_index_documents(
+        self,
+        index_name: str,
+        documents: List[Dict[str, Any]],
+        id_field: Optional[str] = None,
+        refresh: bool = False,
+    ) -> int:
+        """
+        Bulk index multiple documents in Elasticsearch.
+        Args:
+            index_name: Name of the index to add the documents to
+            documents: List of documents to index
+            id_field: Optional field in the documents to use as the document ID
+            refresh: Whether to refresh the index after indexing
+        Returns:
+            int: Number of successfully indexed documents
+        """
+        try:
+            # Prepare the bulk actions
+            actions = []
+            for doc in documents:
+                action = {
+                    "_index": index_name,
+                    "_source": doc,
+                }
+                # Use the specified field as the document ID if provided
+                if id_field and id_field in doc:
+                    action["_id"] = doc[id_field]
+                actions.append(action)
+            # Execute the bulk indexing
+            success, failed = bulk(
+                self.client,
+                actions,
+                refresh=refresh,
+                stats_only=True,
+            )
+            logger.info(f"Bulk indexed {success} documents in '{index_name}', failed: {failed}")
+            return success
+        except Exception as e:
+            logger.error(f"Error bulk indexing documents in '{index_name}': {str(e)}")
+            return 0
+    def index_file(
+        self,
+        index_name: str,
+        file_path: str,
+        content_field: str = "content",
+        title_field: Optional[str] = "title",
+        extract_metadata: bool = True,
+        refresh: bool = False,
+    ) -> Optional[str]:
+        """
+        Index a file in Elasticsearch, extracting text content and metadata.
+        Args:
+            index_name: Name of the index to add the document to
+            file_path: Path to the file to index
+            content_field: Field name to store the file content
+            title_field: Field name to store the file title (filename if not specified)
+            extract_metadata: Whether to extract file metadata
+            refresh: Whether to refresh the index after indexing
+        Returns:
+            str: Document ID if successful, None otherwise
+        """
+        try:
+            from langchain_community.document_loaders import UnstructuredFileLoader
+            # Extract file content and metadata
+            loader = UnstructuredFileLoader(file_path)
+            documents = loader.load()
+            # Combine all content from the documents
+            content = "\n\n".join([doc.page_content for doc in documents])
+            # Get the filename for the title
+            filename = os.path.basename(file_path)
+            title = filename
+            # Prepare the document
+            document = {
+                content_field: content,
+            }
+            # Add title if requested
+            if title_field:
+                document[title_field] = title
+            # Add metadata if requested
+            if extract_metadata and documents:
+                # Include metadata from the first document
+                document["metadata"] = documents[0].metadata
+                # Add file-specific metadata
+                document["source"] = file_path
+                document["file_extension"] = os.path.splitext(filename)[1].lstrip(".")
+                document["filename"] = filename
+            # Index the document
+            return self.index_document(index_name, document, refresh=refresh)
+        except ImportError:
+            logger.error("UnstructuredFileLoader not available. Please install the 'unstructured' package.")
+            return None
+        except Exception as e:
+            logger.error(f"Error indexing file '{file_path}': {str(e)}")
+            return None
+    def index_directory(
+        self,
+        index_name: str,
+        directory_path: str,
+        file_patterns: List[str] = ["*.txt", "*.pdf", "*.docx", "*.md"],
+        content_field: str = "content",
+        title_field: str = "title",
+        extract_metadata: bool = True,
+        refresh: bool = False,
+    ) -> int:
+        """
+        Index all matching files in a directory in Elasticsearch.
+        Args:
+            index_name: Name of the index to add the documents to
+            directory_path: Path to the directory containing files to index
+            file_patterns: List of file patterns to match (glob patterns)
+            content_field: Field name to store the file content
+            title_field: Field name to store the file title
+            extract_metadata: Whether to extract file metadata
+            refresh: Whether to refresh the index after indexing
+        Returns:
+            int: Number of successfully indexed files
+        """
+        try:
+            import glob
+            # Find all matching files
+            all_files = []
+            for pattern in file_patterns:
+                pattern_path = os.path.join(directory_path, pattern)
+                matching_files = glob.glob(pattern_path)
+                all_files.extend(matching_files)
+            logger.info(f"Found {len(all_files)} files matching patterns {file_patterns} in {directory_path}")
+            # Index each file
+            successful_count = 0
+            for file_path in all_files:
+                logger.info(f"Indexing file: {file_path}")
+                doc_id = self.index_file(
+                    index_name=index_name,
+                    file_path=file_path,
+                    content_field=content_field,
+                    title_field=title_field,
+                    extract_metadata=extract_metadata,
+                    refresh=refresh,
+                )
+                if doc_id:
+                    successful_count += 1
+            logger.info(f"Successfully indexed {successful_count} files out of {len(all_files)}")
+            return successful_count
+        except Exception as e:
+            logger.error(f"Error indexing directory '{directory_path}': {str(e)}")
+            return 0
+    def search(
+        self,
+        index_name: str,
+        query: str,
+        fields: List[str] = ["content", "title"],
+        size: int = 10,
+        highlight: bool = True,
+    ) -> Dict[str, Any]:
+        """
+        Search for documents in Elasticsearch.
+        Args:
+            index_name: Name of the index to search
+            query: Search query
+            fields: Fields to search in
+            size: Maximum number of results to return
+            highlight: Whether to include highlighted excerpts in results
+        Returns:
+            Dict: Elasticsearch search response
+        """
+        try:
+            search_query = {
+                "query": {
+                    "multi_match": {
+                        "query": query,
+                        "fields": fields,
+                        "type": "best_fields",
+                        "tie_breaker": 0.3,
+                    }
+                },
+                "size": size,
+            }
+            # Add highlighting if requested
+            if highlight:
+                search_query["highlight"] = {
+                    "fields": {field: {} for field in fields},
+                    "pre_tags": ["<em>"],
+                    "post_tags": ["</em>"],
+                }
+            # Execute the search
+            response = self.client.search(
+                index=index_name,
+                body=search_query,
+            )
+            return response
+        except Exception as e:
+            logger.error(f"Error searching index '{index_name}': {str(e)}")
+            return {"error": str(e)}

local-deep-research 0.3.12__py3-none-any.whl → 0.4.0__py3-none-any.whl

local-deep-research 0.3.12py3-none-any.whl → 0.4.0py3-none-any.whl