local-deep-research 0.3.12__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__init__.py +1 -0
- local_deep_research/__version__.py +1 -1
- local_deep_research/advanced_search_system/filters/base_filter.py +2 -3
- local_deep_research/advanced_search_system/filters/cross_engine_filter.py +4 -5
- local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +298 -0
- local_deep_research/advanced_search_system/findings/repository.py +0 -3
- local_deep_research/advanced_search_system/strategies/base_strategy.py +1 -2
- local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +14 -18
- local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +4 -8
- local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +5 -6
- local_deep_research/advanced_search_system/strategies/source_based_strategy.py +2 -2
- local_deep_research/advanced_search_system/strategies/standard_strategy.py +9 -7
- local_deep_research/api/benchmark_functions.py +288 -0
- local_deep_research/api/research_functions.py +8 -4
- local_deep_research/benchmarks/README.md +162 -0
- local_deep_research/benchmarks/__init__.py +51 -0
- local_deep_research/benchmarks/benchmark_functions.py +353 -0
- local_deep_research/benchmarks/cli/__init__.py +16 -0
- local_deep_research/benchmarks/cli/benchmark_commands.py +338 -0
- local_deep_research/benchmarks/cli.py +347 -0
- local_deep_research/benchmarks/comparison/__init__.py +12 -0
- local_deep_research/benchmarks/comparison/evaluator.py +768 -0
- local_deep_research/benchmarks/datasets/__init__.py +53 -0
- local_deep_research/benchmarks/datasets/base.py +295 -0
- local_deep_research/benchmarks/datasets/browsecomp.py +116 -0
- local_deep_research/benchmarks/datasets/custom_dataset_template.py +98 -0
- local_deep_research/benchmarks/datasets/simpleqa.py +74 -0
- local_deep_research/benchmarks/datasets/utils.py +116 -0
- local_deep_research/benchmarks/datasets.py +31 -0
- local_deep_research/benchmarks/efficiency/__init__.py +14 -0
- local_deep_research/benchmarks/efficiency/resource_monitor.py +367 -0
- local_deep_research/benchmarks/efficiency/speed_profiler.py +214 -0
- local_deep_research/benchmarks/evaluators/__init__.py +18 -0
- local_deep_research/benchmarks/evaluators/base.py +74 -0
- local_deep_research/benchmarks/evaluators/browsecomp.py +83 -0
- local_deep_research/benchmarks/evaluators/composite.py +121 -0
- local_deep_research/benchmarks/evaluators/simpleqa.py +271 -0
- local_deep_research/benchmarks/graders.py +410 -0
- local_deep_research/benchmarks/metrics/README.md +80 -0
- local_deep_research/benchmarks/metrics/__init__.py +24 -0
- local_deep_research/benchmarks/metrics/calculation.py +385 -0
- local_deep_research/benchmarks/metrics/reporting.py +155 -0
- local_deep_research/benchmarks/metrics/visualization.py +205 -0
- local_deep_research/benchmarks/metrics.py +11 -0
- local_deep_research/benchmarks/optimization/__init__.py +32 -0
- local_deep_research/benchmarks/optimization/api.py +274 -0
- local_deep_research/benchmarks/optimization/metrics.py +20 -0
- local_deep_research/benchmarks/optimization/optuna_optimizer.py +1163 -0
- local_deep_research/benchmarks/runners.py +434 -0
- local_deep_research/benchmarks/templates.py +65 -0
- local_deep_research/config/llm_config.py +26 -23
- local_deep_research/config/search_config.py +1 -5
- local_deep_research/defaults/default_settings.json +108 -7
- local_deep_research/search_system.py +16 -8
- local_deep_research/utilities/db_utils.py +3 -6
- local_deep_research/utilities/es_utils.py +441 -0
- local_deep_research/utilities/log_utils.py +36 -0
- local_deep_research/utilities/search_utilities.py +8 -9
- local_deep_research/web/app.py +15 -10
- local_deep_research/web/app_factory.py +9 -12
- local_deep_research/web/database/migrations.py +8 -5
- local_deep_research/web/database/models.py +20 -0
- local_deep_research/web/database/schema_upgrade.py +5 -8
- local_deep_research/web/models/database.py +15 -18
- local_deep_research/web/routes/benchmark_routes.py +427 -0
- local_deep_research/web/routes/research_routes.py +13 -17
- local_deep_research/web/routes/settings_routes.py +264 -67
- local_deep_research/web/services/research_service.py +58 -73
- local_deep_research/web/services/settings_manager.py +1 -4
- local_deep_research/web/services/settings_service.py +4 -6
- local_deep_research/web/static/css/styles.css +12 -0
- local_deep_research/web/static/js/components/logpanel.js +164 -155
- local_deep_research/web/static/js/components/research.js +44 -3
- local_deep_research/web/static/js/components/settings.js +27 -0
- local_deep_research/web/static/js/services/socket.js +47 -0
- local_deep_research/web_search_engines/default_search_engines.py +38 -0
- local_deep_research/web_search_engines/engines/meta_search_engine.py +100 -33
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +31 -17
- local_deep_research/web_search_engines/engines/search_engine_brave.py +8 -3
- local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +343 -0
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +14 -6
- local_deep_research/web_search_engines/engines/search_engine_local.py +19 -23
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +9 -12
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +12 -17
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +8 -4
- local_deep_research/web_search_engines/search_engine_base.py +22 -5
- local_deep_research/web_search_engines/search_engine_factory.py +30 -11
- local_deep_research/web_search_engines/search_engines_config.py +14 -1
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/METADATA +10 -2
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/RECORD +93 -51
- local_deep_research/app.py +0 -8
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/WHEEL +0 -0
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/entry_points.txt +0 -0
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -194,6 +194,20 @@
|
|
194
194
|
"value": 30000,
|
195
195
|
"visible": true
|
196
196
|
},
|
197
|
+
"llm.context_window_size": {
|
198
|
+
"category": "llm_parameters",
|
199
|
+
"description": "Maximum context window size in tokens for the LLM",
|
200
|
+
"editable": true,
|
201
|
+
"max_value": 20000000.0,
|
202
|
+
"min_value": 512.0,
|
203
|
+
"name": "Context Window Size",
|
204
|
+
"options": null,
|
205
|
+
"step": null,
|
206
|
+
"type": "LLM",
|
207
|
+
"ui_element": "number",
|
208
|
+
"value": 128000,
|
209
|
+
"visible": true
|
210
|
+
},
|
197
211
|
"llm.supports_max_tokens": {
|
198
212
|
"category": "llm_parameters",
|
199
213
|
"description": "Whether the LLM API supports the 'max_tokens' option.",
|
@@ -624,6 +638,75 @@
|
|
624
638
|
"value": false,
|
625
639
|
"visible": true
|
626
640
|
},
|
641
|
+
"search.journal_reputation.threshold": {
|
642
|
+
"category": "journal_quality_filter_parameters",
|
643
|
+
"description": "If enabled, journals with quality scores (scale from 1-10) below this threshold will be filtered out.",
|
644
|
+
"editable": true,
|
645
|
+
"max_value": 10,
|
646
|
+
"min_value": 1,
|
647
|
+
"name": "Journal Quality Threshold",
|
648
|
+
"options": null,
|
649
|
+
"step": 1,
|
650
|
+
"type": "SEARCH",
|
651
|
+
"ui_element": "range",
|
652
|
+
"value": 4,
|
653
|
+
"visible": true
|
654
|
+
},
|
655
|
+
"search.journal_reputation.max_context": {
|
656
|
+
"category": "journal_quality_filter_parameters",
|
657
|
+
"description": "Maximum number of characters to include in the prompt for journal quality checking.",
|
658
|
+
"editable": true,
|
659
|
+
"max_value": 1000000,
|
660
|
+
"min_value": 500,
|
661
|
+
"name": "Journal Quality Context Size",
|
662
|
+
"options": null,
|
663
|
+
"step": null,
|
664
|
+
"type": "SEARCH",
|
665
|
+
"ui_element": "number",
|
666
|
+
"value": 3000,
|
667
|
+
"visible": true
|
668
|
+
},
|
669
|
+
"search.journal_reputation.exclude_non_published": {
|
670
|
+
"category": "journal_quality_filter_parameters",
|
671
|
+
"description": "If true, quality filtering will exclude results that do not have a published journal reference.",
|
672
|
+
"editable": true,
|
673
|
+
"max_value": null,
|
674
|
+
"min_value": null,
|
675
|
+
"name": "Exclude Non-Published Results",
|
676
|
+
"options": null,
|
677
|
+
"step": null,
|
678
|
+
"type": "SEARCH",
|
679
|
+
"ui_element": "checkbox",
|
680
|
+
"value": false,
|
681
|
+
"visible": true
|
682
|
+
},
|
683
|
+
"search.journal_reputation.reanalysis_period": {
|
684
|
+
"category": "journal_quality_filter_parameters",
|
685
|
+
"description": "Period at which to re-check the quality of journals.",
|
686
|
+
"editable": true,
|
687
|
+
"max_value": null,
|
688
|
+
"min_value": null,
|
689
|
+
"name": "Quality Reanalysis Period",
|
690
|
+
"options": [
|
691
|
+
{
|
692
|
+
"label": "Yearly",
|
693
|
+
"value": "365"
|
694
|
+
},
|
695
|
+
{
|
696
|
+
"label": "Every 6 Months",
|
697
|
+
"value": "182"
|
698
|
+
},
|
699
|
+
{
|
700
|
+
"label": "Every Month",
|
701
|
+
"value": "30"
|
702
|
+
}
|
703
|
+
],
|
704
|
+
"step": null,
|
705
|
+
"type": "SEARCH",
|
706
|
+
"ui_element": "select",
|
707
|
+
"value": "265",
|
708
|
+
"visible": true
|
709
|
+
},
|
627
710
|
"search.snippets_only": {
|
628
711
|
"category": "search_parameters",
|
629
712
|
"description": "Only retrieve snippets instead of full search results",
|
@@ -778,6 +861,20 @@
|
|
778
861
|
"value": "ArXivSearchEngine",
|
779
862
|
"visible": true
|
780
863
|
},
|
864
|
+
"search.engine.web.arxiv.journal_reputation.enabled": {
|
865
|
+
"category": "arxiv",
|
866
|
+
"description": "Enable journal quality filtering for this search engine.",
|
867
|
+
"editable": true,
|
868
|
+
"max_value": null,
|
869
|
+
"min_value": null,
|
870
|
+
"name": "Filter Low-Quality Journals",
|
871
|
+
"options": null,
|
872
|
+
"step": null,
|
873
|
+
"type": "SEARCH",
|
874
|
+
"ui_element": "checkbox",
|
875
|
+
"value": true,
|
876
|
+
"visible": true
|
877
|
+
},
|
781
878
|
"search.engine.web.arxiv.default_params.max_results": {
|
782
879
|
"category": "arxiv",
|
783
880
|
"description": "Setting for arxiv.default_params.max_results",
|
@@ -3107,7 +3204,7 @@
|
|
3107
3204
|
"step": 0.05,
|
3108
3205
|
"type": "SEARCH",
|
3109
3206
|
"ui_element": "range",
|
3110
|
-
"value": 0
|
3207
|
+
"value": 1.0,
|
3111
3208
|
"visible": true
|
3112
3209
|
},
|
3113
3210
|
"search.engine.web.searxng.requires_api_key": {
|
@@ -3136,12 +3233,16 @@
|
|
3136
3233
|
"type": "SEARCH",
|
3137
3234
|
"ui_element": "text",
|
3138
3235
|
"value": [
|
3139
|
-
"
|
3140
|
-
"
|
3141
|
-
"
|
3142
|
-
"
|
3143
|
-
"
|
3144
|
-
"
|
3236
|
+
"comprehensive general information",
|
3237
|
+
"current events and news",
|
3238
|
+
"technical documentation",
|
3239
|
+
"factual queries",
|
3240
|
+
"historical information",
|
3241
|
+
"consumer products",
|
3242
|
+
"educational content",
|
3243
|
+
"multi-source aggregation",
|
3244
|
+
"real-time results",
|
3245
|
+
"combined results from major search engines"
|
3145
3246
|
],
|
3146
3247
|
"visible": true
|
3147
3248
|
},
|
@@ -1,8 +1,8 @@
|
|
1
1
|
# src/local_deep_research/search_system/search_system.py
|
2
|
-
import logging
|
3
2
|
from typing import Callable, Dict
|
4
3
|
|
5
4
|
from langchain_core.language_models import BaseChatModel
|
5
|
+
from loguru import logger
|
6
6
|
|
7
7
|
from .advanced_search_system.findings.repository import FindingsRepository
|
8
8
|
from .advanced_search_system.questions.standard_question import (
|
@@ -23,8 +23,6 @@ from .config.search_config import get_search
|
|
23
23
|
from .utilities.db_utils import get_db_setting
|
24
24
|
from .web_search_engines.search_engine_base import BaseSearchEngine
|
25
25
|
|
26
|
-
logger = logging.getLogger(__name__)
|
27
|
-
|
28
26
|
|
29
27
|
class AdvancedSearchSystem:
|
30
28
|
"""
|
@@ -38,6 +36,8 @@ class AdvancedSearchSystem:
|
|
38
36
|
use_cross_engine_filter: bool = True,
|
39
37
|
llm: BaseChatModel | None = None,
|
40
38
|
search: BaseSearchEngine | None = None,
|
39
|
+
max_iterations: int | None = None,
|
40
|
+
questions_per_iteration: int | None = None,
|
41
41
|
):
|
42
42
|
"""Initialize the advanced search system.
|
43
43
|
|
@@ -49,6 +49,11 @@ class AdvancedSearchSystem:
|
|
49
49
|
llm: LLM to use. If not provided, it will use the default one.
|
50
50
|
search: Search engine to use. If not provided, it will use the
|
51
51
|
default one.
|
52
|
+
max_iterations: The maximum number of search iterations to
|
53
|
+
perform. Will be read from the settings if not specified.
|
54
|
+
questions_per_iteration: The number of questions to include in
|
55
|
+
each iteration. Will be read from the settings if not specified.
|
56
|
+
|
52
57
|
"""
|
53
58
|
# Get configuration
|
54
59
|
self.model = llm
|
@@ -59,11 +64,14 @@ class AdvancedSearchSystem:
|
|
59
64
|
self.search = get_search(llm_instance=self.model)
|
60
65
|
|
61
66
|
# Get iterations setting
|
62
|
-
self.max_iterations =
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
+
self.max_iterations = max_iterations
|
68
|
+
if self.max_iterations is None:
|
69
|
+
self.max_iterations = get_db_setting("search.iterations", 1)
|
70
|
+
self.questions_per_iteration = questions_per_iteration
|
71
|
+
if self.questions_per_iteration is None:
|
72
|
+
self.questions_per_iteration = get_db_setting(
|
73
|
+
"search.questions_per_iteration", 3
|
74
|
+
)
|
67
75
|
|
68
76
|
# Log the strategy name that's being used
|
69
77
|
logger.info(
|
@@ -1,16 +1,13 @@
|
|
1
|
-
import logging
|
2
1
|
import os
|
3
2
|
from functools import cache
|
4
3
|
from typing import Any, Dict
|
5
4
|
|
5
|
+
from loguru import logger
|
6
6
|
from sqlalchemy import create_engine
|
7
7
|
from sqlalchemy.orm import Session, sessionmaker
|
8
8
|
|
9
9
|
from ..web.services.settings_manager import SettingsManager
|
10
10
|
|
11
|
-
logger = logging.getLogger(__name__)
|
12
|
-
|
13
|
-
|
14
11
|
# Database path.
|
15
12
|
DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "data"))
|
16
13
|
DB_PATH = os.path.join(DATA_DIR, "ldr.db")
|
@@ -57,8 +54,8 @@ def get_db_setting(
|
|
57
54
|
|
58
55
|
if value is not None:
|
59
56
|
return value
|
60
|
-
except Exception
|
61
|
-
logger.
|
57
|
+
except Exception:
|
58
|
+
logger.exception(f"Error getting setting {key} from database")
|
62
59
|
|
63
60
|
logger.warning(f"Could not find setting '{key}' in the database.")
|
64
61
|
return default_value
|
@@ -0,0 +1,441 @@
|
|
1
|
+
"""
|
2
|
+
Elasticsearch utilities for indexing and managing documents.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import json
|
6
|
+
import logging
|
7
|
+
import os
|
8
|
+
from typing import Any, Dict, List, Optional, Union
|
9
|
+
|
10
|
+
from elasticsearch import Elasticsearch
|
11
|
+
from elasticsearch.helpers import bulk
|
12
|
+
|
13
|
+
logger = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
|
16
|
+
class ElasticsearchManager:
|
17
|
+
"""
|
18
|
+
Utility class for managing Elasticsearch indices and documents.
|
19
|
+
|
20
|
+
This class provides methods for creating indices, indexing documents,
|
21
|
+
and performing other Elasticsearch management tasks.
|
22
|
+
"""
|
23
|
+
|
24
|
+
def __init__(
|
25
|
+
self,
|
26
|
+
hosts: List[str] = ["http://localhost:9200"],
|
27
|
+
username: Optional[str] = None,
|
28
|
+
password: Optional[str] = None,
|
29
|
+
api_key: Optional[str] = None,
|
30
|
+
cloud_id: Optional[str] = None,
|
31
|
+
):
|
32
|
+
"""
|
33
|
+
Initialize the Elasticsearch manager.
|
34
|
+
|
35
|
+
Args:
|
36
|
+
hosts: List of Elasticsearch hosts
|
37
|
+
username: Optional username for authentication
|
38
|
+
password: Optional password for authentication
|
39
|
+
api_key: Optional API key for authentication
|
40
|
+
cloud_id: Optional Elastic Cloud ID
|
41
|
+
"""
|
42
|
+
# Initialize the Elasticsearch client
|
43
|
+
es_args = {}
|
44
|
+
|
45
|
+
# Basic authentication
|
46
|
+
if username and password:
|
47
|
+
es_args["basic_auth"] = (username, password)
|
48
|
+
|
49
|
+
# API key authentication
|
50
|
+
if api_key:
|
51
|
+
es_args["api_key"] = api_key
|
52
|
+
|
53
|
+
# Cloud ID for Elastic Cloud
|
54
|
+
if cloud_id:
|
55
|
+
es_args["cloud_id"] = cloud_id
|
56
|
+
|
57
|
+
# Connect to Elasticsearch
|
58
|
+
self.client = Elasticsearch(hosts, **es_args)
|
59
|
+
|
60
|
+
# Verify connection
|
61
|
+
try:
|
62
|
+
info = self.client.info()
|
63
|
+
logger.info(f"Connected to Elasticsearch cluster: {info.get('cluster_name')}")
|
64
|
+
logger.info(f"Elasticsearch version: {info.get('version', {}).get('number')}")
|
65
|
+
except Exception as e:
|
66
|
+
logger.error(f"Failed to connect to Elasticsearch: {str(e)}")
|
67
|
+
raise ConnectionError(f"Could not connect to Elasticsearch: {str(e)}")
|
68
|
+
|
69
|
+
def create_index(
|
70
|
+
self,
|
71
|
+
index_name: str,
|
72
|
+
mappings: Optional[Dict[str, Any]] = None,
|
73
|
+
settings: Optional[Dict[str, Any]] = None,
|
74
|
+
) -> bool:
|
75
|
+
"""
|
76
|
+
Create an Elasticsearch index with optional mappings and settings.
|
77
|
+
|
78
|
+
Args:
|
79
|
+
index_name: Name of the index to create
|
80
|
+
mappings: Optional mappings for the index fields
|
81
|
+
settings: Optional settings for the index
|
82
|
+
|
83
|
+
Returns:
|
84
|
+
bool: True if successful, False otherwise
|
85
|
+
"""
|
86
|
+
try:
|
87
|
+
# Check if index already exists
|
88
|
+
if self.client.indices.exists(index=index_name):
|
89
|
+
logger.warning(f"Index '{index_name}' already exists - skipping creation")
|
90
|
+
return True
|
91
|
+
|
92
|
+
# Default mappings for better text search if none provided
|
93
|
+
if mappings is None:
|
94
|
+
mappings = {
|
95
|
+
"properties": {
|
96
|
+
"title": {
|
97
|
+
"type": "text",
|
98
|
+
"analyzer": "standard",
|
99
|
+
"fields": {
|
100
|
+
"keyword": {
|
101
|
+
"type": "keyword",
|
102
|
+
"ignore_above": 256
|
103
|
+
}
|
104
|
+
}
|
105
|
+
},
|
106
|
+
"content": {
|
107
|
+
"type": "text",
|
108
|
+
"analyzer": "standard"
|
109
|
+
},
|
110
|
+
"url": {
|
111
|
+
"type": "keyword"
|
112
|
+
},
|
113
|
+
"source": {
|
114
|
+
"type": "keyword"
|
115
|
+
},
|
116
|
+
"timestamp": {
|
117
|
+
"type": "date"
|
118
|
+
},
|
119
|
+
"metadata": {
|
120
|
+
"type": "object",
|
121
|
+
"enabled": True
|
122
|
+
}
|
123
|
+
}
|
124
|
+
}
|
125
|
+
|
126
|
+
# Default settings if none provided
|
127
|
+
if settings is None:
|
128
|
+
settings = {
|
129
|
+
"number_of_shards": 1,
|
130
|
+
"number_of_replicas": 0,
|
131
|
+
"analysis": {
|
132
|
+
"analyzer": {
|
133
|
+
"standard": {
|
134
|
+
"type": "standard"
|
135
|
+
}
|
136
|
+
}
|
137
|
+
}
|
138
|
+
}
|
139
|
+
|
140
|
+
# Create the index with mappings and settings
|
141
|
+
create_response = self.client.indices.create(
|
142
|
+
index=index_name,
|
143
|
+
mappings=mappings,
|
144
|
+
settings=settings,
|
145
|
+
)
|
146
|
+
|
147
|
+
logger.info(f"Created index '{index_name}': {create_response}")
|
148
|
+
return True
|
149
|
+
|
150
|
+
except Exception as e:
|
151
|
+
logger.error(f"Error creating index '{index_name}': {str(e)}")
|
152
|
+
return False
|
153
|
+
|
154
|
+
def delete_index(self, index_name: str) -> bool:
|
155
|
+
"""
|
156
|
+
Delete an Elasticsearch index.
|
157
|
+
|
158
|
+
Args:
|
159
|
+
index_name: Name of the index to delete
|
160
|
+
|
161
|
+
Returns:
|
162
|
+
bool: True if successful, False otherwise
|
163
|
+
"""
|
164
|
+
try:
|
165
|
+
# Check if index exists
|
166
|
+
if not self.client.indices.exists(index=index_name):
|
167
|
+
logger.warning(f"Index '{index_name}' does not exist - skipping deletion")
|
168
|
+
return True
|
169
|
+
|
170
|
+
# Delete the index
|
171
|
+
delete_response = self.client.indices.delete(index=index_name)
|
172
|
+
logger.info(f"Deleted index '{index_name}': {delete_response}")
|
173
|
+
return True
|
174
|
+
|
175
|
+
except Exception as e:
|
176
|
+
logger.error(f"Error deleting index '{index_name}': {str(e)}")
|
177
|
+
return False
|
178
|
+
|
179
|
+
def index_document(
|
180
|
+
self,
|
181
|
+
index_name: str,
|
182
|
+
document: Dict[str, Any],
|
183
|
+
document_id: Optional[str] = None,
|
184
|
+
refresh: bool = False,
|
185
|
+
) -> Optional[str]:
|
186
|
+
"""
|
187
|
+
Index a single document in Elasticsearch.
|
188
|
+
|
189
|
+
Args:
|
190
|
+
index_name: Name of the index to add the document to
|
191
|
+
document: The document to index
|
192
|
+
document_id: Optional document ID (will be generated if not provided)
|
193
|
+
refresh: Whether to refresh the index after indexing
|
194
|
+
|
195
|
+
Returns:
|
196
|
+
str: Document ID if successful, None otherwise
|
197
|
+
"""
|
198
|
+
try:
|
199
|
+
# Index the document
|
200
|
+
response = self.client.index(
|
201
|
+
index=index_name,
|
202
|
+
document=document,
|
203
|
+
id=document_id,
|
204
|
+
refresh=refresh,
|
205
|
+
)
|
206
|
+
|
207
|
+
logger.info(f"Indexed document in '{index_name}' with ID: {response['_id']}")
|
208
|
+
return response["_id"]
|
209
|
+
|
210
|
+
except Exception as e:
|
211
|
+
logger.error(f"Error indexing document in '{index_name}': {str(e)}")
|
212
|
+
return None
|
213
|
+
|
214
|
+
def bulk_index_documents(
|
215
|
+
self,
|
216
|
+
index_name: str,
|
217
|
+
documents: List[Dict[str, Any]],
|
218
|
+
id_field: Optional[str] = None,
|
219
|
+
refresh: bool = False,
|
220
|
+
) -> int:
|
221
|
+
"""
|
222
|
+
Bulk index multiple documents in Elasticsearch.
|
223
|
+
|
224
|
+
Args:
|
225
|
+
index_name: Name of the index to add the documents to
|
226
|
+
documents: List of documents to index
|
227
|
+
id_field: Optional field in the documents to use as the document ID
|
228
|
+
refresh: Whether to refresh the index after indexing
|
229
|
+
|
230
|
+
Returns:
|
231
|
+
int: Number of successfully indexed documents
|
232
|
+
"""
|
233
|
+
try:
|
234
|
+
# Prepare the bulk actions
|
235
|
+
actions = []
|
236
|
+
for doc in documents:
|
237
|
+
action = {
|
238
|
+
"_index": index_name,
|
239
|
+
"_source": doc,
|
240
|
+
}
|
241
|
+
|
242
|
+
# Use the specified field as the document ID if provided
|
243
|
+
if id_field and id_field in doc:
|
244
|
+
action["_id"] = doc[id_field]
|
245
|
+
|
246
|
+
actions.append(action)
|
247
|
+
|
248
|
+
# Execute the bulk indexing
|
249
|
+
success, failed = bulk(
|
250
|
+
self.client,
|
251
|
+
actions,
|
252
|
+
refresh=refresh,
|
253
|
+
stats_only=True,
|
254
|
+
)
|
255
|
+
|
256
|
+
logger.info(f"Bulk indexed {success} documents in '{index_name}', failed: {failed}")
|
257
|
+
return success
|
258
|
+
|
259
|
+
except Exception as e:
|
260
|
+
logger.error(f"Error bulk indexing documents in '{index_name}': {str(e)}")
|
261
|
+
return 0
|
262
|
+
|
263
|
+
def index_file(
|
264
|
+
self,
|
265
|
+
index_name: str,
|
266
|
+
file_path: str,
|
267
|
+
content_field: str = "content",
|
268
|
+
title_field: Optional[str] = "title",
|
269
|
+
extract_metadata: bool = True,
|
270
|
+
refresh: bool = False,
|
271
|
+
) -> Optional[str]:
|
272
|
+
"""
|
273
|
+
Index a file in Elasticsearch, extracting text content and metadata.
|
274
|
+
|
275
|
+
Args:
|
276
|
+
index_name: Name of the index to add the document to
|
277
|
+
file_path: Path to the file to index
|
278
|
+
content_field: Field name to store the file content
|
279
|
+
title_field: Field name to store the file title (filename if not specified)
|
280
|
+
extract_metadata: Whether to extract file metadata
|
281
|
+
refresh: Whether to refresh the index after indexing
|
282
|
+
|
283
|
+
Returns:
|
284
|
+
str: Document ID if successful, None otherwise
|
285
|
+
"""
|
286
|
+
try:
|
287
|
+
from langchain_community.document_loaders import UnstructuredFileLoader
|
288
|
+
|
289
|
+
# Extract file content and metadata
|
290
|
+
loader = UnstructuredFileLoader(file_path)
|
291
|
+
documents = loader.load()
|
292
|
+
|
293
|
+
# Combine all content from the documents
|
294
|
+
content = "\n\n".join([doc.page_content for doc in documents])
|
295
|
+
|
296
|
+
# Get the filename for the title
|
297
|
+
filename = os.path.basename(file_path)
|
298
|
+
title = filename
|
299
|
+
|
300
|
+
# Prepare the document
|
301
|
+
document = {
|
302
|
+
content_field: content,
|
303
|
+
}
|
304
|
+
|
305
|
+
# Add title if requested
|
306
|
+
if title_field:
|
307
|
+
document[title_field] = title
|
308
|
+
|
309
|
+
# Add metadata if requested
|
310
|
+
if extract_metadata and documents:
|
311
|
+
# Include metadata from the first document
|
312
|
+
document["metadata"] = documents[0].metadata
|
313
|
+
|
314
|
+
# Add file-specific metadata
|
315
|
+
document["source"] = file_path
|
316
|
+
document["file_extension"] = os.path.splitext(filename)[1].lstrip(".")
|
317
|
+
document["filename"] = filename
|
318
|
+
|
319
|
+
# Index the document
|
320
|
+
return self.index_document(index_name, document, refresh=refresh)
|
321
|
+
|
322
|
+
except ImportError:
|
323
|
+
logger.error("UnstructuredFileLoader not available. Please install the 'unstructured' package.")
|
324
|
+
return None
|
325
|
+
except Exception as e:
|
326
|
+
logger.error(f"Error indexing file '{file_path}': {str(e)}")
|
327
|
+
return None
|
328
|
+
|
329
|
+
def index_directory(
|
330
|
+
self,
|
331
|
+
index_name: str,
|
332
|
+
directory_path: str,
|
333
|
+
file_patterns: List[str] = ["*.txt", "*.pdf", "*.docx", "*.md"],
|
334
|
+
content_field: str = "content",
|
335
|
+
title_field: str = "title",
|
336
|
+
extract_metadata: bool = True,
|
337
|
+
refresh: bool = False,
|
338
|
+
) -> int:
|
339
|
+
"""
|
340
|
+
Index all matching files in a directory in Elasticsearch.
|
341
|
+
|
342
|
+
Args:
|
343
|
+
index_name: Name of the index to add the documents to
|
344
|
+
directory_path: Path to the directory containing files to index
|
345
|
+
file_patterns: List of file patterns to match (glob patterns)
|
346
|
+
content_field: Field name to store the file content
|
347
|
+
title_field: Field name to store the file title
|
348
|
+
extract_metadata: Whether to extract file metadata
|
349
|
+
refresh: Whether to refresh the index after indexing
|
350
|
+
|
351
|
+
Returns:
|
352
|
+
int: Number of successfully indexed files
|
353
|
+
"""
|
354
|
+
try:
|
355
|
+
import glob
|
356
|
+
|
357
|
+
# Find all matching files
|
358
|
+
all_files = []
|
359
|
+
for pattern in file_patterns:
|
360
|
+
pattern_path = os.path.join(directory_path, pattern)
|
361
|
+
matching_files = glob.glob(pattern_path)
|
362
|
+
all_files.extend(matching_files)
|
363
|
+
|
364
|
+
logger.info(f"Found {len(all_files)} files matching patterns {file_patterns} in {directory_path}")
|
365
|
+
|
366
|
+
# Index each file
|
367
|
+
successful_count = 0
|
368
|
+
for file_path in all_files:
|
369
|
+
logger.info(f"Indexing file: {file_path}")
|
370
|
+
doc_id = self.index_file(
|
371
|
+
index_name=index_name,
|
372
|
+
file_path=file_path,
|
373
|
+
content_field=content_field,
|
374
|
+
title_field=title_field,
|
375
|
+
extract_metadata=extract_metadata,
|
376
|
+
refresh=refresh,
|
377
|
+
)
|
378
|
+
|
379
|
+
if doc_id:
|
380
|
+
successful_count += 1
|
381
|
+
|
382
|
+
logger.info(f"Successfully indexed {successful_count} files out of {len(all_files)}")
|
383
|
+
return successful_count
|
384
|
+
|
385
|
+
except Exception as e:
|
386
|
+
logger.error(f"Error indexing directory '{directory_path}': {str(e)}")
|
387
|
+
return 0
|
388
|
+
|
389
|
+
def search(
|
390
|
+
self,
|
391
|
+
index_name: str,
|
392
|
+
query: str,
|
393
|
+
fields: List[str] = ["content", "title"],
|
394
|
+
size: int = 10,
|
395
|
+
highlight: bool = True,
|
396
|
+
) -> Dict[str, Any]:
|
397
|
+
"""
|
398
|
+
Search for documents in Elasticsearch.
|
399
|
+
|
400
|
+
Args:
|
401
|
+
index_name: Name of the index to search
|
402
|
+
query: Search query
|
403
|
+
fields: Fields to search in
|
404
|
+
size: Maximum number of results to return
|
405
|
+
highlight: Whether to include highlighted excerpts in results
|
406
|
+
|
407
|
+
Returns:
|
408
|
+
Dict: Elasticsearch search response
|
409
|
+
"""
|
410
|
+
try:
|
411
|
+
search_query = {
|
412
|
+
"query": {
|
413
|
+
"multi_match": {
|
414
|
+
"query": query,
|
415
|
+
"fields": fields,
|
416
|
+
"type": "best_fields",
|
417
|
+
"tie_breaker": 0.3,
|
418
|
+
}
|
419
|
+
},
|
420
|
+
"size": size,
|
421
|
+
}
|
422
|
+
|
423
|
+
# Add highlighting if requested
|
424
|
+
if highlight:
|
425
|
+
search_query["highlight"] = {
|
426
|
+
"fields": {field: {} for field in fields},
|
427
|
+
"pre_tags": ["<em>"],
|
428
|
+
"post_tags": ["</em>"],
|
429
|
+
}
|
430
|
+
|
431
|
+
# Execute the search
|
432
|
+
response = self.client.search(
|
433
|
+
index=index_name,
|
434
|
+
body=search_query,
|
435
|
+
)
|
436
|
+
|
437
|
+
return response
|
438
|
+
|
439
|
+
except Exception as e:
|
440
|
+
logger.error(f"Error searching index '{index_name}': {str(e)}")
|
441
|
+
return {"error": str(e)}
|