local-deep-research 0.3.12__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. local_deep_research/__version__.py +1 -1
  2. local_deep_research/advanced_search_system/filters/base_filter.py +2 -3
  3. local_deep_research/advanced_search_system/filters/cross_engine_filter.py +4 -5
  4. local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +298 -0
  5. local_deep_research/advanced_search_system/findings/repository.py +0 -3
  6. local_deep_research/advanced_search_system/strategies/base_strategy.py +1 -2
  7. local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +14 -18
  8. local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +4 -8
  9. local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +5 -6
  10. local_deep_research/advanced_search_system/strategies/source_based_strategy.py +2 -2
  11. local_deep_research/advanced_search_system/strategies/standard_strategy.py +9 -7
  12. local_deep_research/api/benchmark_functions.py +288 -0
  13. local_deep_research/api/research_functions.py +8 -4
  14. local_deep_research/benchmarks/README.md +162 -0
  15. local_deep_research/benchmarks/__init__.py +51 -0
  16. local_deep_research/benchmarks/benchmark_functions.py +353 -0
  17. local_deep_research/benchmarks/cli/__init__.py +16 -0
  18. local_deep_research/benchmarks/cli/benchmark_commands.py +338 -0
  19. local_deep_research/benchmarks/cli.py +347 -0
  20. local_deep_research/benchmarks/comparison/__init__.py +12 -0
  21. local_deep_research/benchmarks/comparison/evaluator.py +768 -0
  22. local_deep_research/benchmarks/datasets/__init__.py +53 -0
  23. local_deep_research/benchmarks/datasets/base.py +295 -0
  24. local_deep_research/benchmarks/datasets/browsecomp.py +116 -0
  25. local_deep_research/benchmarks/datasets/custom_dataset_template.py +98 -0
  26. local_deep_research/benchmarks/datasets/simpleqa.py +74 -0
  27. local_deep_research/benchmarks/datasets/utils.py +116 -0
  28. local_deep_research/benchmarks/datasets.py +31 -0
  29. local_deep_research/benchmarks/efficiency/__init__.py +14 -0
  30. local_deep_research/benchmarks/efficiency/resource_monitor.py +367 -0
  31. local_deep_research/benchmarks/efficiency/speed_profiler.py +214 -0
  32. local_deep_research/benchmarks/evaluators/__init__.py +18 -0
  33. local_deep_research/benchmarks/evaluators/base.py +74 -0
  34. local_deep_research/benchmarks/evaluators/browsecomp.py +83 -0
  35. local_deep_research/benchmarks/evaluators/composite.py +121 -0
  36. local_deep_research/benchmarks/evaluators/simpleqa.py +271 -0
  37. local_deep_research/benchmarks/graders.py +410 -0
  38. local_deep_research/benchmarks/metrics/README.md +80 -0
  39. local_deep_research/benchmarks/metrics/__init__.py +24 -0
  40. local_deep_research/benchmarks/metrics/calculation.py +385 -0
  41. local_deep_research/benchmarks/metrics/reporting.py +155 -0
  42. local_deep_research/benchmarks/metrics/visualization.py +205 -0
  43. local_deep_research/benchmarks/metrics.py +11 -0
  44. local_deep_research/benchmarks/optimization/__init__.py +32 -0
  45. local_deep_research/benchmarks/optimization/api.py +274 -0
  46. local_deep_research/benchmarks/optimization/metrics.py +20 -0
  47. local_deep_research/benchmarks/optimization/optuna_optimizer.py +1163 -0
  48. local_deep_research/benchmarks/runners.py +434 -0
  49. local_deep_research/benchmarks/templates.py +65 -0
  50. local_deep_research/config/llm_config.py +26 -23
  51. local_deep_research/config/search_config.py +1 -5
  52. local_deep_research/defaults/default_settings.json +108 -7
  53. local_deep_research/search_system.py +16 -8
  54. local_deep_research/utilities/db_utils.py +3 -6
  55. local_deep_research/utilities/es_utils.py +441 -0
  56. local_deep_research/utilities/log_utils.py +36 -0
  57. local_deep_research/utilities/search_utilities.py +8 -9
  58. local_deep_research/web/app.py +7 -9
  59. local_deep_research/web/app_factory.py +9 -12
  60. local_deep_research/web/database/migrations.py +8 -5
  61. local_deep_research/web/database/models.py +20 -0
  62. local_deep_research/web/database/schema_upgrade.py +5 -8
  63. local_deep_research/web/models/database.py +15 -18
  64. local_deep_research/web/routes/benchmark_routes.py +427 -0
  65. local_deep_research/web/routes/research_routes.py +13 -17
  66. local_deep_research/web/routes/settings_routes.py +264 -67
  67. local_deep_research/web/services/research_service.py +47 -57
  68. local_deep_research/web/services/settings_manager.py +1 -4
  69. local_deep_research/web/services/settings_service.py +4 -6
  70. local_deep_research/web/static/css/styles.css +12 -0
  71. local_deep_research/web/static/js/components/logpanel.js +164 -155
  72. local_deep_research/web/static/js/components/research.js +44 -3
  73. local_deep_research/web/static/js/components/settings.js +27 -0
  74. local_deep_research/web/static/js/services/socket.js +47 -0
  75. local_deep_research/web_search_engines/default_search_engines.py +38 -0
  76. local_deep_research/web_search_engines/engines/meta_search_engine.py +100 -33
  77. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +31 -17
  78. local_deep_research/web_search_engines/engines/search_engine_brave.py +8 -3
  79. local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +343 -0
  80. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +14 -6
  81. local_deep_research/web_search_engines/engines/search_engine_local.py +19 -23
  82. local_deep_research/web_search_engines/engines/search_engine_local_all.py +9 -12
  83. local_deep_research/web_search_engines/engines/search_engine_searxng.py +12 -17
  84. local_deep_research/web_search_engines/engines/search_engine_serpapi.py +8 -4
  85. local_deep_research/web_search_engines/search_engine_base.py +22 -5
  86. local_deep_research/web_search_engines/search_engine_factory.py +32 -11
  87. local_deep_research/web_search_engines/search_engines_config.py +14 -1
  88. {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.0.dist-info}/METADATA +10 -2
  89. {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.0.dist-info}/RECORD +92 -49
  90. {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.0.dist-info}/WHEEL +0 -0
  91. {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.0.dist-info}/entry_points.txt +0 -0
  92. {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -194,6 +194,20 @@
194
194
  "value": 30000,
195
195
  "visible": true
196
196
  },
197
+ "llm.context_window_size": {
198
+ "category": "llm_parameters",
199
+ "description": "Maximum context window size in tokens for the LLM",
200
+ "editable": true,
201
+ "max_value": 20000000.0,
202
+ "min_value": 512.0,
203
+ "name": "Context Window Size",
204
+ "options": null,
205
+ "step": null,
206
+ "type": "LLM",
207
+ "ui_element": "number",
208
+ "value": 128000,
209
+ "visible": true
210
+ },
197
211
  "llm.supports_max_tokens": {
198
212
  "category": "llm_parameters",
199
213
  "description": "Whether the LLM API supports the 'max_tokens' option.",
@@ -624,6 +638,75 @@
624
638
  "value": false,
625
639
  "visible": true
626
640
  },
641
+ "search.journal_reputation.threshold": {
642
+ "category": "journal_quality_filter_parameters",
643
+ "description": "If enabled, journals with quality scores (scale from 1-10) below this threshold will be filtered out.",
644
+ "editable": true,
645
+ "max_value": 10,
646
+ "min_value": 1,
647
+ "name": "Journal Quality Threshold",
648
+ "options": null,
649
+ "step": 1,
650
+ "type": "SEARCH",
651
+ "ui_element": "range",
652
+ "value": 4,
653
+ "visible": true
654
+ },
655
+ "search.journal_reputation.max_context": {
656
+ "category": "journal_quality_filter_parameters",
657
+ "description": "Maximum number of characters to include in the prompt for journal quality checking.",
658
+ "editable": true,
659
+ "max_value": 1000000,
660
+ "min_value": 500,
661
+ "name": "Journal Quality Context Size",
662
+ "options": null,
663
+ "step": null,
664
+ "type": "SEARCH",
665
+ "ui_element": "number",
666
+ "value": 3000,
667
+ "visible": true
668
+ },
669
+ "search.journal_reputation.exclude_non_published": {
670
+ "category": "journal_quality_filter_parameters",
671
+ "description": "If true, quality filtering will exclude results that do not have a published journal reference.",
672
+ "editable": true,
673
+ "max_value": null,
674
+ "min_value": null,
675
+ "name": "Exclude Non-Published Results",
676
+ "options": null,
677
+ "step": null,
678
+ "type": "SEARCH",
679
+ "ui_element": "checkbox",
680
+ "value": false,
681
+ "visible": true
682
+ },
683
+ "search.journal_reputation.reanalysis_period": {
684
+ "category": "journal_quality_filter_parameters",
685
+ "description": "Period at which to re-check the quality of journals.",
686
+ "editable": true,
687
+ "max_value": null,
688
+ "min_value": null,
689
+ "name": "Quality Reanalysis Period",
690
+ "options": [
691
+ {
692
+ "label": "Yearly",
693
+ "value": "365"
694
+ },
695
+ {
696
+ "label": "Every 6 Months",
697
+ "value": "182"
698
+ },
699
+ {
700
+ "label": "Every Month",
701
+ "value": "30"
702
+ }
703
+ ],
704
+ "step": null,
705
+ "type": "SEARCH",
706
+ "ui_element": "select",
707
+ "value": "265",
708
+ "visible": true
709
+ },
627
710
  "search.snippets_only": {
628
711
  "category": "search_parameters",
629
712
  "description": "Only retrieve snippets instead of full search results",
@@ -778,6 +861,20 @@
778
861
  "value": "ArXivSearchEngine",
779
862
  "visible": true
780
863
  },
864
+ "search.engine.web.arxiv.journal_reputation.enabled": {
865
+ "category": "arxiv",
866
+ "description": "Enable journal quality filtering for this search engine.",
867
+ "editable": true,
868
+ "max_value": null,
869
+ "min_value": null,
870
+ "name": "Filter Low-Quality Journals",
871
+ "options": null,
872
+ "step": null,
873
+ "type": "SEARCH",
874
+ "ui_element": "checkbox",
875
+ "value": true,
876
+ "visible": true
877
+ },
781
878
  "search.engine.web.arxiv.default_params.max_results": {
782
879
  "category": "arxiv",
783
880
  "description": "Setting for arxiv.default_params.max_results",
@@ -3107,7 +3204,7 @@
3107
3204
  "step": 0.05,
3108
3205
  "type": "SEARCH",
3109
3206
  "ui_element": "range",
3110
- "value": 0.9,
3207
+ "value": 1.0,
3111
3208
  "visible": true
3112
3209
  },
3113
3210
  "search.engine.web.searxng.requires_api_key": {
@@ -3136,12 +3233,16 @@
3136
3233
  "type": "SEARCH",
3137
3234
  "ui_element": "text",
3138
3235
  "value": [
3139
- "privacy-focused",
3140
- "metasearch engine",
3141
- "self-hosted",
3142
- "no tracking",
3143
- "configurable",
3144
- "multiple engines in one"
3236
+ "comprehensive general information",
3237
+ "current events and news",
3238
+ "technical documentation",
3239
+ "factual queries",
3240
+ "historical information",
3241
+ "consumer products",
3242
+ "educational content",
3243
+ "multi-source aggregation",
3244
+ "real-time results",
3245
+ "combined results from major search engines"
3145
3246
  ],
3146
3247
  "visible": true
3147
3248
  },
@@ -1,8 +1,8 @@
1
1
  # src/local_deep_research/search_system/search_system.py
2
- import logging
3
2
  from typing import Callable, Dict
4
3
 
5
4
  from langchain_core.language_models import BaseChatModel
5
+ from loguru import logger
6
6
 
7
7
  from .advanced_search_system.findings.repository import FindingsRepository
8
8
  from .advanced_search_system.questions.standard_question import (
@@ -23,8 +23,6 @@ from .config.search_config import get_search
23
23
  from .utilities.db_utils import get_db_setting
24
24
  from .web_search_engines.search_engine_base import BaseSearchEngine
25
25
 
26
- logger = logging.getLogger(__name__)
27
-
28
26
 
29
27
  class AdvancedSearchSystem:
30
28
  """
@@ -38,6 +36,8 @@ class AdvancedSearchSystem:
38
36
  use_cross_engine_filter: bool = True,
39
37
  llm: BaseChatModel | None = None,
40
38
  search: BaseSearchEngine | None = None,
39
+ max_iterations: int | None = None,
40
+ questions_per_iteration: int | None = None,
41
41
  ):
42
42
  """Initialize the advanced search system.
43
43
 
@@ -49,6 +49,11 @@ class AdvancedSearchSystem:
49
49
  llm: LLM to use. If not provided, it will use the default one.
50
50
  search: Search engine to use. If not provided, it will use the
51
51
  default one.
52
+ max_iterations: The maximum number of search iterations to
53
+ perform. Will be read from the settings if not specified.
54
+ questions_per_iteration: The number of questions to include in
55
+ each iteration. Will be read from the settings if not specified.
56
+
52
57
  """
53
58
  # Get configuration
54
59
  self.model = llm
@@ -59,11 +64,14 @@ class AdvancedSearchSystem:
59
64
  self.search = get_search(llm_instance=self.model)
60
65
 
61
66
  # Get iterations setting
62
- self.max_iterations = get_db_setting("search.iterations", 1)
63
-
64
- self.questions_per_iteration = get_db_setting(
65
- "search.questions_per_iteration", 3
66
- )
67
+ self.max_iterations = max_iterations
68
+ if self.max_iterations is None:
69
+ self.max_iterations = get_db_setting("search.iterations", 1)
70
+ self.questions_per_iteration = questions_per_iteration
71
+ if self.questions_per_iteration is None:
72
+ self.questions_per_iteration = get_db_setting(
73
+ "search.questions_per_iteration", 3
74
+ )
67
75
 
68
76
  # Log the strategy name that's being used
69
77
  logger.info(
@@ -1,16 +1,13 @@
1
- import logging
2
1
  import os
3
2
  from functools import cache
4
3
  from typing import Any, Dict
5
4
 
5
+ from loguru import logger
6
6
  from sqlalchemy import create_engine
7
7
  from sqlalchemy.orm import Session, sessionmaker
8
8
 
9
9
  from ..web.services.settings_manager import SettingsManager
10
10
 
11
- logger = logging.getLogger(__name__)
12
-
13
-
14
11
  # Database path.
15
12
  DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "data"))
16
13
  DB_PATH = os.path.join(DATA_DIR, "ldr.db")
@@ -57,8 +54,8 @@ def get_db_setting(
57
54
 
58
55
  if value is not None:
59
56
  return value
60
- except Exception as e:
61
- logger.error(f"Error getting setting {key} from database: {e}")
57
+ except Exception:
58
+ logger.exception(f"Error getting setting {key} from database")
62
59
 
63
60
  logger.warning(f"Could not find setting '{key}' in the database.")
64
61
  return default_value
@@ -0,0 +1,441 @@
1
+ """
2
+ Elasticsearch utilities for indexing and managing documents.
3
+ """
4
+
5
+ import json
6
+ import logging
7
+ import os
8
+ from typing import Any, Dict, List, Optional, Union
9
+
10
+ from elasticsearch import Elasticsearch
11
+ from elasticsearch.helpers import bulk
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class ElasticsearchManager:
17
+ """
18
+ Utility class for managing Elasticsearch indices and documents.
19
+
20
+ This class provides methods for creating indices, indexing documents,
21
+ and performing other Elasticsearch management tasks.
22
+ """
23
+
24
+ def __init__(
25
+ self,
26
+ hosts: List[str] = ["http://localhost:9200"],
27
+ username: Optional[str] = None,
28
+ password: Optional[str] = None,
29
+ api_key: Optional[str] = None,
30
+ cloud_id: Optional[str] = None,
31
+ ):
32
+ """
33
+ Initialize the Elasticsearch manager.
34
+
35
+ Args:
36
+ hosts: List of Elasticsearch hosts
37
+ username: Optional username for authentication
38
+ password: Optional password for authentication
39
+ api_key: Optional API key for authentication
40
+ cloud_id: Optional Elastic Cloud ID
41
+ """
42
+ # Initialize the Elasticsearch client
43
+ es_args = {}
44
+
45
+ # Basic authentication
46
+ if username and password:
47
+ es_args["basic_auth"] = (username, password)
48
+
49
+ # API key authentication
50
+ if api_key:
51
+ es_args["api_key"] = api_key
52
+
53
+ # Cloud ID for Elastic Cloud
54
+ if cloud_id:
55
+ es_args["cloud_id"] = cloud_id
56
+
57
+ # Connect to Elasticsearch
58
+ self.client = Elasticsearch(hosts, **es_args)
59
+
60
+ # Verify connection
61
+ try:
62
+ info = self.client.info()
63
+ logger.info(f"Connected to Elasticsearch cluster: {info.get('cluster_name')}")
64
+ logger.info(f"Elasticsearch version: {info.get('version', {}).get('number')}")
65
+ except Exception as e:
66
+ logger.error(f"Failed to connect to Elasticsearch: {str(e)}")
67
+ raise ConnectionError(f"Could not connect to Elasticsearch: {str(e)}")
68
+
69
+ def create_index(
70
+ self,
71
+ index_name: str,
72
+ mappings: Optional[Dict[str, Any]] = None,
73
+ settings: Optional[Dict[str, Any]] = None,
74
+ ) -> bool:
75
+ """
76
+ Create an Elasticsearch index with optional mappings and settings.
77
+
78
+ Args:
79
+ index_name: Name of the index to create
80
+ mappings: Optional mappings for the index fields
81
+ settings: Optional settings for the index
82
+
83
+ Returns:
84
+ bool: True if successful, False otherwise
85
+ """
86
+ try:
87
+ # Check if index already exists
88
+ if self.client.indices.exists(index=index_name):
89
+ logger.warning(f"Index '{index_name}' already exists - skipping creation")
90
+ return True
91
+
92
+ # Default mappings for better text search if none provided
93
+ if mappings is None:
94
+ mappings = {
95
+ "properties": {
96
+ "title": {
97
+ "type": "text",
98
+ "analyzer": "standard",
99
+ "fields": {
100
+ "keyword": {
101
+ "type": "keyword",
102
+ "ignore_above": 256
103
+ }
104
+ }
105
+ },
106
+ "content": {
107
+ "type": "text",
108
+ "analyzer": "standard"
109
+ },
110
+ "url": {
111
+ "type": "keyword"
112
+ },
113
+ "source": {
114
+ "type": "keyword"
115
+ },
116
+ "timestamp": {
117
+ "type": "date"
118
+ },
119
+ "metadata": {
120
+ "type": "object",
121
+ "enabled": True
122
+ }
123
+ }
124
+ }
125
+
126
+ # Default settings if none provided
127
+ if settings is None:
128
+ settings = {
129
+ "number_of_shards": 1,
130
+ "number_of_replicas": 0,
131
+ "analysis": {
132
+ "analyzer": {
133
+ "standard": {
134
+ "type": "standard"
135
+ }
136
+ }
137
+ }
138
+ }
139
+
140
+ # Create the index with mappings and settings
141
+ create_response = self.client.indices.create(
142
+ index=index_name,
143
+ mappings=mappings,
144
+ settings=settings,
145
+ )
146
+
147
+ logger.info(f"Created index '{index_name}': {create_response}")
148
+ return True
149
+
150
+ except Exception as e:
151
+ logger.error(f"Error creating index '{index_name}': {str(e)}")
152
+ return False
153
+
154
+ def delete_index(self, index_name: str) -> bool:
155
+ """
156
+ Delete an Elasticsearch index.
157
+
158
+ Args:
159
+ index_name: Name of the index to delete
160
+
161
+ Returns:
162
+ bool: True if successful, False otherwise
163
+ """
164
+ try:
165
+ # Check if index exists
166
+ if not self.client.indices.exists(index=index_name):
167
+ logger.warning(f"Index '{index_name}' does not exist - skipping deletion")
168
+ return True
169
+
170
+ # Delete the index
171
+ delete_response = self.client.indices.delete(index=index_name)
172
+ logger.info(f"Deleted index '{index_name}': {delete_response}")
173
+ return True
174
+
175
+ except Exception as e:
176
+ logger.error(f"Error deleting index '{index_name}': {str(e)}")
177
+ return False
178
+
179
+ def index_document(
180
+ self,
181
+ index_name: str,
182
+ document: Dict[str, Any],
183
+ document_id: Optional[str] = None,
184
+ refresh: bool = False,
185
+ ) -> Optional[str]:
186
+ """
187
+ Index a single document in Elasticsearch.
188
+
189
+ Args:
190
+ index_name: Name of the index to add the document to
191
+ document: The document to index
192
+ document_id: Optional document ID (will be generated if not provided)
193
+ refresh: Whether to refresh the index after indexing
194
+
195
+ Returns:
196
+ str: Document ID if successful, None otherwise
197
+ """
198
+ try:
199
+ # Index the document
200
+ response = self.client.index(
201
+ index=index_name,
202
+ document=document,
203
+ id=document_id,
204
+ refresh=refresh,
205
+ )
206
+
207
+ logger.info(f"Indexed document in '{index_name}' with ID: {response['_id']}")
208
+ return response["_id"]
209
+
210
+ except Exception as e:
211
+ logger.error(f"Error indexing document in '{index_name}': {str(e)}")
212
+ return None
213
+
214
+ def bulk_index_documents(
215
+ self,
216
+ index_name: str,
217
+ documents: List[Dict[str, Any]],
218
+ id_field: Optional[str] = None,
219
+ refresh: bool = False,
220
+ ) -> int:
221
+ """
222
+ Bulk index multiple documents in Elasticsearch.
223
+
224
+ Args:
225
+ index_name: Name of the index to add the documents to
226
+ documents: List of documents to index
227
+ id_field: Optional field in the documents to use as the document ID
228
+ refresh: Whether to refresh the index after indexing
229
+
230
+ Returns:
231
+ int: Number of successfully indexed documents
232
+ """
233
+ try:
234
+ # Prepare the bulk actions
235
+ actions = []
236
+ for doc in documents:
237
+ action = {
238
+ "_index": index_name,
239
+ "_source": doc,
240
+ }
241
+
242
+ # Use the specified field as the document ID if provided
243
+ if id_field and id_field in doc:
244
+ action["_id"] = doc[id_field]
245
+
246
+ actions.append(action)
247
+
248
+ # Execute the bulk indexing
249
+ success, failed = bulk(
250
+ self.client,
251
+ actions,
252
+ refresh=refresh,
253
+ stats_only=True,
254
+ )
255
+
256
+ logger.info(f"Bulk indexed {success} documents in '{index_name}', failed: {failed}")
257
+ return success
258
+
259
+ except Exception as e:
260
+ logger.error(f"Error bulk indexing documents in '{index_name}': {str(e)}")
261
+ return 0
262
+
263
+ def index_file(
264
+ self,
265
+ index_name: str,
266
+ file_path: str,
267
+ content_field: str = "content",
268
+ title_field: Optional[str] = "title",
269
+ extract_metadata: bool = True,
270
+ refresh: bool = False,
271
+ ) -> Optional[str]:
272
+ """
273
+ Index a file in Elasticsearch, extracting text content and metadata.
274
+
275
+ Args:
276
+ index_name: Name of the index to add the document to
277
+ file_path: Path to the file to index
278
+ content_field: Field name to store the file content
279
+ title_field: Field name to store the file title (filename if not specified)
280
+ extract_metadata: Whether to extract file metadata
281
+ refresh: Whether to refresh the index after indexing
282
+
283
+ Returns:
284
+ str: Document ID if successful, None otherwise
285
+ """
286
+ try:
287
+ from langchain_community.document_loaders import UnstructuredFileLoader
288
+
289
+ # Extract file content and metadata
290
+ loader = UnstructuredFileLoader(file_path)
291
+ documents = loader.load()
292
+
293
+ # Combine all content from the documents
294
+ content = "\n\n".join([doc.page_content for doc in documents])
295
+
296
+ # Get the filename for the title
297
+ filename = os.path.basename(file_path)
298
+ title = filename
299
+
300
+ # Prepare the document
301
+ document = {
302
+ content_field: content,
303
+ }
304
+
305
+ # Add title if requested
306
+ if title_field:
307
+ document[title_field] = title
308
+
309
+ # Add metadata if requested
310
+ if extract_metadata and documents:
311
+ # Include metadata from the first document
312
+ document["metadata"] = documents[0].metadata
313
+
314
+ # Add file-specific metadata
315
+ document["source"] = file_path
316
+ document["file_extension"] = os.path.splitext(filename)[1].lstrip(".")
317
+ document["filename"] = filename
318
+
319
+ # Index the document
320
+ return self.index_document(index_name, document, refresh=refresh)
321
+
322
+ except ImportError:
323
+ logger.error("UnstructuredFileLoader not available. Please install the 'unstructured' package.")
324
+ return None
325
+ except Exception as e:
326
+ logger.error(f"Error indexing file '{file_path}': {str(e)}")
327
+ return None
328
+
329
+ def index_directory(
330
+ self,
331
+ index_name: str,
332
+ directory_path: str,
333
+ file_patterns: List[str] = ["*.txt", "*.pdf", "*.docx", "*.md"],
334
+ content_field: str = "content",
335
+ title_field: str = "title",
336
+ extract_metadata: bool = True,
337
+ refresh: bool = False,
338
+ ) -> int:
339
+ """
340
+ Index all matching files in a directory in Elasticsearch.
341
+
342
+ Args:
343
+ index_name: Name of the index to add the documents to
344
+ directory_path: Path to the directory containing files to index
345
+ file_patterns: List of file patterns to match (glob patterns)
346
+ content_field: Field name to store the file content
347
+ title_field: Field name to store the file title
348
+ extract_metadata: Whether to extract file metadata
349
+ refresh: Whether to refresh the index after indexing
350
+
351
+ Returns:
352
+ int: Number of successfully indexed files
353
+ """
354
+ try:
355
+ import glob
356
+
357
+ # Find all matching files
358
+ all_files = []
359
+ for pattern in file_patterns:
360
+ pattern_path = os.path.join(directory_path, pattern)
361
+ matching_files = glob.glob(pattern_path)
362
+ all_files.extend(matching_files)
363
+
364
+ logger.info(f"Found {len(all_files)} files matching patterns {file_patterns} in {directory_path}")
365
+
366
+ # Index each file
367
+ successful_count = 0
368
+ for file_path in all_files:
369
+ logger.info(f"Indexing file: {file_path}")
370
+ doc_id = self.index_file(
371
+ index_name=index_name,
372
+ file_path=file_path,
373
+ content_field=content_field,
374
+ title_field=title_field,
375
+ extract_metadata=extract_metadata,
376
+ refresh=refresh,
377
+ )
378
+
379
+ if doc_id:
380
+ successful_count += 1
381
+
382
+ logger.info(f"Successfully indexed {successful_count} files out of {len(all_files)}")
383
+ return successful_count
384
+
385
+ except Exception as e:
386
+ logger.error(f"Error indexing directory '{directory_path}': {str(e)}")
387
+ return 0
388
+
389
+ def search(
390
+ self,
391
+ index_name: str,
392
+ query: str,
393
+ fields: List[str] = ["content", "title"],
394
+ size: int = 10,
395
+ highlight: bool = True,
396
+ ) -> Dict[str, Any]:
397
+ """
398
+ Search for documents in Elasticsearch.
399
+
400
+ Args:
401
+ index_name: Name of the index to search
402
+ query: Search query
403
+ fields: Fields to search in
404
+ size: Maximum number of results to return
405
+ highlight: Whether to include highlighted excerpts in results
406
+
407
+ Returns:
408
+ Dict: Elasticsearch search response
409
+ """
410
+ try:
411
+ search_query = {
412
+ "query": {
413
+ "multi_match": {
414
+ "query": query,
415
+ "fields": fields,
416
+ "type": "best_fields",
417
+ "tie_breaker": 0.3,
418
+ }
419
+ },
420
+ "size": size,
421
+ }
422
+
423
+ # Add highlighting if requested
424
+ if highlight:
425
+ search_query["highlight"] = {
426
+ "fields": {field: {} for field in fields},
427
+ "pre_tags": ["<em>"],
428
+ "post_tags": ["</em>"],
429
+ }
430
+
431
+ # Execute the search
432
+ response = self.client.search(
433
+ index=index_name,
434
+ body=search_query,
435
+ )
436
+
437
+ return response
438
+
439
+ except Exception as e:
440
+ logger.error(f"Error searching index '{index_name}': {str(e)}")
441
+ return {"error": str(e)}