local-deep-research 0.4.4__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__init__.py +7 -0
- local_deep_research/__version__.py +1 -1
- local_deep_research/advanced_search_system/answer_decoding/__init__.py +5 -0
- local_deep_research/advanced_search_system/answer_decoding/browsecomp_answer_decoder.py +421 -0
- local_deep_research/advanced_search_system/candidate_exploration/README.md +219 -0
- local_deep_research/advanced_search_system/candidate_exploration/__init__.py +25 -0
- local_deep_research/advanced_search_system/candidate_exploration/adaptive_explorer.py +329 -0
- local_deep_research/advanced_search_system/candidate_exploration/base_explorer.py +341 -0
- local_deep_research/advanced_search_system/candidate_exploration/constraint_guided_explorer.py +436 -0
- local_deep_research/advanced_search_system/candidate_exploration/diversity_explorer.py +457 -0
- local_deep_research/advanced_search_system/candidate_exploration/parallel_explorer.py +250 -0
- local_deep_research/advanced_search_system/candidate_exploration/progressive_explorer.py +255 -0
- local_deep_research/advanced_search_system/candidates/__init__.py +5 -0
- local_deep_research/advanced_search_system/candidates/base_candidate.py +59 -0
- local_deep_research/advanced_search_system/constraint_checking/README.md +150 -0
- local_deep_research/advanced_search_system/constraint_checking/__init__.py +35 -0
- local_deep_research/advanced_search_system/constraint_checking/base_constraint_checker.py +122 -0
- local_deep_research/advanced_search_system/constraint_checking/constraint_checker.py +223 -0
- local_deep_research/advanced_search_system/constraint_checking/constraint_satisfaction_tracker.py +387 -0
- local_deep_research/advanced_search_system/constraint_checking/dual_confidence_checker.py +424 -0
- local_deep_research/advanced_search_system/constraint_checking/evidence_analyzer.py +174 -0
- local_deep_research/advanced_search_system/constraint_checking/intelligent_constraint_relaxer.py +503 -0
- local_deep_research/advanced_search_system/constraint_checking/rejection_engine.py +143 -0
- local_deep_research/advanced_search_system/constraint_checking/strict_checker.py +259 -0
- local_deep_research/advanced_search_system/constraint_checking/threshold_checker.py +213 -0
- local_deep_research/advanced_search_system/constraints/__init__.py +6 -0
- local_deep_research/advanced_search_system/constraints/base_constraint.py +58 -0
- local_deep_research/advanced_search_system/constraints/constraint_analyzer.py +143 -0
- local_deep_research/advanced_search_system/evidence/__init__.py +12 -0
- local_deep_research/advanced_search_system/evidence/base_evidence.py +57 -0
- local_deep_research/advanced_search_system/evidence/evaluator.py +159 -0
- local_deep_research/advanced_search_system/evidence/requirements.py +122 -0
- local_deep_research/advanced_search_system/filters/base_filter.py +3 -1
- local_deep_research/advanced_search_system/filters/cross_engine_filter.py +8 -2
- local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +43 -29
- local_deep_research/advanced_search_system/findings/repository.py +54 -17
- local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +3 -1
- local_deep_research/advanced_search_system/query_generation/adaptive_query_generator.py +405 -0
- local_deep_research/advanced_search_system/questions/__init__.py +16 -0
- local_deep_research/advanced_search_system/questions/atomic_fact_question.py +171 -0
- local_deep_research/advanced_search_system/questions/browsecomp_question.py +287 -0
- local_deep_research/advanced_search_system/questions/decomposition_question.py +13 -4
- local_deep_research/advanced_search_system/questions/entity_aware_question.py +184 -0
- local_deep_research/advanced_search_system/questions/standard_question.py +9 -3
- local_deep_research/advanced_search_system/search_optimization/cross_constraint_manager.py +624 -0
- local_deep_research/advanced_search_system/source_management/diversity_manager.py +613 -0
- local_deep_research/advanced_search_system/strategies/__init__.py +42 -0
- local_deep_research/advanced_search_system/strategies/adaptive_decomposition_strategy.py +564 -0
- local_deep_research/advanced_search_system/strategies/base_strategy.py +4 -4
- local_deep_research/advanced_search_system/strategies/browsecomp_entity_strategy.py +1031 -0
- local_deep_research/advanced_search_system/strategies/browsecomp_optimized_strategy.py +778 -0
- local_deep_research/advanced_search_system/strategies/concurrent_dual_confidence_strategy.py +446 -0
- local_deep_research/advanced_search_system/strategies/constrained_search_strategy.py +1348 -0
- local_deep_research/advanced_search_system/strategies/constraint_parallel_strategy.py +522 -0
- local_deep_research/advanced_search_system/strategies/direct_search_strategy.py +217 -0
- local_deep_research/advanced_search_system/strategies/dual_confidence_strategy.py +320 -0
- local_deep_research/advanced_search_system/strategies/dual_confidence_with_rejection.py +219 -0
- local_deep_research/advanced_search_system/strategies/early_stop_constrained_strategy.py +369 -0
- local_deep_research/advanced_search_system/strategies/entity_aware_source_strategy.py +140 -0
- local_deep_research/advanced_search_system/strategies/evidence_based_strategy.py +1248 -0
- local_deep_research/advanced_search_system/strategies/evidence_based_strategy_v2.py +1337 -0
- local_deep_research/advanced_search_system/strategies/focused_iteration_strategy.py +537 -0
- local_deep_research/advanced_search_system/strategies/improved_evidence_based_strategy.py +782 -0
- local_deep_research/advanced_search_system/strategies/iterative_reasoning_strategy.py +760 -0
- local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +55 -21
- local_deep_research/advanced_search_system/strategies/llm_driven_modular_strategy.py +865 -0
- local_deep_research/advanced_search_system/strategies/modular_strategy.py +1142 -0
- local_deep_research/advanced_search_system/strategies/parallel_constrained_strategy.py +506 -0
- local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +34 -16
- local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +29 -9
- local_deep_research/advanced_search_system/strategies/recursive_decomposition_strategy.py +492 -0
- local_deep_research/advanced_search_system/strategies/smart_decomposition_strategy.py +284 -0
- local_deep_research/advanced_search_system/strategies/smart_query_strategy.py +515 -0
- local_deep_research/advanced_search_system/strategies/source_based_strategy.py +48 -24
- local_deep_research/advanced_search_system/strategies/standard_strategy.py +34 -14
- local_deep_research/advanced_search_system/tools/base_tool.py +7 -2
- local_deep_research/api/benchmark_functions.py +6 -2
- local_deep_research/api/research_functions.py +10 -4
- local_deep_research/benchmarks/__init__.py +9 -7
- local_deep_research/benchmarks/benchmark_functions.py +6 -2
- local_deep_research/benchmarks/cli/benchmark_commands.py +27 -10
- local_deep_research/benchmarks/cli.py +38 -13
- local_deep_research/benchmarks/comparison/__init__.py +4 -2
- local_deep_research/benchmarks/comparison/evaluator.py +316 -239
- local_deep_research/benchmarks/datasets/__init__.py +1 -1
- local_deep_research/benchmarks/datasets/base.py +91 -72
- local_deep_research/benchmarks/datasets/browsecomp.py +54 -33
- local_deep_research/benchmarks/datasets/custom_dataset_template.py +19 -19
- local_deep_research/benchmarks/datasets/simpleqa.py +14 -14
- local_deep_research/benchmarks/datasets/utils.py +48 -29
- local_deep_research/benchmarks/datasets.py +4 -11
- local_deep_research/benchmarks/efficiency/__init__.py +8 -4
- local_deep_research/benchmarks/efficiency/resource_monitor.py +223 -171
- local_deep_research/benchmarks/efficiency/speed_profiler.py +62 -48
- local_deep_research/benchmarks/evaluators/browsecomp.py +3 -1
- local_deep_research/benchmarks/evaluators/composite.py +6 -2
- local_deep_research/benchmarks/evaluators/simpleqa.py +36 -13
- local_deep_research/benchmarks/graders.py +32 -10
- local_deep_research/benchmarks/metrics/README.md +1 -1
- local_deep_research/benchmarks/metrics/calculation.py +25 -10
- local_deep_research/benchmarks/metrics/reporting.py +7 -3
- local_deep_research/benchmarks/metrics/visualization.py +42 -23
- local_deep_research/benchmarks/metrics.py +1 -1
- local_deep_research/benchmarks/optimization/__init__.py +3 -1
- local_deep_research/benchmarks/optimization/api.py +7 -1
- local_deep_research/benchmarks/optimization/optuna_optimizer.py +75 -26
- local_deep_research/benchmarks/runners.py +48 -15
- local_deep_research/citation_handler.py +65 -92
- local_deep_research/citation_handlers/__init__.py +15 -0
- local_deep_research/citation_handlers/base_citation_handler.py +70 -0
- local_deep_research/citation_handlers/forced_answer_citation_handler.py +179 -0
- local_deep_research/citation_handlers/precision_extraction_handler.py +550 -0
- local_deep_research/citation_handlers/standard_citation_handler.py +80 -0
- local_deep_research/config/llm_config.py +271 -169
- local_deep_research/config/search_config.py +14 -5
- local_deep_research/defaults/__init__.py +0 -1
- local_deep_research/metrics/__init__.py +13 -0
- local_deep_research/metrics/database.py +58 -0
- local_deep_research/metrics/db_models.py +115 -0
- local_deep_research/metrics/migrate_add_provider_to_token_usage.py +148 -0
- local_deep_research/metrics/migrate_call_stack_tracking.py +105 -0
- local_deep_research/metrics/migrate_enhanced_tracking.py +75 -0
- local_deep_research/metrics/migrate_research_ratings.py +31 -0
- local_deep_research/metrics/models.py +61 -0
- local_deep_research/metrics/pricing/__init__.py +12 -0
- local_deep_research/metrics/pricing/cost_calculator.py +237 -0
- local_deep_research/metrics/pricing/pricing_cache.py +143 -0
- local_deep_research/metrics/pricing/pricing_fetcher.py +240 -0
- local_deep_research/metrics/query_utils.py +51 -0
- local_deep_research/metrics/search_tracker.py +380 -0
- local_deep_research/metrics/token_counter.py +1078 -0
- local_deep_research/migrate_db.py +3 -1
- local_deep_research/report_generator.py +22 -8
- local_deep_research/search_system.py +390 -9
- local_deep_research/test_migration.py +15 -5
- local_deep_research/utilities/db_utils.py +7 -4
- local_deep_research/utilities/es_utils.py +115 -104
- local_deep_research/utilities/llm_utils.py +15 -5
- local_deep_research/utilities/log_utils.py +151 -0
- local_deep_research/utilities/search_cache.py +387 -0
- local_deep_research/utilities/search_utilities.py +14 -6
- local_deep_research/utilities/threading_utils.py +92 -0
- local_deep_research/utilities/url_utils.py +6 -0
- local_deep_research/web/api.py +347 -0
- local_deep_research/web/app.py +13 -17
- local_deep_research/web/app_factory.py +71 -66
- local_deep_research/web/database/migrate_to_ldr_db.py +12 -4
- local_deep_research/web/database/migrations.py +20 -3
- local_deep_research/web/database/models.py +74 -25
- local_deep_research/web/database/schema_upgrade.py +49 -29
- local_deep_research/web/models/database.py +63 -83
- local_deep_research/web/routes/api_routes.py +56 -22
- local_deep_research/web/routes/benchmark_routes.py +4 -1
- local_deep_research/web/routes/globals.py +22 -0
- local_deep_research/web/routes/history_routes.py +71 -46
- local_deep_research/web/routes/metrics_routes.py +1155 -0
- local_deep_research/web/routes/research_routes.py +192 -54
- local_deep_research/web/routes/settings_routes.py +156 -55
- local_deep_research/web/services/research_service.py +412 -251
- local_deep_research/web/services/resource_service.py +36 -11
- local_deep_research/web/services/settings_manager.py +55 -17
- local_deep_research/web/services/settings_service.py +12 -4
- local_deep_research/web/services/socket_service.py +295 -188
- local_deep_research/web/static/css/custom_dropdown.css +180 -0
- local_deep_research/web/static/css/styles.css +39 -1
- local_deep_research/web/static/js/components/detail.js +633 -267
- local_deep_research/web/static/js/components/details.js +751 -0
- local_deep_research/web/static/js/components/fallback/formatting.js +11 -11
- local_deep_research/web/static/js/components/fallback/ui.js +23 -23
- local_deep_research/web/static/js/components/history.js +76 -76
- local_deep_research/web/static/js/components/logpanel.js +61 -13
- local_deep_research/web/static/js/components/progress.js +13 -2
- local_deep_research/web/static/js/components/research.js +99 -12
- local_deep_research/web/static/js/components/results.js +239 -106
- local_deep_research/web/static/js/main.js +40 -40
- local_deep_research/web/static/js/services/audio.js +1 -1
- local_deep_research/web/static/js/services/formatting.js +11 -11
- local_deep_research/web/static/js/services/keyboard.js +157 -0
- local_deep_research/web/static/js/services/pdf.js +80 -80
- local_deep_research/web/static/sounds/README.md +1 -1
- local_deep_research/web/templates/base.html +1 -0
- local_deep_research/web/templates/components/log_panel.html +7 -1
- local_deep_research/web/templates/components/mobile_nav.html +1 -1
- local_deep_research/web/templates/components/sidebar.html +3 -0
- local_deep_research/web/templates/pages/cost_analytics.html +1245 -0
- local_deep_research/web/templates/pages/details.html +325 -24
- local_deep_research/web/templates/pages/history.html +1 -1
- local_deep_research/web/templates/pages/metrics.html +1929 -0
- local_deep_research/web/templates/pages/progress.html +2 -2
- local_deep_research/web/templates/pages/research.html +53 -17
- local_deep_research/web/templates/pages/results.html +12 -1
- local_deep_research/web/templates/pages/star_reviews.html +803 -0
- local_deep_research/web/utils/formatters.py +9 -3
- local_deep_research/web_search_engines/default_search_engines.py +5 -3
- local_deep_research/web_search_engines/engines/full_search.py +8 -2
- local_deep_research/web_search_engines/engines/meta_search_engine.py +59 -20
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +19 -6
- local_deep_research/web_search_engines/engines/search_engine_brave.py +6 -2
- local_deep_research/web_search_engines/engines/search_engine_ddg.py +3 -1
- local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +81 -58
- local_deep_research/web_search_engines/engines/search_engine_github.py +46 -15
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +16 -6
- local_deep_research/web_search_engines/engines/search_engine_guardian.py +39 -15
- local_deep_research/web_search_engines/engines/search_engine_local.py +58 -25
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +15 -5
- local_deep_research/web_search_engines/engines/search_engine_pubmed.py +63 -21
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +37 -11
- local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +27 -9
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +12 -4
- local_deep_research/web_search_engines/engines/search_engine_wayback.py +31 -10
- local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +12 -3
- local_deep_research/web_search_engines/search_engine_base.py +83 -35
- local_deep_research/web_search_engines/search_engine_factory.py +25 -8
- local_deep_research/web_search_engines/search_engines_config.py +9 -3
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/METADATA +7 -1
- local_deep_research-0.5.2.dist-info/RECORD +265 -0
- local_deep_research-0.4.4.dist-info/RECORD +0 -177
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/WHEEL +0 -0
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/entry_points.txt +0 -0
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/licenses/LICENSE +0 -0
@@ -2,10 +2,9 @@
|
|
2
2
|
Elasticsearch utilities for indexing and managing documents.
|
3
3
|
"""
|
4
4
|
|
5
|
-
import json
|
6
5
|
import logging
|
7
6
|
import os
|
8
|
-
from typing import Any, Dict, List, Optional
|
7
|
+
from typing import Any, Dict, List, Optional
|
9
8
|
|
10
9
|
from elasticsearch import Elasticsearch
|
11
10
|
from elasticsearch.helpers import bulk
|
@@ -16,11 +15,11 @@ logger = logging.getLogger(__name__)
|
|
16
15
|
class ElasticsearchManager:
|
17
16
|
"""
|
18
17
|
Utility class for managing Elasticsearch indices and documents.
|
19
|
-
|
18
|
+
|
20
19
|
This class provides methods for creating indices, indexing documents,
|
21
20
|
and performing other Elasticsearch management tasks.
|
22
21
|
"""
|
23
|
-
|
22
|
+
|
24
23
|
def __init__(
|
25
24
|
self,
|
26
25
|
hosts: List[str] = ["http://localhost:9200"],
|
@@ -31,7 +30,7 @@ class ElasticsearchManager:
|
|
31
30
|
):
|
32
31
|
"""
|
33
32
|
Initialize the Elasticsearch manager.
|
34
|
-
|
33
|
+
|
35
34
|
Args:
|
36
35
|
hosts: List of Elasticsearch hosts
|
37
36
|
username: Optional username for authentication
|
@@ -41,31 +40,37 @@ class ElasticsearchManager:
|
|
41
40
|
"""
|
42
41
|
# Initialize the Elasticsearch client
|
43
42
|
es_args = {}
|
44
|
-
|
43
|
+
|
45
44
|
# Basic authentication
|
46
45
|
if username and password:
|
47
46
|
es_args["basic_auth"] = (username, password)
|
48
|
-
|
47
|
+
|
49
48
|
# API key authentication
|
50
49
|
if api_key:
|
51
50
|
es_args["api_key"] = api_key
|
52
|
-
|
51
|
+
|
53
52
|
# Cloud ID for Elastic Cloud
|
54
53
|
if cloud_id:
|
55
54
|
es_args["cloud_id"] = cloud_id
|
56
|
-
|
55
|
+
|
57
56
|
# Connect to Elasticsearch
|
58
57
|
self.client = Elasticsearch(hosts, **es_args)
|
59
|
-
|
58
|
+
|
60
59
|
# Verify connection
|
61
60
|
try:
|
62
61
|
info = self.client.info()
|
63
|
-
logger.info(
|
64
|
-
|
62
|
+
logger.info(
|
63
|
+
f"Connected to Elasticsearch cluster: {info.get('cluster_name')}"
|
64
|
+
)
|
65
|
+
logger.info(
|
66
|
+
f"Elasticsearch version: {info.get('version', {}).get('number')}"
|
67
|
+
)
|
65
68
|
except Exception as e:
|
66
69
|
logger.error(f"Failed to connect to Elasticsearch: {str(e)}")
|
67
|
-
raise ConnectionError(
|
68
|
-
|
70
|
+
raise ConnectionError(
|
71
|
+
f"Could not connect to Elasticsearch: {str(e)}"
|
72
|
+
)
|
73
|
+
|
69
74
|
def create_index(
|
70
75
|
self,
|
71
76
|
index_name: str,
|
@@ -74,21 +79,23 @@ class ElasticsearchManager:
|
|
74
79
|
) -> bool:
|
75
80
|
"""
|
76
81
|
Create an Elasticsearch index with optional mappings and settings.
|
77
|
-
|
82
|
+
|
78
83
|
Args:
|
79
84
|
index_name: Name of the index to create
|
80
85
|
mappings: Optional mappings for the index fields
|
81
86
|
settings: Optional settings for the index
|
82
|
-
|
87
|
+
|
83
88
|
Returns:
|
84
89
|
bool: True if successful, False otherwise
|
85
90
|
"""
|
86
91
|
try:
|
87
92
|
# Check if index already exists
|
88
93
|
if self.client.indices.exists(index=index_name):
|
89
|
-
logger.warning(
|
94
|
+
logger.warning(
|
95
|
+
f"Index '{index_name}' already exists - skipping creation"
|
96
|
+
)
|
90
97
|
return True
|
91
|
-
|
98
|
+
|
92
99
|
# Default mappings for better text search if none provided
|
93
100
|
if mappings is None:
|
94
101
|
mappings = {
|
@@ -99,83 +106,69 @@ class ElasticsearchManager:
|
|
99
106
|
"fields": {
|
100
107
|
"keyword": {
|
101
108
|
"type": "keyword",
|
102
|
-
"ignore_above": 256
|
109
|
+
"ignore_above": 256,
|
103
110
|
}
|
104
|
-
}
|
105
|
-
},
|
106
|
-
"content": {
|
107
|
-
"type": "text",
|
108
|
-
"analyzer": "standard"
|
109
|
-
},
|
110
|
-
"url": {
|
111
|
-
"type": "keyword"
|
112
|
-
},
|
113
|
-
"source": {
|
114
|
-
"type": "keyword"
|
115
|
-
},
|
116
|
-
"timestamp": {
|
117
|
-
"type": "date"
|
111
|
+
},
|
118
112
|
},
|
119
|
-
"
|
120
|
-
|
121
|
-
|
122
|
-
}
|
113
|
+
"content": {"type": "text", "analyzer": "standard"},
|
114
|
+
"url": {"type": "keyword"},
|
115
|
+
"source": {"type": "keyword"},
|
116
|
+
"timestamp": {"type": "date"},
|
117
|
+
"metadata": {"type": "object", "enabled": True},
|
123
118
|
}
|
124
119
|
}
|
125
|
-
|
120
|
+
|
126
121
|
# Default settings if none provided
|
127
122
|
if settings is None:
|
128
123
|
settings = {
|
129
124
|
"number_of_shards": 1,
|
130
125
|
"number_of_replicas": 0,
|
131
126
|
"analysis": {
|
132
|
-
"analyzer": {
|
133
|
-
|
134
|
-
"type": "standard"
|
135
|
-
}
|
136
|
-
}
|
137
|
-
}
|
127
|
+
"analyzer": {"standard": {"type": "standard"}}
|
128
|
+
},
|
138
129
|
}
|
139
|
-
|
130
|
+
|
140
131
|
# Create the index with mappings and settings
|
141
132
|
create_response = self.client.indices.create(
|
142
133
|
index=index_name,
|
143
134
|
mappings=mappings,
|
144
135
|
settings=settings,
|
145
136
|
)
|
146
|
-
|
137
|
+
|
147
138
|
logger.info(f"Created index '{index_name}': {create_response}")
|
148
139
|
return True
|
149
|
-
|
140
|
+
|
150
141
|
except Exception as e:
|
151
142
|
logger.error(f"Error creating index '{index_name}': {str(e)}")
|
152
143
|
return False
|
153
|
-
|
144
|
+
|
154
145
|
def delete_index(self, index_name: str) -> bool:
|
155
146
|
"""
|
156
147
|
Delete an Elasticsearch index.
|
157
|
-
|
148
|
+
|
158
149
|
Args:
|
159
150
|
index_name: Name of the index to delete
|
160
|
-
|
151
|
+
|
161
152
|
Returns:
|
162
153
|
bool: True if successful, False otherwise
|
163
154
|
"""
|
164
155
|
try:
|
165
156
|
# Check if index exists
|
166
157
|
if not self.client.indices.exists(index=index_name):
|
167
|
-
logger.warning(
|
158
|
+
logger.warning(
|
159
|
+
f"Index '{index_name}' does not exist - skipping deletion"
|
160
|
+
)
|
168
161
|
return True
|
169
|
-
|
162
|
+
|
170
163
|
# Delete the index
|
171
164
|
delete_response = self.client.indices.delete(index=index_name)
|
172
165
|
logger.info(f"Deleted index '{index_name}': {delete_response}")
|
173
166
|
return True
|
174
|
-
|
167
|
+
|
175
168
|
except Exception as e:
|
176
169
|
logger.error(f"Error deleting index '{index_name}': {str(e)}")
|
177
170
|
return False
|
178
|
-
|
171
|
+
|
179
172
|
def index_document(
|
180
173
|
self,
|
181
174
|
index_name: str,
|
@@ -185,13 +178,13 @@ class ElasticsearchManager:
|
|
185
178
|
) -> Optional[str]:
|
186
179
|
"""
|
187
180
|
Index a single document in Elasticsearch.
|
188
|
-
|
181
|
+
|
189
182
|
Args:
|
190
183
|
index_name: Name of the index to add the document to
|
191
184
|
document: The document to index
|
192
185
|
document_id: Optional document ID (will be generated if not provided)
|
193
186
|
refresh: Whether to refresh the index after indexing
|
194
|
-
|
187
|
+
|
195
188
|
Returns:
|
196
189
|
str: Document ID if successful, None otherwise
|
197
190
|
"""
|
@@ -203,14 +196,16 @@ class ElasticsearchManager:
|
|
203
196
|
id=document_id,
|
204
197
|
refresh=refresh,
|
205
198
|
)
|
206
|
-
|
207
|
-
logger.info(
|
199
|
+
|
200
|
+
logger.info(
|
201
|
+
f"Indexed document in '{index_name}' with ID: {response['_id']}"
|
202
|
+
)
|
208
203
|
return response["_id"]
|
209
|
-
|
204
|
+
|
210
205
|
except Exception as e:
|
211
206
|
logger.error(f"Error indexing document in '{index_name}': {str(e)}")
|
212
207
|
return None
|
213
|
-
|
208
|
+
|
214
209
|
def bulk_index_documents(
|
215
210
|
self,
|
216
211
|
index_name: str,
|
@@ -220,13 +215,13 @@ class ElasticsearchManager:
|
|
220
215
|
) -> int:
|
221
216
|
"""
|
222
217
|
Bulk index multiple documents in Elasticsearch.
|
223
|
-
|
218
|
+
|
224
219
|
Args:
|
225
220
|
index_name: Name of the index to add the documents to
|
226
221
|
documents: List of documents to index
|
227
222
|
id_field: Optional field in the documents to use as the document ID
|
228
223
|
refresh: Whether to refresh the index after indexing
|
229
|
-
|
224
|
+
|
230
225
|
Returns:
|
231
226
|
int: Number of successfully indexed documents
|
232
227
|
"""
|
@@ -238,13 +233,13 @@ class ElasticsearchManager:
|
|
238
233
|
"_index": index_name,
|
239
234
|
"_source": doc,
|
240
235
|
}
|
241
|
-
|
236
|
+
|
242
237
|
# Use the specified field as the document ID if provided
|
243
238
|
if id_field and id_field in doc:
|
244
239
|
action["_id"] = doc[id_field]
|
245
|
-
|
240
|
+
|
246
241
|
actions.append(action)
|
247
|
-
|
242
|
+
|
248
243
|
# Execute the bulk indexing
|
249
244
|
success, failed = bulk(
|
250
245
|
self.client,
|
@@ -252,14 +247,18 @@ class ElasticsearchManager:
|
|
252
247
|
refresh=refresh,
|
253
248
|
stats_only=True,
|
254
249
|
)
|
255
|
-
|
256
|
-
logger.info(
|
250
|
+
|
251
|
+
logger.info(
|
252
|
+
f"Bulk indexed {success} documents in '{index_name}', failed: {failed}"
|
253
|
+
)
|
257
254
|
return success
|
258
|
-
|
255
|
+
|
259
256
|
except Exception as e:
|
260
|
-
logger.error(
|
257
|
+
logger.error(
|
258
|
+
f"Error bulk indexing documents in '{index_name}': {str(e)}"
|
259
|
+
)
|
261
260
|
return 0
|
262
|
-
|
261
|
+
|
263
262
|
def index_file(
|
264
263
|
self,
|
265
264
|
index_name: str,
|
@@ -271,7 +270,7 @@ class ElasticsearchManager:
|
|
271
270
|
) -> Optional[str]:
|
272
271
|
"""
|
273
272
|
Index a file in Elasticsearch, extracting text content and metadata.
|
274
|
-
|
273
|
+
|
275
274
|
Args:
|
276
275
|
index_name: Name of the index to add the document to
|
277
276
|
file_path: Path to the file to index
|
@@ -279,53 +278,59 @@ class ElasticsearchManager:
|
|
279
278
|
title_field: Field name to store the file title (filename if not specified)
|
280
279
|
extract_metadata: Whether to extract file metadata
|
281
280
|
refresh: Whether to refresh the index after indexing
|
282
|
-
|
281
|
+
|
283
282
|
Returns:
|
284
283
|
str: Document ID if successful, None otherwise
|
285
284
|
"""
|
286
285
|
try:
|
287
|
-
from langchain_community.document_loaders import
|
288
|
-
|
286
|
+
from langchain_community.document_loaders import (
|
287
|
+
UnstructuredFileLoader,
|
288
|
+
)
|
289
|
+
|
289
290
|
# Extract file content and metadata
|
290
291
|
loader = UnstructuredFileLoader(file_path)
|
291
292
|
documents = loader.load()
|
292
|
-
|
293
|
+
|
293
294
|
# Combine all content from the documents
|
294
295
|
content = "\n\n".join([doc.page_content for doc in documents])
|
295
|
-
|
296
|
+
|
296
297
|
# Get the filename for the title
|
297
298
|
filename = os.path.basename(file_path)
|
298
299
|
title = filename
|
299
|
-
|
300
|
+
|
300
301
|
# Prepare the document
|
301
302
|
document = {
|
302
303
|
content_field: content,
|
303
304
|
}
|
304
|
-
|
305
|
+
|
305
306
|
# Add title if requested
|
306
307
|
if title_field:
|
307
308
|
document[title_field] = title
|
308
|
-
|
309
|
+
|
309
310
|
# Add metadata if requested
|
310
311
|
if extract_metadata and documents:
|
311
312
|
# Include metadata from the first document
|
312
313
|
document["metadata"] = documents[0].metadata
|
313
|
-
|
314
|
+
|
314
315
|
# Add file-specific metadata
|
315
316
|
document["source"] = file_path
|
316
|
-
document["file_extension"] = os.path.splitext(filename)[
|
317
|
+
document["file_extension"] = os.path.splitext(filename)[
|
318
|
+
1
|
319
|
+
].lstrip(".")
|
317
320
|
document["filename"] = filename
|
318
|
-
|
321
|
+
|
319
322
|
# Index the document
|
320
323
|
return self.index_document(index_name, document, refresh=refresh)
|
321
|
-
|
324
|
+
|
322
325
|
except ImportError:
|
323
|
-
logger.error(
|
326
|
+
logger.error(
|
327
|
+
"UnstructuredFileLoader not available. Please install the 'unstructured' package."
|
328
|
+
)
|
324
329
|
return None
|
325
330
|
except Exception as e:
|
326
331
|
logger.error(f"Error indexing file '{file_path}': {str(e)}")
|
327
332
|
return None
|
328
|
-
|
333
|
+
|
329
334
|
def index_directory(
|
330
335
|
self,
|
331
336
|
index_name: str,
|
@@ -338,7 +343,7 @@ class ElasticsearchManager:
|
|
338
343
|
) -> int:
|
339
344
|
"""
|
340
345
|
Index all matching files in a directory in Elasticsearch.
|
341
|
-
|
346
|
+
|
342
347
|
Args:
|
343
348
|
index_name: Name of the index to add the documents to
|
344
349
|
directory_path: Path to the directory containing files to index
|
@@ -347,22 +352,24 @@ class ElasticsearchManager:
|
|
347
352
|
title_field: Field name to store the file title
|
348
353
|
extract_metadata: Whether to extract file metadata
|
349
354
|
refresh: Whether to refresh the index after indexing
|
350
|
-
|
355
|
+
|
351
356
|
Returns:
|
352
357
|
int: Number of successfully indexed files
|
353
358
|
"""
|
354
359
|
try:
|
355
360
|
import glob
|
356
|
-
|
361
|
+
|
357
362
|
# Find all matching files
|
358
363
|
all_files = []
|
359
364
|
for pattern in file_patterns:
|
360
365
|
pattern_path = os.path.join(directory_path, pattern)
|
361
366
|
matching_files = glob.glob(pattern_path)
|
362
367
|
all_files.extend(matching_files)
|
363
|
-
|
364
|
-
logger.info(
|
365
|
-
|
368
|
+
|
369
|
+
logger.info(
|
370
|
+
f"Found {len(all_files)} files matching patterns {file_patterns} in {directory_path}"
|
371
|
+
)
|
372
|
+
|
366
373
|
# Index each file
|
367
374
|
successful_count = 0
|
368
375
|
for file_path in all_files:
|
@@ -375,17 +382,21 @@ class ElasticsearchManager:
|
|
375
382
|
extract_metadata=extract_metadata,
|
376
383
|
refresh=refresh,
|
377
384
|
)
|
378
|
-
|
385
|
+
|
379
386
|
if doc_id:
|
380
387
|
successful_count += 1
|
381
|
-
|
382
|
-
logger.info(
|
388
|
+
|
389
|
+
logger.info(
|
390
|
+
f"Successfully indexed {successful_count} files out of {len(all_files)}"
|
391
|
+
)
|
383
392
|
return successful_count
|
384
|
-
|
393
|
+
|
385
394
|
except Exception as e:
|
386
|
-
logger.error(
|
395
|
+
logger.error(
|
396
|
+
f"Error indexing directory '{directory_path}': {str(e)}"
|
397
|
+
)
|
387
398
|
return 0
|
388
|
-
|
399
|
+
|
389
400
|
def search(
|
390
401
|
self,
|
391
402
|
index_name: str,
|
@@ -396,14 +407,14 @@ class ElasticsearchManager:
|
|
396
407
|
) -> Dict[str, Any]:
|
397
408
|
"""
|
398
409
|
Search for documents in Elasticsearch.
|
399
|
-
|
410
|
+
|
400
411
|
Args:
|
401
412
|
index_name: Name of the index to search
|
402
413
|
query: Search query
|
403
414
|
fields: Fields to search in
|
404
415
|
size: Maximum number of results to return
|
405
416
|
highlight: Whether to include highlighted excerpts in results
|
406
|
-
|
417
|
+
|
407
418
|
Returns:
|
408
419
|
Dict: Elasticsearch search response
|
409
420
|
"""
|
@@ -419,7 +430,7 @@ class ElasticsearchManager:
|
|
419
430
|
},
|
420
431
|
"size": size,
|
421
432
|
}
|
422
|
-
|
433
|
+
|
423
434
|
# Add highlighting if requested
|
424
435
|
if highlight:
|
425
436
|
search_query["highlight"] = {
|
@@ -427,15 +438,15 @@ class ElasticsearchManager:
|
|
427
438
|
"pre_tags": ["<em>"],
|
428
439
|
"post_tags": ["</em>"],
|
429
440
|
}
|
430
|
-
|
441
|
+
|
431
442
|
# Execute the search
|
432
443
|
response = self.client.search(
|
433
444
|
index=index_name,
|
434
445
|
body=search_query,
|
435
446
|
)
|
436
|
-
|
447
|
+
|
437
448
|
return response
|
438
|
-
|
449
|
+
|
439
450
|
except Exception as e:
|
440
451
|
logger.error(f"Error searching index '{index_name}': {str(e)}")
|
441
|
-
return {"error": str(e)}
|
452
|
+
return {"error": str(e)}
|
@@ -78,7 +78,9 @@ def get_model(
|
|
78
78
|
api_key = os.getenv("OPENAI_API_KEY")
|
79
79
|
if not api_key:
|
80
80
|
raise ValueError("OPENAI_API_KEY environment variable not set")
|
81
|
-
return ChatOpenAI(
|
81
|
+
return ChatOpenAI(
|
82
|
+
model=model_name, api_key=api_key, **common_params
|
83
|
+
)
|
82
84
|
except ImportError:
|
83
85
|
logger.error("langchain_openai not available")
|
84
86
|
raise
|
@@ -89,7 +91,9 @@ def get_model(
|
|
89
91
|
|
90
92
|
api_key = os.getenv("ANTHROPIC_API_KEY")
|
91
93
|
if not api_key:
|
92
|
-
raise ValueError(
|
94
|
+
raise ValueError(
|
95
|
+
"ANTHROPIC_API_KEY environment variable not set"
|
96
|
+
)
|
93
97
|
return ChatAnthropic(
|
94
98
|
model=model_name, anthropic_api_key=api_key, **common_params
|
95
99
|
)
|
@@ -103,7 +107,9 @@ def get_model(
|
|
103
107
|
|
104
108
|
api_key = os.getenv("OPENAI_ENDPOINT_API_KEY")
|
105
109
|
if not api_key:
|
106
|
-
raise ValueError(
|
110
|
+
raise ValueError(
|
111
|
+
"OPENAI_ENDPOINT_API_KEY environment variable not set"
|
112
|
+
)
|
107
113
|
|
108
114
|
endpoint_url = kwargs.get(
|
109
115
|
"OPENAI_ENDPOINT_URL", "https://openrouter.ai/api/v1"
|
@@ -113,7 +119,9 @@ def get_model(
|
|
113
119
|
"OPENAI_ENDPOINT_REQUIRES_MODEL", True
|
114
120
|
):
|
115
121
|
return ChatOpenAI(
|
116
|
-
api_key=api_key,
|
122
|
+
api_key=api_key,
|
123
|
+
openai_api_base=endpoint_url,
|
124
|
+
**common_params,
|
117
125
|
)
|
118
126
|
else:
|
119
127
|
return ChatOpenAI(
|
@@ -130,7 +138,9 @@ def get_model(
|
|
130
138
|
try:
|
131
139
|
from langchain_ollama import ChatOllama
|
132
140
|
|
133
|
-
logger.warning(
|
141
|
+
logger.warning(
|
142
|
+
f"Unknown model type '{model_type}', defaulting to Ollama"
|
143
|
+
)
|
134
144
|
return ChatOllama(model=model_name, **common_params)
|
135
145
|
except (ImportError, Exception) as e:
|
136
146
|
logger.error(f"Failed to load any model: {e}")
|