local-deep-research 0.4.4__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__init__.py +7 -0
- local_deep_research/__version__.py +1 -1
- local_deep_research/advanced_search_system/answer_decoding/__init__.py +5 -0
- local_deep_research/advanced_search_system/answer_decoding/browsecomp_answer_decoder.py +421 -0
- local_deep_research/advanced_search_system/candidate_exploration/README.md +219 -0
- local_deep_research/advanced_search_system/candidate_exploration/__init__.py +25 -0
- local_deep_research/advanced_search_system/candidate_exploration/adaptive_explorer.py +329 -0
- local_deep_research/advanced_search_system/candidate_exploration/base_explorer.py +341 -0
- local_deep_research/advanced_search_system/candidate_exploration/constraint_guided_explorer.py +436 -0
- local_deep_research/advanced_search_system/candidate_exploration/diversity_explorer.py +457 -0
- local_deep_research/advanced_search_system/candidate_exploration/parallel_explorer.py +250 -0
- local_deep_research/advanced_search_system/candidate_exploration/progressive_explorer.py +255 -0
- local_deep_research/advanced_search_system/candidates/__init__.py +5 -0
- local_deep_research/advanced_search_system/candidates/base_candidate.py +59 -0
- local_deep_research/advanced_search_system/constraint_checking/README.md +150 -0
- local_deep_research/advanced_search_system/constraint_checking/__init__.py +35 -0
- local_deep_research/advanced_search_system/constraint_checking/base_constraint_checker.py +122 -0
- local_deep_research/advanced_search_system/constraint_checking/constraint_checker.py +223 -0
- local_deep_research/advanced_search_system/constraint_checking/constraint_satisfaction_tracker.py +387 -0
- local_deep_research/advanced_search_system/constraint_checking/dual_confidence_checker.py +424 -0
- local_deep_research/advanced_search_system/constraint_checking/evidence_analyzer.py +174 -0
- local_deep_research/advanced_search_system/constraint_checking/intelligent_constraint_relaxer.py +503 -0
- local_deep_research/advanced_search_system/constraint_checking/rejection_engine.py +143 -0
- local_deep_research/advanced_search_system/constraint_checking/strict_checker.py +259 -0
- local_deep_research/advanced_search_system/constraint_checking/threshold_checker.py +213 -0
- local_deep_research/advanced_search_system/constraints/__init__.py +6 -0
- local_deep_research/advanced_search_system/constraints/base_constraint.py +58 -0
- local_deep_research/advanced_search_system/constraints/constraint_analyzer.py +143 -0
- local_deep_research/advanced_search_system/evidence/__init__.py +12 -0
- local_deep_research/advanced_search_system/evidence/base_evidence.py +57 -0
- local_deep_research/advanced_search_system/evidence/evaluator.py +159 -0
- local_deep_research/advanced_search_system/evidence/requirements.py +122 -0
- local_deep_research/advanced_search_system/filters/base_filter.py +3 -1
- local_deep_research/advanced_search_system/filters/cross_engine_filter.py +8 -2
- local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +43 -29
- local_deep_research/advanced_search_system/findings/repository.py +54 -17
- local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +3 -1
- local_deep_research/advanced_search_system/query_generation/adaptive_query_generator.py +405 -0
- local_deep_research/advanced_search_system/questions/__init__.py +16 -0
- local_deep_research/advanced_search_system/questions/atomic_fact_question.py +171 -0
- local_deep_research/advanced_search_system/questions/browsecomp_question.py +287 -0
- local_deep_research/advanced_search_system/questions/decomposition_question.py +13 -4
- local_deep_research/advanced_search_system/questions/entity_aware_question.py +184 -0
- local_deep_research/advanced_search_system/questions/standard_question.py +9 -3
- local_deep_research/advanced_search_system/search_optimization/cross_constraint_manager.py +624 -0
- local_deep_research/advanced_search_system/source_management/diversity_manager.py +613 -0
- local_deep_research/advanced_search_system/strategies/__init__.py +42 -0
- local_deep_research/advanced_search_system/strategies/adaptive_decomposition_strategy.py +564 -0
- local_deep_research/advanced_search_system/strategies/base_strategy.py +4 -4
- local_deep_research/advanced_search_system/strategies/browsecomp_entity_strategy.py +1031 -0
- local_deep_research/advanced_search_system/strategies/browsecomp_optimized_strategy.py +778 -0
- local_deep_research/advanced_search_system/strategies/concurrent_dual_confidence_strategy.py +446 -0
- local_deep_research/advanced_search_system/strategies/constrained_search_strategy.py +1348 -0
- local_deep_research/advanced_search_system/strategies/constraint_parallel_strategy.py +522 -0
- local_deep_research/advanced_search_system/strategies/direct_search_strategy.py +217 -0
- local_deep_research/advanced_search_system/strategies/dual_confidence_strategy.py +320 -0
- local_deep_research/advanced_search_system/strategies/dual_confidence_with_rejection.py +219 -0
- local_deep_research/advanced_search_system/strategies/early_stop_constrained_strategy.py +369 -0
- local_deep_research/advanced_search_system/strategies/entity_aware_source_strategy.py +140 -0
- local_deep_research/advanced_search_system/strategies/evidence_based_strategy.py +1248 -0
- local_deep_research/advanced_search_system/strategies/evidence_based_strategy_v2.py +1337 -0
- local_deep_research/advanced_search_system/strategies/focused_iteration_strategy.py +537 -0
- local_deep_research/advanced_search_system/strategies/improved_evidence_based_strategy.py +782 -0
- local_deep_research/advanced_search_system/strategies/iterative_reasoning_strategy.py +760 -0
- local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +55 -21
- local_deep_research/advanced_search_system/strategies/llm_driven_modular_strategy.py +865 -0
- local_deep_research/advanced_search_system/strategies/modular_strategy.py +1142 -0
- local_deep_research/advanced_search_system/strategies/parallel_constrained_strategy.py +506 -0
- local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +34 -16
- local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +29 -9
- local_deep_research/advanced_search_system/strategies/recursive_decomposition_strategy.py +492 -0
- local_deep_research/advanced_search_system/strategies/smart_decomposition_strategy.py +284 -0
- local_deep_research/advanced_search_system/strategies/smart_query_strategy.py +515 -0
- local_deep_research/advanced_search_system/strategies/source_based_strategy.py +48 -24
- local_deep_research/advanced_search_system/strategies/standard_strategy.py +34 -14
- local_deep_research/advanced_search_system/tools/base_tool.py +7 -2
- local_deep_research/api/benchmark_functions.py +6 -2
- local_deep_research/api/research_functions.py +10 -4
- local_deep_research/benchmarks/__init__.py +9 -7
- local_deep_research/benchmarks/benchmark_functions.py +6 -2
- local_deep_research/benchmarks/cli/benchmark_commands.py +27 -10
- local_deep_research/benchmarks/cli.py +38 -13
- local_deep_research/benchmarks/comparison/__init__.py +4 -2
- local_deep_research/benchmarks/comparison/evaluator.py +316 -239
- local_deep_research/benchmarks/datasets/__init__.py +1 -1
- local_deep_research/benchmarks/datasets/base.py +91 -72
- local_deep_research/benchmarks/datasets/browsecomp.py +54 -33
- local_deep_research/benchmarks/datasets/custom_dataset_template.py +19 -19
- local_deep_research/benchmarks/datasets/simpleqa.py +14 -14
- local_deep_research/benchmarks/datasets/utils.py +48 -29
- local_deep_research/benchmarks/datasets.py +4 -11
- local_deep_research/benchmarks/efficiency/__init__.py +8 -4
- local_deep_research/benchmarks/efficiency/resource_monitor.py +223 -171
- local_deep_research/benchmarks/efficiency/speed_profiler.py +62 -48
- local_deep_research/benchmarks/evaluators/browsecomp.py +3 -1
- local_deep_research/benchmarks/evaluators/composite.py +6 -2
- local_deep_research/benchmarks/evaluators/simpleqa.py +36 -13
- local_deep_research/benchmarks/graders.py +32 -10
- local_deep_research/benchmarks/metrics/README.md +1 -1
- local_deep_research/benchmarks/metrics/calculation.py +25 -10
- local_deep_research/benchmarks/metrics/reporting.py +7 -3
- local_deep_research/benchmarks/metrics/visualization.py +42 -23
- local_deep_research/benchmarks/metrics.py +1 -1
- local_deep_research/benchmarks/optimization/__init__.py +3 -1
- local_deep_research/benchmarks/optimization/api.py +7 -1
- local_deep_research/benchmarks/optimization/optuna_optimizer.py +75 -26
- local_deep_research/benchmarks/runners.py +48 -15
- local_deep_research/citation_handler.py +65 -92
- local_deep_research/citation_handlers/__init__.py +15 -0
- local_deep_research/citation_handlers/base_citation_handler.py +70 -0
- local_deep_research/citation_handlers/forced_answer_citation_handler.py +179 -0
- local_deep_research/citation_handlers/precision_extraction_handler.py +550 -0
- local_deep_research/citation_handlers/standard_citation_handler.py +80 -0
- local_deep_research/config/llm_config.py +271 -169
- local_deep_research/config/search_config.py +14 -5
- local_deep_research/defaults/__init__.py +0 -1
- local_deep_research/metrics/__init__.py +13 -0
- local_deep_research/metrics/database.py +58 -0
- local_deep_research/metrics/db_models.py +115 -0
- local_deep_research/metrics/migrate_add_provider_to_token_usage.py +148 -0
- local_deep_research/metrics/migrate_call_stack_tracking.py +105 -0
- local_deep_research/metrics/migrate_enhanced_tracking.py +75 -0
- local_deep_research/metrics/migrate_research_ratings.py +31 -0
- local_deep_research/metrics/models.py +61 -0
- local_deep_research/metrics/pricing/__init__.py +12 -0
- local_deep_research/metrics/pricing/cost_calculator.py +237 -0
- local_deep_research/metrics/pricing/pricing_cache.py +143 -0
- local_deep_research/metrics/pricing/pricing_fetcher.py +240 -0
- local_deep_research/metrics/query_utils.py +51 -0
- local_deep_research/metrics/search_tracker.py +380 -0
- local_deep_research/metrics/token_counter.py +1078 -0
- local_deep_research/migrate_db.py +3 -1
- local_deep_research/report_generator.py +22 -8
- local_deep_research/search_system.py +390 -9
- local_deep_research/test_migration.py +15 -5
- local_deep_research/utilities/db_utils.py +7 -4
- local_deep_research/utilities/es_utils.py +115 -104
- local_deep_research/utilities/llm_utils.py +15 -5
- local_deep_research/utilities/log_utils.py +151 -0
- local_deep_research/utilities/search_cache.py +387 -0
- local_deep_research/utilities/search_utilities.py +14 -6
- local_deep_research/utilities/threading_utils.py +92 -0
- local_deep_research/utilities/url_utils.py +6 -0
- local_deep_research/web/api.py +347 -0
- local_deep_research/web/app.py +13 -17
- local_deep_research/web/app_factory.py +71 -66
- local_deep_research/web/database/migrate_to_ldr_db.py +12 -4
- local_deep_research/web/database/migrations.py +20 -3
- local_deep_research/web/database/models.py +74 -25
- local_deep_research/web/database/schema_upgrade.py +49 -29
- local_deep_research/web/models/database.py +63 -83
- local_deep_research/web/routes/api_routes.py +56 -22
- local_deep_research/web/routes/benchmark_routes.py +4 -1
- local_deep_research/web/routes/globals.py +22 -0
- local_deep_research/web/routes/history_routes.py +71 -46
- local_deep_research/web/routes/metrics_routes.py +1155 -0
- local_deep_research/web/routes/research_routes.py +192 -54
- local_deep_research/web/routes/settings_routes.py +156 -55
- local_deep_research/web/services/research_service.py +412 -251
- local_deep_research/web/services/resource_service.py +36 -11
- local_deep_research/web/services/settings_manager.py +55 -17
- local_deep_research/web/services/settings_service.py +12 -4
- local_deep_research/web/services/socket_service.py +295 -188
- local_deep_research/web/static/css/custom_dropdown.css +180 -0
- local_deep_research/web/static/css/styles.css +39 -1
- local_deep_research/web/static/js/components/detail.js +633 -267
- local_deep_research/web/static/js/components/details.js +751 -0
- local_deep_research/web/static/js/components/fallback/formatting.js +11 -11
- local_deep_research/web/static/js/components/fallback/ui.js +23 -23
- local_deep_research/web/static/js/components/history.js +76 -76
- local_deep_research/web/static/js/components/logpanel.js +61 -13
- local_deep_research/web/static/js/components/progress.js +13 -2
- local_deep_research/web/static/js/components/research.js +99 -12
- local_deep_research/web/static/js/components/results.js +239 -106
- local_deep_research/web/static/js/main.js +40 -40
- local_deep_research/web/static/js/services/audio.js +1 -1
- local_deep_research/web/static/js/services/formatting.js +11 -11
- local_deep_research/web/static/js/services/keyboard.js +157 -0
- local_deep_research/web/static/js/services/pdf.js +80 -80
- local_deep_research/web/static/sounds/README.md +1 -1
- local_deep_research/web/templates/base.html +1 -0
- local_deep_research/web/templates/components/log_panel.html +7 -1
- local_deep_research/web/templates/components/mobile_nav.html +1 -1
- local_deep_research/web/templates/components/sidebar.html +3 -0
- local_deep_research/web/templates/pages/cost_analytics.html +1245 -0
- local_deep_research/web/templates/pages/details.html +325 -24
- local_deep_research/web/templates/pages/history.html +1 -1
- local_deep_research/web/templates/pages/metrics.html +1929 -0
- local_deep_research/web/templates/pages/progress.html +2 -2
- local_deep_research/web/templates/pages/research.html +53 -17
- local_deep_research/web/templates/pages/results.html +12 -1
- local_deep_research/web/templates/pages/star_reviews.html +803 -0
- local_deep_research/web/utils/formatters.py +9 -3
- local_deep_research/web_search_engines/default_search_engines.py +5 -3
- local_deep_research/web_search_engines/engines/full_search.py +8 -2
- local_deep_research/web_search_engines/engines/meta_search_engine.py +59 -20
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +19 -6
- local_deep_research/web_search_engines/engines/search_engine_brave.py +6 -2
- local_deep_research/web_search_engines/engines/search_engine_ddg.py +3 -1
- local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +81 -58
- local_deep_research/web_search_engines/engines/search_engine_github.py +46 -15
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +16 -6
- local_deep_research/web_search_engines/engines/search_engine_guardian.py +39 -15
- local_deep_research/web_search_engines/engines/search_engine_local.py +58 -25
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +15 -5
- local_deep_research/web_search_engines/engines/search_engine_pubmed.py +63 -21
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +37 -11
- local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +27 -9
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +12 -4
- local_deep_research/web_search_engines/engines/search_engine_wayback.py +31 -10
- local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +12 -3
- local_deep_research/web_search_engines/search_engine_base.py +83 -35
- local_deep_research/web_search_engines/search_engine_factory.py +25 -8
- local_deep_research/web_search_engines/search_engines_config.py +9 -3
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/METADATA +7 -1
- local_deep_research-0.5.2.dist-info/RECORD +265 -0
- local_deep_research-0.4.4.dist-info/RECORD +0 -177
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/WHEEL +0 -0
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/entry_points.txt +0 -0
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/licenses/LICENSE +0 -0
@@ -2,10 +2,30 @@
|
|
2
2
|
Utilities for logging.
|
3
3
|
"""
|
4
4
|
|
5
|
+
# Needed for loguru annotations
|
6
|
+
from __future__ import annotations
|
7
|
+
|
5
8
|
import inspect
|
6
9
|
import logging
|
10
|
+
import sys
|
11
|
+
from functools import wraps
|
12
|
+
from pathlib import Path
|
13
|
+
from typing import Any, Callable
|
7
14
|
|
15
|
+
import loguru
|
16
|
+
from flask import g, has_app_context
|
8
17
|
from loguru import logger
|
18
|
+
from sqlalchemy.exc import OperationalError
|
19
|
+
|
20
|
+
from ..web.database.models import ResearchLog
|
21
|
+
from ..web.services.socket_service import SocketIOService
|
22
|
+
from .db_utils import get_db_session
|
23
|
+
|
24
|
+
_LOG_DIR = Path(__file__).parents[2] / "data" / "logs"
|
25
|
+
_LOG_DIR.mkdir(exist_ok=True)
|
26
|
+
"""
|
27
|
+
Default log directory to use.
|
28
|
+
"""
|
9
29
|
|
10
30
|
|
11
31
|
class InterceptHandler(logging.Handler):
|
@@ -34,3 +54,134 @@ class InterceptHandler(logging.Handler):
|
|
34
54
|
logger.opt(depth=depth, exception=record.exc_info).log(
|
35
55
|
level, record.getMessage()
|
36
56
|
)
|
57
|
+
|
58
|
+
|
59
|
+
def log_for_research(
|
60
|
+
to_wrap: Callable[[int, ...], Any],
|
61
|
+
) -> Callable[[int, ...], Any]:
|
62
|
+
"""
|
63
|
+
Decorator for a function that's part of the research process. It expects the function to
|
64
|
+
take the research ID as the first parameter, and configures all log
|
65
|
+
messages made during this request to include the research ID.
|
66
|
+
|
67
|
+
Args:
|
68
|
+
to_wrap: The function to wrap. Should take the research ID as the first parameter.
|
69
|
+
|
70
|
+
Returns:
|
71
|
+
The wrapped function.
|
72
|
+
|
73
|
+
"""
|
74
|
+
|
75
|
+
@wraps(to_wrap)
|
76
|
+
def wrapped(research_id: int, *args: Any, **kwargs: Any) -> Any:
|
77
|
+
g.research_id = research_id
|
78
|
+
to_wrap(research_id, *args, **kwargs)
|
79
|
+
g.pop("research_id")
|
80
|
+
|
81
|
+
return wrapped
|
82
|
+
|
83
|
+
|
84
|
+
def _get_research_id() -> int | None:
|
85
|
+
"""
|
86
|
+
Gets the current research ID, if present.
|
87
|
+
|
88
|
+
Returns:
|
89
|
+
The current research ID, or None if it does not exist.
|
90
|
+
|
91
|
+
"""
|
92
|
+
research_id = None
|
93
|
+
if has_app_context():
|
94
|
+
research_id = g.get("research_id")
|
95
|
+
return research_id
|
96
|
+
|
97
|
+
|
98
|
+
def database_sink(message: loguru.Message) -> None:
|
99
|
+
"""
|
100
|
+
Sink that saves messages to the database.
|
101
|
+
|
102
|
+
Args:
|
103
|
+
message: The log message to save.
|
104
|
+
|
105
|
+
"""
|
106
|
+
record = message.record
|
107
|
+
research_id = _get_research_id()
|
108
|
+
|
109
|
+
# Create a new database entry.
|
110
|
+
db_log = ResearchLog(
|
111
|
+
timestamp=record["time"],
|
112
|
+
message=str(message),
|
113
|
+
module=record["name"],
|
114
|
+
function=record["function"],
|
115
|
+
line_no=int(record["line"]),
|
116
|
+
level=record["level"].name,
|
117
|
+
research_id=research_id,
|
118
|
+
)
|
119
|
+
|
120
|
+
# Save the entry to the database.
|
121
|
+
db_session = get_db_session()
|
122
|
+
try:
|
123
|
+
db_session.add(db_log)
|
124
|
+
db_session.commit()
|
125
|
+
except OperationalError:
|
126
|
+
# Something else is probably using the DB and we can't write to it
|
127
|
+
# right now. Ignore this.
|
128
|
+
db_session.rollback()
|
129
|
+
return
|
130
|
+
|
131
|
+
|
132
|
+
def frontend_progress_sink(message: loguru.Message) -> None:
|
133
|
+
"""
|
134
|
+
Sink that sends messages to the frontend.
|
135
|
+
|
136
|
+
Args:
|
137
|
+
message: The log message to send.
|
138
|
+
|
139
|
+
"""
|
140
|
+
research_id = _get_research_id()
|
141
|
+
if research_id is None:
|
142
|
+
# If we don't have a research ID, don't send anything.
|
143
|
+
return
|
144
|
+
|
145
|
+
record = message.record
|
146
|
+
frontend_log = dict(
|
147
|
+
log_entry=dict(
|
148
|
+
message=record["message"],
|
149
|
+
type=record["level"].name,
|
150
|
+
time=record["time"].isoformat(),
|
151
|
+
),
|
152
|
+
)
|
153
|
+
SocketIOService().emit_to_subscribers(
|
154
|
+
"progress", research_id, frontend_log, enable_logging=False
|
155
|
+
)
|
156
|
+
|
157
|
+
|
158
|
+
def config_logger(name: str) -> None:
|
159
|
+
"""
|
160
|
+
Configures the default logger.
|
161
|
+
|
162
|
+
Args:
|
163
|
+
name: The name to use for the log file.
|
164
|
+
|
165
|
+
"""
|
166
|
+
logger.enable("local_deep_research")
|
167
|
+
logger.remove()
|
168
|
+
|
169
|
+
# Log more important stuff to the console.
|
170
|
+
logger.add(sys.stderr, level="INFO")
|
171
|
+
logger.add(
|
172
|
+
_LOG_DIR / f"{name}.log",
|
173
|
+
level="DEBUG",
|
174
|
+
enqueue=True,
|
175
|
+
rotation="00:00",
|
176
|
+
retention="30 days",
|
177
|
+
compression="zip",
|
178
|
+
)
|
179
|
+
logger.add(database_sink)
|
180
|
+
logger.add(frontend_progress_sink)
|
181
|
+
|
182
|
+
# Add a special log level for milestones.
|
183
|
+
try:
|
184
|
+
logger.level("milestone", no=26, color="<magenta><bold>")
|
185
|
+
except ValueError:
|
186
|
+
# Level already exists, that's fine
|
187
|
+
pass
|
@@ -0,0 +1,387 @@
|
|
1
|
+
"""
|
2
|
+
Search Cache Utility
|
3
|
+
Provides intelligent caching for search results to avoid repeated queries.
|
4
|
+
Includes TTL, LRU eviction, and query normalization.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import hashlib
|
8
|
+
import json
|
9
|
+
import os
|
10
|
+
import sqlite3
|
11
|
+
import time
|
12
|
+
from functools import lru_cache
|
13
|
+
from typing import Any, Dict, List, Optional
|
14
|
+
|
15
|
+
from loguru import logger
|
16
|
+
|
17
|
+
|
18
|
+
class SearchCache:
|
19
|
+
"""
|
20
|
+
Persistent cache for search results with TTL and LRU eviction.
|
21
|
+
Stores results in SQLite for persistence across sessions.
|
22
|
+
"""
|
23
|
+
|
24
|
+
def __init__(
|
25
|
+
self,
|
26
|
+
cache_dir: str = None,
|
27
|
+
max_memory_items: int = 1000,
|
28
|
+
default_ttl: int = 3600,
|
29
|
+
):
|
30
|
+
"""
|
31
|
+
Initialize search cache.
|
32
|
+
|
33
|
+
Args:
|
34
|
+
cache_dir: Directory for cache database. Defaults to data/__CACHE_DIR__
|
35
|
+
max_memory_items: Maximum items in memory cache
|
36
|
+
default_ttl: Default time-to-live in seconds (1 hour default)
|
37
|
+
"""
|
38
|
+
self.max_memory_items = max_memory_items
|
39
|
+
self.default_ttl = default_ttl
|
40
|
+
|
41
|
+
# Setup cache directory
|
42
|
+
if cache_dir is None:
|
43
|
+
cache_dir = os.path.join(
|
44
|
+
os.getcwd(), "data", "__CACHE_DIR__", "search_cache"
|
45
|
+
)
|
46
|
+
|
47
|
+
os.makedirs(cache_dir, exist_ok=True)
|
48
|
+
self.db_path = os.path.join(cache_dir, "search_cache.db")
|
49
|
+
|
50
|
+
# Initialize database
|
51
|
+
self._init_db()
|
52
|
+
|
53
|
+
# In-memory cache for frequently accessed items
|
54
|
+
self._memory_cache = {}
|
55
|
+
self._access_times = {}
|
56
|
+
|
57
|
+
def _init_db(self):
|
58
|
+
"""Initialize SQLite database for persistent cache."""
|
59
|
+
try:
|
60
|
+
with sqlite3.connect(self.db_path) as conn:
|
61
|
+
conn.execute(
|
62
|
+
"""
|
63
|
+
CREATE TABLE IF NOT EXISTS search_cache (
|
64
|
+
query_hash TEXT PRIMARY KEY,
|
65
|
+
query_text TEXT NOT NULL,
|
66
|
+
results TEXT NOT NULL,
|
67
|
+
created_at INTEGER NOT NULL,
|
68
|
+
expires_at INTEGER NOT NULL,
|
69
|
+
access_count INTEGER DEFAULT 1,
|
70
|
+
last_accessed INTEGER NOT NULL
|
71
|
+
)
|
72
|
+
"""
|
73
|
+
)
|
74
|
+
conn.execute(
|
75
|
+
"""
|
76
|
+
CREATE INDEX IF NOT EXISTS idx_expires_at ON search_cache(expires_at)
|
77
|
+
"""
|
78
|
+
)
|
79
|
+
conn.execute(
|
80
|
+
"""
|
81
|
+
CREATE INDEX IF NOT EXISTS idx_last_accessed ON search_cache(last_accessed)
|
82
|
+
"""
|
83
|
+
)
|
84
|
+
conn.commit()
|
85
|
+
except Exception as e:
|
86
|
+
logger.error(f"Failed to initialize search cache database: {e}")
|
87
|
+
|
88
|
+
def _normalize_query(self, query: str) -> str:
|
89
|
+
"""Normalize query for consistent caching."""
|
90
|
+
# Convert to lowercase and remove extra whitespace
|
91
|
+
normalized = " ".join(query.lower().strip().split())
|
92
|
+
|
93
|
+
# Remove common punctuation that doesn't affect search
|
94
|
+
normalized = normalized.replace('"', "").replace("'", "")
|
95
|
+
|
96
|
+
return normalized
|
97
|
+
|
98
|
+
def _get_query_hash(
|
99
|
+
self, query: str, search_engine: str = "default"
|
100
|
+
) -> str:
|
101
|
+
"""Generate hash for query + search engine combination."""
|
102
|
+
normalized_query = self._normalize_query(query)
|
103
|
+
cache_key = f"{search_engine}:{normalized_query}"
|
104
|
+
return hashlib.md5(cache_key.encode()).hexdigest()
|
105
|
+
|
106
|
+
def _cleanup_expired(self):
|
107
|
+
"""Remove expired entries from database."""
|
108
|
+
try:
|
109
|
+
current_time = int(time.time())
|
110
|
+
with sqlite3.connect(self.db_path) as conn:
|
111
|
+
cursor = conn.cursor()
|
112
|
+
cursor.execute(
|
113
|
+
"DELETE FROM search_cache WHERE expires_at < ?",
|
114
|
+
(current_time,),
|
115
|
+
)
|
116
|
+
deleted = cursor.rowcount
|
117
|
+
conn.commit()
|
118
|
+
if deleted > 0:
|
119
|
+
logger.debug(f"Cleaned up {deleted} expired cache entries")
|
120
|
+
except Exception as e:
|
121
|
+
logger.error(f"Failed to cleanup expired cache entries: {e}")
|
122
|
+
|
123
|
+
def _evict_lru_memory(self):
|
124
|
+
"""Evict least recently used items from memory cache."""
|
125
|
+
if len(self._memory_cache) <= self.max_memory_items:
|
126
|
+
return
|
127
|
+
|
128
|
+
# Sort by access time and remove oldest
|
129
|
+
sorted_items = sorted(self._access_times.items(), key=lambda x: x[1])
|
130
|
+
items_to_remove = (
|
131
|
+
len(self._memory_cache) - self.max_memory_items + 100
|
132
|
+
) # Remove extra for efficiency
|
133
|
+
|
134
|
+
for query_hash, _ in sorted_items[:items_to_remove]:
|
135
|
+
self._memory_cache.pop(query_hash, None)
|
136
|
+
self._access_times.pop(query_hash, None)
|
137
|
+
|
138
|
+
def get(
|
139
|
+
self, query: str, search_engine: str = "default"
|
140
|
+
) -> Optional[List[Dict[str, Any]]]:
|
141
|
+
"""
|
142
|
+
Get cached search results for a query.
|
143
|
+
|
144
|
+
Args:
|
145
|
+
query: Search query
|
146
|
+
search_engine: Search engine identifier for cache partitioning
|
147
|
+
|
148
|
+
Returns:
|
149
|
+
Cached results or None if not found/expired
|
150
|
+
"""
|
151
|
+
query_hash = self._get_query_hash(query, search_engine)
|
152
|
+
current_time = int(time.time())
|
153
|
+
|
154
|
+
# Check memory cache first
|
155
|
+
if query_hash in self._memory_cache:
|
156
|
+
entry = self._memory_cache[query_hash]
|
157
|
+
if entry["expires_at"] > current_time:
|
158
|
+
self._access_times[query_hash] = current_time
|
159
|
+
logger.debug(f"Cache hit (memory) for query: {query[:50]}...")
|
160
|
+
return entry["results"]
|
161
|
+
else:
|
162
|
+
# Expired, remove from memory
|
163
|
+
self._memory_cache.pop(query_hash, None)
|
164
|
+
self._access_times.pop(query_hash, None)
|
165
|
+
|
166
|
+
# Check database cache
|
167
|
+
try:
|
168
|
+
with sqlite3.connect(self.db_path) as conn:
|
169
|
+
cursor = conn.cursor()
|
170
|
+
cursor.execute(
|
171
|
+
"""
|
172
|
+
SELECT results, expires_at FROM search_cache
|
173
|
+
WHERE query_hash = ? AND expires_at > ?
|
174
|
+
""",
|
175
|
+
(query_hash, current_time),
|
176
|
+
)
|
177
|
+
|
178
|
+
row = cursor.fetchone()
|
179
|
+
if row:
|
180
|
+
results_json, expires_at = row
|
181
|
+
results = json.loads(results_json)
|
182
|
+
|
183
|
+
# Update access statistics
|
184
|
+
cursor.execute(
|
185
|
+
"""
|
186
|
+
UPDATE search_cache
|
187
|
+
SET access_count = access_count + 1, last_accessed = ?
|
188
|
+
WHERE query_hash = ?
|
189
|
+
""",
|
190
|
+
(current_time, query_hash),
|
191
|
+
)
|
192
|
+
conn.commit()
|
193
|
+
|
194
|
+
# Add to memory cache
|
195
|
+
self._memory_cache[query_hash] = {
|
196
|
+
"results": results,
|
197
|
+
"expires_at": expires_at,
|
198
|
+
}
|
199
|
+
self._access_times[query_hash] = current_time
|
200
|
+
self._evict_lru_memory()
|
201
|
+
|
202
|
+
logger.debug(
|
203
|
+
f"Cache hit (database) for query: {query[:50]}..."
|
204
|
+
)
|
205
|
+
return results
|
206
|
+
|
207
|
+
except Exception as e:
|
208
|
+
logger.error(f"Failed to retrieve from search cache: {e}")
|
209
|
+
|
210
|
+
logger.debug(f"Cache miss for query: {query[:50]}...")
|
211
|
+
return None
|
212
|
+
|
213
|
+
def put(
|
214
|
+
self,
|
215
|
+
query: str,
|
216
|
+
results: List[Dict[str, Any]],
|
217
|
+
search_engine: str = "default",
|
218
|
+
ttl: Optional[int] = None,
|
219
|
+
) -> bool:
|
220
|
+
"""
|
221
|
+
Store search results in cache.
|
222
|
+
|
223
|
+
Args:
|
224
|
+
query: Search query
|
225
|
+
results: Search results to cache
|
226
|
+
search_engine: Search engine identifier
|
227
|
+
ttl: Time-to-live in seconds (uses default if None)
|
228
|
+
|
229
|
+
Returns:
|
230
|
+
True if successfully cached
|
231
|
+
"""
|
232
|
+
if not results: # Don't cache empty results
|
233
|
+
return False
|
234
|
+
|
235
|
+
query_hash = self._get_query_hash(query, search_engine)
|
236
|
+
current_time = int(time.time())
|
237
|
+
expires_at = current_time + (ttl or self.default_ttl)
|
238
|
+
|
239
|
+
try:
|
240
|
+
results_json = json.dumps(results)
|
241
|
+
|
242
|
+
# Store in database
|
243
|
+
with sqlite3.connect(self.db_path) as conn:
|
244
|
+
conn.execute(
|
245
|
+
"""
|
246
|
+
INSERT OR REPLACE INTO search_cache
|
247
|
+
(query_hash, query_text, results, created_at, expires_at, access_count, last_accessed)
|
248
|
+
VALUES (?, ?, ?, ?, ?, 1, ?)
|
249
|
+
""",
|
250
|
+
(
|
251
|
+
query_hash,
|
252
|
+
self._normalize_query(query),
|
253
|
+
results_json,
|
254
|
+
current_time,
|
255
|
+
expires_at,
|
256
|
+
current_time,
|
257
|
+
),
|
258
|
+
)
|
259
|
+
conn.commit()
|
260
|
+
|
261
|
+
# Store in memory cache
|
262
|
+
self._memory_cache[query_hash] = {
|
263
|
+
"results": results,
|
264
|
+
"expires_at": expires_at,
|
265
|
+
}
|
266
|
+
self._access_times[query_hash] = current_time
|
267
|
+
self._evict_lru_memory()
|
268
|
+
|
269
|
+
logger.debug(f"Cached results for query: {query[:50]}...")
|
270
|
+
return True
|
271
|
+
|
272
|
+
except Exception as e:
|
273
|
+
logger.error(f"Failed to store in search cache: {e}")
|
274
|
+
return False
|
275
|
+
|
276
|
+
def invalidate(self, query: str, search_engine: str = "default") -> bool:
|
277
|
+
"""Invalidate cached results for a specific query."""
|
278
|
+
query_hash = self._get_query_hash(query, search_engine)
|
279
|
+
|
280
|
+
try:
|
281
|
+
# Remove from memory
|
282
|
+
self._memory_cache.pop(query_hash, None)
|
283
|
+
self._access_times.pop(query_hash, None)
|
284
|
+
|
285
|
+
# Remove from database
|
286
|
+
with sqlite3.connect(self.db_path) as conn:
|
287
|
+
cursor = conn.cursor()
|
288
|
+
cursor.execute(
|
289
|
+
"DELETE FROM search_cache WHERE query_hash = ?",
|
290
|
+
(query_hash,),
|
291
|
+
)
|
292
|
+
deleted = cursor.rowcount
|
293
|
+
conn.commit()
|
294
|
+
|
295
|
+
logger.debug(f"Invalidated cache for query: {query[:50]}...")
|
296
|
+
return deleted > 0
|
297
|
+
|
298
|
+
except Exception as e:
|
299
|
+
logger.error(f"Failed to invalidate cache: {e}")
|
300
|
+
return False
|
301
|
+
|
302
|
+
def clear_all(self) -> bool:
|
303
|
+
"""Clear all cached results."""
|
304
|
+
try:
|
305
|
+
self._memory_cache.clear()
|
306
|
+
self._access_times.clear()
|
307
|
+
|
308
|
+
with sqlite3.connect(self.db_path) as conn:
|
309
|
+
conn.execute("DELETE FROM search_cache")
|
310
|
+
conn.commit()
|
311
|
+
|
312
|
+
logger.info("Cleared all search cache")
|
313
|
+
return True
|
314
|
+
|
315
|
+
except Exception as e:
|
316
|
+
logger.error(f"Failed to clear search cache: {e}")
|
317
|
+
return False
|
318
|
+
|
319
|
+
def get_stats(self) -> Dict[str, Any]:
|
320
|
+
"""Get cache statistics."""
|
321
|
+
try:
|
322
|
+
current_time = int(time.time())
|
323
|
+
with sqlite3.connect(self.db_path) as conn:
|
324
|
+
cursor = conn.cursor()
|
325
|
+
|
326
|
+
# Total entries
|
327
|
+
cursor.execute(
|
328
|
+
"SELECT COUNT(*) FROM search_cache WHERE expires_at > ?",
|
329
|
+
(current_time,),
|
330
|
+
)
|
331
|
+
total_entries = cursor.fetchone()[0]
|
332
|
+
|
333
|
+
# Total expired entries
|
334
|
+
cursor.execute(
|
335
|
+
"SELECT COUNT(*) FROM search_cache WHERE expires_at <= ?",
|
336
|
+
(current_time,),
|
337
|
+
)
|
338
|
+
expired_entries = cursor.fetchone()[0]
|
339
|
+
|
340
|
+
# Average access count
|
341
|
+
cursor.execute(
|
342
|
+
"SELECT AVG(access_count) FROM search_cache WHERE expires_at > ?",
|
343
|
+
(current_time,),
|
344
|
+
)
|
345
|
+
avg_access = cursor.fetchone()[0] or 0
|
346
|
+
|
347
|
+
return {
|
348
|
+
"total_valid_entries": total_entries,
|
349
|
+
"expired_entries": expired_entries,
|
350
|
+
"memory_cache_size": len(self._memory_cache),
|
351
|
+
"average_access_count": round(avg_access, 2),
|
352
|
+
"cache_hit_potential": (
|
353
|
+
f"{(total_entries / (total_entries + 1)) * 100:.1f}%"
|
354
|
+
if total_entries > 0
|
355
|
+
else "0%"
|
356
|
+
),
|
357
|
+
}
|
358
|
+
|
359
|
+
except Exception as e:
|
360
|
+
logger.error(f"Failed to get cache stats: {e}")
|
361
|
+
return {"error": str(e)}
|
362
|
+
|
363
|
+
|
364
|
+
# Global cache instance
|
365
|
+
_global_cache = None
|
366
|
+
|
367
|
+
|
368
|
+
def get_search_cache() -> SearchCache:
|
369
|
+
"""Get global search cache instance."""
|
370
|
+
global _global_cache
|
371
|
+
if _global_cache is None:
|
372
|
+
_global_cache = SearchCache()
|
373
|
+
return _global_cache
|
374
|
+
|
375
|
+
|
376
|
+
@lru_cache(maxsize=100)
|
377
|
+
def normalize_entity_query(entity: str, constraint: str) -> str:
|
378
|
+
"""
|
379
|
+
Normalize entity + constraint combination for consistent caching.
|
380
|
+
Uses LRU cache for frequent normalizations.
|
381
|
+
"""
|
382
|
+
# Remove quotes and normalize whitespace
|
383
|
+
entity_clean = " ".join(entity.strip().lower().split())
|
384
|
+
constraint_clean = " ".join(constraint.strip().lower().split())
|
385
|
+
|
386
|
+
# Create canonical form
|
387
|
+
return f"{entity_clean} {constraint_clean}"
|
@@ -47,7 +47,6 @@ def format_links_to_markdown(all_links: List[Dict]) -> str:
|
|
47
47
|
logger.info(f"Formatting {len(all_links)} links to markdown...")
|
48
48
|
|
49
49
|
if all_links:
|
50
|
-
|
51
50
|
# Group links by URL and collect all their indices
|
52
51
|
url_to_indices = {}
|
53
52
|
for link in all_links:
|
@@ -57,7 +56,6 @@ def format_links_to_markdown(all_links: List[Dict]) -> str:
|
|
57
56
|
index = link.get("index", "")
|
58
57
|
# logger.info(f"URL \n {str(url)} ")
|
59
58
|
if url:
|
60
|
-
|
61
59
|
if url not in url_to_indices:
|
62
60
|
url_to_indices[url] = []
|
63
61
|
url_to_indices[url].append(index)
|
@@ -139,7 +137,9 @@ def format_findings(
|
|
139
137
|
logger.info(f"Formatting {len(findings_list)} detailed finding items.")
|
140
138
|
|
141
139
|
for idx, finding in enumerate(findings_list):
|
142
|
-
logger.debug(
|
140
|
+
logger.debug(
|
141
|
+
f"Formatting finding item {idx}. Keys: {list(finding.keys())}"
|
142
|
+
)
|
143
143
|
# Use .get() for safety
|
144
144
|
phase = finding.get("phase", "Unknown Phase")
|
145
145
|
content = finding.get("content", "No content available.")
|
@@ -201,8 +201,14 @@ def format_findings(
|
|
201
201
|
)
|
202
202
|
|
203
203
|
# If the question is in the finding itself, display it
|
204
|
-
if
|
205
|
-
|
204
|
+
if (
|
205
|
+
not question_displayed
|
206
|
+
and "question" in finding
|
207
|
+
and finding["question"]
|
208
|
+
):
|
209
|
+
formatted_text += (
|
210
|
+
f"### SEARCH QUESTION:\n{finding['question']}\n\n"
|
211
|
+
)
|
206
212
|
|
207
213
|
# Content
|
208
214
|
formatted_text += f"\n\n{content}\n\n"
|
@@ -213,7 +219,9 @@ def format_findings(
|
|
213
219
|
links = extract_links_from_search_results(search_results)
|
214
220
|
if links:
|
215
221
|
formatted_text += "### SOURCES USED IN THIS SECTION:\n"
|
216
|
-
formatted_text +=
|
222
|
+
formatted_text += (
|
223
|
+
format_links_to_markdown(links) + "\n\n"
|
224
|
+
)
|
217
225
|
except Exception:
|
218
226
|
logger.exception(
|
219
227
|
f"Error processing search results/links for finding {idx}"
|
@@ -0,0 +1,92 @@
|
|
1
|
+
import threading
|
2
|
+
from functools import wraps
|
3
|
+
from typing import Any, Callable, Tuple
|
4
|
+
from loguru import logger
|
5
|
+
|
6
|
+
from cachetools import cached, keys
|
7
|
+
from flask import current_app, g
|
8
|
+
from flask.ctx import AppContext
|
9
|
+
|
10
|
+
|
11
|
+
def thread_specific_cache(*args: Any, **kwargs: Any) -> Callable:
|
12
|
+
"""
|
13
|
+
A version of `cached()` that is local to a single thread. In other words,
|
14
|
+
cache entries will only be valid in the thread where they were created.
|
15
|
+
|
16
|
+
Args:
|
17
|
+
*args: Will be forwarded to `cached()`.
|
18
|
+
**kwargs: Will be forwarded to `cached()`.
|
19
|
+
|
20
|
+
Returns:
|
21
|
+
The wrapped function.
|
22
|
+
|
23
|
+
"""
|
24
|
+
|
25
|
+
def _key_func(*args_: Any, **kwargs_: Any) -> Tuple[int, ...]:
|
26
|
+
base_hash = keys.hashkey(*args_, **kwargs_)
|
27
|
+
return (threading.get_ident(),) + base_hash
|
28
|
+
|
29
|
+
return cached(*args, **kwargs, key=_key_func)
|
30
|
+
|
31
|
+
|
32
|
+
def thread_with_app_context(to_wrap: Callable) -> Callable:
|
33
|
+
"""
|
34
|
+
Decorator that wraps the entry point to a thread and injects the current
|
35
|
+
app context from Flask. This is useful when we want to use multiple
|
36
|
+
threads to handle a single request.
|
37
|
+
|
38
|
+
When using this wrapped function, `current_app.app_context()` should be
|
39
|
+
passed as the first argument when initializing the thread.
|
40
|
+
|
41
|
+
Args:
|
42
|
+
to_wrap: The function to wrap.
|
43
|
+
|
44
|
+
Returns:
|
45
|
+
The wrapped function.
|
46
|
+
|
47
|
+
"""
|
48
|
+
|
49
|
+
@wraps(to_wrap)
|
50
|
+
def _run_with_context(
|
51
|
+
app_context: AppContext | None, *args: Any, **kwargs: Any
|
52
|
+
) -> Any:
|
53
|
+
if app_context is None:
|
54
|
+
# Do nothing.
|
55
|
+
return to_wrap(*args, **kwargs)
|
56
|
+
|
57
|
+
with app_context:
|
58
|
+
return to_wrap(*args, **kwargs)
|
59
|
+
|
60
|
+
return _run_with_context
|
61
|
+
|
62
|
+
|
63
|
+
def thread_context() -> AppContext | None:
|
64
|
+
"""
|
65
|
+
Pushes a new app context for a thread that is being spawned to handle the
|
66
|
+
current request. Will copy all the global data from the current context.
|
67
|
+
|
68
|
+
Returns:
|
69
|
+
The new context, or None if no context is active.
|
70
|
+
|
71
|
+
"""
|
72
|
+
# Copy global data.
|
73
|
+
global_data = {}
|
74
|
+
try:
|
75
|
+
for key in g:
|
76
|
+
global_data[key] = g.get(key)
|
77
|
+
except TypeError:
|
78
|
+
# Context is not initialized. Don't change anything.
|
79
|
+
pass
|
80
|
+
|
81
|
+
try:
|
82
|
+
context = current_app.app_context()
|
83
|
+
except RuntimeError:
|
84
|
+
# Context is not initialized.
|
85
|
+
logger.debug("No current app context, not passing to thread.")
|
86
|
+
return None
|
87
|
+
|
88
|
+
with context:
|
89
|
+
for key, value in global_data.items():
|
90
|
+
setattr(g, key, value)
|
91
|
+
|
92
|
+
return context
|