local-deep-research 0.4.4__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__init__.py +7 -0
- local_deep_research/__version__.py +1 -1
- local_deep_research/advanced_search_system/answer_decoding/__init__.py +5 -0
- local_deep_research/advanced_search_system/answer_decoding/browsecomp_answer_decoder.py +421 -0
- local_deep_research/advanced_search_system/candidate_exploration/README.md +219 -0
- local_deep_research/advanced_search_system/candidate_exploration/__init__.py +25 -0
- local_deep_research/advanced_search_system/candidate_exploration/adaptive_explorer.py +329 -0
- local_deep_research/advanced_search_system/candidate_exploration/base_explorer.py +341 -0
- local_deep_research/advanced_search_system/candidate_exploration/constraint_guided_explorer.py +436 -0
- local_deep_research/advanced_search_system/candidate_exploration/diversity_explorer.py +457 -0
- local_deep_research/advanced_search_system/candidate_exploration/parallel_explorer.py +250 -0
- local_deep_research/advanced_search_system/candidate_exploration/progressive_explorer.py +255 -0
- local_deep_research/advanced_search_system/candidates/__init__.py +5 -0
- local_deep_research/advanced_search_system/candidates/base_candidate.py +59 -0
- local_deep_research/advanced_search_system/constraint_checking/README.md +150 -0
- local_deep_research/advanced_search_system/constraint_checking/__init__.py +35 -0
- local_deep_research/advanced_search_system/constraint_checking/base_constraint_checker.py +122 -0
- local_deep_research/advanced_search_system/constraint_checking/constraint_checker.py +223 -0
- local_deep_research/advanced_search_system/constraint_checking/constraint_satisfaction_tracker.py +387 -0
- local_deep_research/advanced_search_system/constraint_checking/dual_confidence_checker.py +424 -0
- local_deep_research/advanced_search_system/constraint_checking/evidence_analyzer.py +174 -0
- local_deep_research/advanced_search_system/constraint_checking/intelligent_constraint_relaxer.py +503 -0
- local_deep_research/advanced_search_system/constraint_checking/rejection_engine.py +143 -0
- local_deep_research/advanced_search_system/constraint_checking/strict_checker.py +259 -0
- local_deep_research/advanced_search_system/constraint_checking/threshold_checker.py +213 -0
- local_deep_research/advanced_search_system/constraints/__init__.py +6 -0
- local_deep_research/advanced_search_system/constraints/base_constraint.py +58 -0
- local_deep_research/advanced_search_system/constraints/constraint_analyzer.py +143 -0
- local_deep_research/advanced_search_system/evidence/__init__.py +12 -0
- local_deep_research/advanced_search_system/evidence/base_evidence.py +57 -0
- local_deep_research/advanced_search_system/evidence/evaluator.py +159 -0
- local_deep_research/advanced_search_system/evidence/requirements.py +122 -0
- local_deep_research/advanced_search_system/filters/base_filter.py +3 -1
- local_deep_research/advanced_search_system/filters/cross_engine_filter.py +8 -2
- local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +43 -29
- local_deep_research/advanced_search_system/findings/repository.py +54 -17
- local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +3 -1
- local_deep_research/advanced_search_system/query_generation/adaptive_query_generator.py +405 -0
- local_deep_research/advanced_search_system/questions/__init__.py +16 -0
- local_deep_research/advanced_search_system/questions/atomic_fact_question.py +171 -0
- local_deep_research/advanced_search_system/questions/browsecomp_question.py +287 -0
- local_deep_research/advanced_search_system/questions/decomposition_question.py +13 -4
- local_deep_research/advanced_search_system/questions/entity_aware_question.py +184 -0
- local_deep_research/advanced_search_system/questions/standard_question.py +9 -3
- local_deep_research/advanced_search_system/search_optimization/cross_constraint_manager.py +624 -0
- local_deep_research/advanced_search_system/source_management/diversity_manager.py +613 -0
- local_deep_research/advanced_search_system/strategies/__init__.py +42 -0
- local_deep_research/advanced_search_system/strategies/adaptive_decomposition_strategy.py +564 -0
- local_deep_research/advanced_search_system/strategies/base_strategy.py +4 -4
- local_deep_research/advanced_search_system/strategies/browsecomp_entity_strategy.py +1031 -0
- local_deep_research/advanced_search_system/strategies/browsecomp_optimized_strategy.py +778 -0
- local_deep_research/advanced_search_system/strategies/concurrent_dual_confidence_strategy.py +446 -0
- local_deep_research/advanced_search_system/strategies/constrained_search_strategy.py +1348 -0
- local_deep_research/advanced_search_system/strategies/constraint_parallel_strategy.py +522 -0
- local_deep_research/advanced_search_system/strategies/direct_search_strategy.py +217 -0
- local_deep_research/advanced_search_system/strategies/dual_confidence_strategy.py +320 -0
- local_deep_research/advanced_search_system/strategies/dual_confidence_with_rejection.py +219 -0
- local_deep_research/advanced_search_system/strategies/early_stop_constrained_strategy.py +369 -0
- local_deep_research/advanced_search_system/strategies/entity_aware_source_strategy.py +140 -0
- local_deep_research/advanced_search_system/strategies/evidence_based_strategy.py +1248 -0
- local_deep_research/advanced_search_system/strategies/evidence_based_strategy_v2.py +1337 -0
- local_deep_research/advanced_search_system/strategies/focused_iteration_strategy.py +537 -0
- local_deep_research/advanced_search_system/strategies/improved_evidence_based_strategy.py +782 -0
- local_deep_research/advanced_search_system/strategies/iterative_reasoning_strategy.py +760 -0
- local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +55 -21
- local_deep_research/advanced_search_system/strategies/llm_driven_modular_strategy.py +865 -0
- local_deep_research/advanced_search_system/strategies/modular_strategy.py +1142 -0
- local_deep_research/advanced_search_system/strategies/parallel_constrained_strategy.py +506 -0
- local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +34 -16
- local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +29 -9
- local_deep_research/advanced_search_system/strategies/recursive_decomposition_strategy.py +492 -0
- local_deep_research/advanced_search_system/strategies/smart_decomposition_strategy.py +284 -0
- local_deep_research/advanced_search_system/strategies/smart_query_strategy.py +515 -0
- local_deep_research/advanced_search_system/strategies/source_based_strategy.py +48 -24
- local_deep_research/advanced_search_system/strategies/standard_strategy.py +34 -14
- local_deep_research/advanced_search_system/tools/base_tool.py +7 -2
- local_deep_research/api/benchmark_functions.py +6 -2
- local_deep_research/api/research_functions.py +10 -4
- local_deep_research/benchmarks/__init__.py +9 -7
- local_deep_research/benchmarks/benchmark_functions.py +6 -2
- local_deep_research/benchmarks/cli/benchmark_commands.py +27 -10
- local_deep_research/benchmarks/cli.py +38 -13
- local_deep_research/benchmarks/comparison/__init__.py +4 -2
- local_deep_research/benchmarks/comparison/evaluator.py +316 -239
- local_deep_research/benchmarks/datasets/__init__.py +1 -1
- local_deep_research/benchmarks/datasets/base.py +91 -72
- local_deep_research/benchmarks/datasets/browsecomp.py +54 -33
- local_deep_research/benchmarks/datasets/custom_dataset_template.py +19 -19
- local_deep_research/benchmarks/datasets/simpleqa.py +14 -14
- local_deep_research/benchmarks/datasets/utils.py +48 -29
- local_deep_research/benchmarks/datasets.py +4 -11
- local_deep_research/benchmarks/efficiency/__init__.py +8 -4
- local_deep_research/benchmarks/efficiency/resource_monitor.py +223 -171
- local_deep_research/benchmarks/efficiency/speed_profiler.py +62 -48
- local_deep_research/benchmarks/evaluators/browsecomp.py +3 -1
- local_deep_research/benchmarks/evaluators/composite.py +6 -2
- local_deep_research/benchmarks/evaluators/simpleqa.py +36 -13
- local_deep_research/benchmarks/graders.py +32 -10
- local_deep_research/benchmarks/metrics/README.md +1 -1
- local_deep_research/benchmarks/metrics/calculation.py +25 -10
- local_deep_research/benchmarks/metrics/reporting.py +7 -3
- local_deep_research/benchmarks/metrics/visualization.py +42 -23
- local_deep_research/benchmarks/metrics.py +1 -1
- local_deep_research/benchmarks/optimization/__init__.py +3 -1
- local_deep_research/benchmarks/optimization/api.py +7 -1
- local_deep_research/benchmarks/optimization/optuna_optimizer.py +75 -26
- local_deep_research/benchmarks/runners.py +48 -15
- local_deep_research/citation_handler.py +65 -92
- local_deep_research/citation_handlers/__init__.py +15 -0
- local_deep_research/citation_handlers/base_citation_handler.py +70 -0
- local_deep_research/citation_handlers/forced_answer_citation_handler.py +179 -0
- local_deep_research/citation_handlers/precision_extraction_handler.py +550 -0
- local_deep_research/citation_handlers/standard_citation_handler.py +80 -0
- local_deep_research/config/llm_config.py +271 -169
- local_deep_research/config/search_config.py +14 -5
- local_deep_research/defaults/__init__.py +0 -1
- local_deep_research/metrics/__init__.py +13 -0
- local_deep_research/metrics/database.py +58 -0
- local_deep_research/metrics/db_models.py +115 -0
- local_deep_research/metrics/migrate_add_provider_to_token_usage.py +148 -0
- local_deep_research/metrics/migrate_call_stack_tracking.py +105 -0
- local_deep_research/metrics/migrate_enhanced_tracking.py +75 -0
- local_deep_research/metrics/migrate_research_ratings.py +31 -0
- local_deep_research/metrics/models.py +61 -0
- local_deep_research/metrics/pricing/__init__.py +12 -0
- local_deep_research/metrics/pricing/cost_calculator.py +237 -0
- local_deep_research/metrics/pricing/pricing_cache.py +143 -0
- local_deep_research/metrics/pricing/pricing_fetcher.py +240 -0
- local_deep_research/metrics/query_utils.py +51 -0
- local_deep_research/metrics/search_tracker.py +380 -0
- local_deep_research/metrics/token_counter.py +1078 -0
- local_deep_research/migrate_db.py +3 -1
- local_deep_research/report_generator.py +22 -8
- local_deep_research/search_system.py +390 -9
- local_deep_research/test_migration.py +15 -5
- local_deep_research/utilities/db_utils.py +7 -4
- local_deep_research/utilities/es_utils.py +115 -104
- local_deep_research/utilities/llm_utils.py +15 -5
- local_deep_research/utilities/log_utils.py +151 -0
- local_deep_research/utilities/search_cache.py +387 -0
- local_deep_research/utilities/search_utilities.py +14 -6
- local_deep_research/utilities/threading_utils.py +92 -0
- local_deep_research/utilities/url_utils.py +6 -0
- local_deep_research/web/api.py +347 -0
- local_deep_research/web/app.py +13 -17
- local_deep_research/web/app_factory.py +71 -66
- local_deep_research/web/database/migrate_to_ldr_db.py +12 -4
- local_deep_research/web/database/migrations.py +20 -3
- local_deep_research/web/database/models.py +74 -25
- local_deep_research/web/database/schema_upgrade.py +49 -29
- local_deep_research/web/models/database.py +63 -83
- local_deep_research/web/routes/api_routes.py +56 -22
- local_deep_research/web/routes/benchmark_routes.py +4 -1
- local_deep_research/web/routes/globals.py +22 -0
- local_deep_research/web/routes/history_routes.py +71 -46
- local_deep_research/web/routes/metrics_routes.py +1155 -0
- local_deep_research/web/routes/research_routes.py +192 -54
- local_deep_research/web/routes/settings_routes.py +156 -55
- local_deep_research/web/services/research_service.py +412 -251
- local_deep_research/web/services/resource_service.py +36 -11
- local_deep_research/web/services/settings_manager.py +55 -17
- local_deep_research/web/services/settings_service.py +12 -4
- local_deep_research/web/services/socket_service.py +295 -188
- local_deep_research/web/static/css/custom_dropdown.css +180 -0
- local_deep_research/web/static/css/styles.css +39 -1
- local_deep_research/web/static/js/components/detail.js +633 -267
- local_deep_research/web/static/js/components/details.js +751 -0
- local_deep_research/web/static/js/components/fallback/formatting.js +11 -11
- local_deep_research/web/static/js/components/fallback/ui.js +23 -23
- local_deep_research/web/static/js/components/history.js +76 -76
- local_deep_research/web/static/js/components/logpanel.js +61 -13
- local_deep_research/web/static/js/components/progress.js +13 -2
- local_deep_research/web/static/js/components/research.js +99 -12
- local_deep_research/web/static/js/components/results.js +239 -106
- local_deep_research/web/static/js/main.js +40 -40
- local_deep_research/web/static/js/services/audio.js +1 -1
- local_deep_research/web/static/js/services/formatting.js +11 -11
- local_deep_research/web/static/js/services/keyboard.js +157 -0
- local_deep_research/web/static/js/services/pdf.js +80 -80
- local_deep_research/web/static/sounds/README.md +1 -1
- local_deep_research/web/templates/base.html +1 -0
- local_deep_research/web/templates/components/log_panel.html +7 -1
- local_deep_research/web/templates/components/mobile_nav.html +1 -1
- local_deep_research/web/templates/components/sidebar.html +3 -0
- local_deep_research/web/templates/pages/cost_analytics.html +1245 -0
- local_deep_research/web/templates/pages/details.html +325 -24
- local_deep_research/web/templates/pages/history.html +1 -1
- local_deep_research/web/templates/pages/metrics.html +1929 -0
- local_deep_research/web/templates/pages/progress.html +2 -2
- local_deep_research/web/templates/pages/research.html +53 -17
- local_deep_research/web/templates/pages/results.html +12 -1
- local_deep_research/web/templates/pages/star_reviews.html +803 -0
- local_deep_research/web/utils/formatters.py +9 -3
- local_deep_research/web_search_engines/default_search_engines.py +5 -3
- local_deep_research/web_search_engines/engines/full_search.py +8 -2
- local_deep_research/web_search_engines/engines/meta_search_engine.py +59 -20
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +19 -6
- local_deep_research/web_search_engines/engines/search_engine_brave.py +6 -2
- local_deep_research/web_search_engines/engines/search_engine_ddg.py +3 -1
- local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +81 -58
- local_deep_research/web_search_engines/engines/search_engine_github.py +46 -15
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +16 -6
- local_deep_research/web_search_engines/engines/search_engine_guardian.py +39 -15
- local_deep_research/web_search_engines/engines/search_engine_local.py +58 -25
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +15 -5
- local_deep_research/web_search_engines/engines/search_engine_pubmed.py +63 -21
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +37 -11
- local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +27 -9
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +12 -4
- local_deep_research/web_search_engines/engines/search_engine_wayback.py +31 -10
- local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +12 -3
- local_deep_research/web_search_engines/search_engine_base.py +83 -35
- local_deep_research/web_search_engines/search_engine_factory.py +25 -8
- local_deep_research/web_search_engines/search_engines_config.py +9 -3
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/METADATA +7 -1
- local_deep_research-0.5.2.dist-info/RECORD +265 -0
- local_deep_research-0.4.4.dist-info/RECORD +0 -177
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/WHEEL +0 -0
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/entry_points.txt +0 -0
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,287 @@
|
|
1
|
+
"""
|
2
|
+
BrowseComp-specific question generation that creates progressive, entity-focused searches.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import logging
|
6
|
+
import re
|
7
|
+
from typing import Dict, List
|
8
|
+
|
9
|
+
from .base_question import BaseQuestionGenerator
|
10
|
+
|
11
|
+
logger = logging.getLogger(__name__)
|
12
|
+
|
13
|
+
|
14
|
+
class BrowseCompQuestionGenerator(BaseQuestionGenerator):
|
15
|
+
"""
|
16
|
+
Question generator optimized for BrowseComp-style queries.
|
17
|
+
|
18
|
+
Key features:
|
19
|
+
1. Extract concrete entities (dates, numbers, names, places)
|
20
|
+
2. Generate progressive search combinations
|
21
|
+
3. Start broad, then narrow systematically
|
22
|
+
4. Focus on verifiable facts
|
23
|
+
"""
|
24
|
+
|
25
|
+
def __init__(self, model):
|
26
|
+
super().__init__(model)
|
27
|
+
self.extracted_entities = {}
|
28
|
+
self.search_progression = []
|
29
|
+
|
30
|
+
def generate_questions(
|
31
|
+
self,
|
32
|
+
current_knowledge: str,
|
33
|
+
query: str,
|
34
|
+
questions_per_iteration: int = 5,
|
35
|
+
questions_by_iteration: dict = None,
|
36
|
+
iteration: int = 1,
|
37
|
+
) -> List[str]:
|
38
|
+
"""Generate progressive search queries for BrowseComp problems."""
|
39
|
+
questions_by_iteration = questions_by_iteration or {}
|
40
|
+
|
41
|
+
# First iteration: Extract entities and create initial searches
|
42
|
+
if iteration == 1 or not self.extracted_entities:
|
43
|
+
self.extracted_entities = self._extract_entities(query)
|
44
|
+
return self._generate_initial_searches(
|
45
|
+
query, self.extracted_entities, questions_per_iteration
|
46
|
+
)
|
47
|
+
|
48
|
+
# Subsequent iterations: Progressive refinement
|
49
|
+
return self._generate_progressive_searches(
|
50
|
+
query,
|
51
|
+
current_knowledge,
|
52
|
+
self.extracted_entities,
|
53
|
+
questions_by_iteration,
|
54
|
+
questions_per_iteration,
|
55
|
+
iteration,
|
56
|
+
)
|
57
|
+
|
58
|
+
def _extract_entities(self, query: str) -> Dict[str, List[str]]:
|
59
|
+
"""Extract concrete entities from the query."""
|
60
|
+
prompt = f"""Extract ALL concrete, searchable entities from this query:
|
61
|
+
|
62
|
+
Query: {query}
|
63
|
+
|
64
|
+
Extract:
|
65
|
+
1. TEMPORAL: All years, dates, time periods (e.g., "2018", "between 1995 and 2006", "2023")
|
66
|
+
2. NUMERICAL: All numbers, statistics, counts (e.g., "300", "more than 3", "4-3", "84.5%")
|
67
|
+
3. NAMES: Partial names, name hints, proper nouns (e.g., "Dartmouth", "EMNLP", "Plastic Man")
|
68
|
+
4. LOCATIONS: Places, institutions, geographic features (e.g., "Pennsylvania", "Grand Canyon")
|
69
|
+
5. DESCRIPTORS: Key descriptive terms (e.g., "fourth wall", "ascetics", "decider game")
|
70
|
+
|
71
|
+
For TEMPORAL entities, if there's a range (e.g., "between 2018-2023"), list EACH individual year.
|
72
|
+
|
73
|
+
Format your response as:
|
74
|
+
TEMPORAL: [entity1], [entity2], ...
|
75
|
+
NUMERICAL: [entity1], [entity2], ...
|
76
|
+
NAMES: [entity1], [entity2], ...
|
77
|
+
LOCATIONS: [entity1], [entity2], ...
|
78
|
+
DESCRIPTORS: [entity1], [entity2], ...
|
79
|
+
"""
|
80
|
+
|
81
|
+
response = self.model.invoke(prompt)
|
82
|
+
content = (
|
83
|
+
response.content if hasattr(response, "content") else str(response)
|
84
|
+
)
|
85
|
+
|
86
|
+
entities = {
|
87
|
+
"temporal": [],
|
88
|
+
"numerical": [],
|
89
|
+
"names": [],
|
90
|
+
"locations": [],
|
91
|
+
"descriptors": [],
|
92
|
+
}
|
93
|
+
|
94
|
+
# current_category = None # Not currently used
|
95
|
+
for line in content.strip().split("\n"):
|
96
|
+
line = line.strip()
|
97
|
+
if ":" in line:
|
98
|
+
category, values = line.split(":", 1)
|
99
|
+
category = category.strip().lower()
|
100
|
+
if category in entities:
|
101
|
+
# Parse comma-separated values
|
102
|
+
values = [v.strip() for v in values.split(",") if v.strip()]
|
103
|
+
entities[category].extend(values)
|
104
|
+
|
105
|
+
# Expand temporal ranges
|
106
|
+
entities["temporal"] = self._expand_temporal_ranges(
|
107
|
+
entities["temporal"]
|
108
|
+
)
|
109
|
+
|
110
|
+
logger.info(f"Extracted entities: {entities}")
|
111
|
+
return entities
|
112
|
+
|
113
|
+
def _expand_temporal_ranges(
|
114
|
+
self, temporal_entities: List[str]
|
115
|
+
) -> List[str]:
|
116
|
+
"""Expand year ranges into individual years."""
|
117
|
+
expanded = []
|
118
|
+
for entity in temporal_entities:
|
119
|
+
# Check for range patterns like "2018-2023" or "between 1995 and 2006"
|
120
|
+
range_match = re.search(
|
121
|
+
r"(\d{4})[-\s]+(?:to|and)?\s*(\d{4})", entity
|
122
|
+
)
|
123
|
+
if range_match:
|
124
|
+
start_year = int(range_match.group(1))
|
125
|
+
end_year = int(range_match.group(2))
|
126
|
+
for year in range(start_year, end_year + 1):
|
127
|
+
expanded.append(str(year))
|
128
|
+
else:
|
129
|
+
# Single year or other temporal entity
|
130
|
+
year_match = re.search(r"\d{4}", entity)
|
131
|
+
if year_match:
|
132
|
+
expanded.append(year_match.group())
|
133
|
+
else:
|
134
|
+
expanded.append(entity)
|
135
|
+
|
136
|
+
return list(set(expanded)) # Remove duplicates
|
137
|
+
|
138
|
+
def _generate_initial_searches(
|
139
|
+
self, query: str, entities: Dict[str, List[str]], num_questions: int
|
140
|
+
) -> List[str]:
|
141
|
+
"""Generate initial broad searches."""
|
142
|
+
searches = []
|
143
|
+
|
144
|
+
# 1. Original query (always include)
|
145
|
+
searches.append(query)
|
146
|
+
|
147
|
+
# 2. Domain exploration searches (combine key entities)
|
148
|
+
if entities["names"]:
|
149
|
+
for name in entities["names"][:2]: # Top 2 names
|
150
|
+
searches.append(f"{name}")
|
151
|
+
if entities["descriptors"]:
|
152
|
+
searches.append(f"{name} {entities['descriptors'][0]}")
|
153
|
+
|
154
|
+
# 3. Temporal searches if years are important
|
155
|
+
if entities["temporal"] and len(entities["temporal"]) <= 10:
|
156
|
+
# For small year ranges, search each year with a key term
|
157
|
+
key_term = (
|
158
|
+
entities["names"][0]
|
159
|
+
if entities["names"]
|
160
|
+
else entities["descriptors"][0]
|
161
|
+
if entities["descriptors"]
|
162
|
+
else ""
|
163
|
+
)
|
164
|
+
for year in entities["temporal"][:5]: # Limit to 5 years initially
|
165
|
+
if key_term:
|
166
|
+
searches.append(f"{key_term} {year}")
|
167
|
+
|
168
|
+
# 4. Location-based searches
|
169
|
+
if entities["locations"]:
|
170
|
+
for location in entities["locations"][:2]:
|
171
|
+
searches.append(f"{location}")
|
172
|
+
if entities["descriptors"]:
|
173
|
+
searches.append(f"{location} {entities['descriptors'][0]}")
|
174
|
+
|
175
|
+
# Remove duplicates and limit to requested number
|
176
|
+
seen = set()
|
177
|
+
unique_searches = []
|
178
|
+
for s in searches:
|
179
|
+
if s.lower() not in seen:
|
180
|
+
seen.add(s.lower())
|
181
|
+
unique_searches.append(s)
|
182
|
+
|
183
|
+
return unique_searches[:num_questions]
|
184
|
+
|
185
|
+
def _generate_progressive_searches(
|
186
|
+
self,
|
187
|
+
query: str,
|
188
|
+
current_knowledge: str,
|
189
|
+
entities: Dict[str, List[str]],
|
190
|
+
questions_by_iteration: dict,
|
191
|
+
num_questions: int,
|
192
|
+
iteration: int,
|
193
|
+
) -> List[str]:
|
194
|
+
"""Generate progressively more specific searches based on findings."""
|
195
|
+
|
196
|
+
# Analyze what we've found so far
|
197
|
+
prompt = f"""Based on our search progress, generate targeted follow-up searches.
|
198
|
+
|
199
|
+
Original Query: {query}
|
200
|
+
|
201
|
+
Entities Found:
|
202
|
+
- Names/Terms: {", ".join(entities["names"][:5])}
|
203
|
+
- Years: {", ".join(entities["temporal"][:5])}
|
204
|
+
- Locations: {", ".join(entities["locations"][:3])}
|
205
|
+
- Key Features: {", ".join(entities["descriptors"][:3])}
|
206
|
+
|
207
|
+
Current Knowledge Summary:
|
208
|
+
{current_knowledge[:1500]}
|
209
|
+
|
210
|
+
Previous Searches:
|
211
|
+
{self._format_previous_searches(questions_by_iteration)}
|
212
|
+
|
213
|
+
Generate {num_questions} NEW search queries that:
|
214
|
+
1. Combine 2-3 entities we haven't tried together
|
215
|
+
2. If we found candidate names, search for them with other constraints
|
216
|
+
3. For year ranges, systematically cover years we haven't searched
|
217
|
+
4. Use quotes for exact phrases when beneficial
|
218
|
+
|
219
|
+
Focus on finding the specific answer, not general information.
|
220
|
+
|
221
|
+
Format: One search per line
|
222
|
+
"""
|
223
|
+
|
224
|
+
response = self.model.invoke(prompt)
|
225
|
+
content = (
|
226
|
+
response.content if hasattr(response, "content") else str(response)
|
227
|
+
)
|
228
|
+
|
229
|
+
# Extract searches from response
|
230
|
+
searches = []
|
231
|
+
for line in content.strip().split("\n"):
|
232
|
+
line = line.strip()
|
233
|
+
if line and not line.endswith(":") and len(line) > 5:
|
234
|
+
# Clean up common prefixes
|
235
|
+
for prefix in ["Q:", "Search:", "-", "*", "•"]:
|
236
|
+
if line.startswith(prefix):
|
237
|
+
line = line[len(prefix) :].strip()
|
238
|
+
if line:
|
239
|
+
searches.append(line)
|
240
|
+
|
241
|
+
# Ensure we have enough searches
|
242
|
+
while len(searches) < num_questions:
|
243
|
+
# Generate combinations programmatically
|
244
|
+
if iteration <= 5 and entities["temporal"]:
|
245
|
+
# Continue with year-based searches
|
246
|
+
for year in entities["temporal"]:
|
247
|
+
if not self._was_searched(year, questions_by_iteration):
|
248
|
+
base_term = (
|
249
|
+
entities["names"][0] if entities["names"] else ""
|
250
|
+
)
|
251
|
+
searches.append(f"{base_term} {year}".strip())
|
252
|
+
if len(searches) >= num_questions:
|
253
|
+
break
|
254
|
+
else:
|
255
|
+
# Combine multiple constraints
|
256
|
+
if entities["names"] and entities["descriptors"]:
|
257
|
+
for name in entities["names"]:
|
258
|
+
for desc in entities["descriptors"]:
|
259
|
+
combo = f"{name} {desc}"
|
260
|
+
if not self._was_searched(
|
261
|
+
combo, questions_by_iteration
|
262
|
+
):
|
263
|
+
searches.append(combo)
|
264
|
+
if len(searches) >= num_questions:
|
265
|
+
break
|
266
|
+
|
267
|
+
return searches[:num_questions]
|
268
|
+
|
269
|
+
def _format_previous_searches(self, questions_by_iteration: dict) -> str:
|
270
|
+
"""Format previous searches for context."""
|
271
|
+
formatted = []
|
272
|
+
for iteration, questions in questions_by_iteration.items():
|
273
|
+
if isinstance(questions, list):
|
274
|
+
formatted.extend(
|
275
|
+
[f"Iteration {iteration}: {q}" for q in questions[:3]]
|
276
|
+
)
|
277
|
+
return "\n".join(formatted[-10:]) # Last 10 searches
|
278
|
+
|
279
|
+
def _was_searched(self, term: str, questions_by_iteration: dict) -> bool:
|
280
|
+
"""Check if a term was already searched."""
|
281
|
+
term_lower = term.lower()
|
282
|
+
for questions in questions_by_iteration.values():
|
283
|
+
if isinstance(questions, list):
|
284
|
+
for q in questions:
|
285
|
+
if term_lower in q.lower():
|
286
|
+
return True
|
287
|
+
return False
|
@@ -101,7 +101,9 @@ class DecompositionQuestionGenerator(BaseQuestionGenerator):
|
|
101
101
|
if subject.lower().startswith(article):
|
102
102
|
subject = subject[len(article) :].strip()
|
103
103
|
|
104
|
-
logger.info(
|
104
|
+
logger.info(
|
105
|
+
f"Original query: '{query}', Extracted subject: '{subject}'"
|
106
|
+
)
|
105
107
|
|
106
108
|
# Create a prompt to decompose the query into sub-questions
|
107
109
|
prompt = f"""Decompose the main research topic into 3-5 specific sub-queries that can be answered independently.
|
@@ -223,7 +225,9 @@ What are the security implications of X?
|
|
223
225
|
for conjunction in conjunctions:
|
224
226
|
if conjunction in topic_text.lower():
|
225
227
|
# Take only the part before the conjunction
|
226
|
-
topic_text = topic_text.split(conjunction)[
|
228
|
+
topic_text = topic_text.split(conjunction)[
|
229
|
+
0
|
230
|
+
].strip()
|
227
231
|
logger.info(
|
228
232
|
f"Simplified prompt: Split compound query at '{conjunction}', extracted: '{topic_text}'"
|
229
233
|
)
|
@@ -288,7 +292,9 @@ Sub-questions:
|
|
288
292
|
)
|
289
293
|
return self._generate_default_questions(query)
|
290
294
|
|
291
|
-
logger.info(
|
295
|
+
logger.info(
|
296
|
+
f"Generated {len(sub_queries)} sub-questions: {sub_queries}"
|
297
|
+
)
|
292
298
|
return sub_queries[: self.max_subqueries] # Limit to max_subqueries
|
293
299
|
|
294
300
|
except Exception as e:
|
@@ -380,7 +386,10 @@ Sub-questions:
|
|
380
386
|
)
|
381
387
|
|
382
388
|
# Special case for CSRF - if we've extracted just "csrf" from a longer query
|
383
|
-
if
|
389
|
+
if (
|
390
|
+
subject.lower() == "csrf"
|
391
|
+
or subject.lower() == "cross-site request forgery"
|
392
|
+
):
|
384
393
|
# CSRF-specific questions
|
385
394
|
default_questions = [
|
386
395
|
"What is Cross-Site Request Forgery (CSRF)?",
|
@@ -0,0 +1,184 @@
|
|
1
|
+
"""
|
2
|
+
Entity-aware question generation for improved entity identification.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import logging
|
6
|
+
from datetime import datetime
|
7
|
+
from typing import List
|
8
|
+
|
9
|
+
from .base_question import BaseQuestionGenerator
|
10
|
+
|
11
|
+
logger = logging.getLogger(__name__)
|
12
|
+
|
13
|
+
|
14
|
+
class EntityAwareQuestionGenerator(BaseQuestionGenerator):
|
15
|
+
"""Question generator that creates more targeted searches for entity identification."""
|
16
|
+
|
17
|
+
def generate_questions(
|
18
|
+
self,
|
19
|
+
current_knowledge: str,
|
20
|
+
query: str,
|
21
|
+
questions_per_iteration: int = 2,
|
22
|
+
questions_by_iteration: dict = None,
|
23
|
+
) -> List[str]:
|
24
|
+
"""Generate questions with entity-aware search patterns."""
|
25
|
+
now = datetime.now()
|
26
|
+
current_time = now.strftime("%Y-%m-%d")
|
27
|
+
questions_by_iteration = questions_by_iteration or {}
|
28
|
+
|
29
|
+
logger.info("Generating entity-aware follow-up questions...")
|
30
|
+
|
31
|
+
# Detect if this is likely an entity identification query
|
32
|
+
entity_keywords = [
|
33
|
+
"who",
|
34
|
+
"what",
|
35
|
+
"which",
|
36
|
+
"identify",
|
37
|
+
"name",
|
38
|
+
"character",
|
39
|
+
"person",
|
40
|
+
"place",
|
41
|
+
"organization",
|
42
|
+
"company",
|
43
|
+
"author",
|
44
|
+
"scientist",
|
45
|
+
"inventor",
|
46
|
+
"city",
|
47
|
+
"country",
|
48
|
+
"book",
|
49
|
+
"movie",
|
50
|
+
]
|
51
|
+
|
52
|
+
is_entity_query = any(
|
53
|
+
keyword in query.lower() for keyword in entity_keywords
|
54
|
+
)
|
55
|
+
|
56
|
+
if is_entity_query:
|
57
|
+
# Use more direct entity-focused prompt
|
58
|
+
if questions_by_iteration:
|
59
|
+
prompt = f"""Generate {questions_per_iteration} targeted search queries to identify the specific entity in the query.
|
60
|
+
|
61
|
+
Query: {query}
|
62
|
+
Today: {current_time}
|
63
|
+
Past questions: {str(questions_by_iteration)}
|
64
|
+
Current knowledge: {current_knowledge}
|
65
|
+
|
66
|
+
Create direct search queries that combine the key identifying features to find the specific name/entity.
|
67
|
+
Focus on:
|
68
|
+
1. Combining multiple constraints in a single search
|
69
|
+
2. Using quotation marks for exact phrases
|
70
|
+
3. Including specific details that narrow down results
|
71
|
+
|
72
|
+
Format: One question per line, e.g.
|
73
|
+
Q: "fictional character" "breaks fourth wall" "TV show" 1960s 1980s
|
74
|
+
Q: character name ascetics humor television fewer than 50 episodes
|
75
|
+
"""
|
76
|
+
else:
|
77
|
+
prompt = f"""Generate {questions_per_iteration} direct search queries to identify the specific entity in: {query}
|
78
|
+
|
79
|
+
Today: {current_time}
|
80
|
+
|
81
|
+
Create search queries that:
|
82
|
+
1. Combine multiple identifying features
|
83
|
+
2. Target the specific entity name/identification
|
84
|
+
3. Use variations of key terms
|
85
|
+
|
86
|
+
Format: One question per line, e.g.
|
87
|
+
Q: question1
|
88
|
+
Q: question2
|
89
|
+
"""
|
90
|
+
else:
|
91
|
+
# Fall back to standard question generation for non-entity queries
|
92
|
+
return super().generate_questions(
|
93
|
+
current_knowledge,
|
94
|
+
query,
|
95
|
+
questions_per_iteration,
|
96
|
+
questions_by_iteration,
|
97
|
+
)
|
98
|
+
|
99
|
+
response = self.model.invoke(prompt)
|
100
|
+
|
101
|
+
# Handle both string responses and responses with .content attribute
|
102
|
+
response_text = ""
|
103
|
+
if hasattr(response, "content"):
|
104
|
+
response_text = response.content
|
105
|
+
else:
|
106
|
+
response_text = str(response)
|
107
|
+
|
108
|
+
questions = [
|
109
|
+
q.replace("Q:", "").strip()
|
110
|
+
for q in response_text.split("\n")
|
111
|
+
if q.strip().startswith("Q:")
|
112
|
+
][:questions_per_iteration]
|
113
|
+
|
114
|
+
logger.info(f"Generated {len(questions)} entity-aware questions")
|
115
|
+
|
116
|
+
return questions
|
117
|
+
|
118
|
+
def generate_sub_questions(
|
119
|
+
self, query: str, context: str = ""
|
120
|
+
) -> List[str]:
|
121
|
+
"""Generate sub-questions with entity focus when appropriate."""
|
122
|
+
# Check if this is an entity identification query
|
123
|
+
entity_keywords = [
|
124
|
+
"who",
|
125
|
+
"what",
|
126
|
+
"which",
|
127
|
+
"identify",
|
128
|
+
"name",
|
129
|
+
"character",
|
130
|
+
"person",
|
131
|
+
"place",
|
132
|
+
"organization",
|
133
|
+
"company",
|
134
|
+
]
|
135
|
+
|
136
|
+
is_entity_query = any(
|
137
|
+
keyword in query.lower() for keyword in entity_keywords
|
138
|
+
)
|
139
|
+
|
140
|
+
if is_entity_query:
|
141
|
+
prompt = f"""Break down this entity identification query into targeted sub-questions.
|
142
|
+
|
143
|
+
Original Question: {query}
|
144
|
+
{context}
|
145
|
+
|
146
|
+
Generate 2-5 sub-questions that will help identify the specific entity.
|
147
|
+
Focus on:
|
148
|
+
1. Combining constraints to narrow down results
|
149
|
+
2. Finding the actual name/identity
|
150
|
+
3. Verifying the entity matches all criteria
|
151
|
+
|
152
|
+
Format your response as:
|
153
|
+
1. First sub-question
|
154
|
+
2. Second sub-question
|
155
|
+
...
|
156
|
+
|
157
|
+
Only provide the numbered sub-questions."""
|
158
|
+
else:
|
159
|
+
return super().generate_sub_questions(query, context)
|
160
|
+
|
161
|
+
try:
|
162
|
+
response = self.model.invoke(prompt)
|
163
|
+
content = ""
|
164
|
+
if hasattr(response, "content"):
|
165
|
+
content = response.content
|
166
|
+
else:
|
167
|
+
content = str(response)
|
168
|
+
|
169
|
+
# Extract numbered questions
|
170
|
+
questions = []
|
171
|
+
for line in content.strip().split("\n"):
|
172
|
+
line = line.strip()
|
173
|
+
if line and (line[0].isdigit() or line.startswith("-")):
|
174
|
+
# Remove the number/bullet and clean up
|
175
|
+
question = line.split(".", 1)[-1].strip()
|
176
|
+
question = question.lstrip("- ").strip()
|
177
|
+
if question:
|
178
|
+
questions.append(question)
|
179
|
+
|
180
|
+
return questions
|
181
|
+
|
182
|
+
except Exception as e:
|
183
|
+
logger.error(f"Error generating sub-questions: {str(e)}")
|
184
|
+
return []
|
@@ -50,7 +50,7 @@ class StandardQuestionGenerator(BaseQuestionGenerator):
|
|
50
50
|
response_text = str(response)
|
51
51
|
|
52
52
|
questions = [
|
53
|
-
q.replace("Q:", "").strip()
|
53
|
+
q.replace("Q:", "").strip().strip("\"'")
|
54
54
|
for q in response_text.split("\n")
|
55
55
|
if q.strip().startswith("Q:")
|
56
56
|
][:questions_per_iteration]
|
@@ -59,7 +59,9 @@ class StandardQuestionGenerator(BaseQuestionGenerator):
|
|
59
59
|
|
60
60
|
return questions
|
61
61
|
|
62
|
-
def generate_sub_questions(
|
62
|
+
def generate_sub_questions(
|
63
|
+
self, query: str, context: str = ""
|
64
|
+
) -> List[str]:
|
63
65
|
"""
|
64
66
|
Generate sub-questions from a main query.
|
65
67
|
|
@@ -107,7 +109,11 @@ Only provide the numbered sub-questions, nothing else."""
|
|
107
109
|
line = line.strip()
|
108
110
|
if line and (line[0].isdigit() or line.startswith("-")):
|
109
111
|
# Extract sub-question from numbered or bulleted list
|
110
|
-
parts =
|
112
|
+
parts = (
|
113
|
+
line.split(".", 1)
|
114
|
+
if "." in line
|
115
|
+
else line.split(" ", 1)
|
116
|
+
)
|
111
117
|
if len(parts) > 1:
|
112
118
|
sub_question = parts[1].strip()
|
113
119
|
sub_questions.append(sub_question)
|