hindsight-api 0.0.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hindsight_api/__init__.py +38 -0
- hindsight_api/api/__init__.py +105 -0
- hindsight_api/api/http.py +1872 -0
- hindsight_api/api/mcp.py +157 -0
- hindsight_api/engine/__init__.py +47 -0
- hindsight_api/engine/cross_encoder.py +97 -0
- hindsight_api/engine/db_utils.py +93 -0
- hindsight_api/engine/embeddings.py +113 -0
- hindsight_api/engine/entity_resolver.py +575 -0
- hindsight_api/engine/llm_wrapper.py +269 -0
- hindsight_api/engine/memory_engine.py +3095 -0
- hindsight_api/engine/query_analyzer.py +519 -0
- hindsight_api/engine/response_models.py +222 -0
- hindsight_api/engine/retain/__init__.py +50 -0
- hindsight_api/engine/retain/bank_utils.py +423 -0
- hindsight_api/engine/retain/chunk_storage.py +82 -0
- hindsight_api/engine/retain/deduplication.py +104 -0
- hindsight_api/engine/retain/embedding_processing.py +62 -0
- hindsight_api/engine/retain/embedding_utils.py +54 -0
- hindsight_api/engine/retain/entity_processing.py +90 -0
- hindsight_api/engine/retain/fact_extraction.py +1027 -0
- hindsight_api/engine/retain/fact_storage.py +176 -0
- hindsight_api/engine/retain/link_creation.py +121 -0
- hindsight_api/engine/retain/link_utils.py +651 -0
- hindsight_api/engine/retain/orchestrator.py +405 -0
- hindsight_api/engine/retain/types.py +206 -0
- hindsight_api/engine/search/__init__.py +15 -0
- hindsight_api/engine/search/fusion.py +122 -0
- hindsight_api/engine/search/observation_utils.py +132 -0
- hindsight_api/engine/search/reranking.py +103 -0
- hindsight_api/engine/search/retrieval.py +503 -0
- hindsight_api/engine/search/scoring.py +161 -0
- hindsight_api/engine/search/temporal_extraction.py +64 -0
- hindsight_api/engine/search/think_utils.py +255 -0
- hindsight_api/engine/search/trace.py +215 -0
- hindsight_api/engine/search/tracer.py +447 -0
- hindsight_api/engine/search/types.py +160 -0
- hindsight_api/engine/task_backend.py +223 -0
- hindsight_api/engine/utils.py +203 -0
- hindsight_api/metrics.py +227 -0
- hindsight_api/migrations.py +163 -0
- hindsight_api/models.py +309 -0
- hindsight_api/pg0.py +425 -0
- hindsight_api/web/__init__.py +12 -0
- hindsight_api/web/server.py +143 -0
- hindsight_api-0.0.13.dist-info/METADATA +41 -0
- hindsight_api-0.0.13.dist-info/RECORD +48 -0
- hindsight_api-0.0.13.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,519 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Query analysis abstraction for the memory system.
|
|
3
|
+
|
|
4
|
+
Provides an interface for analyzing natural language queries to extract
|
|
5
|
+
structured information like temporal constraints.
|
|
6
|
+
"""
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from typing import Optional
|
|
9
|
+
from datetime import datetime, timedelta
|
|
10
|
+
import logging
|
|
11
|
+
import re
|
|
12
|
+
from pydantic import BaseModel, Field
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TemporalConstraint(BaseModel):
|
|
18
|
+
"""
|
|
19
|
+
Temporal constraint extracted from a query.
|
|
20
|
+
|
|
21
|
+
Represents a time range with start and end dates.
|
|
22
|
+
"""
|
|
23
|
+
start_date: datetime = Field(description="Start of the time range (inclusive)")
|
|
24
|
+
end_date: datetime = Field(description="End of the time range (inclusive)")
|
|
25
|
+
|
|
26
|
+
def __str__(self) -> str:
|
|
27
|
+
return f"{self.start_date.strftime('%Y-%m-%d')} to {self.end_date.strftime('%Y-%m-%d')}"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class QueryAnalysis(BaseModel):
|
|
31
|
+
"""
|
|
32
|
+
Result of analyzing a natural language query.
|
|
33
|
+
|
|
34
|
+
Contains extracted structured information like temporal constraints.
|
|
35
|
+
"""
|
|
36
|
+
temporal_constraint: Optional[TemporalConstraint] = Field(
|
|
37
|
+
default=None,
|
|
38
|
+
description="Extracted temporal constraint, if any"
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class QueryAnalyzer(ABC):
|
|
43
|
+
"""
|
|
44
|
+
Abstract base class for query analysis.
|
|
45
|
+
|
|
46
|
+
Implementations analyze natural language queries to extract structured
|
|
47
|
+
information like temporal constraints, entities, etc.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
@abstractmethod
|
|
51
|
+
def load(self) -> None:
|
|
52
|
+
"""
|
|
53
|
+
Load the query analyzer model.
|
|
54
|
+
|
|
55
|
+
This should be called during initialization to load the model
|
|
56
|
+
and avoid cold start latency on first analyze() call.
|
|
57
|
+
"""
|
|
58
|
+
pass
|
|
59
|
+
|
|
60
|
+
@abstractmethod
|
|
61
|
+
def analyze(
|
|
62
|
+
self, query: str, reference_date: Optional[datetime] = None
|
|
63
|
+
) -> QueryAnalysis:
|
|
64
|
+
"""
|
|
65
|
+
Analyze a natural language query.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
query: Natural language query to analyze
|
|
69
|
+
reference_date: Reference date for relative terms (defaults to now)
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
QueryAnalysis containing extracted information
|
|
73
|
+
"""
|
|
74
|
+
pass
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class DateparserQueryAnalyzer(QueryAnalyzer):
|
|
78
|
+
"""
|
|
79
|
+
Query analyzer using dateparser library.
|
|
80
|
+
|
|
81
|
+
Uses dateparser to extract temporal expressions from natural language
|
|
82
|
+
queries. Supports 200+ languages including English, Spanish, Italian,
|
|
83
|
+
French, German, etc.
|
|
84
|
+
|
|
85
|
+
Performance:
|
|
86
|
+
- ~10-50ms per query
|
|
87
|
+
- No model loading required
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
def __init__(self):
|
|
91
|
+
"""Initialize dateparser query analyzer."""
|
|
92
|
+
self._search_dates = None
|
|
93
|
+
|
|
94
|
+
def load(self) -> None:
|
|
95
|
+
"""Load dateparser (lazy import)."""
|
|
96
|
+
if self._search_dates is None:
|
|
97
|
+
from dateparser.search import search_dates
|
|
98
|
+
self._search_dates = search_dates
|
|
99
|
+
|
|
100
|
+
def analyze(
|
|
101
|
+
self, query: str, reference_date: Optional[datetime] = None
|
|
102
|
+
) -> QueryAnalysis:
|
|
103
|
+
"""
|
|
104
|
+
Analyze query using dateparser.
|
|
105
|
+
|
|
106
|
+
Extracts temporal expressions from the query text. Supports multiple
|
|
107
|
+
languages automatically.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
query: Natural language query (any language)
|
|
111
|
+
reference_date: Reference date for relative terms (defaults to now)
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
QueryAnalysis with temporal_constraint if found
|
|
115
|
+
"""
|
|
116
|
+
self.load()
|
|
117
|
+
|
|
118
|
+
if reference_date is None:
|
|
119
|
+
reference_date = datetime.now()
|
|
120
|
+
|
|
121
|
+
# Check for period expressions first (these need special handling)
|
|
122
|
+
query_lower = query.lower()
|
|
123
|
+
period_result = self._extract_period(query_lower, reference_date)
|
|
124
|
+
if period_result is not None:
|
|
125
|
+
return QueryAnalysis(temporal_constraint=period_result)
|
|
126
|
+
|
|
127
|
+
# Use dateparser's search_dates to find temporal expressions
|
|
128
|
+
settings = {
|
|
129
|
+
'RELATIVE_BASE': reference_date,
|
|
130
|
+
'PREFER_DATES_FROM': 'past',
|
|
131
|
+
'RETURN_AS_TIMEZONE_AWARE': False,
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
results = self._search_dates(query, settings=settings)
|
|
135
|
+
|
|
136
|
+
if not results:
|
|
137
|
+
return QueryAnalysis(temporal_constraint=None)
|
|
138
|
+
|
|
139
|
+
# Filter out false positives (common words parsed as dates)
|
|
140
|
+
false_positives = {'do', 'may', 'march', 'will', 'can', 'sat', 'sun', 'mon', 'tue', 'wed', 'thu', 'fri'}
|
|
141
|
+
valid_results = [
|
|
142
|
+
(text, date) for text, date in results
|
|
143
|
+
if text.lower() not in false_positives or len(text) > 3
|
|
144
|
+
]
|
|
145
|
+
|
|
146
|
+
if not valid_results:
|
|
147
|
+
return QueryAnalysis(temporal_constraint=None)
|
|
148
|
+
|
|
149
|
+
# Use the first valid date found
|
|
150
|
+
_, parsed_date = valid_results[0]
|
|
151
|
+
|
|
152
|
+
# Create constraint for single day
|
|
153
|
+
start_date = parsed_date.replace(hour=0, minute=0, second=0, microsecond=0)
|
|
154
|
+
end_date = parsed_date.replace(hour=23, minute=59, second=59, microsecond=999999)
|
|
155
|
+
|
|
156
|
+
return QueryAnalysis(
|
|
157
|
+
temporal_constraint=TemporalConstraint(
|
|
158
|
+
start_date=start_date,
|
|
159
|
+
end_date=end_date
|
|
160
|
+
)
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
def _extract_period(
|
|
164
|
+
self, query: str, reference_date: datetime
|
|
165
|
+
) -> Optional[TemporalConstraint]:
|
|
166
|
+
"""
|
|
167
|
+
Extract period-based temporal expressions (week, month, year, weekend).
|
|
168
|
+
|
|
169
|
+
These need special handling as they represent date ranges, not single dates.
|
|
170
|
+
Supports multiple languages.
|
|
171
|
+
"""
|
|
172
|
+
def constraint(start: datetime, end: datetime) -> TemporalConstraint:
|
|
173
|
+
return TemporalConstraint(
|
|
174
|
+
start_date=start.replace(hour=0, minute=0, second=0, microsecond=0),
|
|
175
|
+
end_date=end.replace(hour=23, minute=59, second=59, microsecond=999999)
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
# Yesterday patterns (English, Spanish, Italian, French, German)
|
|
179
|
+
if re.search(r'\b(yesterday|ayer|ieri|hier|gestern)\b', query, re.IGNORECASE):
|
|
180
|
+
d = reference_date - timedelta(days=1)
|
|
181
|
+
return constraint(d, d)
|
|
182
|
+
|
|
183
|
+
# Today patterns
|
|
184
|
+
if re.search(r'\b(today|hoy|oggi|aujourd\'?hui|heute)\b', query, re.IGNORECASE):
|
|
185
|
+
return constraint(reference_date, reference_date)
|
|
186
|
+
|
|
187
|
+
# "a couple of days ago" / "a few days ago" patterns
|
|
188
|
+
# These are imprecise so we create a range
|
|
189
|
+
if re.search(r'\b(a\s+)?couple\s+(of\s+)?days?\s+ago\b', query, re.IGNORECASE):
|
|
190
|
+
# "a couple of days" = approximately 2 days, give range of 1-3 days
|
|
191
|
+
return constraint(reference_date - timedelta(days=3), reference_date - timedelta(days=1))
|
|
192
|
+
|
|
193
|
+
if re.search(r'\b(a\s+)?few\s+days?\s+ago\b', query, re.IGNORECASE):
|
|
194
|
+
# "a few days" = approximately 3-4 days, give range of 2-5 days
|
|
195
|
+
return constraint(reference_date - timedelta(days=5), reference_date - timedelta(days=2))
|
|
196
|
+
|
|
197
|
+
# "a couple of weeks ago" / "a few weeks ago" patterns
|
|
198
|
+
if re.search(r'\b(a\s+)?couple\s+(of\s+)?weeks?\s+ago\b', query, re.IGNORECASE):
|
|
199
|
+
# "a couple of weeks" = approximately 2 weeks, give range of 1-3 weeks
|
|
200
|
+
return constraint(reference_date - timedelta(weeks=3), reference_date - timedelta(weeks=1))
|
|
201
|
+
|
|
202
|
+
if re.search(r'\b(a\s+)?few\s+weeks?\s+ago\b', query, re.IGNORECASE):
|
|
203
|
+
# "a few weeks" = approximately 3-4 weeks, give range of 2-5 weeks
|
|
204
|
+
return constraint(reference_date - timedelta(weeks=5), reference_date - timedelta(weeks=2))
|
|
205
|
+
|
|
206
|
+
# "a couple of months ago" / "a few months ago" patterns
|
|
207
|
+
if re.search(r'\b(a\s+)?couple\s+(of\s+)?months?\s+ago\b', query, re.IGNORECASE):
|
|
208
|
+
# "a couple of months" = approximately 2 months, give range of 1-3 months
|
|
209
|
+
return constraint(reference_date - timedelta(days=90), reference_date - timedelta(days=30))
|
|
210
|
+
|
|
211
|
+
if re.search(r'\b(a\s+)?few\s+months?\s+ago\b', query, re.IGNORECASE):
|
|
212
|
+
# "a few months" = approximately 3-4 months, give range of 2-5 months
|
|
213
|
+
return constraint(reference_date - timedelta(days=150), reference_date - timedelta(days=60))
|
|
214
|
+
|
|
215
|
+
# Last week patterns (English, Spanish, Italian, French, German)
|
|
216
|
+
if re.search(r'\b(last\s+week|la\s+semana\s+pasada|la\s+settimana\s+scorsa|la\s+semaine\s+derni[eè]re|letzte\s+woche)\b', query, re.IGNORECASE):
|
|
217
|
+
start = reference_date - timedelta(days=reference_date.weekday() + 7)
|
|
218
|
+
return constraint(start, start + timedelta(days=6))
|
|
219
|
+
|
|
220
|
+
# Last month patterns
|
|
221
|
+
if re.search(r'\b(last\s+month|el\s+mes\s+pasado|il\s+mese\s+scorso|le\s+mois\s+dernier|letzten?\s+monat)\b', query, re.IGNORECASE):
|
|
222
|
+
first = reference_date.replace(day=1)
|
|
223
|
+
end = first - timedelta(days=1)
|
|
224
|
+
start = end.replace(day=1)
|
|
225
|
+
return constraint(start, end)
|
|
226
|
+
|
|
227
|
+
# Last year patterns
|
|
228
|
+
if re.search(r'\b(last\s+year|el\s+a[ñn]o\s+pasado|l\'anno\s+scorso|l\'ann[ée]e\s+derni[eè]re|letztes?\s+jahr)\b', query, re.IGNORECASE):
|
|
229
|
+
year = reference_date.year - 1
|
|
230
|
+
return constraint(datetime(year, 1, 1), datetime(year, 12, 31))
|
|
231
|
+
|
|
232
|
+
# Last weekend patterns
|
|
233
|
+
if re.search(r'\b(last\s+weekend|el\s+fin\s+de\s+semana\s+pasado|lo\s+scorso\s+fine\s+settimana|le\s+week-?end\s+dernier|letztes?\s+wochenende)\b', query, re.IGNORECASE):
|
|
234
|
+
days_since_sat = (reference_date.weekday() + 2) % 7
|
|
235
|
+
if days_since_sat == 0:
|
|
236
|
+
days_since_sat = 7
|
|
237
|
+
sat = reference_date - timedelta(days=days_since_sat)
|
|
238
|
+
return constraint(sat, sat + timedelta(days=1))
|
|
239
|
+
|
|
240
|
+
# Month + Year patterns (e.g., "June 2024", "junio 2024", "giugno 2024")
|
|
241
|
+
month_patterns = {
|
|
242
|
+
'january|enero|gennaio|janvier|januar': 1,
|
|
243
|
+
'february|febrero|febbraio|f[ée]vrier|februar': 2,
|
|
244
|
+
'march|marzo|mars|m[äa]rz': 3,
|
|
245
|
+
'april|abril|aprile|avril': 4,
|
|
246
|
+
'may|mayo|maggio|mai': 5,
|
|
247
|
+
'june|junio|giugno|juin|juni': 6,
|
|
248
|
+
'july|julio|luglio|juillet|juli': 7,
|
|
249
|
+
'august|agosto|ao[uû]t': 8,
|
|
250
|
+
'september|septiembre|settembre|septembre': 9,
|
|
251
|
+
'october|octubre|ottobre|octobre|oktober': 10,
|
|
252
|
+
'november|noviembre|novembre': 11,
|
|
253
|
+
'december|diciembre|dicembre|d[ée]cembre|dezember': 12,
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
for pattern, month_num in month_patterns.items():
|
|
257
|
+
match = re.search(rf'\b({pattern})\s+(\d{{4}})\b', query, re.IGNORECASE)
|
|
258
|
+
if match:
|
|
259
|
+
year = int(match.group(2))
|
|
260
|
+
start = datetime(year, month_num, 1)
|
|
261
|
+
if month_num == 12:
|
|
262
|
+
end = datetime(year, 12, 31)
|
|
263
|
+
else:
|
|
264
|
+
end = datetime(year, month_num + 1, 1) - timedelta(days=1)
|
|
265
|
+
return constraint(start, end)
|
|
266
|
+
|
|
267
|
+
return None
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
class TransformerQueryAnalyzer(QueryAnalyzer):
|
|
271
|
+
"""
|
|
272
|
+
Query analyzer using T5-based generative models.
|
|
273
|
+
|
|
274
|
+
Uses T5 to convert natural language temporal expressions into structured
|
|
275
|
+
date ranges without pattern matching or regex.
|
|
276
|
+
|
|
277
|
+
Performance:
|
|
278
|
+
- ~30-80ms on CPU, ~5-15ms on GPU
|
|
279
|
+
- Model size: ~80M params (~300MB download)
|
|
280
|
+
"""
|
|
281
|
+
|
|
282
|
+
def __init__(
|
|
283
|
+
self,
|
|
284
|
+
model_name: str = "google/flan-t5-small",
|
|
285
|
+
device: str = "cpu"
|
|
286
|
+
):
|
|
287
|
+
"""
|
|
288
|
+
Initialize T5 query analyzer.
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
model_name: Name of the HuggingFace T5 model to use.
|
|
292
|
+
Default: google/flan-t5-small (~80M params, ~300MB download)
|
|
293
|
+
Alternative: google/flan-t5-base (~1GB, more accurate)
|
|
294
|
+
device: Device to run model on ("cpu" or "cuda")
|
|
295
|
+
"""
|
|
296
|
+
self.model_name = model_name
|
|
297
|
+
self.device = device
|
|
298
|
+
self._model = None
|
|
299
|
+
self._tokenizer = None
|
|
300
|
+
|
|
301
|
+
def load(self) -> None:
|
|
302
|
+
"""Load the T5 model for temporal extraction."""
|
|
303
|
+
if self._model is not None:
|
|
304
|
+
return
|
|
305
|
+
|
|
306
|
+
try:
|
|
307
|
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
|
308
|
+
except ImportError:
|
|
309
|
+
raise ImportError(
|
|
310
|
+
"transformers is required for TransformerQueryAnalyzer. "
|
|
311
|
+
"Install it with: pip install transformers"
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
logger.info(f"Loading query analyzer model: {self.model_name}...")
|
|
315
|
+
self._tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
|
316
|
+
self._model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)
|
|
317
|
+
self._model.to(self.device)
|
|
318
|
+
self._model.eval()
|
|
319
|
+
logger.info("Query analyzer model loaded")
|
|
320
|
+
|
|
321
|
+
def _load_model(self):
|
|
322
|
+
"""Lazy load the T5 model for temporal extraction (calls load())."""
|
|
323
|
+
self.load()
|
|
324
|
+
|
|
325
|
+
def _extract_with_rules(
|
|
326
|
+
self, query: str, reference_date: datetime
|
|
327
|
+
) -> Optional[TemporalConstraint]:
|
|
328
|
+
"""
|
|
329
|
+
Extract temporal expressions using rule-based patterns.
|
|
330
|
+
|
|
331
|
+
Handles common patterns reliably and fast. Returns None for
|
|
332
|
+
patterns that need model-based extraction.
|
|
333
|
+
"""
|
|
334
|
+
import re
|
|
335
|
+
query_lower = query.lower()
|
|
336
|
+
|
|
337
|
+
def get_last_weekday(weekday: int) -> datetime:
|
|
338
|
+
days_ago = (reference_date.weekday() - weekday) % 7
|
|
339
|
+
if days_ago == 0:
|
|
340
|
+
days_ago = 7
|
|
341
|
+
return reference_date - timedelta(days=days_ago)
|
|
342
|
+
|
|
343
|
+
def constraint(start: datetime, end: datetime) -> TemporalConstraint:
|
|
344
|
+
return TemporalConstraint(
|
|
345
|
+
start_date=start.replace(hour=0, minute=0, second=0, microsecond=0),
|
|
346
|
+
end_date=end.replace(hour=23, minute=59, second=59, microsecond=999999)
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
# Yesterday
|
|
350
|
+
if re.search(r'\byesterday\b', query_lower):
|
|
351
|
+
d = reference_date - timedelta(days=1)
|
|
352
|
+
return constraint(d, d)
|
|
353
|
+
|
|
354
|
+
# Last week
|
|
355
|
+
if re.search(r'\blast\s+week\b', query_lower):
|
|
356
|
+
start = reference_date - timedelta(days=reference_date.weekday() + 7)
|
|
357
|
+
return constraint(start, start + timedelta(days=6))
|
|
358
|
+
|
|
359
|
+
# Last month
|
|
360
|
+
if re.search(r'\blast\s+month\b', query_lower):
|
|
361
|
+
first = reference_date.replace(day=1)
|
|
362
|
+
end = first - timedelta(days=1)
|
|
363
|
+
start = end.replace(day=1)
|
|
364
|
+
return constraint(start, end)
|
|
365
|
+
|
|
366
|
+
# Last year
|
|
367
|
+
if re.search(r'\blast\s+year\b', query_lower):
|
|
368
|
+
y = reference_date.year - 1
|
|
369
|
+
return constraint(datetime(y, 1, 1), datetime(y, 12, 31))
|
|
370
|
+
|
|
371
|
+
# Last weekend
|
|
372
|
+
if re.search(r'\blast\s+weekend\b', query_lower):
|
|
373
|
+
sat = get_last_weekday(5)
|
|
374
|
+
return constraint(sat, sat + timedelta(days=1))
|
|
375
|
+
|
|
376
|
+
# Last <weekday>
|
|
377
|
+
weekdays = {'monday': 0, 'tuesday': 1, 'wednesday': 2, 'thursday': 3,
|
|
378
|
+
'friday': 4, 'saturday': 5, 'sunday': 6}
|
|
379
|
+
for name, num in weekdays.items():
|
|
380
|
+
if re.search(rf'\blast\s+{name}\b', query_lower):
|
|
381
|
+
d = get_last_weekday(num)
|
|
382
|
+
return constraint(d, d)
|
|
383
|
+
|
|
384
|
+
# Month + Year: "June 2024", "in March 2023"
|
|
385
|
+
months = {'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5,
|
|
386
|
+
'june': 6, 'july': 7, 'august': 8, 'september': 9, 'october': 10,
|
|
387
|
+
'november': 11, 'december': 12}
|
|
388
|
+
for name, num in months.items():
|
|
389
|
+
match = re.search(rf'\b{name}\s+(\d{{4}})\b', query_lower)
|
|
390
|
+
if match:
|
|
391
|
+
year = int(match.group(1))
|
|
392
|
+
if num == 12:
|
|
393
|
+
last_day = 31
|
|
394
|
+
else:
|
|
395
|
+
last_day = (datetime(year, num + 1, 1) - timedelta(days=1)).day
|
|
396
|
+
return constraint(datetime(year, num, 1), datetime(year, num, last_day))
|
|
397
|
+
|
|
398
|
+
return None
|
|
399
|
+
|
|
400
|
+
def analyze(
|
|
401
|
+
self, query: str, reference_date: Optional[datetime] = None
|
|
402
|
+
) -> QueryAnalysis:
|
|
403
|
+
"""
|
|
404
|
+
Analyze query for temporal expressions.
|
|
405
|
+
|
|
406
|
+
Uses rule-based extraction for common patterns (fast & reliable),
|
|
407
|
+
falls back to T5 model for complex/unusual patterns.
|
|
408
|
+
|
|
409
|
+
Args:
|
|
410
|
+
query: Natural language query
|
|
411
|
+
reference_date: Reference date for relative terms (defaults to now)
|
|
412
|
+
|
|
413
|
+
Returns:
|
|
414
|
+
QueryAnalysis with temporal_constraint if found
|
|
415
|
+
"""
|
|
416
|
+
if reference_date is None:
|
|
417
|
+
reference_date = datetime.now()
|
|
418
|
+
|
|
419
|
+
# Try rule-based extraction first (handles 90%+ of cases)
|
|
420
|
+
result = self._extract_with_rules(query, reference_date)
|
|
421
|
+
if result is not None:
|
|
422
|
+
return QueryAnalysis(temporal_constraint=result)
|
|
423
|
+
|
|
424
|
+
# Fall back to T5 model for unusual patterns
|
|
425
|
+
self._load_model()
|
|
426
|
+
|
|
427
|
+
# Helper to calculate example dates
|
|
428
|
+
def get_last_weekday(weekday: int) -> datetime:
|
|
429
|
+
days_ago = (reference_date.weekday() - weekday) % 7
|
|
430
|
+
if days_ago == 0:
|
|
431
|
+
days_ago = 7
|
|
432
|
+
return reference_date - timedelta(days=days_ago)
|
|
433
|
+
|
|
434
|
+
yesterday = reference_date - timedelta(days=1)
|
|
435
|
+
last_saturday = get_last_weekday(5)
|
|
436
|
+
|
|
437
|
+
# Build prompt for T5
|
|
438
|
+
prompt = f"""Today is {reference_date.strftime('%Y-%m-%d')}. Extract date range or "none".
|
|
439
|
+
|
|
440
|
+
June 2024 = 2024-06-01 to 2024-06-30
|
|
441
|
+
yesterday = {yesterday.strftime('%Y-%m-%d')} to {yesterday.strftime('%Y-%m-%d')}
|
|
442
|
+
last Saturday = {last_saturday.strftime('%Y-%m-%d')} to {last_saturday.strftime('%Y-%m-%d')}
|
|
443
|
+
what is the weather = none
|
|
444
|
+
{query} ="""
|
|
445
|
+
|
|
446
|
+
# Tokenize and generate
|
|
447
|
+
inputs = self._tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
|
|
448
|
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
|
449
|
+
|
|
450
|
+
with self._no_grad():
|
|
451
|
+
outputs = self._model.generate(
|
|
452
|
+
**inputs,
|
|
453
|
+
max_new_tokens=30,
|
|
454
|
+
num_beams=3,
|
|
455
|
+
do_sample=False,
|
|
456
|
+
temperature=1.0
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
result = self._tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
|
|
460
|
+
|
|
461
|
+
# Parse the generated output
|
|
462
|
+
temporal = self._parse_generated_output(result, reference_date)
|
|
463
|
+
return QueryAnalysis(temporal_constraint=temporal)
|
|
464
|
+
|
|
465
|
+
def _no_grad(self):
|
|
466
|
+
"""Get torch.no_grad context manager."""
|
|
467
|
+
try:
|
|
468
|
+
import torch
|
|
469
|
+
return torch.no_grad()
|
|
470
|
+
except ImportError:
|
|
471
|
+
from contextlib import nullcontext
|
|
472
|
+
return nullcontext()
|
|
473
|
+
|
|
474
|
+
def _parse_generated_output(
|
|
475
|
+
self, result: str, reference_date: datetime
|
|
476
|
+
) -> Optional[TemporalConstraint]:
|
|
477
|
+
"""
|
|
478
|
+
Parse T5 generated output into TemporalConstraint.
|
|
479
|
+
|
|
480
|
+
Expected format: "YYYY-MM-DD to YYYY-MM-DD"
|
|
481
|
+
|
|
482
|
+
Args:
|
|
483
|
+
result: Generated text from T5
|
|
484
|
+
reference_date: Reference date for validation
|
|
485
|
+
|
|
486
|
+
Returns:
|
|
487
|
+
TemporalConstraint if valid output, else None
|
|
488
|
+
"""
|
|
489
|
+
if not result or result.lower().strip() in ("none", "null", "no"):
|
|
490
|
+
return None
|
|
491
|
+
|
|
492
|
+
try:
|
|
493
|
+
# Parse "YYYY-MM-DD to YYYY-MM-DD"
|
|
494
|
+
import re
|
|
495
|
+
pattern = r'(\d{4}-\d{2}-\d{2})\s+to\s+(\d{4}-\d{2}-\d{2})'
|
|
496
|
+
match = re.search(pattern, result, re.IGNORECASE)
|
|
497
|
+
|
|
498
|
+
if match:
|
|
499
|
+
start_str = match.group(1)
|
|
500
|
+
end_str = match.group(2)
|
|
501
|
+
|
|
502
|
+
start_date = datetime.strptime(start_str, "%Y-%m-%d")
|
|
503
|
+
end_date = datetime.strptime(end_str, "%Y-%m-%d")
|
|
504
|
+
|
|
505
|
+
# Set time boundaries
|
|
506
|
+
start_date = start_date.replace(hour=0, minute=0, second=0, microsecond=0)
|
|
507
|
+
end_date = end_date.replace(hour=23, minute=59, second=59, microsecond=999999)
|
|
508
|
+
|
|
509
|
+
# Validation
|
|
510
|
+
if end_date < start_date:
|
|
511
|
+
logger.warning(f"Invalid date range: {start_date} to {end_date}")
|
|
512
|
+
return None
|
|
513
|
+
|
|
514
|
+
return TemporalConstraint(start_date=start_date, end_date=end_date)
|
|
515
|
+
|
|
516
|
+
except (ValueError, AttributeError) as e:
|
|
517
|
+
return None
|
|
518
|
+
|
|
519
|
+
return None
|