nosible 0.1.9__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
nosible/nosible_client.py CHANGED
@@ -2,18 +2,17 @@ import gzip
2
2
  import json
3
3
  import logging
4
4
  import os
5
+ import re
6
+ import sys
7
+ import textwrap
5
8
  import time
6
9
  import types
7
- import typing
8
10
  from collections.abc import Iterator
9
11
  from concurrent.futures import ThreadPoolExecutor
10
- from typing import Union
12
+ from datetime import datetime
13
+ from typing import Optional, Union
11
14
 
12
- import polars as pl
13
- import requests
14
- from cryptography.fernet import Fernet
15
- from openai import OpenAI
16
- from polars import SQLContext
15
+ import httpx
17
16
  from tenacity import (
18
17
  before_sleep_log,
19
18
  retry,
@@ -29,7 +28,6 @@ from nosible.classes.search_set import SearchSet
29
28
  from nosible.classes.snippet_set import SnippetSet
30
29
  from nosible.classes.web_page import WebPageData
31
30
  from nosible.utils.json_tools import json_loads
32
- from nosible.utils.question_builder import _get_question
33
31
  from nosible.utils.rate_limiter import PLAN_RATE_LIMITS, RateLimiter, _rate_limited
34
32
 
35
33
  # Set up a module‐level logger.
@@ -53,6 +51,8 @@ class Nosible:
53
51
  Base URL for the OpenAI-compatible LLM API. (default is OpenRouter's API endpoint)
54
52
  sentiment_model : str, optional
55
53
  Model to use for sentiment analysis (default is "openai/gpt-4o").
54
+ expansions_model : str, optional
55
+ Model to use for expansions (default is "openai/gpt-4o").
56
56
  timeout : int
57
57
  Request timeout for HTTP calls.
58
58
  retries : int,
@@ -91,7 +91,8 @@ class Nosible:
91
91
  - The `nosible_api_key` is required to access the Nosible Search API.
92
92
  - The `llm_api_key` is optional and used for LLM-based query expansions.
93
93
  - The `openai_base_url` defaults to OpenRouter's API endpoint.
94
- - The `sentiment_model` is used for generating query expansions and sentiment analysis.
94
+ - The `sentiment_model` is used for sentiment analysis.
95
+ - The `expansions_model` is used for generating query expansions.
95
96
  - The `timeout`, `retries`, and `concurrency` parameters control the behavior of HTTP requests.
96
97
 
97
98
  Examples
@@ -103,10 +104,11 @@ class Nosible:
103
104
 
104
105
  def __init__(
105
106
  self,
106
- nosible_api_key: str = None,
107
- llm_api_key: str = None,
107
+ nosible_api_key: Optional[str] = None,
108
+ llm_api_key: Optional[str] = None,
108
109
  openai_base_url: str = "https://openrouter.ai/api/v1",
109
110
  sentiment_model: str = "openai/gpt-4o",
111
+ expansions_model: str = "openai/gpt-4o",
110
112
  timeout: int = 30,
111
113
  retries: int = 5,
112
114
  concurrency: int = 10,
@@ -139,6 +141,7 @@ class Nosible:
139
141
  self.llm_api_key = llm_api_key or os.getenv("LLM_API_KEY")
140
142
  self.openai_base_url = openai_base_url
141
143
  self.sentiment_model = sentiment_model
144
+ self.expansions_model = expansions_model
142
145
  # Network parameters
143
146
  self.timeout = timeout
144
147
  self.retries = retries
@@ -159,7 +162,7 @@ class Nosible:
159
162
  reraise=True,
160
163
  stop=stop_after_attempt(self.retries) | stop_after_delay(self.timeout),
161
164
  wait=wait_exponential(multiplier=1, min=1, max=10),
162
- retry=retry_if_exception_type(requests.exceptions.RequestException),
165
+ retry=retry_if_exception_type(httpx.RequestError),
163
166
  before_sleep=before_sleep_log(self.logger, logging.WARNING),
164
167
  )(self._post)
165
168
 
@@ -168,12 +171,12 @@ class Nosible:
168
171
  reraise=True,
169
172
  stop=stop_after_attempt(self.retries) | stop_after_delay(self.timeout),
170
173
  wait=wait_exponential(multiplier=1, min=1, max=10),
171
- retry=retry_if_exception_type(Exception),
174
+ retry=retry_if_exception_type(httpx.RequestError),
172
175
  before_sleep=before_sleep_log(self.logger, logging.WARNING),
173
176
  )(self._generate_expansions)
174
177
 
175
178
  # Thread pool for parallel searches
176
- self._session = requests.Session()
179
+ self._session = httpx.Client(follow_redirects=True)
177
180
  self._executor = ThreadPoolExecutor(max_workers=self.concurrency)
178
181
 
179
182
  # Headers
@@ -198,7 +201,6 @@ class Nosible:
198
201
 
199
202
  def search(
200
203
  self,
201
- *,
202
204
  search: Search = None,
203
205
  question: str = None,
204
206
  expansions: list[str] = None,
@@ -207,6 +209,9 @@ class Nosible:
207
209
  n_probes: int = 30,
208
210
  n_contextify: int = 128,
209
211
  algorithm: str = "hybrid-2",
212
+ min_similarity: float = None,
213
+ must_include: list[str] = None,
214
+ must_exclude: list[str] = None,
210
215
  autogenerate_expansions: bool = False,
211
216
  publish_start: str = None,
212
217
  publish_end: str = None,
@@ -246,6 +251,12 @@ class Nosible:
246
251
  Context window size per result.
247
252
  algorithm : str
248
253
  Search algorithm type.
254
+ min_similarity : float
255
+ Results must have at least this similarity score.
256
+ must_include : list of str
257
+ Only results mentioning these strings will be included.
258
+ must_exclude : list of str
259
+ Any result mentioning these strings will be excluded.
249
260
  autogenerate_expansions : bool
250
261
  Do you want to generate expansions automatically using a LLM?
251
262
  publish_start : str, optional
@@ -335,6 +346,9 @@ class Nosible:
335
346
  n_probes=n_probes,
336
347
  n_contextify=n_contextify,
337
348
  algorithm=algorithm,
349
+ min_similarity=min_similarity,
350
+ must_include=must_include,
351
+ must_exclude=must_exclude,
338
352
  autogenerate_expansions=autogenerate_expansions,
339
353
  publish_start=publish_start,
340
354
  publish_end=publish_end,
@@ -372,6 +386,9 @@ class Nosible:
372
386
  n_probes: int = 30,
373
387
  n_contextify: int = 128,
374
388
  algorithm: str = "hybrid-2",
389
+ min_similarity: float = None,
390
+ must_include: list[str] = None,
391
+ must_exclude: list[str] = None,
375
392
  autogenerate_expansions: bool = False,
376
393
  publish_start: str = None,
377
394
  publish_end: str = None,
@@ -408,8 +425,14 @@ class Nosible:
408
425
  Context window size for the search.
409
426
  algorithm : str
410
427
  Search algorithm to use.
428
+ min_similarity : float
429
+ Results must have at least this similarity score.
430
+ must_include : list of str
431
+ Only results mentioning these strings will be included.
432
+ must_exclude : list of str
433
+ Any result mentioning these strings will be excluded.
411
434
  autogenerate_expansions : bool
412
- Do you want to generate expansions automatically using a LLM?
435
+ Do you want to generate expansions automatically using a LLM?.
413
436
  publish_start : str, optional
414
437
  Start date for when the document was published (ISO format).
415
438
  publish_end : str, optional
@@ -509,6 +532,9 @@ class Nosible:
509
532
  n_probes=n_probes,
510
533
  n_contextify=n_contextify,
511
534
  algorithm=algorithm,
535
+ min_similarity=min_similarity,
536
+ must_include=must_include,
537
+ must_exclude=must_exclude,
512
538
  autogenerate_expansions=autogenerate_expansions,
513
539
  publish_start=publish_start,
514
540
  publish_end=publish_end,
@@ -532,7 +558,7 @@ class Nosible:
532
558
  yield future.result()
533
559
  except Exception as e:
534
560
  self.logger.warning(f"Search failed: {e!r}")
535
- yield None
561
+ raise
536
562
 
537
563
  return _run_generator()
538
564
 
@@ -555,6 +581,8 @@ class Nosible:
555
581
  ------
556
582
  ValueError
557
583
  If `n_results` > 100.
584
+ ValueError
585
+ If min_similarity is not [0,1].
558
586
 
559
587
  Examples
560
588
  --------
@@ -577,6 +605,9 @@ class Nosible:
577
605
  n_probes = search_obj.n_probes if search_obj.n_probes is not None else 30
578
606
  n_contextify = search_obj.n_contextify if search_obj.n_contextify is not None else 128
579
607
  algorithm = search_obj.algorithm if search_obj.algorithm is not None else "hybrid-2"
608
+ min_similarity = search_obj.min_similarity if search_obj.min_similarity is not None else 0
609
+ must_include = search_obj.must_include if search_obj.must_include is not None else []
610
+ must_exclude = search_obj.must_exclude if search_obj.must_exclude is not None else []
580
611
  autogenerate_expansions = (
581
612
  search_obj.autogenerate_expansions if search_obj.autogenerate_expansions is not None else False
582
613
  )
@@ -600,6 +631,9 @@ class Nosible:
600
631
  search_obj.exclude_companies if search_obj.exclude_companies is not None else self.exclude_companies
601
632
  )
602
633
 
634
+ if not (0.0 <= min_similarity <= 1.0):
635
+ raise ValueError(f"Invalid min_simalarity: {min_similarity}. Must be [0,1].")
636
+
603
637
  # Generate expansions if not provided
604
638
  if expansions is None:
605
639
  expansions = []
@@ -636,6 +670,9 @@ class Nosible:
636
670
  "n_probes": n_probes,
637
671
  "n_contextify": n_contextify,
638
672
  "algorithm": algorithm,
673
+ "min_similarity": min_similarity,
674
+ "must_include": must_include,
675
+ "must_exclude": must_exclude,
639
676
  }
640
677
 
641
678
  resp = self._post(url="https://www.nosible.ai/search/v1/fast-search", payload=payload)
@@ -696,6 +733,9 @@ class Nosible:
696
733
  n_probes: int = 30,
697
734
  n_contextify: int = 128,
698
735
  algorithm: str = "hybrid-2",
736
+ min_similarity: float = None,
737
+ must_include: list[str] = None,
738
+ must_exclude: list[str] = None,
699
739
  autogenerate_expansions: bool = False,
700
740
  publish_start: str = None,
701
741
  publish_end: str = None,
@@ -733,6 +773,12 @@ class Nosible:
733
773
  Context window size per result.
734
774
  algorithm : str
735
775
  Search algorithm identifier.
776
+ min_similarity : float
777
+ Results must have at least this similarity score.
778
+ must_include : list of str
779
+ Only results mentioning these strings will be included.
780
+ must_exclude : list of str
781
+ Any result mentioning these strings will be excluded.
736
782
  autogenerate_expansions : bool
737
783
  Do you want to generate expansions automatically using a LLM?
738
784
  publish_start : str, optional
@@ -779,6 +825,8 @@ class Nosible:
779
825
  If neither question nor search are specified.
780
826
  RuntimeError
781
827
  If the response fails in any way.
828
+ ValueError
829
+ If min_similarity is not [0,1].
782
830
 
783
831
  Notes
784
832
  -----
@@ -824,6 +872,8 @@ class Nosible:
824
872
  ...
825
873
  ValueError: Bulk search cannot have more than 10000 results per query.
826
874
  """
875
+ from cryptography.fernet import Fernet
876
+
827
877
  previous_level = self.logger.level
828
878
  if verbose:
829
879
  self.logger.setLevel(logging.INFO)
@@ -843,6 +893,12 @@ class Nosible:
843
893
  n_probes = search.n_probes if search.n_probes is not None else n_probes
844
894
  n_contextify = search.n_contextify if search.n_contextify is not None else n_contextify
845
895
  algorithm = search.algorithm if search.algorithm is not None else algorithm
896
+ min_similarity = search.min_similarity if search.min_similarity is not None else min_similarity
897
+ min_similarity = min_similarity if min_similarity is not None else 0
898
+ must_include = search.must_include if search.must_include is not None else must_include
899
+ must_include = must_include if must_include is not None else []
900
+ must_exclude = search.must_exclude if search.must_exclude is not None else must_exclude
901
+ must_exclude = must_exclude if must_exclude is not None else []
846
902
  autogenerate_expansions = (
847
903
  search.autogenerate_expansions
848
904
  if search.autogenerate_expansions is not None
@@ -868,6 +924,13 @@ class Nosible:
868
924
  if autogenerate_expansions is True:
869
925
  expansions = self._generate_expansions(question=question)
870
926
 
927
+ must_include = must_include if must_include is not None else []
928
+ must_exclude = must_exclude if must_exclude is not None else []
929
+ min_similarity = min_similarity if min_similarity is not None else 0
930
+
931
+ if not (0.0 <= min_similarity <= 1.0):
932
+ raise ValueError(f"Invalid min_simalarity: {min_similarity}. Must be [0,1].")
933
+
871
934
  # Generate sql_filter if unset
872
935
  if sql_filter is None:
873
936
  sql_filter = self._format_sql(
@@ -912,11 +975,14 @@ class Nosible:
912
975
  "n_probes": n_probes,
913
976
  "n_contextify": n_contextify,
914
977
  "algorithm": algorithm,
978
+ "min_similarity": min_similarity,
979
+ "must_include": must_include,
980
+ "must_exclude": must_exclude,
915
981
  }
916
982
  resp = self._post(url="https://www.nosible.ai/search/v1/slow-search", payload=payload)
917
983
  try:
918
984
  resp.raise_for_status()
919
- except requests.HTTPError as e:
985
+ except httpx.HTTPStatusError as e:
920
986
  raise ValueError(f"[{question!r}] HTTP {resp.status_code}: {resp.text}") from e
921
987
 
922
988
  data = resp.json()
@@ -928,7 +994,7 @@ class Nosible:
928
994
  decrypt_using = data.get("decrypt_using")
929
995
  for _ in range(100):
930
996
  dl = self._session.get(download_from, timeout=self.timeout)
931
- if dl.ok:
997
+ if dl.status_code == 200:
932
998
  fernet = Fernet(decrypt_using.encode())
933
999
  decrypted = fernet.decrypt(dl.content)
934
1000
  decompressed = gzip.decompress(decrypted)
@@ -944,6 +1010,112 @@ class Nosible:
944
1010
  if verbose:
945
1011
  self.logger.setLevel(previous_level)
946
1012
 
1013
+ def answer(
1014
+ self,
1015
+ query: str,
1016
+ n_results: int = 100,
1017
+ min_similarity: float = 0.65,
1018
+ model: Union[str, None] = "google/gemini-2.0-flash-001",
1019
+ show_context: bool = True,
1020
+ ) -> str:
1021
+ """
1022
+ RAG-style question answering: retrieve top `n_results` via `.search()`
1023
+ then answer `query` using those documents as context.
1024
+
1025
+ Parameters
1026
+ ----------
1027
+ query : str
1028
+ The user’s natural-language question.
1029
+ n_results : int
1030
+ How many docs to fetch to build the context.
1031
+ min_similarity : float
1032
+ Results must have at least this similarity score.
1033
+ model : str, optional
1034
+ Which LLM to call to answer your question.
1035
+ show_context : bool, optional
1036
+ Do you want the context to be shown?
1037
+
1038
+ Returns
1039
+ -------
1040
+ str
1041
+ The LLM’s generated answer, grounded in the retrieved docs.
1042
+
1043
+ Raises
1044
+ ------
1045
+ ValueError
1046
+ If no API key is configured for the LLM client.
1047
+ RuntimeError
1048
+ If the LLM call fails or returns an invalid response.
1049
+
1050
+ Examples
1051
+ --------
1052
+ >>> from nosible import Nosible
1053
+ >>> with Nosible() as nos:
1054
+ ... ans = nos.answer(
1055
+ ... query="How is research governance and decision-making structured between Google and DeepMind?",
1056
+ ... n_results=100,
1057
+ ... show_context=True,
1058
+ ... ) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
1059
+ <BLANKLINE>
1060
+ Doc 1
1061
+ Title: ...
1062
+ >>> print(ans) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
1063
+ Answer:
1064
+ ...
1065
+ """
1066
+
1067
+ if not self.llm_api_key:
1068
+ raise ValueError("An LLM API key is required for answer().")
1069
+
1070
+ # Retrieve top documents
1071
+ results = self.search(question=query, n_results=n_results, min_similarity=min_similarity)
1072
+
1073
+ # Build RAG context
1074
+ context = ""
1075
+ pieces: list[str] = []
1076
+ for idx, result in enumerate(results):
1077
+ pieces.append(f"""
1078
+ Doc {idx + 1}
1079
+ Title: {result.title}
1080
+ Similarity Score: {result.similarity * 100:.2f}%
1081
+ URL: {result.url}
1082
+ Content: {result.content}
1083
+ """)
1084
+ context = "\n".join(pieces)
1085
+
1086
+ if show_context:
1087
+ print(textwrap.dedent(context))
1088
+
1089
+ # Craft prompt
1090
+ prompt = f"""
1091
+ # TASK DESCRIPTION
1092
+
1093
+ You are a helpful assistant. Use the following context to answer the question.
1094
+ When you use information from a chunk, cite it by referencing its label in square brackets, e.g. [doc3].
1095
+
1096
+ ## Question
1097
+ {query}
1098
+
1099
+ ## Context
1100
+ {context}
1101
+ """
1102
+ from openai import OpenAI
1103
+
1104
+ # Call LLM
1105
+ client = OpenAI(base_url=self.openai_base_url, api_key=self.llm_api_key)
1106
+ try:
1107
+ response = client.chat.completions.create(model=model, messages=[{"role": "user", "content": prompt}])
1108
+ except Exception as e:
1109
+ raise RuntimeError(f"LLM API error: {e}") from e
1110
+
1111
+ # Validate response shape
1112
+ choices = getattr(response, "choices", None)
1113
+ if not choices or not hasattr(choices[0], "message"):
1114
+ raise RuntimeError(f"Invalid LLM response format: {response!r}")
1115
+
1116
+ # Return the generated text
1117
+ return "Answer:\n" + response.choices[0].message.content.strip()
1118
+
947
1119
  @_rate_limited("visit")
948
1120
  def visit(self, html: str = "", recrawl: bool = False, render: bool = False, url: str = None) -> WebPageData:
949
1121
  """
@@ -1029,6 +1201,75 @@ class Nosible:
1029
1201
  url_tree=response_data.get("url_tree"),
1030
1202
  )
1031
1203
 
1204
+ @_rate_limited("fast")
1205
+ def trend(
1206
+ self,
1207
+ query: str,
1208
+ start_date: Optional[str] = None,
1209
+ end_date: Optional[str] = None,
1210
+ sql_filter: Optional[str] = None,
1211
+ ) -> dict:
1212
+ """
1213
+ Extract a trend showing the volume of news surrounding your query.
1214
+
1215
+ Parameters
1216
+ ----------
1217
+ query : str
1218
+ The search term we would like to see a trend for.
1219
+ start_date : str, optional
1220
+ ISO‐format start date (YYYY-MM-DD) of the trend window.
1221
+ end_date : str, optional
1222
+ ISO‐format end date (YYYY-MM-DD) of the trend window.
1223
+ sql_filter : str, optional
1224
+ An optional SQL filter to narrow down the trend query
1225
+
1226
+ Returns
1227
+ -------
1228
+ dict
1229
+ The JSON-decoded trend data returned by the server.
1230
+
1231
+ Examples
1232
+ --------
1233
+ >>> from nosible import Nosible
1234
+ >>> with Nosible() as nos:
1235
+ ... trends_data = nos.trend("Christmas Shopping", start_date="2005-01-01", end_date="2020-12-31")
1236
+ ... print(trends_data) # doctest: +ELLIPSIS
1237
+ {'2005-01-31': ...'2020-12-31': ...}
1238
+ """
1239
+ # Validate dates
1240
+ if start_date is not None:
1241
+ self._validate_date_format(start_date, "start_date")
1242
+ if end_date is not None:
1243
+ self._validate_date_format(end_date, "end_date")
1244
+
1245
+ payload: dict[str, str] = {"query": query}
1246
+
1247
+ if sql_filter is not None:
1248
+ payload["sql_filter"] = sql_filter
1249
+ else:
1250
+ payload["sql_filter"] = "SELECT loc, published FROM engine"
1251
+
1252
+ # Send the POST to the /trend endpoint
1253
+ response = self._post(url="https://www.nosible.ai/search/v1/trend", payload=payload)
1254
+ # Will raise ValueError on rate-limit or auth errors
1255
+ response.raise_for_status()
1256
+ payload = response.json().get("response", {})
1257
+
1258
+ # if no window requested, return everything
1259
+ if start_date is None and end_date is None:
1260
+ return payload
1261
+
1262
+ # Filter by ISO‐date keys
1263
+ filtered: dict[str, float] = {}
1264
+ for date_str, value in payload.items():
1265
+ if start_date and date_str < start_date:
1266
+ continue
1267
+ if end_date and date_str > end_date:
1268
+ continue
1269
+ filtered[date_str] = value
1270
+
1271
+ return filtered
1272
+
1032
1273
  def version(self) -> str:
1033
1274
  """
1034
1275
  Retrieve the current version information for the Nosible API.
@@ -1107,7 +1348,9 @@ class Nosible:
1107
1348
  return False
1108
1349
  if msg == "The URL could not be retrieved.":
1109
1350
  return False
1110
- except requests.HTTPError:
1351
+ # If we reach here, the response is unexpected
1352
+ return False
1353
+ except httpx.HTTPError:
1111
1354
  return False
1112
1355
  except:
1113
1356
  return False
@@ -1202,7 +1445,7 @@ class Nosible:
1202
1445
  out = [
1203
1446
  "Below are the rate limits for all NOSIBLE plans.",
1204
1447
  "To upgrade your package, visit https://www.nosible.ai/products.\n",
1205
- "Unless otherwise indicated, bulk searches are limited to one-at-a-time per API key.\n"
1448
+ "Unless otherwise indicated, bulk searches are limited to one-at-a-time per API key.\n",
1206
1449
  ]
1207
1450
 
1208
1451
  user_plan = self._get_user_plan()
@@ -1263,7 +1506,7 @@ class Nosible:
1263
1506
  except Exception:
1264
1507
  pass
1265
1508
 
1266
- def _post(self, url: str, payload: dict, headers: dict = None, timeout: int = None) -> requests.Response:
1509
+ def _post(self, url: str, payload: dict, headers: dict = None, timeout: int = None) -> httpx.Response:
1267
1510
  """
1268
1511
  Internal helper to send a POST request with retry logic.
1269
1512
 
@@ -1295,7 +1538,7 @@ class Nosible:
1295
1538
 
1296
1539
  Returns
1297
1540
  -------
1298
- requests.Response
1541
+ httpx.Response
1299
1542
  The HTTP response object.
1300
1543
  """
1301
1544
  response = self._session.post(
@@ -1303,16 +1546,19 @@ class Nosible:
1303
1546
  json=payload,
1304
1547
  headers=headers if headers is not None else self.headers,
1305
1548
  timeout=timeout if timeout is not None else self.timeout,
1549
+ follow_redirects=True,
1306
1550
  )
1307
1551
 
1308
1552
  # If unauthorized, or if the payload is string too short, treat as invalid API key
1309
1553
  if response.status_code == 401:
1310
1554
  raise ValueError("Your API key is not valid.")
1311
1555
  if response.status_code == 422:
1312
- # Only inspect JSON if it’s a JSON response
1313
1556
  content_type = response.headers.get("Content-Type", "")
1314
1557
  if content_type.startswith("application/json"):
1315
1558
  body = response.json()
1559
+ if isinstance(body, list):
1560
+ body = body[0]
1561
+ print(body)
1316
1562
  if body.get("type") == "string_too_short":
1317
1563
  raise ValueError("Your API key is not valid: Too Short.")
1318
1564
  else:
@@ -1450,12 +1696,14 @@ class Nosible:
1450
1696
  - Contextual Example: Swap "diabetes treatment" with "insulin therapy" or "blood sugar management".
1451
1697
 
1452
1698
  """.replace(" ", "")
1699
+ # Lazy load
1700
+ from openai import OpenAI
1453
1701
 
1454
1702
  client = OpenAI(base_url=self.openai_base_url, api_key=self.llm_api_key)
1455
1703
 
1456
1704
  # Call the chat completions endpoint.
1457
1705
  resp = client.chat.completions.create(
1458
- model=self.sentiment_model, messages=[{"role": "user", "content": prompt.strip()}], temperature=0.7
1706
+ model=self.expansions_model, messages=[{"role": "user", "content": prompt.strip()}], temperature=0.7
1459
1707
  )
1460
1708
 
1461
1709
  raw = resp.choices[0].message.content
@@ -1481,6 +1729,51 @@ class Nosible:
1481
1729
  self.logger.debug(f"Successful expansions: {expansions}")
1482
1730
  return expansions
1483
1731
 
1732
+ @staticmethod
1733
+ def _validate_date_format(string: str, name: str):
1734
+ """
1735
+ Check that a date string is valid ISO format (YYYY-MM-DD or full ISO timestamp).
1736
+
1737
+ Parameters
1738
+ ----------
1739
+ string : str
1740
+ The date string to validate.
1741
+ name : str
1742
+ The name of the parameter being validated, used in the error message.
1743
+
1744
+ Raises
1745
+ ------
1746
+ ValueError
1747
+ If `string` is not a valid ISO 8601 date. Error message will include
1748
+ the `name` and the offending string.
1749
+ Examples
1750
+ --------
1751
+ >>> # valid date-only format
1752
+ >>> Nosible._validate_date_format("2023-12-31", "publish_start")
1753
+ >>> # valid full timestamp
1754
+ >>> Nosible._validate_date_format("2023-12-31T15:30:00", "visited_end")
1755
+ >>> # invalid month
1756
+ >>> Nosible._validate_date_format("2023-13-01", "publish_end")
1757
+ Traceback (most recent call last):
1758
+ ...
1759
+ ValueError: Invalid date for 'publish_end': '2023-13-01'. Expected ISO format 'YYYY-MM-DD'.
1760
+ >>> # wrong separator
1761
+ >>> Nosible._validate_date_format("2023/12/31", "visited_start")
1762
+ Traceback (most recent call last):
1763
+ ...
1764
+ ValueError: Invalid date for 'visited_start': '2023/12/31'. Expected ISO format 'YYYY-MM-DD'.
1765
+ """
1766
+ dateregex = r"^\d{4}-\d{2}-\d{2}"
1767
+
1768
+ if not re.match(dateregex, string):
1769
+ raise ValueError(f"Invalid date for '{name}': {string!r}. Expected ISO format 'YYYY-MM-DD'.")
1770
+
1771
+ try:
1772
+ # datetime.fromisoformat accepts both YYYY-MM-DD and full timestamps
1773
+ parsed = datetime.fromisoformat(string)
1774
+ except Exception:
1775
+ raise ValueError(f"Invalid date for '{name}': {string!r}. Expected ISO format 'YYYY-MM-DD'.")
1776
+
1484
1777
  def _format_sql(
1485
1778
  self,
1486
1779
  publish_start: str = None,
@@ -1539,8 +1832,17 @@ class Nosible:
1539
1832
  ValueError
1540
1833
  If more than 50 items in a filter are given.
1541
1834
  """
1835
+ for name, value in [
1836
+ ("publish_start", publish_start),
1837
+ ("publish_end", publish_end),
1838
+ ("visited_start", visited_start),
1839
+ ("visited_end", visited_end),
1840
+ ]:
1841
+ if value is not None:
1842
+ self._validate_date_format(string=value, name=name)
1843
+
1542
1844
  # Validate list lengths
1543
- for name, lst in [
1845
+ for name, value in [
1544
1846
  ("include_netlocs", include_netlocs),
1545
1847
  ("exclude_netlocs", exclude_netlocs),
1546
1848
  ("include_languages", include_languages),
@@ -1550,8 +1852,8 @@ class Nosible:
1550
1852
  ("include_docs", include_docs),
1551
1853
  ("exclude_docs", exclude_docs),
1552
1854
  ]:
1553
- if lst is not None and len(lst) > 50:
1554
- raise ValueError(f"Too many items for '{name}' filter ({len(lst)}); maximum allowed is 50.")
1855
+ if value is not None and len(value) > 50:
1856
+ raise ValueError(f"Too many items for '{name}' filter ({len(value)}); maximum allowed is 50.")
1555
1857
 
1556
1858
  sql = ["SELECT loc FROM engine"]
1557
1859
  clauses: list[str] = []
@@ -1683,9 +1985,11 @@ class Nosible:
1683
1985
  "company_3",
1684
1986
  "doc_hash",
1685
1987
  ]
1988
+ import polars as pl # Lazy import
1989
+
1686
1990
  # Create a dummy DataFrame with correct columns and no rows
1687
1991
  df = pl.DataFrame({col: [] for col in columns})
1688
- ctx = SQLContext()
1992
+ ctx = pl.SQLContext()
1689
1993
  ctx.register("engine", df)
1690
1994
  try:
1691
1995
  ctx.execute(sql)
@@ -1706,21 +2010,21 @@ class Nosible:
1706
2010
 
1707
2011
  def __exit__(
1708
2012
  self,
1709
- _exc_type: typing.Optional[type[BaseException]],
1710
- _exc_val: typing.Optional[BaseException],
1711
- _exc_tb: typing.Optional[types.TracebackType],
1712
- ) -> typing.Optional[bool]:
2013
+ _exc_type: Optional[type[BaseException]],
2014
+ _exc_val: Optional[BaseException],
2015
+ _exc_tb: Optional[types.TracebackType],
2016
+ ) -> Optional[bool]:
1713
2017
  """
1714
2018
  Always clean up (self.close()), but let exceptions propagate.
1715
2019
  Return True only if you really want to suppress an exception.
1716
2020
 
1717
2021
  Parameters
1718
2022
  ----------
1719
- exc_type : Optional[type[BaseException]]
2023
+ _exc_type : Optional[type[BaseException]]
1720
2024
  The type of the exception raised, if any.
1721
- exc_val : Optional[BaseException]
2025
+ _exc_val : Optional[BaseException]
1722
2026
  The exception instance, if any.
1723
- exc_tb : Optional[types.TracebackType]
2027
+ _exc_tb : Optional[types.TracebackType]
1724
2028
  The traceback object, if any.
1725
2029
 
1726
2030
  Returns
@@ -1741,5 +2045,9 @@ class Nosible:
1741
2045
  Destructor to ensure resources are cleaned up if not explicitly closed.
1742
2046
 
1743
2047
  """
1744
- # Ensure it's called
1745
- self.close()
2048
+ # Only close if interpreter is fully alive
2049
+ if not getattr(sys, "is_finalizing", lambda: False)():
2050
+ try:
2051
+ self.close()
2052
+ except Exception:
2053
+ pass