nosible 0.1.9__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nosible/classes/result.py +21 -8
- nosible/classes/result_set.py +46 -26
- nosible/classes/search.py +16 -0
- nosible/nosible_client.py +346 -38
- nosible/utils/json_tools.py +8 -7
- {nosible-0.1.9.dist-info → nosible-0.2.2.dist-info}/METADATA +97 -17
- nosible-0.2.2.dist-info/RECORD +16 -0
- nosible/utils/question_builder.py +0 -131
- nosible-0.1.9.dist-info/RECORD +0 -17
- {nosible-0.1.9.dist-info → nosible-0.2.2.dist-info}/WHEEL +0 -0
- {nosible-0.1.9.dist-info → nosible-0.2.2.dist-info}/licenses/LICENSE +0 -0
- {nosible-0.1.9.dist-info → nosible-0.2.2.dist-info}/top_level.txt +0 -0
nosible/nosible_client.py
CHANGED
|
@@ -2,18 +2,17 @@ import gzip
|
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
|
+
import re
|
|
6
|
+
import sys
|
|
7
|
+
import textwrap
|
|
5
8
|
import time
|
|
6
9
|
import types
|
|
7
|
-
import typing
|
|
8
10
|
from collections.abc import Iterator
|
|
9
11
|
from concurrent.futures import ThreadPoolExecutor
|
|
10
|
-
from
|
|
12
|
+
from datetime import datetime
|
|
13
|
+
from typing import Optional, Union
|
|
11
14
|
|
|
12
|
-
import
|
|
13
|
-
import requests
|
|
14
|
-
from cryptography.fernet import Fernet
|
|
15
|
-
from openai import OpenAI
|
|
16
|
-
from polars import SQLContext
|
|
15
|
+
import httpx
|
|
17
16
|
from tenacity import (
|
|
18
17
|
before_sleep_log,
|
|
19
18
|
retry,
|
|
@@ -29,7 +28,6 @@ from nosible.classes.search_set import SearchSet
|
|
|
29
28
|
from nosible.classes.snippet_set import SnippetSet
|
|
30
29
|
from nosible.classes.web_page import WebPageData
|
|
31
30
|
from nosible.utils.json_tools import json_loads
|
|
32
|
-
from nosible.utils.question_builder import _get_question
|
|
33
31
|
from nosible.utils.rate_limiter import PLAN_RATE_LIMITS, RateLimiter, _rate_limited
|
|
34
32
|
|
|
35
33
|
# Set up a module‐level logger.
|
|
@@ -53,6 +51,8 @@ class Nosible:
|
|
|
53
51
|
Base URL for the OpenAI-compatible LLM API. (default is OpenRouter's API endpoint)
|
|
54
52
|
sentiment_model : str, optional
|
|
55
53
|
Model to use for sentiment analysis (default is "openai/gpt-4o").
|
|
54
|
+
expansions_model : str, optional
|
|
55
|
+
Model to use for expansions (default is "openai/gpt-4o").
|
|
56
56
|
timeout : int
|
|
57
57
|
Request timeout for HTTP calls.
|
|
58
58
|
retries : int,
|
|
@@ -91,7 +91,8 @@ class Nosible:
|
|
|
91
91
|
- The `nosible_api_key` is required to access the Nosible Search API.
|
|
92
92
|
- The `llm_api_key` is optional and used for LLM-based query expansions.
|
|
93
93
|
- The `openai_base_url` defaults to OpenRouter's API endpoint.
|
|
94
|
-
- The `sentiment_model` is used for
|
|
94
|
+
- The `sentiment_model` is used for sentiment analysis.
|
|
95
|
+
- The `expansions_model` is used for generating query expansions.
|
|
95
96
|
- The `timeout`, `retries`, and `concurrency` parameters control the behavior of HTTP requests.
|
|
96
97
|
|
|
97
98
|
Examples
|
|
@@ -103,10 +104,11 @@ class Nosible:
|
|
|
103
104
|
|
|
104
105
|
def __init__(
|
|
105
106
|
self,
|
|
106
|
-
nosible_api_key: str = None,
|
|
107
|
-
llm_api_key: str = None,
|
|
107
|
+
nosible_api_key: Optional[str] = None,
|
|
108
|
+
llm_api_key: Optional[str] = None,
|
|
108
109
|
openai_base_url: str = "https://openrouter.ai/api/v1",
|
|
109
110
|
sentiment_model: str = "openai/gpt-4o",
|
|
111
|
+
expansions_model: str = "openai/gpt-4o",
|
|
110
112
|
timeout: int = 30,
|
|
111
113
|
retries: int = 5,
|
|
112
114
|
concurrency: int = 10,
|
|
@@ -139,6 +141,7 @@ class Nosible:
|
|
|
139
141
|
self.llm_api_key = llm_api_key or os.getenv("LLM_API_KEY")
|
|
140
142
|
self.openai_base_url = openai_base_url
|
|
141
143
|
self.sentiment_model = sentiment_model
|
|
144
|
+
self.expansions_model = expansions_model
|
|
142
145
|
# Network parameters
|
|
143
146
|
self.timeout = timeout
|
|
144
147
|
self.retries = retries
|
|
@@ -159,7 +162,7 @@ class Nosible:
|
|
|
159
162
|
reraise=True,
|
|
160
163
|
stop=stop_after_attempt(self.retries) | stop_after_delay(self.timeout),
|
|
161
164
|
wait=wait_exponential(multiplier=1, min=1, max=10),
|
|
162
|
-
retry=retry_if_exception_type(
|
|
165
|
+
retry=retry_if_exception_type(httpx.RequestError),
|
|
163
166
|
before_sleep=before_sleep_log(self.logger, logging.WARNING),
|
|
164
167
|
)(self._post)
|
|
165
168
|
|
|
@@ -168,12 +171,12 @@ class Nosible:
|
|
|
168
171
|
reraise=True,
|
|
169
172
|
stop=stop_after_attempt(self.retries) | stop_after_delay(self.timeout),
|
|
170
173
|
wait=wait_exponential(multiplier=1, min=1, max=10),
|
|
171
|
-
retry=retry_if_exception_type(
|
|
174
|
+
retry=retry_if_exception_type(httpx.RequestError),
|
|
172
175
|
before_sleep=before_sleep_log(self.logger, logging.WARNING),
|
|
173
176
|
)(self._generate_expansions)
|
|
174
177
|
|
|
175
178
|
# Thread pool for parallel searches
|
|
176
|
-
self._session =
|
|
179
|
+
self._session = httpx.Client(follow_redirects=True)
|
|
177
180
|
self._executor = ThreadPoolExecutor(max_workers=self.concurrency)
|
|
178
181
|
|
|
179
182
|
# Headers
|
|
@@ -198,7 +201,6 @@ class Nosible:
|
|
|
198
201
|
|
|
199
202
|
def search(
|
|
200
203
|
self,
|
|
201
|
-
*,
|
|
202
204
|
search: Search = None,
|
|
203
205
|
question: str = None,
|
|
204
206
|
expansions: list[str] = None,
|
|
@@ -207,6 +209,9 @@ class Nosible:
|
|
|
207
209
|
n_probes: int = 30,
|
|
208
210
|
n_contextify: int = 128,
|
|
209
211
|
algorithm: str = "hybrid-2",
|
|
212
|
+
min_similarity: float = None,
|
|
213
|
+
must_include: list[str] = None,
|
|
214
|
+
must_exclude: list[str] = None,
|
|
210
215
|
autogenerate_expansions: bool = False,
|
|
211
216
|
publish_start: str = None,
|
|
212
217
|
publish_end: str = None,
|
|
@@ -246,6 +251,12 @@ class Nosible:
|
|
|
246
251
|
Context window size per result.
|
|
247
252
|
algorithm : str
|
|
248
253
|
Search algorithm type.
|
|
254
|
+
min_similarity : float
|
|
255
|
+
Results must have at least this similarity score.
|
|
256
|
+
must_include : list of str
|
|
257
|
+
Only results mentioning these strings will be included.
|
|
258
|
+
must_exclude : list of str
|
|
259
|
+
Any result mentioning these strings will be excluded.
|
|
249
260
|
autogenerate_expansions : bool
|
|
250
261
|
Do you want to generate expansions automatically using a LLM?
|
|
251
262
|
publish_start : str, optional
|
|
@@ -335,6 +346,9 @@ class Nosible:
|
|
|
335
346
|
n_probes=n_probes,
|
|
336
347
|
n_contextify=n_contextify,
|
|
337
348
|
algorithm=algorithm,
|
|
349
|
+
min_similarity=min_similarity,
|
|
350
|
+
must_include=must_include,
|
|
351
|
+
must_exclude=must_exclude,
|
|
338
352
|
autogenerate_expansions=autogenerate_expansions,
|
|
339
353
|
publish_start=publish_start,
|
|
340
354
|
publish_end=publish_end,
|
|
@@ -372,6 +386,9 @@ class Nosible:
|
|
|
372
386
|
n_probes: int = 30,
|
|
373
387
|
n_contextify: int = 128,
|
|
374
388
|
algorithm: str = "hybrid-2",
|
|
389
|
+
min_similarity: float = None,
|
|
390
|
+
must_include: list[str] = None,
|
|
391
|
+
must_exclude: list[str] = None,
|
|
375
392
|
autogenerate_expansions: bool = False,
|
|
376
393
|
publish_start: str = None,
|
|
377
394
|
publish_end: str = None,
|
|
@@ -408,8 +425,14 @@ class Nosible:
|
|
|
408
425
|
Context window size for the search.
|
|
409
426
|
algorithm : str
|
|
410
427
|
Search algorithm to use.
|
|
428
|
+
min_similarity : float
|
|
429
|
+
Results must have at least this similarity score.
|
|
430
|
+
must_include : list of str
|
|
431
|
+
Only results mentioning these strings will be included.
|
|
432
|
+
must_exclude : list of str
|
|
433
|
+
Any result mentioning these strings will be excluded.
|
|
411
434
|
autogenerate_expansions : bool
|
|
412
|
-
Do you want to generate expansions automatically using a LLM
|
|
435
|
+
Do you want to generate expansions automatically using a LLM?.
|
|
413
436
|
publish_start : str, optional
|
|
414
437
|
Start date for when the document was published (ISO format).
|
|
415
438
|
publish_end : str, optional
|
|
@@ -509,6 +532,9 @@ class Nosible:
|
|
|
509
532
|
n_probes=n_probes,
|
|
510
533
|
n_contextify=n_contextify,
|
|
511
534
|
algorithm=algorithm,
|
|
535
|
+
min_similarity=min_similarity,
|
|
536
|
+
must_include=must_include,
|
|
537
|
+
must_exclude=must_exclude,
|
|
512
538
|
autogenerate_expansions=autogenerate_expansions,
|
|
513
539
|
publish_start=publish_start,
|
|
514
540
|
publish_end=publish_end,
|
|
@@ -532,7 +558,7 @@ class Nosible:
|
|
|
532
558
|
yield future.result()
|
|
533
559
|
except Exception as e:
|
|
534
560
|
self.logger.warning(f"Search failed: {e!r}")
|
|
535
|
-
|
|
561
|
+
raise
|
|
536
562
|
|
|
537
563
|
return _run_generator()
|
|
538
564
|
|
|
@@ -555,6 +581,8 @@ class Nosible:
|
|
|
555
581
|
------
|
|
556
582
|
ValueError
|
|
557
583
|
If `n_results` > 100.
|
|
584
|
+
ValueError
|
|
585
|
+
If min_similarity is not [0,1].
|
|
558
586
|
|
|
559
587
|
Examples
|
|
560
588
|
--------
|
|
@@ -577,6 +605,9 @@ class Nosible:
|
|
|
577
605
|
n_probes = search_obj.n_probes if search_obj.n_probes is not None else 30
|
|
578
606
|
n_contextify = search_obj.n_contextify if search_obj.n_contextify is not None else 128
|
|
579
607
|
algorithm = search_obj.algorithm if search_obj.algorithm is not None else "hybrid-2"
|
|
608
|
+
min_similarity = search_obj.min_similarity if search_obj.min_similarity is not None else 0
|
|
609
|
+
must_include = search_obj.must_include if search_obj.must_include is not None else []
|
|
610
|
+
must_exclude = search_obj.must_exclude if search_obj.must_exclude is not None else []
|
|
580
611
|
autogenerate_expansions = (
|
|
581
612
|
search_obj.autogenerate_expansions if search_obj.autogenerate_expansions is not None else False
|
|
582
613
|
)
|
|
@@ -600,6 +631,9 @@ class Nosible:
|
|
|
600
631
|
search_obj.exclude_companies if search_obj.exclude_companies is not None else self.exclude_companies
|
|
601
632
|
)
|
|
602
633
|
|
|
634
|
+
if not (0.0 <= min_similarity <= 1.0):
|
|
635
|
+
raise ValueError(f"Invalid min_simalarity: {min_similarity}. Must be [0,1].")
|
|
636
|
+
|
|
603
637
|
# Generate expansions if not provided
|
|
604
638
|
if expansions is None:
|
|
605
639
|
expansions = []
|
|
@@ -636,6 +670,9 @@ class Nosible:
|
|
|
636
670
|
"n_probes": n_probes,
|
|
637
671
|
"n_contextify": n_contextify,
|
|
638
672
|
"algorithm": algorithm,
|
|
673
|
+
"min_similarity": min_similarity,
|
|
674
|
+
"must_include": must_include,
|
|
675
|
+
"must_exclude": must_exclude,
|
|
639
676
|
}
|
|
640
677
|
|
|
641
678
|
resp = self._post(url="https://www.nosible.ai/search/v1/fast-search", payload=payload)
|
|
@@ -696,6 +733,9 @@ class Nosible:
|
|
|
696
733
|
n_probes: int = 30,
|
|
697
734
|
n_contextify: int = 128,
|
|
698
735
|
algorithm: str = "hybrid-2",
|
|
736
|
+
min_similarity: float = None,
|
|
737
|
+
must_include: list[str] = None,
|
|
738
|
+
must_exclude: list[str] = None,
|
|
699
739
|
autogenerate_expansions: bool = False,
|
|
700
740
|
publish_start: str = None,
|
|
701
741
|
publish_end: str = None,
|
|
@@ -733,6 +773,12 @@ class Nosible:
|
|
|
733
773
|
Context window size per result.
|
|
734
774
|
algorithm : str
|
|
735
775
|
Search algorithm identifier.
|
|
776
|
+
min_similarity : float
|
|
777
|
+
Results must have at least this similarity score.
|
|
778
|
+
must_include : list of str
|
|
779
|
+
Only results mentioning these strings will be included.
|
|
780
|
+
must_exclude : list of str
|
|
781
|
+
Any result mentioning these strings will be excluded.
|
|
736
782
|
autogenerate_expansions : bool
|
|
737
783
|
Do you want to generate expansions automatically using a LLM?
|
|
738
784
|
publish_start : str, optional
|
|
@@ -779,6 +825,8 @@ class Nosible:
|
|
|
779
825
|
If neither question nor search are specified.
|
|
780
826
|
RuntimeError
|
|
781
827
|
If the response fails in any way.
|
|
828
|
+
ValueError
|
|
829
|
+
If min_similarity is not [0,1].
|
|
782
830
|
|
|
783
831
|
Notes
|
|
784
832
|
-----
|
|
@@ -824,6 +872,8 @@ class Nosible:
|
|
|
824
872
|
...
|
|
825
873
|
ValueError: Bulk search cannot have more than 10000 results per query.
|
|
826
874
|
"""
|
|
875
|
+
from cryptography.fernet import Fernet
|
|
876
|
+
|
|
827
877
|
previous_level = self.logger.level
|
|
828
878
|
if verbose:
|
|
829
879
|
self.logger.setLevel(logging.INFO)
|
|
@@ -843,6 +893,12 @@ class Nosible:
|
|
|
843
893
|
n_probes = search.n_probes if search.n_probes is not None else n_probes
|
|
844
894
|
n_contextify = search.n_contextify if search.n_contextify is not None else n_contextify
|
|
845
895
|
algorithm = search.algorithm if search.algorithm is not None else algorithm
|
|
896
|
+
min_similarity = search.min_similarity if search.min_similarity is not None else min_similarity
|
|
897
|
+
min_similarity = min_similarity if min_similarity is not None else 0
|
|
898
|
+
must_include = search.must_include if search.must_include is not None else must_include
|
|
899
|
+
must_include = must_include if must_include is not None else []
|
|
900
|
+
must_exclude = search.must_exclude if search.must_exclude is not None else must_exclude
|
|
901
|
+
must_exclude = must_exclude if must_exclude is not None else []
|
|
846
902
|
autogenerate_expansions = (
|
|
847
903
|
search.autogenerate_expansions
|
|
848
904
|
if search.autogenerate_expansions is not None
|
|
@@ -868,6 +924,13 @@ class Nosible:
|
|
|
868
924
|
if autogenerate_expansions is True:
|
|
869
925
|
expansions = self._generate_expansions(question=question)
|
|
870
926
|
|
|
927
|
+
must_include = must_include if must_include is not None else []
|
|
928
|
+
must_exclude = must_exclude if must_exclude is not None else []
|
|
929
|
+
min_similarity = min_similarity if min_similarity is not None else 0
|
|
930
|
+
|
|
931
|
+
if not (0.0 <= min_similarity <= 1.0):
|
|
932
|
+
raise ValueError(f"Invalid min_simalarity: {min_similarity}. Must be [0,1].")
|
|
933
|
+
|
|
871
934
|
# Generate sql_filter if unset
|
|
872
935
|
if sql_filter is None:
|
|
873
936
|
sql_filter = self._format_sql(
|
|
@@ -912,11 +975,14 @@ class Nosible:
|
|
|
912
975
|
"n_probes": n_probes,
|
|
913
976
|
"n_contextify": n_contextify,
|
|
914
977
|
"algorithm": algorithm,
|
|
978
|
+
"min_similarity": min_similarity,
|
|
979
|
+
"must_include": must_include,
|
|
980
|
+
"must_exclude": must_exclude,
|
|
915
981
|
}
|
|
916
982
|
resp = self._post(url="https://www.nosible.ai/search/v1/slow-search", payload=payload)
|
|
917
983
|
try:
|
|
918
984
|
resp.raise_for_status()
|
|
919
|
-
except
|
|
985
|
+
except httpx.HTTPStatusError as e:
|
|
920
986
|
raise ValueError(f"[{question!r}] HTTP {resp.status_code}: {resp.text}") from e
|
|
921
987
|
|
|
922
988
|
data = resp.json()
|
|
@@ -928,7 +994,7 @@ class Nosible:
|
|
|
928
994
|
decrypt_using = data.get("decrypt_using")
|
|
929
995
|
for _ in range(100):
|
|
930
996
|
dl = self._session.get(download_from, timeout=self.timeout)
|
|
931
|
-
if dl.
|
|
997
|
+
if dl.status_code == 200:
|
|
932
998
|
fernet = Fernet(decrypt_using.encode())
|
|
933
999
|
decrypted = fernet.decrypt(dl.content)
|
|
934
1000
|
decompressed = gzip.decompress(decrypted)
|
|
@@ -944,6 +1010,112 @@ class Nosible:
|
|
|
944
1010
|
if verbose:
|
|
945
1011
|
self.logger.setLevel(previous_level)
|
|
946
1012
|
|
|
1013
|
+
def answer(
|
|
1014
|
+
self,
|
|
1015
|
+
query: str,
|
|
1016
|
+
n_results: int = 100,
|
|
1017
|
+
min_similarity: float = 0.65,
|
|
1018
|
+
model: Union[str, None] = "google/gemini-2.0-flash-001",
|
|
1019
|
+
show_context: bool = True,
|
|
1020
|
+
) -> str:
|
|
1021
|
+
"""
|
|
1022
|
+
RAG-style question answering: retrieve top `n_results` via `.search()`
|
|
1023
|
+
then answer `query` using those documents as context.
|
|
1024
|
+
|
|
1025
|
+
Parameters
|
|
1026
|
+
----------
|
|
1027
|
+
query : str
|
|
1028
|
+
The user’s natural-language question.
|
|
1029
|
+
n_results : int
|
|
1030
|
+
How many docs to fetch to build the context.
|
|
1031
|
+
min_similarity : float
|
|
1032
|
+
Results must have at least this similarity score.
|
|
1033
|
+
model : str, optional
|
|
1034
|
+
Which LLM to call to answer your question.
|
|
1035
|
+
show_context : bool, optional
|
|
1036
|
+
Do you want the context to be shown?
|
|
1037
|
+
|
|
1038
|
+
Returns
|
|
1039
|
+
-------
|
|
1040
|
+
str
|
|
1041
|
+
The LLM’s generated answer, grounded in the retrieved docs.
|
|
1042
|
+
|
|
1043
|
+
Raises
|
|
1044
|
+
------
|
|
1045
|
+
ValueError
|
|
1046
|
+
If no API key is configured for the LLM client.
|
|
1047
|
+
RuntimeError
|
|
1048
|
+
If the LLM call fails or returns an invalid response.
|
|
1049
|
+
|
|
1050
|
+
Examples
|
|
1051
|
+
--------
|
|
1052
|
+
>>> from nosible import Nosible
|
|
1053
|
+
>>> with Nosible() as nos:
|
|
1054
|
+
... ans = nos.answer(
|
|
1055
|
+
... query="How is research governance and decision-making structured between Google and DeepMind?",
|
|
1056
|
+
... n_results=100,
|
|
1057
|
+
... show_context=True,
|
|
1058
|
+
... ) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
|
|
1059
|
+
<BLANKLINE>
|
|
1060
|
+
Doc 1
|
|
1061
|
+
Title: ...
|
|
1062
|
+
>>> print(ans) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
|
|
1063
|
+
Answer:
|
|
1064
|
+
...
|
|
1065
|
+
"""
|
|
1066
|
+
|
|
1067
|
+
if not self.llm_api_key:
|
|
1068
|
+
raise ValueError("An LLM API key is required for answer().")
|
|
1069
|
+
|
|
1070
|
+
# Retrieve top documents
|
|
1071
|
+
results = self.search(question=query, n_results=n_results, min_similarity=min_similarity)
|
|
1072
|
+
|
|
1073
|
+
# Build RAG context
|
|
1074
|
+
context = ""
|
|
1075
|
+
pieces: list[str] = []
|
|
1076
|
+
for idx, result in enumerate(results):
|
|
1077
|
+
pieces.append(f"""
|
|
1078
|
+
Doc {idx + 1}
|
|
1079
|
+
Title: {result.title}
|
|
1080
|
+
Similarity Score: {result.similarity * 100:.2f}%
|
|
1081
|
+
URL: {result.url}
|
|
1082
|
+
Content: {result.content}
|
|
1083
|
+
""")
|
|
1084
|
+
context = "\n".join(pieces)
|
|
1085
|
+
|
|
1086
|
+
if show_context:
|
|
1087
|
+
print(textwrap.dedent(context))
|
|
1088
|
+
|
|
1089
|
+
# Craft prompt
|
|
1090
|
+
prompt = f"""
|
|
1091
|
+
# TASK DESCRIPTION
|
|
1092
|
+
|
|
1093
|
+
You are a helpful assistant. Use the following context to answer the question.
|
|
1094
|
+
When you use information from a chunk, cite it by referencing its label in square brackets, e.g. [doc3].
|
|
1095
|
+
|
|
1096
|
+
## Question
|
|
1097
|
+
{query}
|
|
1098
|
+
|
|
1099
|
+
## Context
|
|
1100
|
+
{context}
|
|
1101
|
+
"""
|
|
1102
|
+
from openai import OpenAI
|
|
1103
|
+
|
|
1104
|
+
# Call LLM
|
|
1105
|
+
client = OpenAI(base_url=self.openai_base_url, api_key=self.llm_api_key)
|
|
1106
|
+
try:
|
|
1107
|
+
response = client.chat.completions.create(model=model, messages=[{"role": "user", "content": prompt}])
|
|
1108
|
+
except Exception as e:
|
|
1109
|
+
raise RuntimeError(f"LLM API error: {e}") from e
|
|
1110
|
+
|
|
1111
|
+
# Validate response shape
|
|
1112
|
+
choices = getattr(response, "choices", None)
|
|
1113
|
+
if not choices or not hasattr(choices[0], "message"):
|
|
1114
|
+
raise RuntimeError(f"Invalid LLM response format: {response!r}")
|
|
1115
|
+
|
|
1116
|
+
# Return the generated text
|
|
1117
|
+
return "Answer:\n" + response.choices[0].message.content.strip()
|
|
1118
|
+
|
|
947
1119
|
@_rate_limited("visit")
|
|
948
1120
|
def visit(self, html: str = "", recrawl: bool = False, render: bool = False, url: str = None) -> WebPageData:
|
|
949
1121
|
"""
|
|
@@ -1029,6 +1201,75 @@ class Nosible:
|
|
|
1029
1201
|
url_tree=response_data.get("url_tree"),
|
|
1030
1202
|
)
|
|
1031
1203
|
|
|
1204
|
+
@_rate_limited("fast")
|
|
1205
|
+
def trend(
|
|
1206
|
+
self,
|
|
1207
|
+
query: str,
|
|
1208
|
+
start_date: Optional[str] = None,
|
|
1209
|
+
end_date: Optional[str] = None,
|
|
1210
|
+
sql_filter: Optional[str] = None,
|
|
1211
|
+
) -> dict:
|
|
1212
|
+
"""
|
|
1213
|
+
Extract a trend showing the volume of news surrounding your query.
|
|
1214
|
+
|
|
1215
|
+
Parameters
|
|
1216
|
+
----------
|
|
1217
|
+
query : str
|
|
1218
|
+
The search term we would like to see a trend for.
|
|
1219
|
+
start_date : str, optional
|
|
1220
|
+
ISO‐format start date (YYYY-MM-DD) of the trend window.
|
|
1221
|
+
end_date : str, optional
|
|
1222
|
+
ISO‐format end date (YYYY-MM-DD) of the trend window.
|
|
1223
|
+
sql_filter : str, optional
|
|
1224
|
+
An optional SQL filter to narrow down the trend query
|
|
1225
|
+
|
|
1226
|
+
Returns
|
|
1227
|
+
-------
|
|
1228
|
+
dict
|
|
1229
|
+
The JSON-decoded trend data returned by the server.
|
|
1230
|
+
|
|
1231
|
+
Examples
|
|
1232
|
+
--------
|
|
1233
|
+
>>> from nosible import Nosible
|
|
1234
|
+
>>> with Nosible() as nos:
|
|
1235
|
+
... trends_data = nos.trend("Christmas Shopping", start_date="2005-01-01", end_date="2020-12-31")
|
|
1236
|
+
... print(trends_data) # doctest: +ELLIPSIS
|
|
1237
|
+
{'2005-01-31': ...'2020-12-31': ...}
|
|
1238
|
+
"""
|
|
1239
|
+
# Validate dates
|
|
1240
|
+
if start_date is not None:
|
|
1241
|
+
self._validate_date_format(start_date, "start_date")
|
|
1242
|
+
if end_date is not None:
|
|
1243
|
+
self._validate_date_format(end_date, "end_date")
|
|
1244
|
+
|
|
1245
|
+
payload: dict[str, str] = {"query": query}
|
|
1246
|
+
|
|
1247
|
+
if sql_filter is not None:
|
|
1248
|
+
payload["sql_filter"] = sql_filter
|
|
1249
|
+
else:
|
|
1250
|
+
payload["sql_filter"] = "SELECT loc, published FROM engine"
|
|
1251
|
+
|
|
1252
|
+
# Send the POST to the /trend endpoint
|
|
1253
|
+
response = self._post(url="https://www.nosible.ai/search/v1/trend", payload=payload)
|
|
1254
|
+
# Will raise ValueError on rate-limit or auth errors
|
|
1255
|
+
response.raise_for_status()
|
|
1256
|
+
payload = response.json().get("response", {})
|
|
1257
|
+
|
|
1258
|
+
# if no window requested, return everything
|
|
1259
|
+
if start_date is None and end_date is None:
|
|
1260
|
+
return payload
|
|
1261
|
+
|
|
1262
|
+
# Filter by ISO‐date keys
|
|
1263
|
+
filtered: dict[str, float] = {}
|
|
1264
|
+
for date_str, value in payload.items():
|
|
1265
|
+
if start_date and date_str < start_date:
|
|
1266
|
+
continue
|
|
1267
|
+
if end_date and date_str > end_date:
|
|
1268
|
+
continue
|
|
1269
|
+
filtered[date_str] = value
|
|
1270
|
+
|
|
1271
|
+
return filtered
|
|
1272
|
+
|
|
1032
1273
|
def version(self) -> str:
|
|
1033
1274
|
"""
|
|
1034
1275
|
Retrieve the current version information for the Nosible API.
|
|
@@ -1107,7 +1348,9 @@ class Nosible:
|
|
|
1107
1348
|
return False
|
|
1108
1349
|
if msg == "The URL could not be retrieved.":
|
|
1109
1350
|
return False
|
|
1110
|
-
|
|
1351
|
+
# If we reach here, the response is unexpected
|
|
1352
|
+
return False
|
|
1353
|
+
except httpx.HTTPError:
|
|
1111
1354
|
return False
|
|
1112
1355
|
except:
|
|
1113
1356
|
return False
|
|
@@ -1202,7 +1445,7 @@ class Nosible:
|
|
|
1202
1445
|
out = [
|
|
1203
1446
|
"Below are the rate limits for all NOSIBLE plans.",
|
|
1204
1447
|
"To upgrade your package, visit https://www.nosible.ai/products.\n",
|
|
1205
|
-
"Unless otherwise indicated, bulk searches are limited to one-at-a-time per API key.\n"
|
|
1448
|
+
"Unless otherwise indicated, bulk searches are limited to one-at-a-time per API key.\n",
|
|
1206
1449
|
]
|
|
1207
1450
|
|
|
1208
1451
|
user_plan = self._get_user_plan()
|
|
@@ -1263,7 +1506,7 @@ class Nosible:
|
|
|
1263
1506
|
except Exception:
|
|
1264
1507
|
pass
|
|
1265
1508
|
|
|
1266
|
-
def _post(self, url: str, payload: dict, headers: dict = None, timeout: int = None) ->
|
|
1509
|
+
def _post(self, url: str, payload: dict, headers: dict = None, timeout: int = None) -> httpx.Response:
|
|
1267
1510
|
"""
|
|
1268
1511
|
Internal helper to send a POST request with retry logic.
|
|
1269
1512
|
|
|
@@ -1295,7 +1538,7 @@ class Nosible:
|
|
|
1295
1538
|
|
|
1296
1539
|
Returns
|
|
1297
1540
|
-------
|
|
1298
|
-
|
|
1541
|
+
httpx.Response
|
|
1299
1542
|
The HTTP response object.
|
|
1300
1543
|
"""
|
|
1301
1544
|
response = self._session.post(
|
|
@@ -1303,16 +1546,19 @@ class Nosible:
|
|
|
1303
1546
|
json=payload,
|
|
1304
1547
|
headers=headers if headers is not None else self.headers,
|
|
1305
1548
|
timeout=timeout if timeout is not None else self.timeout,
|
|
1549
|
+
follow_redirects=True,
|
|
1306
1550
|
)
|
|
1307
1551
|
|
|
1308
1552
|
# If unauthorized, or if the payload is string too short, treat as invalid API key
|
|
1309
1553
|
if response.status_code == 401:
|
|
1310
1554
|
raise ValueError("Your API key is not valid.")
|
|
1311
1555
|
if response.status_code == 422:
|
|
1312
|
-
# Only inspect JSON if it’s a JSON response
|
|
1313
1556
|
content_type = response.headers.get("Content-Type", "")
|
|
1314
1557
|
if content_type.startswith("application/json"):
|
|
1315
1558
|
body = response.json()
|
|
1559
|
+
if isinstance(body, list):
|
|
1560
|
+
body = body[0]
|
|
1561
|
+
print(body)
|
|
1316
1562
|
if body.get("type") == "string_too_short":
|
|
1317
1563
|
raise ValueError("Your API key is not valid: Too Short.")
|
|
1318
1564
|
else:
|
|
@@ -1450,12 +1696,14 @@ class Nosible:
|
|
|
1450
1696
|
- Contextual Example: Swap "diabetes treatment" with "insulin therapy" or "blood sugar management".
|
|
1451
1697
|
|
|
1452
1698
|
""".replace(" ", "")
|
|
1699
|
+
# Lazy load
|
|
1700
|
+
from openai import OpenAI
|
|
1453
1701
|
|
|
1454
1702
|
client = OpenAI(base_url=self.openai_base_url, api_key=self.llm_api_key)
|
|
1455
1703
|
|
|
1456
1704
|
# Call the chat completions endpoint.
|
|
1457
1705
|
resp = client.chat.completions.create(
|
|
1458
|
-
model=self.
|
|
1706
|
+
model=self.expansions_model, messages=[{"role": "user", "content": prompt.strip()}], temperature=0.7
|
|
1459
1707
|
)
|
|
1460
1708
|
|
|
1461
1709
|
raw = resp.choices[0].message.content
|
|
@@ -1481,6 +1729,51 @@ class Nosible:
|
|
|
1481
1729
|
self.logger.debug(f"Successful expansions: {expansions}")
|
|
1482
1730
|
return expansions
|
|
1483
1731
|
|
|
1732
|
+
@staticmethod
|
|
1733
|
+
def _validate_date_format(string: str, name: str):
|
|
1734
|
+
"""
|
|
1735
|
+
Check that a date string is valid ISO format (YYYY-MM-DD or full ISO timestamp).
|
|
1736
|
+
|
|
1737
|
+
Parameters
|
|
1738
|
+
----------
|
|
1739
|
+
string : str
|
|
1740
|
+
The date string to validate.
|
|
1741
|
+
name : str
|
|
1742
|
+
The name of the parameter being validated, used in the error message.
|
|
1743
|
+
|
|
1744
|
+
Raises
|
|
1745
|
+
------
|
|
1746
|
+
ValueError
|
|
1747
|
+
If `string` is not a valid ISO 8601 date. Error message will include
|
|
1748
|
+
the `name` and the offending string.
|
|
1749
|
+
Examples
|
|
1750
|
+
--------
|
|
1751
|
+
>>> # valid date-only format
|
|
1752
|
+
>>> Nosible._validate_date_format("2023-12-31", "publish_start")
|
|
1753
|
+
>>> # valid full timestamp
|
|
1754
|
+
>>> Nosible._validate_date_format("2023-12-31T15:30:00", "visited_end")
|
|
1755
|
+
>>> # invalid month
|
|
1756
|
+
>>> Nosible._validate_date_format("2023-13-01", "publish_end")
|
|
1757
|
+
Traceback (most recent call last):
|
|
1758
|
+
...
|
|
1759
|
+
ValueError: Invalid date for 'publish_end': '2023-13-01'. Expected ISO format 'YYYY-MM-DD'.
|
|
1760
|
+
>>> # wrong separator
|
|
1761
|
+
>>> Nosible._validate_date_format("2023/12/31", "visited_start")
|
|
1762
|
+
Traceback (most recent call last):
|
|
1763
|
+
...
|
|
1764
|
+
ValueError: Invalid date for 'visited_start': '2023/12/31'. Expected ISO format 'YYYY-MM-DD'.
|
|
1765
|
+
"""
|
|
1766
|
+
dateregex = r"^\d{4}-\d{2}-\d{2}"
|
|
1767
|
+
|
|
1768
|
+
if not re.match(dateregex, string):
|
|
1769
|
+
raise ValueError(f"Invalid date for '{name}': {string!r}. Expected ISO format 'YYYY-MM-DD'.")
|
|
1770
|
+
|
|
1771
|
+
try:
|
|
1772
|
+
# datetime.fromisoformat accepts both YYYY-MM-DD and full timestamps
|
|
1773
|
+
parsed = datetime.fromisoformat(string)
|
|
1774
|
+
except Exception:
|
|
1775
|
+
raise ValueError(f"Invalid date for '{name}': {string!r}. Expected ISO format 'YYYY-MM-DD'.")
|
|
1776
|
+
|
|
1484
1777
|
def _format_sql(
|
|
1485
1778
|
self,
|
|
1486
1779
|
publish_start: str = None,
|
|
@@ -1539,8 +1832,17 @@ class Nosible:
|
|
|
1539
1832
|
ValueError
|
|
1540
1833
|
If more than 50 items in a filter are given.
|
|
1541
1834
|
"""
|
|
1835
|
+
for name, value in [
|
|
1836
|
+
("publish_start", publish_start),
|
|
1837
|
+
("publish_end", publish_end),
|
|
1838
|
+
("visited_start", visited_start),
|
|
1839
|
+
("visited_end", visited_end),
|
|
1840
|
+
]:
|
|
1841
|
+
if value is not None:
|
|
1842
|
+
self._validate_date_format(string=value, name=name)
|
|
1843
|
+
|
|
1542
1844
|
# Validate list lengths
|
|
1543
|
-
for name,
|
|
1845
|
+
for name, value in [
|
|
1544
1846
|
("include_netlocs", include_netlocs),
|
|
1545
1847
|
("exclude_netlocs", exclude_netlocs),
|
|
1546
1848
|
("include_languages", include_languages),
|
|
@@ -1550,8 +1852,8 @@ class Nosible:
|
|
|
1550
1852
|
("include_docs", include_docs),
|
|
1551
1853
|
("exclude_docs", exclude_docs),
|
|
1552
1854
|
]:
|
|
1553
|
-
if
|
|
1554
|
-
raise ValueError(f"Too many items for '{name}' filter ({len(
|
|
1855
|
+
if value is not None and len(value) > 50:
|
|
1856
|
+
raise ValueError(f"Too many items for '{name}' filter ({len(value)}); maximum allowed is 50.")
|
|
1555
1857
|
|
|
1556
1858
|
sql = ["SELECT loc FROM engine"]
|
|
1557
1859
|
clauses: list[str] = []
|
|
@@ -1683,9 +1985,11 @@ class Nosible:
|
|
|
1683
1985
|
"company_3",
|
|
1684
1986
|
"doc_hash",
|
|
1685
1987
|
]
|
|
1988
|
+
import polars as pl # Lazy import
|
|
1989
|
+
|
|
1686
1990
|
# Create a dummy DataFrame with correct columns and no rows
|
|
1687
1991
|
df = pl.DataFrame({col: [] for col in columns})
|
|
1688
|
-
ctx = SQLContext()
|
|
1992
|
+
ctx = pl.SQLContext()
|
|
1689
1993
|
ctx.register("engine", df)
|
|
1690
1994
|
try:
|
|
1691
1995
|
ctx.execute(sql)
|
|
@@ -1706,21 +2010,21 @@ class Nosible:
|
|
|
1706
2010
|
|
|
1707
2011
|
def __exit__(
|
|
1708
2012
|
self,
|
|
1709
|
-
_exc_type:
|
|
1710
|
-
_exc_val:
|
|
1711
|
-
_exc_tb:
|
|
1712
|
-
) ->
|
|
2013
|
+
_exc_type: Optional[type[BaseException]],
|
|
2014
|
+
_exc_val: Optional[BaseException],
|
|
2015
|
+
_exc_tb: Optional[types.TracebackType],
|
|
2016
|
+
) -> Optional[bool]:
|
|
1713
2017
|
"""
|
|
1714
2018
|
Always clean up (self.close()), but let exceptions propagate.
|
|
1715
2019
|
Return True only if you really want to suppress an exception.
|
|
1716
2020
|
|
|
1717
2021
|
Parameters
|
|
1718
2022
|
----------
|
|
1719
|
-
|
|
2023
|
+
_exc_type : Optional[type[BaseException]]
|
|
1720
2024
|
The type of the exception raised, if any.
|
|
1721
|
-
|
|
2025
|
+
_exc_val : Optional[BaseException]
|
|
1722
2026
|
The exception instance, if any.
|
|
1723
|
-
|
|
2027
|
+
_exc_tb : Optional[types.TracebackType]
|
|
1724
2028
|
The traceback object, if any.
|
|
1725
2029
|
|
|
1726
2030
|
Returns
|
|
@@ -1741,5 +2045,9 @@ class Nosible:
|
|
|
1741
2045
|
Destructor to ensure resources are cleaned up if not explicitly closed.
|
|
1742
2046
|
|
|
1743
2047
|
"""
|
|
1744
|
-
#
|
|
1745
|
-
|
|
2048
|
+
# Only close if interpreter is fully alive
|
|
2049
|
+
if not getattr(sys, "is_finalizing", lambda: False)():
|
|
2050
|
+
try:
|
|
2051
|
+
self.close()
|
|
2052
|
+
except Exception:
|
|
2053
|
+
pass
|