nosible 0.1.8__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nosible/classes/result.py +69 -106
- nosible/classes/result_set.py +121 -115
- nosible/classes/search.py +83 -88
- nosible/classes/search_set.py +27 -12
- nosible/classes/snippet.py +57 -74
- nosible/classes/snippet_set.py +62 -63
- nosible/classes/web_page.py +39 -103
- nosible/nosible_client.py +551 -234
- nosible/utils/json_tools.py +58 -8
- nosible/utils/question_builder.py +131 -0
- nosible/utils/rate_limiter.py +30 -24
- {nosible-0.1.8.dist-info → nosible-0.2.1.dist-info}/METADATA +27 -49
- nosible-0.2.1.dist-info/RECORD +17 -0
- nosible-0.1.8.dist-info/RECORD +0 -16
- {nosible-0.1.8.dist-info → nosible-0.2.1.dist-info}/WHEEL +0 -0
- {nosible-0.1.8.dist-info → nosible-0.2.1.dist-info}/licenses/LICENSE +0 -0
- {nosible-0.1.8.dist-info → nosible-0.2.1.dist-info}/top_level.txt +0 -0
nosible/nosible_client.py
CHANGED
|
@@ -2,11 +2,15 @@ import gzip
|
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
|
+
import sys
|
|
6
|
+
import textwrap
|
|
5
7
|
import time
|
|
6
|
-
import
|
|
8
|
+
import types
|
|
9
|
+
import typing
|
|
7
10
|
from collections.abc import Iterator
|
|
8
|
-
from concurrent.futures import ThreadPoolExecutor
|
|
9
|
-
from
|
|
11
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
12
|
+
from datetime import datetime
|
|
13
|
+
from typing import Union, Optional
|
|
10
14
|
|
|
11
15
|
import polars as pl
|
|
12
16
|
import requests
|
|
@@ -25,8 +29,10 @@ from tenacity import (
|
|
|
25
29
|
from nosible.classes.result_set import ResultSet
|
|
26
30
|
from nosible.classes.search import Search
|
|
27
31
|
from nosible.classes.search_set import SearchSet
|
|
32
|
+
from nosible.classes.snippet_set import SnippetSet
|
|
28
33
|
from nosible.classes.web_page import WebPageData
|
|
29
34
|
from nosible.utils.json_tools import json_loads
|
|
35
|
+
from nosible.utils.question_builder import _get_question
|
|
30
36
|
from nosible.utils.rate_limiter import PLAN_RATE_LIMITS, RateLimiter, _rate_limited
|
|
31
37
|
|
|
32
38
|
# Set up a module‐level logger.
|
|
@@ -47,37 +53,33 @@ class Nosible:
|
|
|
47
53
|
llm_api_key : str, optional
|
|
48
54
|
API key for LLM-based query expansions.
|
|
49
55
|
openai_base_url : str
|
|
50
|
-
Base URL for the OpenAI-compatible LLM API.
|
|
51
|
-
sentiment_model : str
|
|
52
|
-
Model to use for sentiment analysis
|
|
56
|
+
Base URL for the OpenAI-compatible LLM API. (default is OpenRouter's API endpoint)
|
|
57
|
+
sentiment_model : str, optional
|
|
58
|
+
Model to use for sentiment analysis (default is "openai/gpt-4o").
|
|
53
59
|
timeout : int
|
|
54
60
|
Request timeout for HTTP calls.
|
|
55
|
-
retries : int,
|
|
61
|
+
retries : int,
|
|
56
62
|
Number of retry attempts for transient HTTP errors.
|
|
57
|
-
concurrency : int,
|
|
63
|
+
concurrency : int,
|
|
58
64
|
Maximum concurrent search requests.
|
|
59
65
|
publish_start : str, optional
|
|
60
|
-
|
|
66
|
+
Start date for when the document was published (ISO format).
|
|
61
67
|
publish_end : str, optional
|
|
62
|
-
|
|
63
|
-
include_netlocs : list of str, optional
|
|
64
|
-
Domains to include.
|
|
65
|
-
exclude_netlocs : list of str, optional
|
|
66
|
-
Domains to exclude.
|
|
68
|
+
End date for when the document was published (ISO format).
|
|
67
69
|
visited_start : str, optional
|
|
68
|
-
|
|
70
|
+
Start date for when the document was visited by NOSIBLE (ISO format).
|
|
69
71
|
visited_end : str, optional
|
|
70
|
-
|
|
72
|
+
End date for when the document was visited by NOSIBLE (ISO format).
|
|
71
73
|
certain : bool, optional
|
|
72
|
-
|
|
73
|
-
include_languages : list of str, optional
|
|
74
|
-
Language codes to include (Max: 50).
|
|
75
|
-
exclude_languages : list of str, optional
|
|
76
|
-
Language codes to exclude (Max: 50).
|
|
74
|
+
Only include documents where we are 100% sure of the date.
|
|
77
75
|
include_netlocs : list of str, optional
|
|
78
|
-
|
|
76
|
+
List of netlocs (domains) to include in the search. (Max: 50)
|
|
79
77
|
exclude_netlocs : list of str, optional
|
|
80
|
-
|
|
78
|
+
List of netlocs (domains) to exclude in the search. (Max: 50)
|
|
79
|
+
include_languages : list of str, optional
|
|
80
|
+
Languages to include in the search. (Max: 50, ISO 639-1 language codes).
|
|
81
|
+
exclude_languages : list of str, optional
|
|
82
|
+
Language codes to exclude in the search (Max: 50, ISO 639-1 language codes).
|
|
81
83
|
include_companies : list of str, optional
|
|
82
84
|
Google KG IDs of public companies to require (Max: 50).
|
|
83
85
|
exclude_companies : list of str, optional
|
|
@@ -86,10 +88,6 @@ class Nosible:
|
|
|
86
88
|
URL hashes of docs to include (Max: 50).
|
|
87
89
|
exclude_docs : list of str, optional
|
|
88
90
|
URL hashes of docs to exclude (Max: 50).
|
|
89
|
-
openai_base_url : str, optional
|
|
90
|
-
Base URL for the OpenAI API (default is OpenRouter).
|
|
91
|
-
sentiment_model : str, optional
|
|
92
|
-
Model to use for sentiment analysis (default is "openai/gpt-4o").
|
|
93
91
|
|
|
94
92
|
Notes
|
|
95
93
|
-----
|
|
@@ -173,7 +171,7 @@ class Nosible:
|
|
|
173
171
|
reraise=True,
|
|
174
172
|
stop=stop_after_attempt(self.retries) | stop_after_delay(self.timeout),
|
|
175
173
|
wait=wait_exponential(multiplier=1, min=1, max=10),
|
|
176
|
-
retry=retry_if_exception_type(
|
|
174
|
+
retry=retry_if_exception_type(requests.exceptions.RequestException),
|
|
177
175
|
before_sleep=before_sleep_log(self.logger, logging.WARNING),
|
|
178
176
|
)(self._generate_expansions)
|
|
179
177
|
|
|
@@ -212,6 +210,9 @@ class Nosible:
|
|
|
212
210
|
n_probes: int = 30,
|
|
213
211
|
n_contextify: int = 128,
|
|
214
212
|
algorithm: str = "hybrid-2",
|
|
213
|
+
min_similarity: float = None,
|
|
214
|
+
must_include: list[str] = None,
|
|
215
|
+
must_exclude: list[str] = None,
|
|
215
216
|
autogenerate_expansions: bool = False,
|
|
216
217
|
publish_start: str = None,
|
|
217
218
|
publish_end: str = None,
|
|
@@ -243,38 +244,40 @@ class Nosible:
|
|
|
243
244
|
List of LLM‐generated expansions.
|
|
244
245
|
sql_filter : list of str, optional
|
|
245
246
|
SQL‐style filter clauses.
|
|
246
|
-
n_results : int
|
|
247
|
+
n_results : int
|
|
247
248
|
Max number of results (max 100).
|
|
248
|
-
n_probes : int
|
|
249
|
+
n_probes : int
|
|
249
250
|
Number of index shards to probe.
|
|
250
|
-
n_contextify : int
|
|
251
|
+
n_contextify : int
|
|
251
252
|
Context window size per result.
|
|
252
|
-
algorithm : str
|
|
253
|
+
algorithm : str
|
|
253
254
|
Search algorithm type.
|
|
254
|
-
|
|
255
|
+
min_similarity : float
|
|
256
|
+
Results must have at least this similarity score.
|
|
257
|
+
must_include : list of str
|
|
258
|
+
Only results mentioning these strings will be included.
|
|
259
|
+
must_exclude : list of str
|
|
260
|
+
Any result mentioning these strings will be excluded.
|
|
261
|
+
autogenerate_expansions : bool
|
|
255
262
|
Do you want to generate expansions automatically using a LLM?
|
|
256
263
|
publish_start : str, optional
|
|
257
|
-
|
|
264
|
+
Start date for when the document was published (ISO format).
|
|
258
265
|
publish_end : str, optional
|
|
259
|
-
|
|
260
|
-
include_netlocs : list of str, optional
|
|
261
|
-
Domains to include.
|
|
262
|
-
exclude_netlocs : list of str, optional
|
|
263
|
-
Domains to exclude.
|
|
266
|
+
End date for when the document was published (ISO format).
|
|
264
267
|
visited_start : str, optional
|
|
265
|
-
|
|
268
|
+
Start date for when the document was visited by NOSIBLE (ISO format).
|
|
266
269
|
visited_end : str, optional
|
|
267
|
-
|
|
270
|
+
End date for when the document was visited by NOSIBLE (ISO format).
|
|
268
271
|
certain : bool, optional
|
|
269
|
-
|
|
270
|
-
include_languages : list of str, optional
|
|
271
|
-
Language codes to include (Max: 50).
|
|
272
|
-
exclude_languages : list of str, optional
|
|
273
|
-
Language codes to exclude (Max: 50).
|
|
272
|
+
Only include documents where we are 100% sure of the date.
|
|
274
273
|
include_netlocs : list of str, optional
|
|
275
|
-
|
|
274
|
+
List of netlocs (domains) to include in the search. (Max: 50)
|
|
276
275
|
exclude_netlocs : list of str, optional
|
|
277
|
-
|
|
276
|
+
List of netlocs (domains) to exclude in the search. (Max: 50)
|
|
277
|
+
include_languages : list of str, optional
|
|
278
|
+
Languages to include in the search. (Max: 50, ISO 639-1 language codes).
|
|
279
|
+
exclude_languages : list of str, optional
|
|
280
|
+
Language codes to exclude in the search (Max: 50, ISO 639-1 language codes).
|
|
278
281
|
include_companies : list of str, optional
|
|
279
282
|
Google KG IDs of public companies to require (Max: 50).
|
|
280
283
|
exclude_companies : list of str, optional
|
|
@@ -297,6 +300,8 @@ class Nosible:
|
|
|
297
300
|
If neither question nor search are specified
|
|
298
301
|
RuntimeError
|
|
299
302
|
If the response fails in any way.
|
|
303
|
+
ValueError
|
|
304
|
+
If `n_results` is greater than 100.
|
|
300
305
|
|
|
301
306
|
Notes
|
|
302
307
|
-----
|
|
@@ -342,6 +347,9 @@ class Nosible:
|
|
|
342
347
|
n_probes=n_probes,
|
|
343
348
|
n_contextify=n_contextify,
|
|
344
349
|
algorithm=algorithm,
|
|
350
|
+
min_similarity=min_similarity,
|
|
351
|
+
must_include=must_include,
|
|
352
|
+
must_exclude=must_exclude,
|
|
345
353
|
autogenerate_expansions=autogenerate_expansions,
|
|
346
354
|
publish_start=publish_start,
|
|
347
355
|
publish_end=publish_end,
|
|
@@ -379,6 +387,9 @@ class Nosible:
|
|
|
379
387
|
n_probes: int = 30,
|
|
380
388
|
n_contextify: int = 128,
|
|
381
389
|
algorithm: str = "hybrid-2",
|
|
390
|
+
min_similarity: float = None,
|
|
391
|
+
must_include: list[str] = None,
|
|
392
|
+
must_exclude: list[str] = None,
|
|
382
393
|
autogenerate_expansions: bool = False,
|
|
383
394
|
publish_start: str = None,
|
|
384
395
|
publish_end: str = None,
|
|
@@ -407,48 +418,50 @@ class Nosible:
|
|
|
407
418
|
List of expansion terms to use for each search.
|
|
408
419
|
sql_filter : list of str, optional
|
|
409
420
|
SQL-like filters to apply to the search.
|
|
410
|
-
n_results : int
|
|
421
|
+
n_results : int
|
|
411
422
|
Number of results to return per search.
|
|
412
|
-
n_probes : int
|
|
423
|
+
n_probes : int
|
|
413
424
|
Number of probes to use for the search algorithm.
|
|
414
|
-
n_contextify : int
|
|
425
|
+
n_contextify : int
|
|
415
426
|
Context window size for the search.
|
|
416
|
-
algorithm : str
|
|
427
|
+
algorithm : str
|
|
417
428
|
Search algorithm to use.
|
|
418
|
-
|
|
419
|
-
|
|
429
|
+
min_similarity : float
|
|
430
|
+
Results must have at least this similarity score.
|
|
431
|
+
must_include : list of str
|
|
432
|
+
Only results mentioning these strings will be included.
|
|
433
|
+
must_exclude : list of str
|
|
434
|
+
Any result mentioning these strings will be excluded.
|
|
435
|
+
autogenerate_expansions : bool
|
|
436
|
+
Do you want to generate expansions automatically using a LLM?.
|
|
420
437
|
publish_start : str, optional
|
|
421
|
-
|
|
438
|
+
Start date for when the document was published (ISO format).
|
|
422
439
|
publish_end : str, optional
|
|
423
|
-
|
|
424
|
-
include_netlocs : list of str, optional
|
|
425
|
-
Only include results from these domains.
|
|
426
|
-
exclude_netlocs : list of str, optional
|
|
427
|
-
Exclude results from these domains.
|
|
440
|
+
End date for when the document was published (ISO format).
|
|
428
441
|
visited_start : str, optional
|
|
429
|
-
|
|
442
|
+
Start date for when the document was visited by NOSIBLE (ISO format).
|
|
430
443
|
visited_end : str, optional
|
|
431
|
-
|
|
444
|
+
End date for when the document was visited by NOSIBLE (ISO format).
|
|
432
445
|
certain : bool, optional
|
|
433
|
-
Only include
|
|
446
|
+
Only include documents where we are 100% sure of the date.
|
|
447
|
+
include_netlocs : list of str, optional
|
|
448
|
+
List of netlocs (domains) to include in the search. (Max: 50)
|
|
449
|
+
exclude_netlocs : list of str, optional
|
|
450
|
+
List of netlocs (domains) to exclude in the search. (Max: 50)
|
|
434
451
|
include_languages : list of str, optional
|
|
435
|
-
|
|
452
|
+
Languages to include in the search. (Max: 50, ISO 639-1 language codes).
|
|
436
453
|
exclude_languages : list of str, optional
|
|
437
|
-
|
|
454
|
+
Language codes to exclude in the search (Max: 50, ISO 639-1 language codes).
|
|
438
455
|
include_companies : list of str, optional
|
|
439
|
-
|
|
456
|
+
Google KG IDs of public companies to require (Max: 50).
|
|
440
457
|
exclude_companies : list of str, optional
|
|
441
|
-
|
|
442
|
-
include_netlocs : list of str, optional
|
|
443
|
-
Only include results from these domains (Max: 50).
|
|
444
|
-
exclude_netlocs : list of str, optional
|
|
445
|
-
Exclude results from these domains (Max: 50).
|
|
458
|
+
Google KG IDs of public companies to forbid (Max: 50).
|
|
446
459
|
include_docs : list of str, optional
|
|
447
|
-
URL hashes of
|
|
460
|
+
URL hashes of docs to include (Max: 50).
|
|
448
461
|
exclude_docs : list of str, optional
|
|
449
|
-
URL hashes of
|
|
462
|
+
URL hashes of docs to exclude (Max: 50).
|
|
450
463
|
|
|
451
|
-
|
|
464
|
+
Returns
|
|
452
465
|
------
|
|
453
466
|
ResultSet or None
|
|
454
467
|
Each completed search’s results, or None on failure.
|
|
@@ -461,8 +474,6 @@ class Nosible:
|
|
|
461
474
|
If both queries and searches are specified.
|
|
462
475
|
TypeError
|
|
463
476
|
If neither queries nor searches are specified.
|
|
464
|
-
RuntimeError
|
|
465
|
-
If the response fails in any way.
|
|
466
477
|
|
|
467
478
|
Notes
|
|
468
479
|
-----
|
|
@@ -473,7 +484,10 @@ class Nosible:
|
|
|
473
484
|
--------
|
|
474
485
|
>>> from nosible import Nosible
|
|
475
486
|
>>> queries = SearchSet(
|
|
476
|
-
... [
|
|
487
|
+
... [
|
|
488
|
+
... Search(question="Hedge funds seek to expand into private credit", n_results=5),
|
|
489
|
+
... Search(question="How have the Trump tariffs impacted the US economy?", n_results=5),
|
|
490
|
+
... ]
|
|
477
491
|
... )
|
|
478
492
|
>>> with Nosible() as nos:
|
|
479
493
|
... results_list = list(nos.searches(searches=queries))
|
|
@@ -484,10 +498,14 @@ class Nosible:
|
|
|
484
498
|
True True
|
|
485
499
|
True True
|
|
486
500
|
>>> with Nosible() as nos:
|
|
487
|
-
... results_list_str = list(
|
|
488
|
-
...
|
|
489
|
-
...
|
|
490
|
-
...
|
|
501
|
+
... results_list_str = list(
|
|
502
|
+
... nos.searches(
|
|
503
|
+
... questions=[
|
|
504
|
+
... "What are the terms of the partnership between Microsoft and OpenAI?",
|
|
505
|
+
... "What are the terms of the partnership between Volkswagen and Uber?",
|
|
506
|
+
... ]
|
|
507
|
+
... )
|
|
508
|
+
... )
|
|
491
509
|
>>> nos = Nosible(nosible_api_key="test|xyz") # doctest: +ELLIPSIS
|
|
492
510
|
>>> nos.searches() # doctest: +ELLIPSIS
|
|
493
511
|
Traceback (most recent call last):
|
|
@@ -515,6 +533,9 @@ class Nosible:
|
|
|
515
533
|
n_probes=n_probes,
|
|
516
534
|
n_contextify=n_contextify,
|
|
517
535
|
algorithm=algorithm,
|
|
536
|
+
min_similarity=min_similarity,
|
|
537
|
+
must_include=must_include,
|
|
538
|
+
must_exclude=must_exclude,
|
|
518
539
|
autogenerate_expansions=autogenerate_expansions,
|
|
519
540
|
publish_start=publish_start,
|
|
520
541
|
publish_end=publish_end,
|
|
@@ -538,7 +559,8 @@ class Nosible:
|
|
|
538
559
|
yield future.result()
|
|
539
560
|
except Exception as e:
|
|
540
561
|
self.logger.warning(f"Search failed: {e!r}")
|
|
541
|
-
|
|
562
|
+
raise
|
|
563
|
+
|
|
542
564
|
return _run_generator()
|
|
543
565
|
|
|
544
566
|
@_rate_limited("fast")
|
|
@@ -560,6 +582,8 @@ class Nosible:
|
|
|
560
582
|
------
|
|
561
583
|
ValueError
|
|
562
584
|
If `n_results` > 100.
|
|
585
|
+
ValueError
|
|
586
|
+
If min_similarity is not [0,1].
|
|
563
587
|
|
|
564
588
|
Examples
|
|
565
589
|
--------
|
|
@@ -573,7 +597,7 @@ class Nosible:
|
|
|
573
597
|
ValueError: Search can not have more than 100 results - Use bulk search instead.
|
|
574
598
|
"""
|
|
575
599
|
# --------------------------------------------------------------------------------------------------------------
|
|
576
|
-
# Setting search params. Individual search will
|
|
600
|
+
# Setting search params. Individual search will override Nosible defaults.
|
|
577
601
|
# --------------------------------------------------------------------------------------------------------------
|
|
578
602
|
question = search_obj.question # No default
|
|
579
603
|
expansions = search_obj.expansions if search_obj.expansions is not None else [] # Default to empty list
|
|
@@ -582,7 +606,12 @@ class Nosible:
|
|
|
582
606
|
n_probes = search_obj.n_probes if search_obj.n_probes is not None else 30
|
|
583
607
|
n_contextify = search_obj.n_contextify if search_obj.n_contextify is not None else 128
|
|
584
608
|
algorithm = search_obj.algorithm if search_obj.algorithm is not None else "hybrid-2"
|
|
585
|
-
|
|
609
|
+
min_similarity = search_obj.min_similarity if search_obj.min_similarity is not None else 0
|
|
610
|
+
must_include = search_obj.must_include if search_obj.must_include is not None else []
|
|
611
|
+
must_exclude = search_obj.must_exclude if search_obj.must_exclude is not None else []
|
|
612
|
+
autogenerate_expansions = (
|
|
613
|
+
search_obj.autogenerate_expansions if search_obj.autogenerate_expansions is not None else False
|
|
614
|
+
)
|
|
586
615
|
publish_start = search_obj.publish_start if search_obj.publish_start is not None else self.publish_start
|
|
587
616
|
publish_end = search_obj.publish_end if search_obj.publish_end is not None else self.publish_end
|
|
588
617
|
include_netlocs = search_obj.include_netlocs if search_obj.include_netlocs is not None else self.include_netlocs
|
|
@@ -603,6 +632,9 @@ class Nosible:
|
|
|
603
632
|
search_obj.exclude_companies if search_obj.exclude_companies is not None else self.exclude_companies
|
|
604
633
|
)
|
|
605
634
|
|
|
635
|
+
if not (0.0 <= min_similarity <= 1.0):
|
|
636
|
+
raise ValueError(f"Invalid min_simalarity: {min_similarity}. Must be [0,1].")
|
|
637
|
+
|
|
606
638
|
# Generate expansions if not provided
|
|
607
639
|
if expansions is None:
|
|
608
640
|
expansions = []
|
|
@@ -639,6 +671,9 @@ class Nosible:
|
|
|
639
671
|
"n_probes": n_probes,
|
|
640
672
|
"n_contextify": n_contextify,
|
|
641
673
|
"algorithm": algorithm,
|
|
674
|
+
"min_similarity": min_similarity,
|
|
675
|
+
"must_include": must_include,
|
|
676
|
+
"must_exclude": must_exclude,
|
|
642
677
|
}
|
|
643
678
|
|
|
644
679
|
resp = self._post(url="https://www.nosible.ai/search/v1/fast-search", payload=payload)
|
|
@@ -699,6 +734,9 @@ class Nosible:
|
|
|
699
734
|
n_probes: int = 30,
|
|
700
735
|
n_contextify: int = 128,
|
|
701
736
|
algorithm: str = "hybrid-2",
|
|
737
|
+
min_similarity: float = None,
|
|
738
|
+
must_include: list[str] = None,
|
|
739
|
+
must_exclude: list[str] = None,
|
|
702
740
|
autogenerate_expansions: bool = False,
|
|
703
741
|
publish_start: str = None,
|
|
704
742
|
publish_end: str = None,
|
|
@@ -728,46 +766,48 @@ class Nosible:
|
|
|
728
766
|
Optional list of expanded query strings.
|
|
729
767
|
sql_filter : list of str, optional
|
|
730
768
|
Optional SQL WHERE clause filters.
|
|
731
|
-
n_results : int
|
|
769
|
+
n_results : int
|
|
732
770
|
Number of results per query (1,000–10,000).
|
|
733
|
-
n_probes : int
|
|
771
|
+
n_probes : int
|
|
734
772
|
Number of shards to probe.
|
|
735
|
-
n_contextify : int
|
|
773
|
+
n_contextify : int
|
|
736
774
|
Context window size per result.
|
|
737
|
-
algorithm : str
|
|
775
|
+
algorithm : str
|
|
738
776
|
Search algorithm identifier.
|
|
739
|
-
|
|
777
|
+
min_similarity : float
|
|
778
|
+
Results must have at least this similarity score.
|
|
779
|
+
must_include : list of str
|
|
780
|
+
Only results mentioning these strings will be included.
|
|
781
|
+
must_exclude : list of str
|
|
782
|
+
Any result mentioning these strings will be excluded.
|
|
783
|
+
autogenerate_expansions : bool
|
|
740
784
|
Do you want to generate expansions automatically using a LLM?
|
|
741
785
|
publish_start : str, optional
|
|
742
|
-
|
|
786
|
+
Start date for when the document was published (ISO format).
|
|
743
787
|
publish_end : str, optional
|
|
744
|
-
|
|
745
|
-
include_netlocs : list of str, optional
|
|
746
|
-
Domains to include.
|
|
747
|
-
exclude_netlocs : list of str, optional
|
|
748
|
-
Domains to exclude.
|
|
788
|
+
End date for when the document was published (ISO format).
|
|
749
789
|
visited_start : str, optional
|
|
750
|
-
|
|
790
|
+
Start date for when the document was visited by NOSIBLE (ISO format).
|
|
751
791
|
visited_end : str, optional
|
|
752
|
-
|
|
792
|
+
End date for when the document was visited by NOSIBLE (ISO format).
|
|
753
793
|
certain : bool, optional
|
|
754
|
-
|
|
755
|
-
include_languages : list of str, optional
|
|
756
|
-
Languages to include (Max: 50).
|
|
757
|
-
exclude_languages : list of str, optional
|
|
758
|
-
Languages to exclude (Max: 50).
|
|
794
|
+
Only include documents where we are 100% sure of the date.
|
|
759
795
|
include_netlocs : list of str, optional
|
|
760
|
-
|
|
796
|
+
List of netlocs (domains) to include in the search. (Max: 50)
|
|
761
797
|
exclude_netlocs : list of str, optional
|
|
762
|
-
|
|
798
|
+
List of netlocs (domains) to exclude in the search. (Max: 50)
|
|
799
|
+
include_languages : list of str, optional
|
|
800
|
+
Languages to include in the search. (Max: 50, ISO 639-1 language codes).
|
|
801
|
+
exclude_languages : list of str, optional
|
|
802
|
+
Language codes to exclude in the search (Max: 50, ISO 639-1 language codes).
|
|
763
803
|
include_companies : list of str, optional
|
|
764
|
-
|
|
804
|
+
Google KG IDs of public companies to require (Max: 50).
|
|
765
805
|
exclude_companies : list of str, optional
|
|
766
|
-
|
|
806
|
+
Google KG IDs of public companies to forbid (Max: 50).
|
|
767
807
|
include_docs : list of str, optional
|
|
768
|
-
URL hashes of
|
|
808
|
+
URL hashes of docs to include (Max: 50).
|
|
769
809
|
exclude_docs : list of str, optional
|
|
770
|
-
URL hashes of
|
|
810
|
+
URL hashes of docs to exclude (Max: 50).
|
|
771
811
|
verbose : bool, optional
|
|
772
812
|
Show verbose output, Bulk search will print more information.
|
|
773
813
|
|
|
@@ -786,6 +826,8 @@ class Nosible:
|
|
|
786
826
|
If neither question nor search are specified.
|
|
787
827
|
RuntimeError
|
|
788
828
|
If the response fails in any way.
|
|
829
|
+
ValueError
|
|
830
|
+
If min_similarity is not [0,1].
|
|
789
831
|
|
|
790
832
|
Notes
|
|
791
833
|
-----
|
|
@@ -794,23 +836,21 @@ class Nosible:
|
|
|
794
836
|
|
|
795
837
|
Examples
|
|
796
838
|
--------
|
|
797
|
-
>>> from nosible.classes.search import Search
|
|
798
|
-
>>> from nosible import Nosible
|
|
799
|
-
>>> with Nosible(
|
|
800
|
-
... results = nos.bulk_search(question=
|
|
839
|
+
>>> from nosible.classes.search import Search # doctest: +SKIP
|
|
840
|
+
>>> from nosible import Nosible # doctest: +SKIP
|
|
841
|
+
>>> with Nosible(exclude_netlocs=["bbc.com"]) as nos: # doctest: +SKIP
|
|
842
|
+
... results = nos.bulk_search(question=_get_question(), n_results=2000) # doctest: +SKIP
|
|
801
843
|
... print(isinstance(results, ResultSet)) # doctest: +SKIP
|
|
802
844
|
... print(len(results)) # doctest: +SKIP
|
|
803
845
|
True
|
|
804
846
|
2000
|
|
805
|
-
|
|
806
|
-
>>> s = Search(question="OpenAI", n_results=1000) # doctest: +SKIP
|
|
847
|
+
>>> s = Search(question=_get_question(), n_results=1000) # doctest: +SKIP
|
|
807
848
|
>>> with Nosible() as nos: # doctest: +SKIP
|
|
808
849
|
... results = nos.bulk_search(search=s) # doctest: +SKIP
|
|
809
850
|
... print(isinstance(results, ResultSet)) # doctest: +SKIP
|
|
810
851
|
... print(len(results)) # doctest: +SKIP
|
|
811
852
|
True
|
|
812
853
|
1000
|
|
813
|
-
|
|
814
854
|
>>> nos = Nosible(nosible_api_key="test|xyz") # doctest: +SKIP
|
|
815
855
|
>>> nos.bulk_search() # doctest: +SKIP
|
|
816
856
|
Traceback (most recent call last):
|
|
@@ -818,20 +858,18 @@ class Nosible:
|
|
|
818
858
|
TypeError: Either question or search must be specified
|
|
819
859
|
|
|
820
860
|
>>> nos = Nosible(nosible_api_key="test|xyz") # doctest: +SKIP
|
|
821
|
-
>>> nos.bulk_search(question=
|
|
861
|
+
>>> nos.bulk_search(question=_get_question(), search=Search(question=_get_question())) # doctest: +SKIP
|
|
822
862
|
Traceback (most recent call last):
|
|
823
863
|
...
|
|
824
864
|
TypeError: Question and search cannot be both specified
|
|
825
|
-
|
|
826
865
|
>>> nos = Nosible(nosible_api_key="test|xyz") # doctest: +SKIP
|
|
827
|
-
>>> nos.bulk_search(question=
|
|
866
|
+
>>> nos.bulk_search(question=_get_question(), n_results=100) # doctest: +SKIP
|
|
828
867
|
Traceback (most recent call last):
|
|
829
868
|
...
|
|
830
|
-
ValueError: Bulk search must have at least
|
|
831
|
-
|
|
869
|
+
ValueError: Bulk search must have at least 1000 results per query; use search() for smaller result sets.
|
|
832
870
|
>>> nos = Nosible(nosible_api_key="test|xyz") # doctest: +SKIP
|
|
833
|
-
>>> nos.bulk_search(question=
|
|
834
|
-
Traceback (most recent call last):
|
|
871
|
+
>>> nos.bulk_search(question=_get_question(), n_results=10001) # doctest: +SKIP
|
|
872
|
+
Traceback (most recent call last): # doctest: +SKIP
|
|
835
873
|
...
|
|
836
874
|
ValueError: Bulk search cannot have more than 10000 results per query.
|
|
837
875
|
"""
|
|
@@ -854,8 +892,17 @@ class Nosible:
|
|
|
854
892
|
n_probes = search.n_probes if search.n_probes is not None else n_probes
|
|
855
893
|
n_contextify = search.n_contextify if search.n_contextify is not None else n_contextify
|
|
856
894
|
algorithm = search.algorithm if search.algorithm is not None else algorithm
|
|
857
|
-
|
|
895
|
+
min_similarity = search.min_similarity if search.min_similarity is not None else min_similarity
|
|
896
|
+
min_similarity = min_similarity if min_similarity is not None else 0
|
|
897
|
+
must_include = search.must_include if search.must_include is not None else must_include
|
|
898
|
+
must_include = must_include if must_include is not None else []
|
|
899
|
+
must_exclude = search.must_exclude if search.must_exclude is not None else must_exclude
|
|
900
|
+
must_exclude = must_exclude if must_exclude is not None else []
|
|
901
|
+
autogenerate_expansions = (
|
|
902
|
+
search.autogenerate_expansions
|
|
903
|
+
if search.autogenerate_expansions is not None
|
|
858
904
|
else autogenerate_expansions
|
|
905
|
+
)
|
|
859
906
|
publish_start = search.publish_start if search.publish_start is not None else publish_start
|
|
860
907
|
publish_end = search.publish_end if search.publish_end is not None else publish_end
|
|
861
908
|
include_netlocs = search.include_netlocs if search.include_netlocs is not None else include_netlocs
|
|
@@ -876,6 +923,13 @@ class Nosible:
|
|
|
876
923
|
if autogenerate_expansions is True:
|
|
877
924
|
expansions = self._generate_expansions(question=question)
|
|
878
925
|
|
|
926
|
+
must_include = must_include if must_include is not None else []
|
|
927
|
+
must_exclude = must_exclude if must_exclude is not None else []
|
|
928
|
+
min_similarity = min_similarity if min_similarity is not None else 0
|
|
929
|
+
|
|
930
|
+
if not (0.0 <= min_similarity <= 1.0):
|
|
931
|
+
raise ValueError(f"Invalid min_simalarity: {min_similarity}. Must be [0,1].")
|
|
932
|
+
|
|
879
933
|
# Generate sql_filter if unset
|
|
880
934
|
if sql_filter is None:
|
|
881
935
|
sql_filter = self._format_sql(
|
|
@@ -920,6 +974,9 @@ class Nosible:
|
|
|
920
974
|
"n_probes": n_probes,
|
|
921
975
|
"n_contextify": n_contextify,
|
|
922
976
|
"algorithm": algorithm,
|
|
977
|
+
"min_similarity": min_similarity,
|
|
978
|
+
"must_include": must_include,
|
|
979
|
+
"must_exclude": must_exclude,
|
|
923
980
|
}
|
|
924
981
|
resp = self._post(url="https://www.nosible.ai/search/v1/slow-search", payload=payload)
|
|
925
982
|
try:
|
|
@@ -952,20 +1009,139 @@ class Nosible:
|
|
|
952
1009
|
if verbose:
|
|
953
1010
|
self.logger.setLevel(previous_level)
|
|
954
1011
|
|
|
1012
|
+
def answer(
|
|
1013
|
+
self,
|
|
1014
|
+
query: str,
|
|
1015
|
+
n_results: int = 100,
|
|
1016
|
+
min_similarity: float = 0.65,
|
|
1017
|
+
model: Union[str, None] = "google/gemini-2.0-flash-001",
|
|
1018
|
+
show_context: bool = True,
|
|
1019
|
+
) -> str:
|
|
1020
|
+
"""
|
|
1021
|
+
RAG-style question answering: retrieve top `n_results` via `.search()`
|
|
1022
|
+
then answer `query` using those documents as context.
|
|
1023
|
+
|
|
1024
|
+
Parameters
|
|
1025
|
+
----------
|
|
1026
|
+
query : str
|
|
1027
|
+
The user’s natural-language question.
|
|
1028
|
+
n_results : int
|
|
1029
|
+
How many docs to fetch to build the context.
|
|
1030
|
+
min_similarity : float
|
|
1031
|
+
Results must have at least this similarity score.
|
|
1032
|
+
model : str, optional
|
|
1033
|
+
Which LLM to call to answer your question.
|
|
1034
|
+
show_context : bool, optional
|
|
1035
|
+
Do you want the context to be shown?
|
|
1036
|
+
|
|
1037
|
+
Returns
|
|
1038
|
+
-------
|
|
1039
|
+
str
|
|
1040
|
+
The LLM’s generated answer, grounded in the retrieved docs.
|
|
1041
|
+
|
|
1042
|
+
Raises
|
|
1043
|
+
------
|
|
1044
|
+
ValueError
|
|
1045
|
+
If no API key is configured for the LLM client.
|
|
1046
|
+
RuntimeError
|
|
1047
|
+
If the LLM call fails or returns an invalid response.
|
|
1048
|
+
|
|
1049
|
+
Examples
|
|
1050
|
+
--------
|
|
1051
|
+
>>> from nosible import Nosible
|
|
1052
|
+
>>> with Nosible() as nos:
|
|
1053
|
+
... ans = nos.answer(
|
|
1054
|
+
... query="How is research governance and decision-making structured between Google and DeepMind?",
|
|
1055
|
+
... n_results=100,
|
|
1056
|
+
... show_context=True
|
|
1057
|
+
... ) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
|
|
1058
|
+
<BLANKLINE>
|
|
1059
|
+
Doc 1
|
|
1060
|
+
Title: ...
|
|
1061
|
+
>>> print(ans) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
|
|
1062
|
+
Answer:
|
|
1063
|
+
...
|
|
1064
|
+
"""
|
|
1065
|
+
|
|
1066
|
+
if not self.llm_api_key:
|
|
1067
|
+
raise ValueError("An LLM API key is required for answer().")
|
|
1068
|
+
|
|
1069
|
+
# Retrieve top documents
|
|
1070
|
+
results = self.search(
|
|
1071
|
+
question=query,
|
|
1072
|
+
n_results=n_results,
|
|
1073
|
+
min_similarity=min_similarity,
|
|
1074
|
+
)
|
|
1075
|
+
|
|
1076
|
+
# Build RAG context
|
|
1077
|
+
context = ""
|
|
1078
|
+
pieces: list[str] = []
|
|
1079
|
+
for idx, result in enumerate(results):
|
|
1080
|
+
pieces.append(f"""
|
|
1081
|
+
Doc {idx + 1}
|
|
1082
|
+
Title: {result.title}
|
|
1083
|
+
Similarity Score: {result.similarity * 100:.2f}%
|
|
1084
|
+
URL: {result.url}
|
|
1085
|
+
Content: {result.content}
|
|
1086
|
+
""")
|
|
1087
|
+
context = "\n".join(pieces)
|
|
1088
|
+
|
|
1089
|
+
if show_context:
|
|
1090
|
+
print(textwrap.dedent(context))
|
|
1091
|
+
|
|
1092
|
+
# Craft prompt
|
|
1093
|
+
prompt = (f"""
|
|
1094
|
+
# TASK DESCRIPTION
|
|
1095
|
+
|
|
1096
|
+
You are a helpful assistant. Use the following context to answer the question.
|
|
1097
|
+
When you use information from a chunk, cite it by referencing its label in square brackets, e.g. [doc3].
|
|
1098
|
+
|
|
1099
|
+
## Question
|
|
1100
|
+
{query}
|
|
1101
|
+
|
|
1102
|
+
## Context
|
|
1103
|
+
{context}
|
|
1104
|
+
"""
|
|
1105
|
+
)
|
|
1106
|
+
|
|
1107
|
+
# Call LLM
|
|
1108
|
+
client = OpenAI(base_url=self.openai_base_url, api_key=self.llm_api_key)
|
|
1109
|
+
try:
|
|
1110
|
+
response = client.chat.completions.create(
|
|
1111
|
+
model = model,
|
|
1112
|
+
messages = [{"role": "user", "content": prompt}],
|
|
1113
|
+
)
|
|
1114
|
+
except Exception as e:
|
|
1115
|
+
raise RuntimeError(f"LLM API error: {e}") from e
|
|
1116
|
+
|
|
1117
|
+
# Validate response shape
|
|
1118
|
+
choices = getattr(response, "choices", None)
|
|
1119
|
+
if not choices or not hasattr(choices[0], "message"):
|
|
1120
|
+
raise RuntimeError(f"Invalid LLM response format: {response!r}")
|
|
1121
|
+
|
|
1122
|
+
# Return the generated text
|
|
1123
|
+
return "Answer:\n" + response.choices[0].message.content.strip()
|
|
1124
|
+
|
|
955
1125
|
@_rate_limited("visit")
|
|
956
|
-
def visit(
|
|
1126
|
+
def visit(
|
|
1127
|
+
self,
|
|
1128
|
+
html: str = "",
|
|
1129
|
+
recrawl: bool = False,
|
|
1130
|
+
render: bool = False,
|
|
1131
|
+
url: str = None
|
|
1132
|
+
) -> WebPageData:
|
|
957
1133
|
"""
|
|
958
1134
|
Visit a given URL and return a structured WebPageData object for the page.
|
|
959
1135
|
|
|
960
1136
|
Parameters
|
|
961
1137
|
----------
|
|
962
|
-
html : str
|
|
1138
|
+
html : str
|
|
963
1139
|
Raw HTML to process instead of fetching.
|
|
964
|
-
recrawl : bool
|
|
1140
|
+
recrawl : bool
|
|
965
1141
|
If True, force a fresh crawl.
|
|
966
|
-
render : bool
|
|
1142
|
+
render : bool
|
|
967
1143
|
If True, allow JavaScript rendering before extraction.
|
|
968
|
-
url : str
|
|
1144
|
+
url : str
|
|
969
1145
|
The URL to fetch and parse.
|
|
970
1146
|
|
|
971
1147
|
Returns
|
|
@@ -986,26 +1162,24 @@ class Nosible:
|
|
|
986
1162
|
|
|
987
1163
|
Examples
|
|
988
1164
|
--------
|
|
989
|
-
>>> from nosible import Nosible
|
|
990
|
-
>>> with Nosible() as nos:
|
|
991
|
-
... out = nos.visit(url="https://www.dailynewsegypt.com/2023/09/08/g20-and-its-summits/")
|
|
992
|
-
... print(isinstance(out,
|
|
993
|
-
... print(hasattr(out, "languages"))
|
|
994
|
-
... print(hasattr(out, "page"))
|
|
1165
|
+
>>> from nosible import Nosible
|
|
1166
|
+
>>> with Nosible() as nos:
|
|
1167
|
+
... out = nos.visit(url="https://www.dailynewsegypt.com/2023/09/08/g20-and-its-summits/")
|
|
1168
|
+
... print(isinstance(out, WebPageData))
|
|
1169
|
+
... print(hasattr(out, "languages"))
|
|
1170
|
+
... print(hasattr(out, "page"))
|
|
995
1171
|
True
|
|
996
1172
|
True
|
|
997
1173
|
True
|
|
998
|
-
>>> with Nosible() as nos:
|
|
999
|
-
... out = nos.visit()
|
|
1000
|
-
... print(isinstance(out, type(WebPageData)))
|
|
1001
|
-
... print(hasattr(out, "languages"))
|
|
1002
|
-
... print(hasattr(out, "page")) # doctest: +
|
|
1174
|
+
>>> with Nosible() as nos:
|
|
1175
|
+
... out = nos.visit()
|
|
1176
|
+
... print(isinstance(out, type(WebPageData)))
|
|
1177
|
+
... print(hasattr(out, "languages"))
|
|
1178
|
+
... print(hasattr(out, "page")) # doctest: +ELLIPSIS
|
|
1003
1179
|
Traceback (most recent call last):
|
|
1004
1180
|
...
|
|
1005
1181
|
TypeError: URL must be provided
|
|
1006
1182
|
"""
|
|
1007
|
-
|
|
1008
|
-
# self._enforce("visit")
|
|
1009
1183
|
if url is None:
|
|
1010
1184
|
raise TypeError("URL must be provided")
|
|
1011
1185
|
response = self._post(
|
|
@@ -1018,7 +1192,7 @@ class Nosible:
|
|
|
1018
1192
|
self.logger.error(f"Failed to parse JSON from response: {e}")
|
|
1019
1193
|
raise ValueError("Invalid JSON response from server") from e
|
|
1020
1194
|
|
|
1021
|
-
if data == {
|
|
1195
|
+
if data == {"message": "Sorry, the URL could not be fetched."}:
|
|
1022
1196
|
raise ValueError("The URL could not be found.")
|
|
1023
1197
|
|
|
1024
1198
|
if "response" not in data:
|
|
@@ -1033,12 +1207,84 @@ class Nosible:
|
|
|
1033
1207
|
metadata=response_data.get("metadata"),
|
|
1034
1208
|
page=response_data.get("page"),
|
|
1035
1209
|
request=response_data.get("request"),
|
|
1036
|
-
snippets=response_data.get("snippets"),
|
|
1210
|
+
snippets=SnippetSet.from_dict(response_data.get("snippets", {})),
|
|
1037
1211
|
statistics=response_data.get("statistics"),
|
|
1038
1212
|
structured=response_data.get("structured"),
|
|
1039
1213
|
url_tree=response_data.get("url_tree"),
|
|
1040
1214
|
)
|
|
1041
1215
|
|
|
1216
|
+
@_rate_limited("fast")
|
|
1217
|
+
def trend(
|
|
1218
|
+
self,
|
|
1219
|
+
query: str,
|
|
1220
|
+
start_date: Optional[str] = None,
|
|
1221
|
+
end_date: Optional[str] = None,
|
|
1222
|
+
sql_filter: Optional[str] = None,
|
|
1223
|
+
) -> dict:
|
|
1224
|
+
"""
|
|
1225
|
+
Extract a trend showing the volume of news surrounding your query.
|
|
1226
|
+
|
|
1227
|
+
Parameters
|
|
1228
|
+
----------
|
|
1229
|
+
query : str
|
|
1230
|
+
The search term we would like to see a trend for.
|
|
1231
|
+
start_date : str, optional
|
|
1232
|
+
ISO‐format start date (YYYY-MM-DD) of the trend window.
|
|
1233
|
+
end_date : str, optional
|
|
1234
|
+
ISO‐format end date (YYYY-MM-DD) of the trend window.
|
|
1235
|
+
sql_filter : str, optional
|
|
1236
|
+
An optional SQL filter to narrow down the trend query
|
|
1237
|
+
|
|
1238
|
+
Returns
|
|
1239
|
+
-------
|
|
1240
|
+
dict
|
|
1241
|
+
The JSON-decoded trend data returned by the server.
|
|
1242
|
+
|
|
1243
|
+
Examples
|
|
1244
|
+
--------
|
|
1245
|
+
>>> from nosible import Nosible
|
|
1246
|
+
>>> with Nosible() as nos:
|
|
1247
|
+
... trends_data = nos.trend("Christmas Shopping", start_date="2005-01-01", end_date="2020-12-31")
|
|
1248
|
+
... print(trends_data) # doctest: +ELLIPSIS
|
|
1249
|
+
{'2005-01-31': ...'2020-12-31': ...}
|
|
1250
|
+
"""
|
|
1251
|
+
# Validate dates
|
|
1252
|
+
if start_date is not None:
|
|
1253
|
+
self._validate_date_format(start_date, "start_date")
|
|
1254
|
+
if end_date is not None:
|
|
1255
|
+
self._validate_date_format(end_date, "end_date")
|
|
1256
|
+
|
|
1257
|
+
payload: dict[str, str] = {"query": query}
|
|
1258
|
+
|
|
1259
|
+
if sql_filter is not None:
|
|
1260
|
+
payload["sql_filter"] = sql_filter
|
|
1261
|
+
else:
|
|
1262
|
+
payload["sql_filter"] = "SELECT loc, published FROM engine"
|
|
1263
|
+
|
|
1264
|
+
# Send the POST to the /trend endpoint
|
|
1265
|
+
response = self._post(
|
|
1266
|
+
url="https://www.nosible.ai/search/v1/trend",
|
|
1267
|
+
payload=payload,
|
|
1268
|
+
)
|
|
1269
|
+
# Will raise ValueError on rate-limit or auth errors
|
|
1270
|
+
response.raise_for_status()
|
|
1271
|
+
payload = response.json().get("response", {})
|
|
1272
|
+
|
|
1273
|
+
# if no window requested, return everything
|
|
1274
|
+
if start_date is None and end_date is None:
|
|
1275
|
+
return payload
|
|
1276
|
+
|
|
1277
|
+
# Filter by ISO‐date keys
|
|
1278
|
+
filtered: dict[str, float] = {}
|
|
1279
|
+
for date_str, value in payload.items():
|
|
1280
|
+
if start_date and date_str < start_date:
|
|
1281
|
+
continue
|
|
1282
|
+
if end_date and date_str > end_date:
|
|
1283
|
+
continue
|
|
1284
|
+
filtered[date_str] = value
|
|
1285
|
+
|
|
1286
|
+
return filtered
|
|
1287
|
+
|
|
1042
1288
|
def version(self) -> str:
|
|
1043
1289
|
"""
|
|
1044
1290
|
Retrieve the current version information for the Nosible API.
|
|
@@ -1097,10 +1343,6 @@ class Nosible:
|
|
|
1097
1343
|
|
|
1098
1344
|
Raises
|
|
1099
1345
|
------
|
|
1100
|
-
ValueError
|
|
1101
|
-
If the API returns an unexpected message.
|
|
1102
|
-
requests.HTTPError
|
|
1103
|
-
If the HTTP request fails.
|
|
1104
1346
|
|
|
1105
1347
|
Examples
|
|
1106
1348
|
--------
|
|
@@ -1121,10 +1363,13 @@ class Nosible:
|
|
|
1121
1363
|
return False
|
|
1122
1364
|
if msg == "The URL could not be retrieved.":
|
|
1123
1365
|
return False
|
|
1366
|
+
# If we reach here, the response is unexpected
|
|
1367
|
+
return False
|
|
1124
1368
|
except requests.HTTPError:
|
|
1125
1369
|
return False
|
|
1126
1370
|
except:
|
|
1127
1371
|
return False
|
|
1372
|
+
|
|
1128
1373
|
def preflight(self, url: str = None) -> str:
|
|
1129
1374
|
"""
|
|
1130
1375
|
Run a preflight check for crawling/preprocessing on a URL.
|
|
@@ -1180,40 +1425,47 @@ class Nosible:
|
|
|
1180
1425
|
|
|
1181
1426
|
Examples
|
|
1182
1427
|
--------
|
|
1183
|
-
>>> nos = Nosible(nosible_api_key="test|xyz")
|
|
1184
|
-
>>> print(nos.get_rate_limits()) # doctest: +
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
|
1428
|
+
>>> nos = Nosible(nosible_api_key="test|xyz")
|
|
1429
|
+
>>> print(nos.get_rate_limits()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
|
|
1430
|
+
Below are the rate limits for all NOSIBLE plans.
|
|
1431
|
+
To upgrade your package, visit https://www.nosible.ai/products.
|
|
1432
|
+
<BLANKLINE>
|
|
1433
|
+
Unless otherwise indicated, bulk searches are limited to one-at-a-time per API key.
|
|
1434
|
+
<BLANKLINE>
|
|
1435
|
+
Free: (Your current plan)
|
|
1436
|
+
| Endpoint | Per Month | Per Minute | Effective CPM |
|
|
1437
|
+
| ----------- | --------- | ---------- | ------------- |
|
|
1438
|
+
| Search | 3000 | 60 | $4.00 |
|
|
1439
|
+
| URL Visits | 300 | 60 | $4.00 |
|
|
1440
|
+
| Bulk Search | 300 | 60 | $4.00 |
|
|
1441
|
+
<BLANKLINE>
|
|
1442
|
+
Basic ($49p/m):
|
|
1443
|
+
| Endpoint | Per Month | Per Minute | Effective CPM |
|
|
1194
1444
|
...
|
|
1195
1445
|
"""
|
|
1196
1446
|
# Human-friendly plan names
|
|
1197
1447
|
display = {
|
|
1198
1448
|
"test": "Free",
|
|
1199
|
-
"basic": "Basic",
|
|
1200
|
-
"pro": "Pro",
|
|
1201
|
-
"pro+": "Pro+",
|
|
1202
|
-
"bus": "Business",
|
|
1203
|
-
"bus+": "Business+",
|
|
1204
|
-
"ent": "Enterprise",
|
|
1449
|
+
"basic": "Basic ($49p/m)",
|
|
1450
|
+
"pro": "Pro ($199p/m)",
|
|
1451
|
+
"pro+": "Pro+ ($799p/m)",
|
|
1452
|
+
"bus": "Business ($3999p/m)",
|
|
1453
|
+
"bus+": "Business+ ($7499p/m)",
|
|
1454
|
+
"ent": "Enterprise ($14999p/m)",
|
|
1205
1455
|
}
|
|
1206
1456
|
|
|
1207
1457
|
# Human-friendly endpoint names
|
|
1208
|
-
endpoint_name = {"fast": "
|
|
1458
|
+
endpoint_name = {"fast": "Search", "visit": "URL Visits", "slow": "Bulk Search"}
|
|
1209
1459
|
|
|
1210
1460
|
out = [
|
|
1211
1461
|
"Below are the rate limits for all NOSIBLE plans.",
|
|
1212
1462
|
"To upgrade your package, visit https://www.nosible.ai/products.\n",
|
|
1463
|
+
"Unless otherwise indicated, bulk searches are limited to one-at-a-time per API key.\n"
|
|
1213
1464
|
]
|
|
1214
1465
|
|
|
1215
1466
|
user_plan = self._get_user_plan()
|
|
1216
1467
|
current_plan = ""
|
|
1468
|
+
cpm_counter = 4.0
|
|
1217
1469
|
|
|
1218
1470
|
# Preserve the order you care about:
|
|
1219
1471
|
for plan in ["test", "basic", "pro", "pro+", "bus", "bus+", "ent"]:
|
|
@@ -1222,17 +1474,19 @@ class Nosible:
|
|
|
1222
1474
|
current_plan = " (Your current plan)"
|
|
1223
1475
|
|
|
1224
1476
|
out.append(f"{name}:{current_plan}")
|
|
1225
|
-
out.append("| Endpoint | Per Month | Per
|
|
1226
|
-
out.append("| ----------- | --------- |
|
|
1477
|
+
out.append("| Endpoint | Per Month | Per Minute | Effective CPM |")
|
|
1478
|
+
out.append("| ----------- | --------- | ---------- | ------------- |")
|
|
1227
1479
|
|
|
1228
1480
|
for ep in ["fast", "visit", "slow"]:
|
|
1229
1481
|
buckets = PLAN_RATE_LIMITS[plan][ep]
|
|
1230
1482
|
# Find minute & day
|
|
1231
1483
|
minute = next(limit for limit, i in buckets if i == 60)
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1484
|
+
month = next(limit for limit, i in buckets if i == 24 * 3600 * 30)
|
|
1485
|
+
cpm = f"${cpm_counter:.2f}"
|
|
1486
|
+
|
|
1487
|
+
out.append(f"| {endpoint_name[ep]:<11} | {month:>9} | {minute:>10} | {cpm:>13} |")
|
|
1235
1488
|
|
|
1489
|
+
cpm_counter = cpm_counter - 0.5
|
|
1236
1490
|
out.append("") # Blank line
|
|
1237
1491
|
current_plan = ""
|
|
1238
1492
|
|
|
@@ -1243,10 +1497,6 @@ class Nosible:
|
|
|
1243
1497
|
Close the Nosible client, shutting down the HTTP session
|
|
1244
1498
|
and thread pool to release network and threading resources.
|
|
1245
1499
|
|
|
1246
|
-
Returns
|
|
1247
|
-
-------
|
|
1248
|
-
None
|
|
1249
|
-
|
|
1250
1500
|
Examples
|
|
1251
1501
|
--------
|
|
1252
1502
|
>>> from nosible import Nosible
|
|
@@ -1292,6 +1542,8 @@ class Nosible:
|
|
|
1292
1542
|
If the user API key is invalid.
|
|
1293
1543
|
ValueError
|
|
1294
1544
|
If the user hits their rate limit.
|
|
1545
|
+
ValueError
|
|
1546
|
+
If the user is making too many concurrent searches.
|
|
1295
1547
|
ValueError
|
|
1296
1548
|
If an unexpected error occurs.
|
|
1297
1549
|
ValueError
|
|
@@ -1319,12 +1571,17 @@ class Nosible:
|
|
|
1319
1571
|
content_type = response.headers.get("Content-Type", "")
|
|
1320
1572
|
if content_type.startswith("application/json"):
|
|
1321
1573
|
body = response.json()
|
|
1574
|
+
if isinstance(body, list):
|
|
1575
|
+
body = body[0] # NOSIBLE returns a list of errors
|
|
1576
|
+
print(body)
|
|
1322
1577
|
if body.get("type") == "string_too_short":
|
|
1323
1578
|
raise ValueError("Your API key is not valid: Too Short.")
|
|
1324
1579
|
else:
|
|
1325
1580
|
raise ValueError("You made a bad request.")
|
|
1326
1581
|
if response.status_code == 429:
|
|
1327
1582
|
raise ValueError("You have hit your rate limit.")
|
|
1583
|
+
if response.status_code == 409:
|
|
1584
|
+
raise ValueError("Too many concurrent searches.")
|
|
1328
1585
|
if response.status_code == 500:
|
|
1329
1586
|
raise ValueError("An unexpected error occurred.")
|
|
1330
1587
|
if response.status_code == 502:
|
|
@@ -1354,16 +1611,16 @@ class Nosible:
|
|
|
1354
1611
|
|
|
1355
1612
|
Examples
|
|
1356
1613
|
--------
|
|
1357
|
-
>>> nos = Nosible(nosible_api_key="test+|xyz") # doctest: +
|
|
1614
|
+
>>> nos = Nosible(nosible_api_key="test+|xyz") # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
|
|
1358
1615
|
Traceback (most recent call last):
|
|
1359
1616
|
...
|
|
1360
|
-
ValueError: test+ is not a valid plan prefix
|
|
1617
|
+
ValueError: Your API key is not valid: test+ is not a valid plan prefix.
|
|
1361
1618
|
"""
|
|
1362
1619
|
# Split off anything after the first '|'
|
|
1363
1620
|
prefix = (self.nosible_api_key or "").split("|", 1)[0]
|
|
1364
1621
|
|
|
1365
|
-
# Map prefixes ->
|
|
1366
|
-
plans = {"test", "basic", "pro", "pro+", "bus", "bus+", "ent"}
|
|
1622
|
+
# Map prefixes -> plan names
|
|
1623
|
+
plans = {"test", "basic", "pro", "pro+", "bus", "bus+", "ent", "chat"}
|
|
1367
1624
|
|
|
1368
1625
|
if prefix not in plans:
|
|
1369
1626
|
raise ValueError(f"Your API key is not valid: {prefix} is not a valid plan prefix.")
|
|
@@ -1393,11 +1650,10 @@ class Nosible:
|
|
|
1393
1650
|
|
|
1394
1651
|
Examples
|
|
1395
1652
|
--------
|
|
1396
|
-
|
|
1397
|
-
>>>
|
|
1398
|
-
>>> nos =
|
|
1399
|
-
>>> nos.
|
|
1400
|
-
>>> nos._generate_expansions("anything") # doctest: +SKIP
|
|
1653
|
+
>>> from nosible import Nosible
|
|
1654
|
+
>>> nos = Nosible(llm_api_key=None)
|
|
1655
|
+
>>> nos.llm_api_key = None
|
|
1656
|
+
>>> nos._generate_expansions("anything") # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
|
|
1401
1657
|
Traceback (most recent call last):
|
|
1402
1658
|
...
|
|
1403
1659
|
ValueError: LLM API key is required for generating expansions.
|
|
@@ -1486,6 +1742,49 @@ class Nosible:
|
|
|
1486
1742
|
self.logger.debug(f"Successful expansions: {expansions}")
|
|
1487
1743
|
return expansions
|
|
1488
1744
|
|
|
1745
|
+
@staticmethod
|
|
1746
|
+
def _validate_date_format(string: str, name: str):
|
|
1747
|
+
"""
|
|
1748
|
+
Check that a date string is valid ISO format (YYYY-MM-DD or full ISO timestamp).
|
|
1749
|
+
|
|
1750
|
+
Parameters
|
|
1751
|
+
----------
|
|
1752
|
+
string : str
|
|
1753
|
+
The date string to validate.
|
|
1754
|
+
name : str
|
|
1755
|
+
The name of the parameter being validated, used in the error message.
|
|
1756
|
+
|
|
1757
|
+
Raises
|
|
1758
|
+
------
|
|
1759
|
+
ValueError
|
|
1760
|
+
If `string` is not a valid ISO 8601 date. Error message will include
|
|
1761
|
+
the `name` and the offending string.
|
|
1762
|
+
Examples
|
|
1763
|
+
--------
|
|
1764
|
+
>>> # valid date-only format
|
|
1765
|
+
>>> Nosible._validate_date_format("2023-12-31", "publish_start")
|
|
1766
|
+
>>> # valid full timestamp
|
|
1767
|
+
>>> Nosible._validate_date_format("2023-12-31T15:30:00", "visited_end")
|
|
1768
|
+
>>> # invalid month
|
|
1769
|
+
>>> Nosible._validate_date_format("2023-13-01", "publish_end")
|
|
1770
|
+
Traceback (most recent call last):
|
|
1771
|
+
...
|
|
1772
|
+
ValueError: Invalid date for 'publish_end': '2023-13-01'. Expected ISO format 'YYYY-MM-DD'.
|
|
1773
|
+
>>> # wrong separator
|
|
1774
|
+
>>> Nosible._validate_date_format("2023/12/31", "visited_start")
|
|
1775
|
+
Traceback (most recent call last):
|
|
1776
|
+
...
|
|
1777
|
+
ValueError: Invalid date for 'visited_start': '2023/12/31'. Expected ISO format 'YYYY-MM-DD'.
|
|
1778
|
+
"""
|
|
1779
|
+
try:
|
|
1780
|
+
# datetime.fromisoformat accepts both YYYY-MM-DD and full timestamps
|
|
1781
|
+
parsed = datetime.fromisoformat(string)
|
|
1782
|
+
except Exception:
|
|
1783
|
+
raise ValueError(
|
|
1784
|
+
f"Invalid date for '{name}': {string!r}. "
|
|
1785
|
+
"Expected ISO format 'YYYY-MM-DD'."
|
|
1786
|
+
)
|
|
1787
|
+
|
|
1489
1788
|
def _format_sql(
|
|
1490
1789
|
self,
|
|
1491
1790
|
publish_start: str = None,
|
|
@@ -1508,35 +1807,31 @@ class Nosible:
|
|
|
1508
1807
|
Parameters
|
|
1509
1808
|
----------
|
|
1510
1809
|
publish_start : str, optional
|
|
1511
|
-
|
|
1810
|
+
Start date for when the document was published (ISO format).
|
|
1512
1811
|
publish_end : str, optional
|
|
1513
|
-
|
|
1514
|
-
include_netlocs : list of str, optional
|
|
1515
|
-
Domains to whitelist.
|
|
1516
|
-
exclude_netlocs : list of str, optional
|
|
1517
|
-
Domains to blacklist.
|
|
1812
|
+
End date for when the document was published (ISO format).
|
|
1518
1813
|
visited_start : str, optional
|
|
1519
|
-
|
|
1814
|
+
Start date for when the document was visited by NOSIBLE (ISO format).
|
|
1520
1815
|
visited_end : str, optional
|
|
1521
|
-
|
|
1816
|
+
End date for when the document was visited by NOSIBLE (ISO format).
|
|
1522
1817
|
certain : bool, optional
|
|
1523
|
-
|
|
1524
|
-
include_languages : list of str, optional
|
|
1525
|
-
Languages to include (Max: 50).
|
|
1526
|
-
exclude_languages : list of str, optional
|
|
1527
|
-
Languages to exclude (Max: 50).
|
|
1818
|
+
Only include documents where we are 100% sure of the date.
|
|
1528
1819
|
include_netlocs : list of str, optional
|
|
1529
|
-
|
|
1820
|
+
List of netlocs (domains) to include in the search. (Max: 50)
|
|
1530
1821
|
exclude_netlocs : list of str, optional
|
|
1531
|
-
|
|
1822
|
+
List of netlocs (domains) to exclude in the search. (Max: 50)
|
|
1823
|
+
include_languages : list of str, optional
|
|
1824
|
+
Languages to include in the search. (Max: 50, ISO 639-1 language codes).
|
|
1825
|
+
exclude_languages : list of str, optional
|
|
1826
|
+
Language codes to exclude in the search (Max: 50, ISO 639-1 language codes).
|
|
1532
1827
|
include_companies : list of str, optional
|
|
1533
|
-
|
|
1828
|
+
Google KG IDs of public companies to require (Max: 50).
|
|
1534
1829
|
exclude_companies : list of str, optional
|
|
1535
|
-
|
|
1830
|
+
Google KG IDs of public companies to forbid (Max: 50).
|
|
1536
1831
|
include_docs : list of str, optional
|
|
1537
|
-
URL hashes of
|
|
1832
|
+
URL hashes of docs to include (Max: 50).
|
|
1538
1833
|
exclude_docs : list of str, optional
|
|
1539
|
-
URL hashes of
|
|
1834
|
+
URL hashes of docs to exclude (Max: 50).
|
|
1540
1835
|
|
|
1541
1836
|
Returns
|
|
1542
1837
|
-------
|
|
@@ -1545,23 +1840,31 @@ class Nosible:
|
|
|
1545
1840
|
|
|
1546
1841
|
Raises
|
|
1547
1842
|
------
|
|
1548
|
-
|
|
1549
1843
|
ValueError
|
|
1550
1844
|
If more than 50 items in a filter are given.
|
|
1551
1845
|
"""
|
|
1846
|
+
for name, value in [
|
|
1847
|
+
("publish_start", publish_start),
|
|
1848
|
+
("publish_end", publish_end),
|
|
1849
|
+
("visited_start", visited_start),
|
|
1850
|
+
("visited_end", visited_end),
|
|
1851
|
+
]:
|
|
1852
|
+
if value is not None:
|
|
1853
|
+
self._validate_date_format(string=value, name=name)
|
|
1854
|
+
|
|
1552
1855
|
# Validate list lengths
|
|
1553
|
-
for name,
|
|
1554
|
-
(
|
|
1555
|
-
(
|
|
1556
|
-
(
|
|
1557
|
-
(
|
|
1558
|
-
(
|
|
1559
|
-
(
|
|
1560
|
-
(
|
|
1561
|
-
(
|
|
1856
|
+
for name, value in [
|
|
1857
|
+
("include_netlocs", include_netlocs),
|
|
1858
|
+
("exclude_netlocs", exclude_netlocs),
|
|
1859
|
+
("include_languages", include_languages),
|
|
1860
|
+
("exclude_languages", exclude_languages),
|
|
1861
|
+
("include_companies", include_companies),
|
|
1862
|
+
("exclude_companies", exclude_companies),
|
|
1863
|
+
("include_docs", include_docs),
|
|
1864
|
+
("exclude_docs", exclude_docs),
|
|
1562
1865
|
]:
|
|
1563
|
-
if
|
|
1564
|
-
raise ValueError(f"Too many items for '{name}' filter ({len(
|
|
1866
|
+
if value is not None and len(value) > 50:
|
|
1867
|
+
raise ValueError(f"Too many items for '{name}' filter ({len(value)}); maximum allowed is 50.")
|
|
1565
1868
|
|
|
1566
1869
|
sql = ["SELECT loc FROM engine"]
|
|
1567
1870
|
clauses: list[str] = []
|
|
@@ -1595,10 +1898,10 @@ class Nosible:
|
|
|
1595
1898
|
variants = set()
|
|
1596
1899
|
for n in include_netlocs:
|
|
1597
1900
|
variants.add(n)
|
|
1598
|
-
if n.startswith(
|
|
1901
|
+
if n.startswith("www."):
|
|
1599
1902
|
variants.add(n[4:])
|
|
1600
1903
|
else:
|
|
1601
|
-
variants.add(
|
|
1904
|
+
variants.add("www." + n)
|
|
1602
1905
|
in_list = ", ".join(f"'{v}'" for v in sorted(variants))
|
|
1603
1906
|
clauses.append(f"netloc IN ({in_list})")
|
|
1604
1907
|
|
|
@@ -1607,10 +1910,10 @@ class Nosible:
|
|
|
1607
1910
|
variants = set()
|
|
1608
1911
|
for n in exclude_netlocs:
|
|
1609
1912
|
variants.add(n)
|
|
1610
|
-
if n.startswith(
|
|
1913
|
+
if n.startswith("www."):
|
|
1611
1914
|
variants.add(n[4:])
|
|
1612
1915
|
else:
|
|
1613
|
-
variants.add(
|
|
1916
|
+
variants.add("www." + n)
|
|
1614
1917
|
ex_list = ", ".join(f"'{v}'" for v in sorted(variants))
|
|
1615
1918
|
clauses.append(f"netloc NOT IN ({ex_list})")
|
|
1616
1919
|
|
|
@@ -1703,7 +2006,7 @@ class Nosible:
|
|
|
1703
2006
|
except Exception:
|
|
1704
2007
|
return False
|
|
1705
2008
|
|
|
1706
|
-
def __enter__(self):
|
|
2009
|
+
def __enter__(self) -> "Nosible":
|
|
1707
2010
|
"""
|
|
1708
2011
|
Enter the context manager, returning this client instance.
|
|
1709
2012
|
|
|
@@ -1714,32 +2017,46 @@ class Nosible:
|
|
|
1714
2017
|
"""
|
|
1715
2018
|
return self
|
|
1716
2019
|
|
|
1717
|
-
def __exit__(
|
|
2020
|
+
def __exit__(
|
|
2021
|
+
self,
|
|
2022
|
+
_exc_type: typing.Optional[type[BaseException]],
|
|
2023
|
+
_exc_val: typing.Optional[BaseException],
|
|
2024
|
+
_exc_tb: typing.Optional[types.TracebackType],
|
|
2025
|
+
) -> typing.Optional[bool]:
|
|
1718
2026
|
"""
|
|
1719
|
-
|
|
2027
|
+
Always clean up (self.close()), but let exceptions propagate.
|
|
2028
|
+
Return True only if you really want to suppress an exception.
|
|
1720
2029
|
|
|
1721
2030
|
Parameters
|
|
1722
2031
|
----------
|
|
1723
|
-
|
|
1724
|
-
|
|
1725
|
-
|
|
1726
|
-
|
|
1727
|
-
|
|
1728
|
-
|
|
2032
|
+
_exc_type : Optional[type[BaseException]]
|
|
2033
|
+
The type of the exception raised, if any.
|
|
2034
|
+
_exc_val : Optional[BaseException]
|
|
2035
|
+
The exception instance, if any.
|
|
2036
|
+
_exc_tb : Optional[types.TracebackType]
|
|
2037
|
+
The traceback object, if any.
|
|
1729
2038
|
|
|
1730
2039
|
Returns
|
|
1731
2040
|
-------
|
|
1732
|
-
|
|
2041
|
+
Optional[bool]
|
|
2042
|
+
False to propagate exceptions, True to suppress them.
|
|
1733
2043
|
"""
|
|
1734
|
-
|
|
2044
|
+
try:
|
|
2045
|
+
self.close()
|
|
2046
|
+
except Exception as cleanup_err:
|
|
2047
|
+
# optional: log or re-raise, but don’t hide the original exc
|
|
2048
|
+
print(f"Cleanup failed: {cleanup_err!r}")
|
|
2049
|
+
# Return False (or None) => exceptions inside the with‐block are re-raised.
|
|
2050
|
+
return False
|
|
1735
2051
|
|
|
1736
2052
|
def __del__(self):
|
|
1737
2053
|
"""
|
|
1738
2054
|
Destructor to ensure resources are cleaned up if not explicitly closed.
|
|
1739
2055
|
|
|
1740
|
-
Returns
|
|
1741
|
-
-------
|
|
1742
|
-
None
|
|
1743
2056
|
"""
|
|
1744
|
-
#
|
|
1745
|
-
|
|
2057
|
+
# Only close if interpreter is fully alive
|
|
2058
|
+
if not getattr(sys, "is_finalizing", lambda: False)():
|
|
2059
|
+
try:
|
|
2060
|
+
self.close()
|
|
2061
|
+
except Exception:
|
|
2062
|
+
pass
|