nosible 0.1.8__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
nosible/nosible_client.py CHANGED
@@ -2,11 +2,15 @@ import gzip
2
2
  import json
3
3
  import logging
4
4
  import os
5
+ import sys
6
+ import textwrap
5
7
  import time
6
- import traceback
8
+ import types
9
+ import typing
7
10
  from collections.abc import Iterator
8
- from concurrent.futures import ThreadPoolExecutor, as_completed
9
- from typing import Union
11
+ from concurrent.futures import ThreadPoolExecutor
12
+ from datetime import datetime
13
+ from typing import Union, Optional
10
14
 
11
15
  import polars as pl
12
16
  import requests
@@ -25,8 +29,10 @@ from tenacity import (
25
29
  from nosible.classes.result_set import ResultSet
26
30
  from nosible.classes.search import Search
27
31
  from nosible.classes.search_set import SearchSet
32
+ from nosible.classes.snippet_set import SnippetSet
28
33
  from nosible.classes.web_page import WebPageData
29
34
  from nosible.utils.json_tools import json_loads
35
+ from nosible.utils.question_builder import _get_question
30
36
  from nosible.utils.rate_limiter import PLAN_RATE_LIMITS, RateLimiter, _rate_limited
31
37
 
32
38
  # Set up a module‐level logger.
@@ -47,37 +53,33 @@ class Nosible:
47
53
  llm_api_key : str, optional
48
54
  API key for LLM-based query expansions.
49
55
  openai_base_url : str
50
- Base URL for the OpenAI-compatible LLM API.
51
- sentiment_model : str
52
- Model to use for sentiment analysis and expansions.
56
+ Base URL for the OpenAI-compatible LLM API. (default is OpenRouter's API endpoint)
57
+ sentiment_model : str, optional
58
+ Model to use for sentiment analysis (default is "openai/gpt-4o").
53
59
  timeout : int
54
60
  Request timeout for HTTP calls.
55
- retries : int, default=5
61
+ retries : int,
56
62
  Number of retry attempts for transient HTTP errors.
57
- concurrency : int, default=10
63
+ concurrency : int,
58
64
  Maximum concurrent search requests.
59
65
  publish_start : str, optional
60
- Earliest publish date filter (ISO formatted date).
66
+ Start date for when the document was published (ISO format).
61
67
  publish_end : str, optional
62
- Latest publish date filter (ISO formatted date).
63
- include_netlocs : list of str, optional
64
- Domains to include.
65
- exclude_netlocs : list of str, optional
66
- Domains to exclude.
68
+ End date for when the document was published (ISO format).
67
69
  visited_start : str, optional
68
- Earliest visit date filter (ISO formatted date).
70
+ Start date for when the document was visited by NOSIBLE (ISO format).
69
71
  visited_end : str, optional
70
- Latest visit date filter (ISO formatted date).
72
+ End date for when the document was visited by NOSIBLE (ISO format).
71
73
  certain : bool, optional
72
- True if we are 100% sure of the date.
73
- include_languages : list of str, optional
74
- Language codes to include (Max: 50).
75
- exclude_languages : list of str, optional
76
- Language codes to exclude (Max: 50).
74
+ Only include documents where we are 100% sure of the date.
77
75
  include_netlocs : list of str, optional
78
- Only include results from these domains (Max: 50).
76
+ List of netlocs (domains) to include in the search. (Max: 50)
79
77
  exclude_netlocs : list of str, optional
80
- Exclude results from these domains (Max: 50).
78
+ List of netlocs (domains) to exclude in the search. (Max: 50)
79
+ include_languages : list of str, optional
80
+ Languages to include in the search. (Max: 50, ISO 639-1 language codes).
81
+ exclude_languages : list of str, optional
82
+ Language codes to exclude in the search (Max: 50, ISO 639-1 language codes).
81
83
  include_companies : list of str, optional
82
84
  Google KG IDs of public companies to require (Max: 50).
83
85
  exclude_companies : list of str, optional
@@ -86,10 +88,6 @@ class Nosible:
86
88
  URL hashes of docs to include (Max: 50).
87
89
  exclude_docs : list of str, optional
88
90
  URL hashes of docs to exclude (Max: 50).
89
- openai_base_url : str, optional
90
- Base URL for the OpenAI API (default is OpenRouter).
91
- sentiment_model : str, optional
92
- Model to use for sentiment analysis (default is "openai/gpt-4o").
93
91
 
94
92
  Notes
95
93
  -----
@@ -173,7 +171,7 @@ class Nosible:
173
171
  reraise=True,
174
172
  stop=stop_after_attempt(self.retries) | stop_after_delay(self.timeout),
175
173
  wait=wait_exponential(multiplier=1, min=1, max=10),
176
- retry=retry_if_exception_type(Exception),
174
+ retry=retry_if_exception_type(requests.exceptions.RequestException),
177
175
  before_sleep=before_sleep_log(self.logger, logging.WARNING),
178
176
  )(self._generate_expansions)
179
177
 
@@ -212,6 +210,9 @@ class Nosible:
212
210
  n_probes: int = 30,
213
211
  n_contextify: int = 128,
214
212
  algorithm: str = "hybrid-2",
213
+ min_similarity: float = None,
214
+ must_include: list[str] = None,
215
+ must_exclude: list[str] = None,
215
216
  autogenerate_expansions: bool = False,
216
217
  publish_start: str = None,
217
218
  publish_end: str = None,
@@ -243,38 +244,40 @@ class Nosible:
243
244
  List of LLM‐generated expansions.
244
245
  sql_filter : list of str, optional
245
246
  SQL‐style filter clauses.
246
- n_results : int, default=100
247
+ n_results : int
247
248
  Max number of results (max 100).
248
- n_probes : int, default=30
249
+ n_probes : int
249
250
  Number of index shards to probe.
250
- n_contextify : int, default=128
251
+ n_contextify : int
251
252
  Context window size per result.
252
- algorithm : str, default="hybrid-2"
253
+ algorithm : str
253
254
  Search algorithm type.
254
- autogenerate_expansions : bool, default=False
255
+ min_similarity : float
256
+ Results must have at least this similarity score.
257
+ must_include : list of str
258
+ Only results mentioning these strings will be included.
259
+ must_exclude : list of str
260
+ Any result mentioning these strings will be excluded.
261
+ autogenerate_expansions : bool
255
262
  Do you want to generate expansions automatically using a LLM?
256
263
  publish_start : str, optional
257
- Earliest publish date filter (ISO formatted date).
264
+ Start date for when the document was published (ISO format).
258
265
  publish_end : str, optional
259
- Latest publish date filter (ISO formatted date).
260
- include_netlocs : list of str, optional
261
- Domains to include.
262
- exclude_netlocs : list of str, optional
263
- Domains to exclude.
266
+ End date for when the document was published (ISO format).
264
267
  visited_start : str, optional
265
- Earliest visit date filter (ISO formatted date).
268
+ Start date for when the document was visited by NOSIBLE (ISO format).
266
269
  visited_end : str, optional
267
- Latest visit date filter (ISO formatted date).
270
+ End date for when the document was visited by NOSIBLE (ISO format).
268
271
  certain : bool, optional
269
- True if we are 100% sure of the date.
270
- include_languages : list of str, optional
271
- Language codes to include (Max: 50).
272
- exclude_languages : list of str, optional
273
- Language codes to exclude (Max: 50).
272
+ Only include documents where we are 100% sure of the date.
274
273
  include_netlocs : list of str, optional
275
- Only include results from these domains (Max: 50).
274
+ List of netlocs (domains) to include in the search. (Max: 50)
276
275
  exclude_netlocs : list of str, optional
277
- Exclude results from these domains (Max: 50).
276
+ List of netlocs (domains) to exclude in the search. (Max: 50)
277
+ include_languages : list of str, optional
278
+ Languages to include in the search. (Max: 50, ISO 639-1 language codes).
279
+ exclude_languages : list of str, optional
280
+ Language codes to exclude in the search (Max: 50, ISO 639-1 language codes).
278
281
  include_companies : list of str, optional
279
282
  Google KG IDs of public companies to require (Max: 50).
280
283
  exclude_companies : list of str, optional
@@ -297,6 +300,8 @@ class Nosible:
297
300
  If neither question nor search are specified
298
301
  RuntimeError
299
302
  If the response fails in any way.
303
+ ValueError
304
+ If `n_results` is greater than 100.
300
305
 
301
306
  Notes
302
307
  -----
@@ -342,6 +347,9 @@ class Nosible:
342
347
  n_probes=n_probes,
343
348
  n_contextify=n_contextify,
344
349
  algorithm=algorithm,
350
+ min_similarity=min_similarity,
351
+ must_include=must_include,
352
+ must_exclude=must_exclude,
345
353
  autogenerate_expansions=autogenerate_expansions,
346
354
  publish_start=publish_start,
347
355
  publish_end=publish_end,
@@ -379,6 +387,9 @@ class Nosible:
379
387
  n_probes: int = 30,
380
388
  n_contextify: int = 128,
381
389
  algorithm: str = "hybrid-2",
390
+ min_similarity: float = None,
391
+ must_include: list[str] = None,
392
+ must_exclude: list[str] = None,
382
393
  autogenerate_expansions: bool = False,
383
394
  publish_start: str = None,
384
395
  publish_end: str = None,
@@ -407,48 +418,50 @@ class Nosible:
407
418
  List of expansion terms to use for each search.
408
419
  sql_filter : list of str, optional
409
420
  SQL-like filters to apply to the search.
410
- n_results : int, default=100
421
+ n_results : int
411
422
  Number of results to return per search.
412
- n_probes : int, default=30
423
+ n_probes : int
413
424
  Number of probes to use for the search algorithm.
414
- n_contextify : int, default=128
425
+ n_contextify : int
415
426
  Context window size for the search.
416
- algorithm : str, default="hybrid-2"
427
+ algorithm : str
417
428
  Search algorithm to use.
418
- autogenerate_expansions : bool, default=False
419
- Do you want to generate expansions automatically using a LLM?
429
+ min_similarity : float
430
+ Results must have at least this similarity score.
431
+ must_include : list of str
432
+ Only results mentioning these strings will be included.
433
+ must_exclude : list of str
434
+ Any result mentioning these strings will be excluded.
435
+ autogenerate_expansions : bool
436
+ Do you want to generate expansions automatically using a LLM?.
420
437
  publish_start : str, optional
421
- Filter results published after this date (ISO formatted date).
438
+ Start date for when the document was published (ISO format).
422
439
  publish_end : str, optional
423
- Filter results published before this date (ISO formatted date).
424
- include_netlocs : list of str, optional
425
- Only include results from these domains.
426
- exclude_netlocs : list of str, optional
427
- Exclude results from these domains.
440
+ End date for when the document was published (ISO format).
428
441
  visited_start : str, optional
429
- Only include results visited after this date (ISO formatted date).
442
+ Start date for when the document was visited by NOSIBLE (ISO format).
430
443
  visited_end : str, optional
431
- Only include results visited before this date (ISO formatted date).
444
+ End date for when the document was visited by NOSIBLE (ISO format).
432
445
  certain : bool, optional
433
- Only include results with high certainty.
446
+ Only include documents where we are 100% sure of the date.
447
+ include_netlocs : list of str, optional
448
+ List of netlocs (domains) to include in the search. (Max: 50)
449
+ exclude_netlocs : list of str, optional
450
+ List of netlocs (domains) to exclude in the search. (Max: 50)
434
451
  include_languages : list of str, optional
435
- Only include results in these languages (Max: 50).
452
+ Languages to include in the search. (Max: 50, ISO 639-1 language codes).
436
453
  exclude_languages : list of str, optional
437
- Exclude results in these languages (Max: 50).
454
+ Language codes to exclude in the search (Max: 50, ISO 639-1 language codes).
438
455
  include_companies : list of str, optional
439
- Only include results from these companies (Max: 50).
456
+ Google KG IDs of public companies to require (Max: 50).
440
457
  exclude_companies : list of str, optional
441
- Exclude results from these companies (Max: 50).
442
- include_netlocs : list of str, optional
443
- Only include results from these domains (Max: 50).
444
- exclude_netlocs : list of str, optional
445
- Exclude results from these domains (Max: 50).
458
+ Google KG IDs of public companies to forbid (Max: 50).
446
459
  include_docs : list of str, optional
447
- URL hashes of documents to include (Max: 50).
460
+ URL hashes of docs to include (Max: 50).
448
461
  exclude_docs : list of str, optional
449
- URL hashes of documents to exclude (Max: 50).
462
+ URL hashes of docs to exclude (Max: 50).
450
463
 
451
- Yields
464
+ Returns
452
465
  ------
453
466
  ResultSet or None
454
467
  Each completed search’s results, or None on failure.
@@ -461,8 +474,6 @@ class Nosible:
461
474
  If both queries and searches are specified.
462
475
  TypeError
463
476
  If neither queries nor searches are specified.
464
- RuntimeError
465
- If the response fails in any way.
466
477
 
467
478
  Notes
468
479
  -----
@@ -473,7 +484,10 @@ class Nosible:
473
484
  --------
474
485
  >>> from nosible import Nosible
475
486
  >>> queries = SearchSet(
476
- ... [Search(question="Hedge funds seek to expand into private credit", n_results=5), Search(question="How have the Trump tariffs impacted the US economy?", n_results=5)]
487
+ ... [
488
+ ... Search(question="Hedge funds seek to expand into private credit", n_results=5),
489
+ ... Search(question="How have the Trump tariffs impacted the US economy?", n_results=5),
490
+ ... ]
477
491
  ... )
478
492
  >>> with Nosible() as nos:
479
493
  ... results_list = list(nos.searches(searches=queries))
@@ -484,10 +498,14 @@ class Nosible:
484
498
  True True
485
499
  True True
486
500
  >>> with Nosible() as nos:
487
- ... results_list_str = list(nos.searches(questions=[
488
- ... "What are the terms of the partnership between Microsoft and OpenAI?",
489
- ... "What are the terms of the partnership between Volkswagen and Uber?"
490
- ... ]))
501
+ ... results_list_str = list(
502
+ ... nos.searches(
503
+ ... questions=[
504
+ ... "What are the terms of the partnership between Microsoft and OpenAI?",
505
+ ... "What are the terms of the partnership between Volkswagen and Uber?",
506
+ ... ]
507
+ ... )
508
+ ... )
491
509
  >>> nos = Nosible(nosible_api_key="test|xyz") # doctest: +ELLIPSIS
492
510
  >>> nos.searches() # doctest: +ELLIPSIS
493
511
  Traceback (most recent call last):
@@ -515,6 +533,9 @@ class Nosible:
515
533
  n_probes=n_probes,
516
534
  n_contextify=n_contextify,
517
535
  algorithm=algorithm,
536
+ min_similarity=min_similarity,
537
+ must_include=must_include,
538
+ must_exclude=must_exclude,
518
539
  autogenerate_expansions=autogenerate_expansions,
519
540
  publish_start=publish_start,
520
541
  publish_end=publish_end,
@@ -538,7 +559,8 @@ class Nosible:
538
559
  yield future.result()
539
560
  except Exception as e:
540
561
  self.logger.warning(f"Search failed: {e!r}")
541
- yield None
562
+ raise
563
+
542
564
  return _run_generator()
543
565
 
544
566
  @_rate_limited("fast")
@@ -560,6 +582,8 @@ class Nosible:
560
582
  ------
561
583
  ValueError
562
584
  If `n_results` > 100.
585
+ ValueError
586
+ If min_similarity is not [0,1].
563
587
 
564
588
  Examples
565
589
  --------
@@ -573,7 +597,7 @@ class Nosible:
573
597
  ValueError: Search can not have more than 100 results - Use bulk search instead.
574
598
  """
575
599
  # --------------------------------------------------------------------------------------------------------------
576
- # Setting search params. Individual search will overide Nosible defaults.
600
+ # Setting search params. Individual search will override Nosible defaults.
577
601
  # --------------------------------------------------------------------------------------------------------------
578
602
  question = search_obj.question # No default
579
603
  expansions = search_obj.expansions if search_obj.expansions is not None else [] # Default to empty list
@@ -582,7 +606,12 @@ class Nosible:
582
606
  n_probes = search_obj.n_probes if search_obj.n_probes is not None else 30
583
607
  n_contextify = search_obj.n_contextify if search_obj.n_contextify is not None else 128
584
608
  algorithm = search_obj.algorithm if search_obj.algorithm is not None else "hybrid-2"
585
- autogenerate_expansions = search_obj.autogenerate_expansions if search_obj.autogenerate_expansions is not None else False
609
+ min_similarity = search_obj.min_similarity if search_obj.min_similarity is not None else 0
610
+ must_include = search_obj.must_include if search_obj.must_include is not None else []
611
+ must_exclude = search_obj.must_exclude if search_obj.must_exclude is not None else []
612
+ autogenerate_expansions = (
613
+ search_obj.autogenerate_expansions if search_obj.autogenerate_expansions is not None else False
614
+ )
586
615
  publish_start = search_obj.publish_start if search_obj.publish_start is not None else self.publish_start
587
616
  publish_end = search_obj.publish_end if search_obj.publish_end is not None else self.publish_end
588
617
  include_netlocs = search_obj.include_netlocs if search_obj.include_netlocs is not None else self.include_netlocs
@@ -603,6 +632,9 @@ class Nosible:
603
632
  search_obj.exclude_companies if search_obj.exclude_companies is not None else self.exclude_companies
604
633
  )
605
634
 
635
+ if not (0.0 <= min_similarity <= 1.0):
636
+ raise ValueError(f"Invalid min_simalarity: {min_similarity}. Must be [0,1].")
637
+
606
638
  # Generate expansions if not provided
607
639
  if expansions is None:
608
640
  expansions = []
@@ -639,6 +671,9 @@ class Nosible:
639
671
  "n_probes": n_probes,
640
672
  "n_contextify": n_contextify,
641
673
  "algorithm": algorithm,
674
+ "min_similarity": min_similarity,
675
+ "must_include": must_include,
676
+ "must_exclude": must_exclude,
642
677
  }
643
678
 
644
679
  resp = self._post(url="https://www.nosible.ai/search/v1/fast-search", payload=payload)
@@ -699,6 +734,9 @@ class Nosible:
699
734
  n_probes: int = 30,
700
735
  n_contextify: int = 128,
701
736
  algorithm: str = "hybrid-2",
737
+ min_similarity: float = None,
738
+ must_include: list[str] = None,
739
+ must_exclude: list[str] = None,
702
740
  autogenerate_expansions: bool = False,
703
741
  publish_start: str = None,
704
742
  publish_end: str = None,
@@ -728,46 +766,48 @@ class Nosible:
728
766
  Optional list of expanded query strings.
729
767
  sql_filter : list of str, optional
730
768
  Optional SQL WHERE clause filters.
731
- n_results : int, default=100
769
+ n_results : int
732
770
  Number of results per query (1,000–10,000).
733
- n_probes : int, default=30
771
+ n_probes : int
734
772
  Number of shards to probe.
735
- n_contextify : int, default=128
773
+ n_contextify : int
736
774
  Context window size per result.
737
- algorithm : str, default="hybrid-2"
775
+ algorithm : str
738
776
  Search algorithm identifier.
739
- autogenerate_expansions : bool, default=False
777
+ min_similarity : float
778
+ Results must have at least this similarity score.
779
+ must_include : list of str
780
+ Only results mentioning these strings will be included.
781
+ must_exclude : list of str
782
+ Any result mentioning these strings will be excluded.
783
+ autogenerate_expansions : bool
740
784
  Do you want to generate expansions automatically using a LLM?
741
785
  publish_start : str, optional
742
- Filter for earliest publish date.
786
+ Start date for when the document was published (ISO format).
743
787
  publish_end : str, optional
744
- Filter for latest publish date.
745
- include_netlocs : list of str, optional
746
- Domains to include.
747
- exclude_netlocs : list of str, optional
748
- Domains to exclude.
788
+ End date for when the document was published (ISO format).
749
789
  visited_start : str, optional
750
- Filter for earliest visit date.
790
+ Start date for when the document was visited by NOSIBLE (ISO format).
751
791
  visited_end : str, optional
752
- Filter for latest visit date.
792
+ End date for when the document was visited by NOSIBLE (ISO format).
753
793
  certain : bool, optional
754
- True if we are 100% sure of the date.
755
- include_languages : list of str, optional
756
- Languages to include (Max: 50).
757
- exclude_languages : list of str, optional
758
- Languages to exclude (Max: 50).
794
+ Only include documents where we are 100% sure of the date.
759
795
  include_netlocs : list of str, optional
760
- Only include results from these domains (Max: 50).
796
+ List of netlocs (domains) to include in the search. (Max: 50)
761
797
  exclude_netlocs : list of str, optional
762
- Exclude results from these domains (Max: 50).
798
+ List of netlocs (domains) to exclude in the search. (Max: 50)
799
+ include_languages : list of str, optional
800
+ Languages to include in the search. (Max: 50, ISO 639-1 language codes).
801
+ exclude_languages : list of str, optional
802
+ Language codes to exclude in the search (Max: 50, ISO 639-1 language codes).
763
803
  include_companies : list of str, optional
764
- Company IDs to require (Max: 50).
804
+ Google KG IDs of public companies to require (Max: 50).
765
805
  exclude_companies : list of str, optional
766
- Company IDs to forbid (Max: 50).
806
+ Google KG IDs of public companies to forbid (Max: 50).
767
807
  include_docs : list of str, optional
768
- URL hashes of documents to include (Max: 50).
808
+ URL hashes of docs to include (Max: 50).
769
809
  exclude_docs : list of str, optional
770
- URL hashes of documents to exclude (Max: 50).
810
+ URL hashes of docs to exclude (Max: 50).
771
811
  verbose : bool, optional
772
812
  Show verbose output, Bulk search will print more information.
773
813
 
@@ -786,6 +826,8 @@ class Nosible:
786
826
  If neither question nor search are specified.
787
827
  RuntimeError
788
828
  If the response fails in any way.
829
+ ValueError
830
+ If min_similarity is not [0,1].
789
831
 
790
832
  Notes
791
833
  -----
@@ -794,23 +836,21 @@ class Nosible:
794
836
 
795
837
  Examples
796
838
  --------
797
- >>> from nosible.classes.search import Search
798
- >>> from nosible import Nosible
799
- >>> with Nosible(include_netlocs=["bbc.com"]) as nos: # doctest: +SKIP
800
- ... results = nos.bulk_search(question="Nvidia insiders dump more than $1 billion in stock", n_results=2000) # doctest: +SKIP
839
+ >>> from nosible.classes.search import Search # doctest: +SKIP
840
+ >>> from nosible import Nosible # doctest: +SKIP
841
+ >>> with Nosible(exclude_netlocs=["bbc.com"]) as nos: # doctest: +SKIP
842
+ ... results = nos.bulk_search(question=_get_question(), n_results=2000) # doctest: +SKIP
801
843
  ... print(isinstance(results, ResultSet)) # doctest: +SKIP
802
844
  ... print(len(results)) # doctest: +SKIP
803
845
  True
804
846
  2000
805
-
806
- >>> s = Search(question="OpenAI", n_results=1000) # doctest: +SKIP
847
+ >>> s = Search(question=_get_question(), n_results=1000) # doctest: +SKIP
807
848
  >>> with Nosible() as nos: # doctest: +SKIP
808
849
  ... results = nos.bulk_search(search=s) # doctest: +SKIP
809
850
  ... print(isinstance(results, ResultSet)) # doctest: +SKIP
810
851
  ... print(len(results)) # doctest: +SKIP
811
852
  True
812
853
  1000
813
-
814
854
  >>> nos = Nosible(nosible_api_key="test|xyz") # doctest: +SKIP
815
855
  >>> nos.bulk_search() # doctest: +SKIP
816
856
  Traceback (most recent call last):
@@ -818,20 +858,18 @@ class Nosible:
818
858
  TypeError: Either question or search must be specified
819
859
 
820
860
  >>> nos = Nosible(nosible_api_key="test|xyz") # doctest: +SKIP
821
- >>> nos.bulk_search(question="foo", search=Search(question="foo")) # doctest: +SKIP
861
+ >>> nos.bulk_search(question=_get_question(), search=Search(question=_get_question())) # doctest: +SKIP
822
862
  Traceback (most recent call last):
823
863
  ...
824
864
  TypeError: Question and search cannot be both specified
825
-
826
865
  >>> nos = Nosible(nosible_api_key="test|xyz") # doctest: +SKIP
827
- >>> nos.bulk_search(question="foo", n_results=100) # doctest: +SKIP
866
+ >>> nos.bulk_search(question=_get_question(), n_results=100) # doctest: +SKIP
828
867
  Traceback (most recent call last):
829
868
  ...
830
- ValueError: Bulk search must have at least 100 results per query; use search() for smaller result sets.
831
-
869
+ ValueError: Bulk search must have at least 1000 results per query; use search() for smaller result sets.
832
870
  >>> nos = Nosible(nosible_api_key="test|xyz") # doctest: +SKIP
833
- >>> nos.bulk_search(question="foo", n_results=10001) # doctest: +SKIP
834
- Traceback (most recent call last):
871
+ >>> nos.bulk_search(question=_get_question(), n_results=10001) # doctest: +SKIP
872
+ Traceback (most recent call last): # doctest: +SKIP
835
873
  ...
836
874
  ValueError: Bulk search cannot have more than 10000 results per query.
837
875
  """
@@ -854,8 +892,17 @@ class Nosible:
854
892
  n_probes = search.n_probes if search.n_probes is not None else n_probes
855
893
  n_contextify = search.n_contextify if search.n_contextify is not None else n_contextify
856
894
  algorithm = search.algorithm if search.algorithm is not None else algorithm
857
- autogenerate_expansions = search.autogenerate_expansions if search.autogenerate_expansions is not None \
895
+ min_similarity = search.min_similarity if search.min_similarity is not None else min_similarity
896
+ min_similarity = min_similarity if min_similarity is not None else 0
897
+ must_include = search.must_include if search.must_include is not None else must_include
898
+ must_include = must_include if must_include is not None else []
899
+ must_exclude = search.must_exclude if search.must_exclude is not None else must_exclude
900
+ must_exclude = must_exclude if must_exclude is not None else []
901
+ autogenerate_expansions = (
902
+ search.autogenerate_expansions
903
+ if search.autogenerate_expansions is not None
858
904
  else autogenerate_expansions
905
+ )
859
906
  publish_start = search.publish_start if search.publish_start is not None else publish_start
860
907
  publish_end = search.publish_end if search.publish_end is not None else publish_end
861
908
  include_netlocs = search.include_netlocs if search.include_netlocs is not None else include_netlocs
@@ -876,6 +923,13 @@ class Nosible:
876
923
  if autogenerate_expansions is True:
877
924
  expansions = self._generate_expansions(question=question)
878
925
 
926
+ must_include = must_include if must_include is not None else []
927
+ must_exclude = must_exclude if must_exclude is not None else []
928
+ min_similarity = min_similarity if min_similarity is not None else 0
929
+
930
+ if not (0.0 <= min_similarity <= 1.0):
931
+ raise ValueError(f"Invalid min_simalarity: {min_similarity}. Must be [0,1].")
932
+
879
933
  # Generate sql_filter if unset
880
934
  if sql_filter is None:
881
935
  sql_filter = self._format_sql(
@@ -920,6 +974,9 @@ class Nosible:
920
974
  "n_probes": n_probes,
921
975
  "n_contextify": n_contextify,
922
976
  "algorithm": algorithm,
977
+ "min_similarity": min_similarity,
978
+ "must_include": must_include,
979
+ "must_exclude": must_exclude,
923
980
  }
924
981
  resp = self._post(url="https://www.nosible.ai/search/v1/slow-search", payload=payload)
925
982
  try:
@@ -952,20 +1009,139 @@ class Nosible:
952
1009
  if verbose:
953
1010
  self.logger.setLevel(previous_level)
954
1011
 
1012
+ def answer(
1013
+ self,
1014
+ query: str,
1015
+ n_results: int = 100,
1016
+ min_similarity: float = 0.65,
1017
+ model: Union[str, None] = "google/gemini-2.0-flash-001",
1018
+ show_context: bool = True,
1019
+ ) -> str:
1020
+ """
1021
+ RAG-style question answering: retrieve top `n_results` via `.search()`
1022
+ then answer `query` using those documents as context.
1023
+
1024
+ Parameters
1025
+ ----------
1026
+ query : str
1027
+ The user’s natural-language question.
1028
+ n_results : int
1029
+ How many docs to fetch to build the context.
1030
+ min_similarity : float
1031
+ Results must have at least this similarity score.
1032
+ model : str, optional
1033
+ Which LLM to call to answer your question.
1034
+ show_context : bool, optional
1035
+ Do you want the context to be shown?
1036
+
1037
+ Returns
1038
+ -------
1039
+ str
1040
+ The LLM’s generated answer, grounded in the retrieved docs.
1041
+
1042
+ Raises
1043
+ ------
1044
+ ValueError
1045
+ If no API key is configured for the LLM client.
1046
+ RuntimeError
1047
+ If the LLM call fails or returns an invalid response.
1048
+
1049
+ Examples
1050
+ --------
1051
+ >>> from nosible import Nosible
1052
+ >>> with Nosible() as nos:
1053
+ ... ans = nos.answer(
1054
+ ... query="How is research governance and decision-making structured between Google and DeepMind?",
1055
+ ... n_results=100,
1056
+ ... show_context=True
1057
+ ... ) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
1058
+ <BLANKLINE>
1059
+ Doc 1
1060
+ Title: ...
1061
+ >>> print(ans) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
1062
+ Answer:
1063
+ ...
1064
+ """
1065
+
1066
+ if not self.llm_api_key:
1067
+ raise ValueError("An LLM API key is required for answer().")
1068
+
1069
+ # Retrieve top documents
1070
+ results = self.search(
1071
+ question=query,
1072
+ n_results=n_results,
1073
+ min_similarity=min_similarity,
1074
+ )
1075
+
1076
+ # Build RAG context
1077
+ context = ""
1078
+ pieces: list[str] = []
1079
+ for idx, result in enumerate(results):
1080
+ pieces.append(f"""
1081
+ Doc {idx + 1}
1082
+ Title: {result.title}
1083
+ Similarity Score: {result.similarity * 100:.2f}%
1084
+ URL: {result.url}
1085
+ Content: {result.content}
1086
+ """)
1087
+ context = "\n".join(pieces)
1088
+
1089
+ if show_context:
1090
+ print(textwrap.dedent(context))
1091
+
1092
+ # Craft prompt
1093
+ prompt = (f"""
1094
+ # TASK DESCRIPTION
1095
+
1096
+ You are a helpful assistant. Use the following context to answer the question.
1097
+ When you use information from a chunk, cite it by referencing its label in square brackets, e.g. [doc3].
1098
+
1099
+ ## Question
1100
+ {query}
1101
+
1102
+ ## Context
1103
+ {context}
1104
+ """
1105
+ )
1106
+
1107
+ # Call LLM
1108
+ client = OpenAI(base_url=self.openai_base_url, api_key=self.llm_api_key)
1109
+ try:
1110
+ response = client.chat.completions.create(
1111
+ model = model,
1112
+ messages = [{"role": "user", "content": prompt}],
1113
+ )
1114
+ except Exception as e:
1115
+ raise RuntimeError(f"LLM API error: {e}") from e
1116
+
1117
+ # Validate response shape
1118
+ choices = getattr(response, "choices", None)
1119
+ if not choices or not hasattr(choices[0], "message"):
1120
+ raise RuntimeError(f"Invalid LLM response format: {response!r}")
1121
+
1122
+ # Return the generated text
1123
+ return "Answer:\n" + response.choices[0].message.content.strip()
1124
+
955
1125
  @_rate_limited("visit")
956
- def visit(self, html: str = "", recrawl: bool = False, render: bool = False, url: str = None) -> WebPageData:
1126
+ def visit(
1127
+ self,
1128
+ html: str = "",
1129
+ recrawl: bool = False,
1130
+ render: bool = False,
1131
+ url: str = None
1132
+ ) -> WebPageData:
957
1133
  """
958
1134
  Visit a given URL and return a structured WebPageData object for the page.
959
1135
 
960
1136
  Parameters
961
1137
  ----------
962
- html : str, default=""
1138
+ html : str
963
1139
  Raw HTML to process instead of fetching.
964
- recrawl : bool, default=False
1140
+ recrawl : bool
965
1141
  If True, force a fresh crawl.
966
- render : bool, default=False
1142
+ render : bool
967
1143
  If True, allow JavaScript rendering before extraction.
968
- url : str, default=None
1144
+ url : str
969
1145
  The URL to fetch and parse.
970
1146
 
971
1147
  Returns
@@ -986,26 +1162,24 @@ class Nosible:
986
1162
 
987
1163
  Examples
988
1164
  --------
989
- >>> from nosible import Nosible # doctest: +SKIP
990
- >>> with Nosible() as nos: # doctest: +SKIP
991
- ... out = nos.visit(url="https://www.dailynewsegypt.com/2023/09/08/g20-and-its-summits/") # doctest: +SKIP
992
- ... print(isinstance(out, type(WebPageData))) # doctest: +SKIP
993
- ... print(hasattr(out, "languages")) # doctest: +SKIP
994
- ... print(hasattr(out, "page")) # doctest: +SKIP
1165
+ >>> from nosible import Nosible
1166
+ >>> with Nosible() as nos:
1167
+ ... out = nos.visit(url="https://www.dailynewsegypt.com/2023/09/08/g20-and-its-summits/")
1168
+ ... print(isinstance(out, WebPageData))
1169
+ ... print(hasattr(out, "languages"))
1170
+ ... print(hasattr(out, "page"))
995
1171
  True
996
1172
  True
997
1173
  True
998
- >>> with Nosible() as nos: # doctest: +SKIP
999
- ... out = nos.visit() # doctest: +SKIP
1000
- ... print(isinstance(out, type(WebPageData))) # doctest: +SKIP
1001
- ... print(hasattr(out, "languages")) # doctest: +SKIP
1002
- ... print(hasattr(out, "page")) # doctest: +SKIP
1174
+ >>> with Nosible() as nos:
1175
+ ... out = nos.visit()
1176
+ ... print(isinstance(out, type(WebPageData)))
1177
+ ... print(hasattr(out, "languages"))
1178
+ ... print(hasattr(out, "page")) # doctest: +ELLIPSIS
1003
1179
  Traceback (most recent call last):
1004
1180
  ...
1005
1181
  TypeError: URL must be provided
1006
1182
  """
1007
-
1008
- # self._enforce("visit")
1009
1183
  if url is None:
1010
1184
  raise TypeError("URL must be provided")
1011
1185
  response = self._post(
@@ -1018,7 +1192,7 @@ class Nosible:
1018
1192
  self.logger.error(f"Failed to parse JSON from response: {e}")
1019
1193
  raise ValueError("Invalid JSON response from server") from e
1020
1194
 
1021
- if data == {'message': 'Sorry, the URL could not be fetched.'}:
1195
+ if data == {"message": "Sorry, the URL could not be fetched."}:
1022
1196
  raise ValueError("The URL could not be found.")
1023
1197
 
1024
1198
  if "response" not in data:
@@ -1033,12 +1207,84 @@ class Nosible:
1033
1207
  metadata=response_data.get("metadata"),
1034
1208
  page=response_data.get("page"),
1035
1209
  request=response_data.get("request"),
1036
- snippets=response_data.get("snippets"),
1210
+ snippets=SnippetSet.from_dict(response_data.get("snippets", {})),
1037
1211
  statistics=response_data.get("statistics"),
1038
1212
  structured=response_data.get("structured"),
1039
1213
  url_tree=response_data.get("url_tree"),
1040
1214
  )
1041
1215
 
1216
+ @_rate_limited("fast")
1217
+ def trend(
1218
+ self,
1219
+ query: str,
1220
+ start_date: Optional[str] = None,
1221
+ end_date: Optional[str] = None,
1222
+ sql_filter: Optional[str] = None,
1223
+ ) -> dict:
1224
+ """
1225
+ Extract a trend showing the volume of news surrounding your query.
1226
+
1227
+ Parameters
1228
+ ----------
1229
+ query : str
1230
+ The search term we would like to see a trend for.
1231
+ start_date : str, optional
1232
+ ISO‐format start date (YYYY-MM-DD) of the trend window.
1233
+ end_date : str, optional
1234
+ ISO‐format end date (YYYY-MM-DD) of the trend window.
1235
+ sql_filter : str, optional
1236
+ An optional SQL filter to narrow down the trend query
1237
+
1238
+ Returns
1239
+ -------
1240
+ dict
1241
+ The JSON-decoded trend data returned by the server.
1242
+
1243
+ Examples
1244
+ --------
1245
+ >>> from nosible import Nosible
1246
+ >>> with Nosible() as nos:
1247
+ ... trends_data = nos.trend("Christmas Shopping", start_date="2005-01-01", end_date="2020-12-31")
1248
+ ... print(trends_data) # doctest: +ELLIPSIS
1249
+ {'2005-01-31': ...'2020-12-31': ...}
1250
+ """
1251
+ # Validate dates
1252
+ if start_date is not None:
1253
+ self._validate_date_format(start_date, "start_date")
1254
+ if end_date is not None:
1255
+ self._validate_date_format(end_date, "end_date")
1256
+
1257
+ payload: dict[str, str] = {"query": query}
1258
+
1259
+ if sql_filter is not None:
1260
+ payload["sql_filter"] = sql_filter
1261
+ else:
1262
+ payload["sql_filter"] = "SELECT loc, published FROM engine"
1263
+
1264
+ # Send the POST to the /trend endpoint
1265
+ response = self._post(
1266
+ url="https://www.nosible.ai/search/v1/trend",
1267
+ payload=payload,
1268
+ )
1269
+ # Will raise ValueError on rate-limit or auth errors
1270
+ response.raise_for_status()
1271
+ payload = response.json().get("response", {})
1272
+
1273
+ # if no window requested, return everything
1274
+ if start_date is None and end_date is None:
1275
+ return payload
1276
+
1277
+ # Filter by ISO‐date keys
1278
+ filtered: dict[str, float] = {}
1279
+ for date_str, value in payload.items():
1280
+ if start_date and date_str < start_date:
1281
+ continue
1282
+ if end_date and date_str > end_date:
1283
+ continue
1284
+ filtered[date_str] = value
1285
+
1286
+ return filtered
1287
+
1042
1288
  def version(self) -> str:
1043
1289
  """
1044
1290
  Retrieve the current version information for the Nosible API.
@@ -1097,10 +1343,6 @@ class Nosible:
1097
1343
 
1098
1344
  Raises
1099
1345
  ------
1100
- ValueError
1101
- If the API returns an unexpected message.
1102
- requests.HTTPError
1103
- If the HTTP request fails.
1104
1346
 
1105
1347
  Examples
1106
1348
  --------
@@ -1121,10 +1363,13 @@ class Nosible:
1121
1363
  return False
1122
1364
  if msg == "The URL could not be retrieved.":
1123
1365
  return False
1366
+ # If we reach here, the response is unexpected
1367
+ return False
1124
1368
  except requests.HTTPError:
1125
1369
  return False
1126
1370
  except:
1127
1371
  return False
1372
+
1128
1373
  def preflight(self, url: str = None) -> str:
1129
1374
  """
1130
1375
  Run a preflight check for crawling/preprocessing on a URL.
@@ -1180,40 +1425,47 @@ class Nosible:
1180
1425
 
1181
1426
  Examples
1182
1427
  --------
1183
- >>> nos = Nosible(nosible_api_key="test|xyz") # doctest: +SKIP
1184
- >>> print(nos.get_rate_limits()) # doctest: +SKIP
1185
- Free (Your current plan)
1186
- | Endpoint | Per Month | Per Day | Per Minute |
1187
- | ----------- | --------- | ------- | ---------- |
1188
- | Fast Search | 3 000 | 100 | 10 |
1189
- | URL Visits | 300 | 10 | 1 |
1190
- | Slow Search | 300 | 10 | 1 |
1191
-
1192
- Basic
1193
- | Endpoint | Per Month | Per Day | Per Minute |
1428
+ >>> nos = Nosible(nosible_api_key="test|xyz")
1429
+ >>> print(nos.get_rate_limits()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
1430
+ Below are the rate limits for all NOSIBLE plans.
1431
+ To upgrade your package, visit https://www.nosible.ai/products.
1432
+ <BLANKLINE>
1433
+ Unless otherwise indicated, bulk searches are limited to one-at-a-time per API key.
1434
+ <BLANKLINE>
1435
+ Free: (Your current plan)
1436
+ | Endpoint | Per Month | Per Minute | Effective CPM |
1437
+ | ----------- | --------- | ---------- | ------------- |
1438
+ | Search | 3000 | 60 | $4.00 |
1439
+ | URL Visits | 300 | 60 | $4.00 |
1440
+ | Bulk Search | 300 | 60 | $4.00 |
1441
+ <BLANKLINE>
1442
+ Basic ($49p/m):
1443
+ | Endpoint | Per Month | Per Minute | Effective CPM |
1194
1444
  ...
1195
1445
  """
1196
1446
  # Human-friendly plan names
1197
1447
  display = {
1198
1448
  "test": "Free",
1199
- "basic": "Basic",
1200
- "pro": "Pro",
1201
- "pro+": "Pro+",
1202
- "bus": "Business",
1203
- "bus+": "Business+",
1204
- "ent": "Enterprise",
1449
+ "basic": "Basic ($49p/m)",
1450
+ "pro": "Pro ($199p/m)",
1451
+ "pro+": "Pro+ ($799p/m)",
1452
+ "bus": "Business ($3999p/m)",
1453
+ "bus+": "Business+ ($7499p/m)",
1454
+ "ent": "Enterprise ($14999p/m)",
1205
1455
  }
1206
1456
 
1207
1457
  # Human-friendly endpoint names
1208
- endpoint_name = {"fast": "Fast Search", "visit": "URL Visits", "slow": "Bulk Search"}
1458
+ endpoint_name = {"fast": "Search", "visit": "URL Visits", "slow": "Bulk Search"}
1209
1459
 
1210
1460
  out = [
1211
1461
  "Below are the rate limits for all NOSIBLE plans.",
1212
1462
  "To upgrade your package, visit https://www.nosible.ai/products.\n",
1463
+ "Unless otherwise indicated, bulk searches are limited to one-at-a-time per API key.\n"
1213
1464
  ]
1214
1465
 
1215
1466
  user_plan = self._get_user_plan()
1216
1467
  current_plan = ""
1468
+ cpm_counter = 4.0
1217
1469
 
1218
1470
  # Preserve the order you care about:
1219
1471
  for plan in ["test", "basic", "pro", "pro+", "bus", "bus+", "ent"]:
@@ -1222,17 +1474,19 @@ class Nosible:
1222
1474
  current_plan = " (Your current plan)"
1223
1475
 
1224
1476
  out.append(f"{name}:{current_plan}")
1225
- out.append("| Endpoint | Per Month | Per Day | Per Minute |")
1226
- out.append("| ----------- | --------- | ------- | ---------- |")
1477
+ out.append("| Endpoint | Per Month | Per Minute | Effective CPM |")
1478
+ out.append("| ----------- | --------- | ---------- | ------------- |")
1227
1479
 
1228
1480
  for ep in ["fast", "visit", "slow"]:
1229
1481
  buckets = PLAN_RATE_LIMITS[plan][ep]
1230
1482
  # Find minute & day
1231
1483
  minute = next(limit for limit, i in buckets if i == 60)
1232
- day = next(limit for limit, i in buckets if i == 24 * 3600)
1233
- month = day * 30
1234
- out.append(f"| {endpoint_name[ep]:<11} | {month:>9} | {day:>7} | {minute:>10} |")
1484
+ month = next(limit for limit, i in buckets if i == 24 * 3600 * 30)
1485
+ cpm = f"${cpm_counter:.2f}"
1486
+
1487
+ out.append(f"| {endpoint_name[ep]:<11} | {month:>9} | {minute:>10} | {cpm:>13} |")
1235
1488
 
1489
+ cpm_counter = cpm_counter - 0.5
1236
1490
  out.append("") # Blank line
1237
1491
  current_plan = ""
1238
1492
 
@@ -1243,10 +1497,6 @@ class Nosible:
1243
1497
  Close the Nosible client, shutting down the HTTP session
1244
1498
  and thread pool to release network and threading resources.
1245
1499
 
1246
- Returns
1247
- -------
1248
- None
1249
-
1250
1500
  Examples
1251
1501
  --------
1252
1502
  >>> from nosible import Nosible
@@ -1292,6 +1542,8 @@ class Nosible:
1292
1542
  If the user API key is invalid.
1293
1543
  ValueError
1294
1544
  If the user hits their rate limit.
1545
+ ValueError
1546
+ If the user is making too many concurrent searches.
1295
1547
  ValueError
1296
1548
  If an unexpected error occurs.
1297
1549
  ValueError
@@ -1319,12 +1571,17 @@ class Nosible:
1319
1571
  content_type = response.headers.get("Content-Type", "")
1320
1572
  if content_type.startswith("application/json"):
1321
1573
  body = response.json()
1574
+ if isinstance(body, list):
1575
+ body = body[0] # NOSIBLE returns a list of errors
1576
+ print(body)
1322
1577
  if body.get("type") == "string_too_short":
1323
1578
  raise ValueError("Your API key is not valid: Too Short.")
1324
1579
  else:
1325
1580
  raise ValueError("You made a bad request.")
1326
1581
  if response.status_code == 429:
1327
1582
  raise ValueError("You have hit your rate limit.")
1583
+ if response.status_code == 409:
1584
+ raise ValueError("Too many concurrent searches.")
1328
1585
  if response.status_code == 500:
1329
1586
  raise ValueError("An unexpected error occurred.")
1330
1587
  if response.status_code == 502:
@@ -1354,16 +1611,16 @@ class Nosible:
1354
1611
 
1355
1612
  Examples
1356
1613
  --------
1357
- >>> nos = Nosible(nosible_api_key="test+|xyz") # doctest: +SKIP
1614
+ >>> nos = Nosible(nosible_api_key="test+|xyz") # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
1358
1615
  Traceback (most recent call last):
1359
1616
  ...
1360
- ValueError: test+ is not a valid plan prefix, your API key is invalid.
1617
+ ValueError: Your API key is not valid: test+ is not a valid plan prefix.
1361
1618
  """
1362
1619
  # Split off anything after the first '|'
1363
1620
  prefix = (self.nosible_api_key or "").split("|", 1)[0]
1364
1621
 
1365
- # Map prefixes -> human-friendly plan names
1366
- plans = {"test", "basic", "pro", "pro+", "bus", "bus+", "ent"}
1622
+ # Map prefixes -> plan names
1623
+ plans = {"test", "basic", "pro", "pro+", "bus", "bus+", "ent", "chat"}
1367
1624
 
1368
1625
  if prefix not in plans:
1369
1626
  raise ValueError(f"Your API key is not valid: {prefix} is not a valid plan prefix.")
@@ -1393,11 +1650,10 @@ class Nosible:
1393
1650
 
1394
1651
  Examples
1395
1652
  --------
1396
-
1397
- >>> from nosible import Nosible # doctest: +SKIP
1398
- >>> nos = Nosible(llm_api_key=None) # doctest: +SKIP
1399
- >>> nos.llm_api_key = None # doctest: +SKIP
1400
- >>> nos._generate_expansions("anything") # doctest: +SKIP
1653
+ >>> from nosible import Nosible
1654
+ >>> nos = Nosible(llm_api_key=None)
1655
+ >>> nos.llm_api_key = None
1656
+ >>> nos._generate_expansions("anything") # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
1401
1657
  Traceback (most recent call last):
1402
1658
  ...
1403
1659
  ValueError: LLM API key is required for generating expansions.
@@ -1486,6 +1742,49 @@ class Nosible:
1486
1742
  self.logger.debug(f"Successful expansions: {expansions}")
1487
1743
  return expansions
1488
1744
 
1745
+ @staticmethod
1746
+ def _validate_date_format(string: str, name: str):
1747
+ """
1748
+ Check that a date string is valid ISO format (YYYY-MM-DD or full ISO timestamp).
1749
+
1750
+ Parameters
1751
+ ----------
1752
+ string : str
1753
+ The date string to validate.
1754
+ name : str
1755
+ The name of the parameter being validated, used in the error message.
1756
+
1757
+ Raises
1758
+ ------
1759
+ ValueError
1760
+ If `string` is not a valid ISO 8601 date. Error message will include
1761
+ the `name` and the offending string.
1762
+ Examples
1763
+ --------
1764
+ >>> # valid date-only format
1765
+ >>> Nosible._validate_date_format("2023-12-31", "publish_start")
1766
+ >>> # valid full timestamp
1767
+ >>> Nosible._validate_date_format("2023-12-31T15:30:00", "visited_end")
1768
+ >>> # invalid month
1769
+ >>> Nosible._validate_date_format("2023-13-01", "publish_end")
1770
+ Traceback (most recent call last):
1771
+ ...
1772
+ ValueError: Invalid date for 'publish_end': '2023-13-01'. Expected ISO format 'YYYY-MM-DD'.
1773
+ >>> # wrong separator
1774
+ >>> Nosible._validate_date_format("2023/12/31", "visited_start")
1775
+ Traceback (most recent call last):
1776
+ ...
1777
+ ValueError: Invalid date for 'visited_start': '2023/12/31'. Expected ISO format 'YYYY-MM-DD'.
1778
+ """
1779
+ try:
1780
+ # datetime.fromisoformat accepts both YYYY-MM-DD and full timestamps
1781
+ parsed = datetime.fromisoformat(string)
1782
+ except Exception:
1783
+ raise ValueError(
1784
+ f"Invalid date for '{name}': {string!r}. "
1785
+ "Expected ISO format 'YYYY-MM-DD'."
1786
+ )
1787
+
1489
1788
  def _format_sql(
1490
1789
  self,
1491
1790
  publish_start: str = None,
@@ -1508,35 +1807,31 @@ class Nosible:
1508
1807
  Parameters
1509
1808
  ----------
1510
1809
  publish_start : str, optional
1511
- Earliest published date filter.
1810
+ Start date for when the document was published (ISO format).
1512
1811
  publish_end : str, optional
1513
- Latest published date filter.
1514
- include_netlocs : list of str, optional
1515
- Domains to whitelist.
1516
- exclude_netlocs : list of str, optional
1517
- Domains to blacklist.
1812
+ End date for when the document was published (ISO format).
1518
1813
  visited_start : str, optional
1519
- Earliest visit date filter.
1814
+ Start date for when the document was visited by NOSIBLE (ISO format).
1520
1815
  visited_end : str, optional
1521
- Latest visit date filter.
1816
+ End date for when the document was visited by NOSIBLE (ISO format).
1522
1817
  certain : bool, optional
1523
- True if we are 100% sure of the date.
1524
- include_languages : list of str, optional
1525
- Languages to include (Max: 50).
1526
- exclude_languages : list of str, optional
1527
- Languages to exclude (Max: 50).
1818
+ Only include documents where we are 100% sure of the date.
1528
1819
  include_netlocs : list of str, optional
1529
- Only include results from these domains (Max: 50).
1820
+ List of netlocs (domains) to include in the search. (Max: 50)
1530
1821
  exclude_netlocs : list of str, optional
1531
- Exclude results from these domains (Max: 50).
1822
+ List of netlocs (domains) to exclude in the search. (Max: 50)
1823
+ include_languages : list of str, optional
1824
+ Languages to include in the search. (Max: 50, ISO 639-1 language codes).
1825
+ exclude_languages : list of str, optional
1826
+ Language codes to exclude in the search (Max: 50, ISO 639-1 language codes).
1532
1827
  include_companies : list of str, optional
1533
- Public Company Google KG IDs to require (Max: 50).
1828
+ Google KG IDs of public companies to require (Max: 50).
1534
1829
  exclude_companies : list of str, optional
1535
- Public Company Google KG IDs to forbid (Max: 50).
1830
+ Google KG IDs of public companies to forbid (Max: 50).
1536
1831
  include_docs : list of str, optional
1537
- URL hashes of documents to include (Max: 50).
1832
+ URL hashes of docs to include (Max: 50).
1538
1833
  exclude_docs : list of str, optional
1539
- URL hashes of documents to exclude (Max: 50).
1834
+ URL hashes of docs to exclude (Max: 50).
1540
1835
 
1541
1836
  Returns
1542
1837
  -------
@@ -1545,23 +1840,31 @@ class Nosible:
1545
1840
 
1546
1841
  Raises
1547
1842
  ------
1548
-
1549
1843
  ValueError
1550
1844
  If more than 50 items in a filter are given.
1551
1845
  """
1846
+ for name, value in [
1847
+ ("publish_start", publish_start),
1848
+ ("publish_end", publish_end),
1849
+ ("visited_start", visited_start),
1850
+ ("visited_end", visited_end),
1851
+ ]:
1852
+ if value is not None:
1853
+ self._validate_date_format(string=value, name=name)
1854
+
1552
1855
  # Validate list lengths
1553
- for name, lst in [
1554
- ('include_netlocs', include_netlocs),
1555
- ('exclude_netlocs', exclude_netlocs),
1556
- ('include_languages', include_languages),
1557
- ('exclude_languages', exclude_languages),
1558
- ('include_companies', include_companies),
1559
- ('exclude_companies', exclude_companies),
1560
- ('include_docs', include_docs),
1561
- ('exclude_docs', exclude_docs),
1856
+ for name, value in [
1857
+ ("include_netlocs", include_netlocs),
1858
+ ("exclude_netlocs", exclude_netlocs),
1859
+ ("include_languages", include_languages),
1860
+ ("exclude_languages", exclude_languages),
1861
+ ("include_companies", include_companies),
1862
+ ("exclude_companies", exclude_companies),
1863
+ ("include_docs", include_docs),
1864
+ ("exclude_docs", exclude_docs),
1562
1865
  ]:
1563
- if lst is not None and len(lst) > 50:
1564
- raise ValueError(f"Too many items for '{name}' filter ({len(lst)}); maximum allowed is 50.")
1866
+ if value is not None and len(value) > 50:
1867
+ raise ValueError(f"Too many items for '{name}' filter ({len(value)}); maximum allowed is 50.")
1565
1868
 
1566
1869
  sql = ["SELECT loc FROM engine"]
1567
1870
  clauses: list[str] = []
@@ -1595,10 +1898,10 @@ class Nosible:
1595
1898
  variants = set()
1596
1899
  for n in include_netlocs:
1597
1900
  variants.add(n)
1598
- if n.startswith('www.'):
1901
+ if n.startswith("www."):
1599
1902
  variants.add(n[4:])
1600
1903
  else:
1601
- variants.add('www.' + n)
1904
+ variants.add("www." + n)
1602
1905
  in_list = ", ".join(f"'{v}'" for v in sorted(variants))
1603
1906
  clauses.append(f"netloc IN ({in_list})")
1604
1907
 
@@ -1607,10 +1910,10 @@ class Nosible:
1607
1910
  variants = set()
1608
1911
  for n in exclude_netlocs:
1609
1912
  variants.add(n)
1610
- if n.startswith('www.'):
1913
+ if n.startswith("www."):
1611
1914
  variants.add(n[4:])
1612
1915
  else:
1613
- variants.add('www.' + n)
1916
+ variants.add("www." + n)
1614
1917
  ex_list = ", ".join(f"'{v}'" for v in sorted(variants))
1615
1918
  clauses.append(f"netloc NOT IN ({ex_list})")
1616
1919
 
@@ -1703,7 +2006,7 @@ class Nosible:
1703
2006
  except Exception:
1704
2007
  return False
1705
2008
 
1706
- def __enter__(self):
2009
+ def __enter__(self) -> "Nosible":
1707
2010
  """
1708
2011
  Enter the context manager, returning this client instance.
1709
2012
 
@@ -1714,32 +2017,46 @@ class Nosible:
1714
2017
  """
1715
2018
  return self
1716
2019
 
1717
- def __exit__(self, exc_type: type, exc: Exception, tb: traceback):
2020
+ def __exit__(
2021
+ self,
2022
+ _exc_type: typing.Optional[type[BaseException]],
2023
+ _exc_val: typing.Optional[BaseException],
2024
+ _exc_tb: typing.Optional[types.TracebackType],
2025
+ ) -> typing.Optional[bool]:
1718
2026
  """
1719
- Exit the context manager, ensuring cleanup of resources.
2027
+ Always clean up (self.close()), but let exceptions propagate.
2028
+ Return True only if you really want to suppress an exception.
1720
2029
 
1721
2030
  Parameters
1722
2031
  ----------
1723
- exc_type : type or None
1724
- Exception type if raised.
1725
- exc : Exception or None
1726
- Exception instance if raised.
1727
- tb : traceback or None
1728
- Traceback if exception was raised.
2032
+ _exc_type : Optional[type[BaseException]]
2033
+ The type of the exception raised, if any.
2034
+ _exc_val : Optional[BaseException]
2035
+ The exception instance, if any.
2036
+ _exc_tb : Optional[types.TracebackType]
2037
+ The traceback object, if any.
1729
2038
 
1730
2039
  Returns
1731
2040
  -------
1732
- None
2041
+ Optional[bool]
2042
+ False to propagate exceptions, True to suppress them.
1733
2043
  """
1734
- self.close()
2044
+ try:
2045
+ self.close()
2046
+ except Exception as cleanup_err:
2047
+ # optional: log or re-raise, but don’t hide the original exc
2048
+ print(f"Cleanup failed: {cleanup_err!r}")
2049
+ # Return False (or None) => exceptions inside the with‐block are re-raised.
2050
+ return False
1735
2051
 
1736
2052
  def __del__(self):
1737
2053
  """
1738
2054
  Destructor to ensure resources are cleaned up if not explicitly closed.
1739
2055
 
1740
- Returns
1741
- -------
1742
- None
1743
2056
  """
1744
- # Ensure it's called
1745
- self.close()
2057
+ # Only close if interpreter is fully alive
2058
+ if not getattr(sys, "is_finalizing", lambda: False)():
2059
+ try:
2060
+ self.close()
2061
+ except Exception:
2062
+ pass