nosible 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
nosible/nosible_client.py CHANGED
@@ -3,9 +3,10 @@ import json
3
3
  import logging
4
4
  import os
5
5
  import time
6
- import traceback
6
+ import types
7
+ import typing
7
8
  from collections.abc import Iterator
8
- from concurrent.futures import ThreadPoolExecutor, as_completed
9
+ from concurrent.futures import ThreadPoolExecutor
9
10
  from typing import Union
10
11
 
11
12
  import polars as pl
@@ -25,8 +26,10 @@ from tenacity import (
25
26
  from nosible.classes.result_set import ResultSet
26
27
  from nosible.classes.search import Search
27
28
  from nosible.classes.search_set import SearchSet
29
+ from nosible.classes.snippet_set import SnippetSet
28
30
  from nosible.classes.web_page import WebPageData
29
31
  from nosible.utils.json_tools import json_loads
32
+ from nosible.utils.question_builder import _get_question
30
33
  from nosible.utils.rate_limiter import PLAN_RATE_LIMITS, RateLimiter, _rate_limited
31
34
 
32
35
  # Set up a module‐level logger.
@@ -47,37 +50,33 @@ class Nosible:
47
50
  llm_api_key : str, optional
48
51
  API key for LLM-based query expansions.
49
52
  openai_base_url : str
50
- Base URL for the OpenAI-compatible LLM API.
51
- sentiment_model : str
52
- Model to use for sentiment analysis and expansions.
53
+ Base URL for the OpenAI-compatible LLM API. (default is OpenRouter's API endpoint)
54
+ sentiment_model : str, optional
55
+ Model to use for sentiment analysis (default is "openai/gpt-4o").
53
56
  timeout : int
54
57
  Request timeout for HTTP calls.
55
- retries : int, default=5
58
+ retries : int,
56
59
  Number of retry attempts for transient HTTP errors.
57
- concurrency : int, default=10
60
+ concurrency : int,
58
61
  Maximum concurrent search requests.
59
62
  publish_start : str, optional
60
- Earliest publish date filter (ISO formatted date).
63
+ Start date for when the document was published (ISO format).
61
64
  publish_end : str, optional
62
- Latest publish date filter (ISO formatted date).
63
- include_netlocs : list of str, optional
64
- Domains to include.
65
- exclude_netlocs : list of str, optional
66
- Domains to exclude.
65
+ End date for when the document was published (ISO format).
67
66
  visited_start : str, optional
68
- Earliest visit date filter (ISO formatted date).
67
+ Start date for when the document was visited by NOSIBLE (ISO format).
69
68
  visited_end : str, optional
70
- Latest visit date filter (ISO formatted date).
69
+ End date for when the document was visited by NOSIBLE (ISO format).
71
70
  certain : bool, optional
72
- True if we are 100% sure of the date.
73
- include_languages : list of str, optional
74
- Language codes to include (Max: 50).
75
- exclude_languages : list of str, optional
76
- Language codes to exclude (Max: 50).
71
+ Only include documents where we are 100% sure of the date.
77
72
  include_netlocs : list of str, optional
78
- Only include results from these domains (Max: 50).
73
+ List of netlocs (domains) to include in the search. (Max: 50)
79
74
  exclude_netlocs : list of str, optional
80
- Exclude results from these domains (Max: 50).
75
+ List of netlocs (domains) to exclude in the search. (Max: 50)
76
+ include_languages : list of str, optional
77
+ Languages to include in the search. (Max: 50, ISO 639-1 language codes).
78
+ exclude_languages : list of str, optional
79
+ Language codes to exclude in the search (Max: 50, ISO 639-1 language codes).
81
80
  include_companies : list of str, optional
82
81
  Google KG IDs of public companies to require (Max: 50).
83
82
  exclude_companies : list of str, optional
@@ -86,10 +85,6 @@ class Nosible:
86
85
  URL hashes of docs to include (Max: 50).
87
86
  exclude_docs : list of str, optional
88
87
  URL hashes of docs to exclude (Max: 50).
89
- openai_base_url : str, optional
90
- Base URL for the OpenAI API (default is OpenRouter).
91
- sentiment_model : str, optional
92
- Model to use for sentiment analysis (default is "openai/gpt-4o").
93
88
 
94
89
  Notes
95
90
  -----
@@ -243,38 +238,34 @@ class Nosible:
243
238
  List of LLM‐generated expansions.
244
239
  sql_filter : list of str, optional
245
240
  SQL‐style filter clauses.
246
- n_results : int, default=100
241
+ n_results : int
247
242
  Max number of results (max 100).
248
- n_probes : int, default=30
243
+ n_probes : int
249
244
  Number of index shards to probe.
250
- n_contextify : int, default=128
245
+ n_contextify : int
251
246
  Context window size per result.
252
- algorithm : str, default="hybrid-2"
247
+ algorithm : str
253
248
  Search algorithm type.
254
- autogenerate_expansions : bool, default=False
249
+ autogenerate_expansions : bool
255
250
  Do you want to generate expansions automatically using a LLM?
256
251
  publish_start : str, optional
257
- Earliest publish date filter (ISO formatted date).
252
+ Start date for when the document was published (ISO format).
258
253
  publish_end : str, optional
259
- Latest publish date filter (ISO formatted date).
260
- include_netlocs : list of str, optional
261
- Domains to include.
262
- exclude_netlocs : list of str, optional
263
- Domains to exclude.
254
+ End date for when the document was published (ISO format).
264
255
  visited_start : str, optional
265
- Earliest visit date filter (ISO formatted date).
256
+ Start date for when the document was visited by NOSIBLE (ISO format).
266
257
  visited_end : str, optional
267
- Latest visit date filter (ISO formatted date).
258
+ End date for when the document was visited by NOSIBLE (ISO format).
268
259
  certain : bool, optional
269
- True if we are 100% sure of the date.
270
- include_languages : list of str, optional
271
- Language codes to include (Max: 50).
272
- exclude_languages : list of str, optional
273
- Language codes to exclude (Max: 50).
260
+ Only include documents where we are 100% sure of the date.
274
261
  include_netlocs : list of str, optional
275
- Only include results from these domains (Max: 50).
262
+ List of netlocs (domains) to include in the search. (Max: 50)
276
263
  exclude_netlocs : list of str, optional
277
- Exclude results from these domains (Max: 50).
264
+ List of netlocs (domains) to exclude in the search. (Max: 50)
265
+ include_languages : list of str, optional
266
+ Languages to include in the search. (Max: 50, ISO 639-1 language codes).
267
+ exclude_languages : list of str, optional
268
+ Language codes to exclude in the search (Max: 50, ISO 639-1 language codes).
278
269
  include_companies : list of str, optional
279
270
  Google KG IDs of public companies to require (Max: 50).
280
271
  exclude_companies : list of str, optional
@@ -297,6 +288,8 @@ class Nosible:
297
288
  If neither question nor search are specified
298
289
  RuntimeError
299
290
  If the response fails in any way.
291
+ ValueError
292
+ If `n_results` is greater than 100.
300
293
 
301
294
  Notes
302
295
  -----
@@ -407,48 +400,44 @@ class Nosible:
407
400
  List of expansion terms to use for each search.
408
401
  sql_filter : list of str, optional
409
402
  SQL-like filters to apply to the search.
410
- n_results : int, default=100
403
+ n_results : int
411
404
  Number of results to return per search.
412
- n_probes : int, default=30
405
+ n_probes : int
413
406
  Number of probes to use for the search algorithm.
414
- n_contextify : int, default=128
407
+ n_contextify : int
415
408
  Context window size for the search.
416
- algorithm : str, default="hybrid-2"
409
+ algorithm : str
417
410
  Search algorithm to use.
418
- autogenerate_expansions : bool, default=False
411
+ autogenerate_expansions : bool
419
412
  Do you want to generate expansions automatically using a LLM?
420
413
  publish_start : str, optional
421
- Filter results published after this date (ISO formatted date).
414
+ Start date for when the document was published (ISO format).
422
415
  publish_end : str, optional
423
- Filter results published before this date (ISO formatted date).
424
- include_netlocs : list of str, optional
425
- Only include results from these domains.
426
- exclude_netlocs : list of str, optional
427
- Exclude results from these domains.
416
+ End date for when the document was published (ISO format).
428
417
  visited_start : str, optional
429
- Only include results visited after this date (ISO formatted date).
418
+ Start date for when the document was visited by NOSIBLE (ISO format).
430
419
  visited_end : str, optional
431
- Only include results visited before this date (ISO formatted date).
420
+ End date for when the document was visited by NOSIBLE (ISO format).
432
421
  certain : bool, optional
433
- Only include results with high certainty.
422
+ Only include documents where we are 100% sure of the date.
423
+ include_netlocs : list of str, optional
424
+ List of netlocs (domains) to include in the search. (Max: 50)
425
+ exclude_netlocs : list of str, optional
426
+ List of netlocs (domains) to exclude in the search. (Max: 50)
434
427
  include_languages : list of str, optional
435
- Only include results in these languages (Max: 50).
428
+ Languages to include in the search. (Max: 50, ISO 639-1 language codes).
436
429
  exclude_languages : list of str, optional
437
- Exclude results in these languages (Max: 50).
430
+ Language codes to exclude in the search (Max: 50, ISO 639-1 language codes).
438
431
  include_companies : list of str, optional
439
- Only include results from these companies (Max: 50).
432
+ Google KG IDs of public companies to require (Max: 50).
440
433
  exclude_companies : list of str, optional
441
- Exclude results from these companies (Max: 50).
442
- include_netlocs : list of str, optional
443
- Only include results from these domains (Max: 50).
444
- exclude_netlocs : list of str, optional
445
- Exclude results from these domains (Max: 50).
434
+ Google KG IDs of public companies to forbid (Max: 50).
446
435
  include_docs : list of str, optional
447
- URL hashes of documents to include (Max: 50).
436
+ URL hashes of docs to include (Max: 50).
448
437
  exclude_docs : list of str, optional
449
- URL hashes of documents to exclude (Max: 50).
438
+ URL hashes of docs to exclude (Max: 50).
450
439
 
451
- Yields
440
+ Returns
452
441
  ------
453
442
  ResultSet or None
454
443
  Each completed search’s results, or None on failure.
@@ -461,8 +450,6 @@ class Nosible:
461
450
  If both queries and searches are specified.
462
451
  TypeError
463
452
  If neither queries nor searches are specified.
464
- RuntimeError
465
- If the response fails in any way.
466
453
 
467
454
  Notes
468
455
  -----
@@ -473,7 +460,10 @@ class Nosible:
473
460
  --------
474
461
  >>> from nosible import Nosible
475
462
  >>> queries = SearchSet(
476
- ... [Search(question="Hedge funds seek to expand into private credit", n_results=5), Search(question="How have the Trump tariffs impacted the US economy?", n_results=5)]
463
+ ... [
464
+ ... Search(question="Hedge funds seek to expand into private credit", n_results=5),
465
+ ... Search(question="How have the Trump tariffs impacted the US economy?", n_results=5),
466
+ ... ]
477
467
  ... )
478
468
  >>> with Nosible() as nos:
479
469
  ... results_list = list(nos.searches(searches=queries))
@@ -484,10 +474,14 @@ class Nosible:
484
474
  True True
485
475
  True True
486
476
  >>> with Nosible() as nos:
487
- ... results_list_str = list(nos.searches(questions=[
488
- ... "What are the terms of the partnership between Microsoft and OpenAI?",
489
- ... "What are the terms of the partnership between Volkswagen and Uber?"
490
- ... ]))
477
+ ... results_list_str = list(
478
+ ... nos.searches(
479
+ ... questions=[
480
+ ... "What are the terms of the partnership between Microsoft and OpenAI?",
481
+ ... "What are the terms of the partnership between Volkswagen and Uber?",
482
+ ... ]
483
+ ... )
484
+ ... )
491
485
  >>> nos = Nosible(nosible_api_key="test|xyz") # doctest: +ELLIPSIS
492
486
  >>> nos.searches() # doctest: +ELLIPSIS
493
487
  Traceback (most recent call last):
@@ -539,6 +533,7 @@ class Nosible:
539
533
  except Exception as e:
540
534
  self.logger.warning(f"Search failed: {e!r}")
541
535
  yield None
536
+
542
537
  return _run_generator()
543
538
 
544
539
  @_rate_limited("fast")
@@ -573,7 +568,7 @@ class Nosible:
573
568
  ValueError: Search can not have more than 100 results - Use bulk search instead.
574
569
  """
575
570
  # --------------------------------------------------------------------------------------------------------------
576
- # Setting search params. Individual search will overide Nosible defaults.
571
+ # Setting search params. Individual search will override Nosible defaults.
577
572
  # --------------------------------------------------------------------------------------------------------------
578
573
  question = search_obj.question # No default
579
574
  expansions = search_obj.expansions if search_obj.expansions is not None else [] # Default to empty list
@@ -582,7 +577,9 @@ class Nosible:
582
577
  n_probes = search_obj.n_probes if search_obj.n_probes is not None else 30
583
578
  n_contextify = search_obj.n_contextify if search_obj.n_contextify is not None else 128
584
579
  algorithm = search_obj.algorithm if search_obj.algorithm is not None else "hybrid-2"
585
- autogenerate_expansions = search_obj.autogenerate_expansions if search_obj.autogenerate_expansions is not None else False
580
+ autogenerate_expansions = (
581
+ search_obj.autogenerate_expansions if search_obj.autogenerate_expansions is not None else False
582
+ )
586
583
  publish_start = search_obj.publish_start if search_obj.publish_start is not None else self.publish_start
587
584
  publish_end = search_obj.publish_end if search_obj.publish_end is not None else self.publish_end
588
585
  include_netlocs = search_obj.include_netlocs if search_obj.include_netlocs is not None else self.include_netlocs
@@ -728,46 +725,42 @@ class Nosible:
728
725
  Optional list of expanded query strings.
729
726
  sql_filter : list of str, optional
730
727
  Optional SQL WHERE clause filters.
731
- n_results : int, default=100
728
+ n_results : int
732
729
  Number of results per query (1,000–10,000).
733
- n_probes : int, default=30
730
+ n_probes : int
734
731
  Number of shards to probe.
735
- n_contextify : int, default=128
732
+ n_contextify : int
736
733
  Context window size per result.
737
- algorithm : str, default="hybrid-2"
734
+ algorithm : str
738
735
  Search algorithm identifier.
739
- autogenerate_expansions : bool, default=False
736
+ autogenerate_expansions : bool
740
737
  Do you want to generate expansions automatically using a LLM?
741
738
  publish_start : str, optional
742
- Filter for earliest publish date.
739
+ Start date for when the document was published (ISO format).
743
740
  publish_end : str, optional
744
- Filter for latest publish date.
745
- include_netlocs : list of str, optional
746
- Domains to include.
747
- exclude_netlocs : list of str, optional
748
- Domains to exclude.
741
+ End date for when the document was published (ISO format).
749
742
  visited_start : str, optional
750
- Filter for earliest visit date.
743
+ Start date for when the document was visited by NOSIBLE (ISO format).
751
744
  visited_end : str, optional
752
- Filter for latest visit date.
745
+ End date for when the document was visited by NOSIBLE (ISO format).
753
746
  certain : bool, optional
754
- True if we are 100% sure of the date.
755
- include_languages : list of str, optional
756
- Languages to include (Max: 50).
757
- exclude_languages : list of str, optional
758
- Languages to exclude (Max: 50).
747
+ Only include documents where we are 100% sure of the date.
759
748
  include_netlocs : list of str, optional
760
- Only include results from these domains (Max: 50).
749
+ List of netlocs (domains) to include in the search. (Max: 50)
761
750
  exclude_netlocs : list of str, optional
762
- Exclude results from these domains (Max: 50).
751
+ List of netlocs (domains) to exclude in the search. (Max: 50)
752
+ include_languages : list of str, optional
753
+ Languages to include in the search. (Max: 50, ISO 639-1 language codes).
754
+ exclude_languages : list of str, optional
755
+ Language codes to exclude in the search (Max: 50, ISO 639-1 language codes).
763
756
  include_companies : list of str, optional
764
- Company IDs to require (Max: 50).
757
+ Google KG IDs of public companies to require (Max: 50).
765
758
  exclude_companies : list of str, optional
766
- Company IDs to forbid (Max: 50).
759
+ Google KG IDs of public companies to forbid (Max: 50).
767
760
  include_docs : list of str, optional
768
- URL hashes of documents to include (Max: 50).
761
+ URL hashes of docs to include (Max: 50).
769
762
  exclude_docs : list of str, optional
770
- URL hashes of documents to exclude (Max: 50).
763
+ URL hashes of docs to exclude (Max: 50).
771
764
  verbose : bool, optional
772
765
  Show verbose output, Bulk search will print more information.
773
766
 
@@ -794,23 +787,21 @@ class Nosible:
794
787
 
795
788
  Examples
796
789
  --------
797
- >>> from nosible.classes.search import Search
798
- >>> from nosible import Nosible
799
- >>> with Nosible(include_netlocs=["bbc.com"]) as nos: # doctest: +SKIP
800
- ... results = nos.bulk_search(question="Nvidia insiders dump more than $1 billion in stock", n_results=2000) # doctest: +SKIP
790
+ >>> from nosible.classes.search import Search # doctest: +SKIP
791
+ >>> from nosible import Nosible # doctest: +SKIP
792
+ >>> with Nosible(exclude_netlocs=["bbc.com"]) as nos: # doctest: +SKIP
793
+ ... results = nos.bulk_search(question=_get_question(), n_results=2000) # doctest: +SKIP
801
794
  ... print(isinstance(results, ResultSet)) # doctest: +SKIP
802
795
  ... print(len(results)) # doctest: +SKIP
803
796
  True
804
797
  2000
805
-
806
- >>> s = Search(question="OpenAI", n_results=1000) # doctest: +SKIP
798
+ >>> s = Search(question=_get_question(), n_results=1000) # doctest: +SKIP
807
799
  >>> with Nosible() as nos: # doctest: +SKIP
808
800
  ... results = nos.bulk_search(search=s) # doctest: +SKIP
809
801
  ... print(isinstance(results, ResultSet)) # doctest: +SKIP
810
802
  ... print(len(results)) # doctest: +SKIP
811
803
  True
812
804
  1000
813
-
814
805
  >>> nos = Nosible(nosible_api_key="test|xyz") # doctest: +SKIP
815
806
  >>> nos.bulk_search() # doctest: +SKIP
816
807
  Traceback (most recent call last):
@@ -818,20 +809,18 @@ class Nosible:
818
809
  TypeError: Either question or search must be specified
819
810
 
820
811
  >>> nos = Nosible(nosible_api_key="test|xyz") # doctest: +SKIP
821
- >>> nos.bulk_search(question="foo", search=Search(question="foo")) # doctest: +SKIP
812
+ >>> nos.bulk_search(question=_get_question(), search=Search(question=_get_question())) # doctest: +SKIP
822
813
  Traceback (most recent call last):
823
814
  ...
824
815
  TypeError: Question and search cannot be both specified
825
-
826
816
  >>> nos = Nosible(nosible_api_key="test|xyz") # doctest: +SKIP
827
- >>> nos.bulk_search(question="foo", n_results=100) # doctest: +SKIP
817
+ >>> nos.bulk_search(question=_get_question(), n_results=100) # doctest: +SKIP
828
818
  Traceback (most recent call last):
829
819
  ...
830
- ValueError: Bulk search must have at least 100 results per query; use search() for smaller result sets.
831
-
820
+ ValueError: Bulk search must have at least 1000 results per query; use search() for smaller result sets.
832
821
  >>> nos = Nosible(nosible_api_key="test|xyz") # doctest: +SKIP
833
- >>> nos.bulk_search(question="foo", n_results=10001) # doctest: +SKIP
834
- Traceback (most recent call last):
822
+ >>> nos.bulk_search(question=_get_question(), n_results=10001) # doctest: +SKIP
823
+ Traceback (most recent call last): # doctest: +SKIP
835
824
  ...
836
825
  ValueError: Bulk search cannot have more than 10000 results per query.
837
826
  """
@@ -854,8 +843,11 @@ class Nosible:
854
843
  n_probes = search.n_probes if search.n_probes is not None else n_probes
855
844
  n_contextify = search.n_contextify if search.n_contextify is not None else n_contextify
856
845
  algorithm = search.algorithm if search.algorithm is not None else algorithm
857
- autogenerate_expansions = search.autogenerate_expansions if search.autogenerate_expansions is not None \
846
+ autogenerate_expansions = (
847
+ search.autogenerate_expansions
848
+ if search.autogenerate_expansions is not None
858
849
  else autogenerate_expansions
850
+ )
859
851
  publish_start = search.publish_start if search.publish_start is not None else publish_start
860
852
  publish_end = search.publish_end if search.publish_end is not None else publish_end
861
853
  include_netlocs = search.include_netlocs if search.include_netlocs is not None else include_netlocs
@@ -959,13 +951,13 @@ class Nosible:
959
951
 
960
952
  Parameters
961
953
  ----------
962
- html : str, default=""
954
+ html : str
963
955
  Raw HTML to process instead of fetching.
964
- recrawl : bool, default=False
956
+ recrawl : bool
965
957
  If True, force a fresh crawl.
966
- render : bool, default=False
958
+ render : bool
967
959
  If True, allow JavaScript rendering before extraction.
968
- url : str, default=None
960
+ url : str
969
961
  The URL to fetch and parse.
970
962
 
971
963
  Returns
@@ -986,26 +978,24 @@ class Nosible:
986
978
 
987
979
  Examples
988
980
  --------
989
- >>> from nosible import Nosible # doctest: +SKIP
990
- >>> with Nosible() as nos: # doctest: +SKIP
991
- ... out = nos.visit(url="https://www.dailynewsegypt.com/2023/09/08/g20-and-its-summits/") # doctest: +SKIP
992
- ... print(isinstance(out, type(WebPageData))) # doctest: +SKIP
993
- ... print(hasattr(out, "languages")) # doctest: +SKIP
994
- ... print(hasattr(out, "page")) # doctest: +SKIP
981
+ >>> from nosible import Nosible
982
+ >>> with Nosible() as nos:
983
+ ... out = nos.visit(url="https://www.dailynewsegypt.com/2023/09/08/g20-and-its-summits/")
984
+ ... print(isinstance(out, WebPageData))
985
+ ... print(hasattr(out, "languages"))
986
+ ... print(hasattr(out, "page"))
995
987
  True
996
988
  True
997
989
  True
998
- >>> with Nosible() as nos: # doctest: +SKIP
999
- ... out = nos.visit() # doctest: +SKIP
1000
- ... print(isinstance(out, type(WebPageData))) # doctest: +SKIP
1001
- ... print(hasattr(out, "languages")) # doctest: +SKIP
1002
- ... print(hasattr(out, "page")) # doctest: +SKIP
990
+ >>> with Nosible() as nos:
991
+ ... out = nos.visit()
992
+ ... print(isinstance(out, type(WebPageData)))
993
+ ... print(hasattr(out, "languages"))
994
+ ... print(hasattr(out, "page")) # doctest: +ELLIPSIS
1003
995
  Traceback (most recent call last):
1004
996
  ...
1005
997
  TypeError: URL must be provided
1006
998
  """
1007
-
1008
- # self._enforce("visit")
1009
999
  if url is None:
1010
1000
  raise TypeError("URL must be provided")
1011
1001
  response = self._post(
@@ -1018,7 +1008,7 @@ class Nosible:
1018
1008
  self.logger.error(f"Failed to parse JSON from response: {e}")
1019
1009
  raise ValueError("Invalid JSON response from server") from e
1020
1010
 
1021
- if data == {'message': 'Sorry, the URL could not be fetched.'}:
1011
+ if data == {"message": "Sorry, the URL could not be fetched."}:
1022
1012
  raise ValueError("The URL could not be found.")
1023
1013
 
1024
1014
  if "response" not in data:
@@ -1033,7 +1023,7 @@ class Nosible:
1033
1023
  metadata=response_data.get("metadata"),
1034
1024
  page=response_data.get("page"),
1035
1025
  request=response_data.get("request"),
1036
- snippets=response_data.get("snippets"),
1026
+ snippets=SnippetSet.from_dict(response_data.get("snippets", {})),
1037
1027
  statistics=response_data.get("statistics"),
1038
1028
  structured=response_data.get("structured"),
1039
1029
  url_tree=response_data.get("url_tree"),
@@ -1097,10 +1087,6 @@ class Nosible:
1097
1087
 
1098
1088
  Raises
1099
1089
  ------
1100
- ValueError
1101
- If the API returns an unexpected message.
1102
- requests.HTTPError
1103
- If the HTTP request fails.
1104
1090
 
1105
1091
  Examples
1106
1092
  --------
@@ -1125,6 +1111,7 @@ class Nosible:
1125
1111
  return False
1126
1112
  except:
1127
1113
  return False
1114
+
1128
1115
  def preflight(self, url: str = None) -> str:
1129
1116
  """
1130
1117
  Run a preflight check for crawling/preprocessing on a URL.
@@ -1180,40 +1167,47 @@ class Nosible:
1180
1167
 
1181
1168
  Examples
1182
1169
  --------
1183
- >>> nos = Nosible(nosible_api_key="test|xyz") # doctest: +SKIP
1184
- >>> print(nos.get_rate_limits()) # doctest: +SKIP
1185
- Free (Your current plan)
1186
- | Endpoint | Per Month | Per Day | Per Minute |
1187
- | ----------- | --------- | ------- | ---------- |
1188
- | Fast Search | 3 000 | 100 | 10 |
1189
- | URL Visits | 300 | 10 | 1 |
1190
- | Slow Search | 300 | 10 | 1 |
1191
-
1192
- Basic
1193
- | Endpoint | Per Month | Per Day | Per Minute |
1170
+ >>> nos = Nosible(nosible_api_key="test|xyz")
1171
+ >>> print(nos.get_rate_limits()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
1172
+ Below are the rate limits for all NOSIBLE plans.
1173
+ To upgrade your package, visit https://www.nosible.ai/products.
1174
+ <BLANKLINE>
1175
+ Unless otherwise indicated, bulk searches are limited to one-at-a-time per API key.
1176
+ <BLANKLINE>
1177
+ Free: (Your current plan)
1178
+ | Endpoint | Per Month | Per Minute | Effective CPM |
1179
+ | ----------- | --------- | ---------- | ------------- |
1180
+ | Search | 3000 | 60 | $4.00 |
1181
+ | URL Visits | 300 | 60 | $4.00 |
1182
+ | Bulk Search | 300 | 60 | $4.00 |
1183
+ <BLANKLINE>
1184
+ Basic ($49p/m):
1185
+ | Endpoint | Per Month | Per Minute | Effective CPM |
1194
1186
  ...
1195
1187
  """
1196
1188
  # Human-friendly plan names
1197
1189
  display = {
1198
1190
  "test": "Free",
1199
- "basic": "Basic",
1200
- "pro": "Pro",
1201
- "pro+": "Pro+",
1202
- "bus": "Business",
1203
- "bus+": "Business+",
1204
- "ent": "Enterprise",
1191
+ "basic": "Basic ($49p/m)",
1192
+ "pro": "Pro ($199p/m)",
1193
+ "pro+": "Pro+ ($799p/m)",
1194
+ "bus": "Business ($3999p/m)",
1195
+ "bus+": "Business+ ($7499p/m)",
1196
+ "ent": "Enterprise ($14999p/m)",
1205
1197
  }
1206
1198
 
1207
1199
  # Human-friendly endpoint names
1208
- endpoint_name = {"fast": "Fast Search", "visit": "URL Visits", "slow": "Bulk Search"}
1200
+ endpoint_name = {"fast": "Search", "visit": "URL Visits", "slow": "Bulk Search"}
1209
1201
 
1210
1202
  out = [
1211
1203
  "Below are the rate limits for all NOSIBLE plans.",
1212
1204
  "To upgrade your package, visit https://www.nosible.ai/products.\n",
1205
+ "Unless otherwise indicated, bulk searches are limited to one-at-a-time per API key.\n"
1213
1206
  ]
1214
1207
 
1215
1208
  user_plan = self._get_user_plan()
1216
1209
  current_plan = ""
1210
+ cpm_counter = 4.0
1217
1211
 
1218
1212
  # Preserve the order you care about:
1219
1213
  for plan in ["test", "basic", "pro", "pro+", "bus", "bus+", "ent"]:
@@ -1222,17 +1216,19 @@ class Nosible:
1222
1216
  current_plan = " (Your current plan)"
1223
1217
 
1224
1218
  out.append(f"{name}:{current_plan}")
1225
- out.append("| Endpoint | Per Month | Per Day | Per Minute |")
1226
- out.append("| ----------- | --------- | ------- | ---------- |")
1219
+ out.append("| Endpoint | Per Month | Per Minute | Effective CPM |")
1220
+ out.append("| ----------- | --------- | ---------- | ------------- |")
1227
1221
 
1228
1222
  for ep in ["fast", "visit", "slow"]:
1229
1223
  buckets = PLAN_RATE_LIMITS[plan][ep]
1230
1224
  # Find minute & day
1231
1225
  minute = next(limit for limit, i in buckets if i == 60)
1232
- day = next(limit for limit, i in buckets if i == 24 * 3600)
1233
- month = day * 30
1234
- out.append(f"| {endpoint_name[ep]:<11} | {month:>9} | {day:>7} | {minute:>10} |")
1226
+ month = next(limit for limit, i in buckets if i == 24 * 3600 * 30)
1227
+ cpm = f"${cpm_counter:.2f}"
1228
+
1229
+ out.append(f"| {endpoint_name[ep]:<11} | {month:>9} | {minute:>10} | {cpm:>13} |")
1235
1230
 
1231
+ cpm_counter = cpm_counter - 0.5
1236
1232
  out.append("") # Blank line
1237
1233
  current_plan = ""
1238
1234
 
@@ -1243,10 +1239,6 @@ class Nosible:
1243
1239
  Close the Nosible client, shutting down the HTTP session
1244
1240
  and thread pool to release network and threading resources.
1245
1241
 
1246
- Returns
1247
- -------
1248
- None
1249
-
1250
1242
  Examples
1251
1243
  --------
1252
1244
  >>> from nosible import Nosible
@@ -1292,6 +1284,8 @@ class Nosible:
1292
1284
  If the user API key is invalid.
1293
1285
  ValueError
1294
1286
  If the user hits their rate limit.
1287
+ ValueError
1288
+ If the user is making too many concurrent searches.
1295
1289
  ValueError
1296
1290
  If an unexpected error occurs.
1297
1291
  ValueError
@@ -1325,6 +1319,8 @@ class Nosible:
1325
1319
  raise ValueError("You made a bad request.")
1326
1320
  if response.status_code == 429:
1327
1321
  raise ValueError("You have hit your rate limit.")
1322
+ if response.status_code == 409:
1323
+ raise ValueError("Too many concurrent searches.")
1328
1324
  if response.status_code == 500:
1329
1325
  raise ValueError("An unexpected error occurred.")
1330
1326
  if response.status_code == 502:
@@ -1354,16 +1350,16 @@ class Nosible:
1354
1350
 
1355
1351
  Examples
1356
1352
  --------
1357
- >>> nos = Nosible(nosible_api_key="test+|xyz") # doctest: +SKIP
1353
+ >>> nos = Nosible(nosible_api_key="test+|xyz") # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
1358
1354
  Traceback (most recent call last):
1359
1355
  ...
1360
- ValueError: test+ is not a valid plan prefix, your API key is invalid.
1356
+ ValueError: Your API key is not valid: test+ is not a valid plan prefix.
1361
1357
  """
1362
1358
  # Split off anything after the first '|'
1363
1359
  prefix = (self.nosible_api_key or "").split("|", 1)[0]
1364
1360
 
1365
- # Map prefixes -> human-friendly plan names
1366
- plans = {"test", "basic", "pro", "pro+", "bus", "bus+", "ent"}
1361
+ # Map prefixes -> plan names
1362
+ plans = {"test", "basic", "pro", "pro+", "bus", "bus+", "ent", "chat"}
1367
1363
 
1368
1364
  if prefix not in plans:
1369
1365
  raise ValueError(f"Your API key is not valid: {prefix} is not a valid plan prefix.")
@@ -1393,11 +1389,10 @@ class Nosible:
1393
1389
 
1394
1390
  Examples
1395
1391
  --------
1396
-
1397
- >>> from nosible import Nosible # doctest: +SKIP
1398
- >>> nos = Nosible(llm_api_key=None) # doctest: +SKIP
1399
- >>> nos.llm_api_key = None # doctest: +SKIP
1400
- >>> nos._generate_expansions("anything") # doctest: +SKIP
1392
+ >>> from nosible import Nosible
1393
+ >>> nos = Nosible(llm_api_key=None)
1394
+ >>> nos.llm_api_key = None
1395
+ >>> nos._generate_expansions("anything") # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
1401
1396
  Traceback (most recent call last):
1402
1397
  ...
1403
1398
  ValueError: LLM API key is required for generating expansions.
@@ -1508,35 +1503,31 @@ class Nosible:
1508
1503
  Parameters
1509
1504
  ----------
1510
1505
  publish_start : str, optional
1511
- Earliest published date filter.
1506
+ Start date for when the document was published (ISO format).
1512
1507
  publish_end : str, optional
1513
- Latest published date filter.
1514
- include_netlocs : list of str, optional
1515
- Domains to whitelist.
1516
- exclude_netlocs : list of str, optional
1517
- Domains to blacklist.
1508
+ End date for when the document was published (ISO format).
1518
1509
  visited_start : str, optional
1519
- Earliest visit date filter.
1510
+ Start date for when the document was visited by NOSIBLE (ISO format).
1520
1511
  visited_end : str, optional
1521
- Latest visit date filter.
1512
+ End date for when the document was visited by NOSIBLE (ISO format).
1522
1513
  certain : bool, optional
1523
- True if we are 100% sure of the date.
1524
- include_languages : list of str, optional
1525
- Languages to include (Max: 50).
1526
- exclude_languages : list of str, optional
1527
- Languages to exclude (Max: 50).
1514
+ Only include documents where we are 100% sure of the date.
1528
1515
  include_netlocs : list of str, optional
1529
- Only include results from these domains (Max: 50).
1516
+ List of netlocs (domains) to include in the search. (Max: 50)
1530
1517
  exclude_netlocs : list of str, optional
1531
- Exclude results from these domains (Max: 50).
1518
+ List of netlocs (domains) to exclude in the search. (Max: 50)
1519
+ include_languages : list of str, optional
1520
+ Languages to include in the search. (Max: 50, ISO 639-1 language codes).
1521
+ exclude_languages : list of str, optional
1522
+ Language codes to exclude in the search (Max: 50, ISO 639-1 language codes).
1532
1523
  include_companies : list of str, optional
1533
- Public Company Google KG IDs to require (Max: 50).
1524
+ Google KG IDs of public companies to require (Max: 50).
1534
1525
  exclude_companies : list of str, optional
1535
- Public Company Google KG IDs to forbid (Max: 50).
1526
+ Google KG IDs of public companies to forbid (Max: 50).
1536
1527
  include_docs : list of str, optional
1537
- URL hashes of documents to include (Max: 50).
1528
+ URL hashes of docs to include (Max: 50).
1538
1529
  exclude_docs : list of str, optional
1539
- URL hashes of documents to exclude (Max: 50).
1530
+ URL hashes of docs to exclude (Max: 50).
1540
1531
 
1541
1532
  Returns
1542
1533
  -------
@@ -1545,20 +1536,19 @@ class Nosible:
1545
1536
 
1546
1537
  Raises
1547
1538
  ------
1548
-
1549
1539
  ValueError
1550
1540
  If more than 50 items in a filter are given.
1551
1541
  """
1552
1542
  # Validate list lengths
1553
1543
  for name, lst in [
1554
- ('include_netlocs', include_netlocs),
1555
- ('exclude_netlocs', exclude_netlocs),
1556
- ('include_languages', include_languages),
1557
- ('exclude_languages', exclude_languages),
1558
- ('include_companies', include_companies),
1559
- ('exclude_companies', exclude_companies),
1560
- ('include_docs', include_docs),
1561
- ('exclude_docs', exclude_docs),
1544
+ ("include_netlocs", include_netlocs),
1545
+ ("exclude_netlocs", exclude_netlocs),
1546
+ ("include_languages", include_languages),
1547
+ ("exclude_languages", exclude_languages),
1548
+ ("include_companies", include_companies),
1549
+ ("exclude_companies", exclude_companies),
1550
+ ("include_docs", include_docs),
1551
+ ("exclude_docs", exclude_docs),
1562
1552
  ]:
1563
1553
  if lst is not None and len(lst) > 50:
1564
1554
  raise ValueError(f"Too many items for '{name}' filter ({len(lst)}); maximum allowed is 50.")
@@ -1595,10 +1585,10 @@ class Nosible:
1595
1585
  variants = set()
1596
1586
  for n in include_netlocs:
1597
1587
  variants.add(n)
1598
- if n.startswith('www.'):
1588
+ if n.startswith("www."):
1599
1589
  variants.add(n[4:])
1600
1590
  else:
1601
- variants.add('www.' + n)
1591
+ variants.add("www." + n)
1602
1592
  in_list = ", ".join(f"'{v}'" for v in sorted(variants))
1603
1593
  clauses.append(f"netloc IN ({in_list})")
1604
1594
 
@@ -1607,10 +1597,10 @@ class Nosible:
1607
1597
  variants = set()
1608
1598
  for n in exclude_netlocs:
1609
1599
  variants.add(n)
1610
- if n.startswith('www.'):
1600
+ if n.startswith("www."):
1611
1601
  variants.add(n[4:])
1612
1602
  else:
1613
- variants.add('www.' + n)
1603
+ variants.add("www." + n)
1614
1604
  ex_list = ", ".join(f"'{v}'" for v in sorted(variants))
1615
1605
  clauses.append(f"netloc NOT IN ({ex_list})")
1616
1606
 
@@ -1703,7 +1693,7 @@ class Nosible:
1703
1693
  except Exception:
1704
1694
  return False
1705
1695
 
1706
- def __enter__(self):
1696
+ def __enter__(self) -> "Nosible":
1707
1697
  """
1708
1698
  Enter the context manager, returning this client instance.
1709
1699
 
@@ -1714,32 +1704,42 @@ class Nosible:
1714
1704
  """
1715
1705
  return self
1716
1706
 
1717
- def __exit__(self, exc_type: type, exc: Exception, tb: traceback):
1707
+ def __exit__(
1708
+ self,
1709
+ _exc_type: typing.Optional[type[BaseException]],
1710
+ _exc_val: typing.Optional[BaseException],
1711
+ _exc_tb: typing.Optional[types.TracebackType],
1712
+ ) -> typing.Optional[bool]:
1718
1713
  """
1719
- Exit the context manager, ensuring cleanup of resources.
1714
+ Always clean up (self.close()), but let exceptions propagate.
1715
+ Return True only if you really want to suppress an exception.
1720
1716
 
1721
1717
  Parameters
1722
1718
  ----------
1723
- exc_type : type or None
1724
- Exception type if raised.
1725
- exc : Exception or None
1726
- Exception instance if raised.
1727
- tb : traceback or None
1728
- Traceback if exception was raised.
1719
+ exc_type : Optional[type[BaseException]]
1720
+ The type of the exception raised, if any.
1721
+ exc_val : Optional[BaseException]
1722
+ The exception instance, if any.
1723
+ exc_tb : Optional[types.TracebackType]
1724
+ The traceback object, if any.
1729
1725
 
1730
1726
  Returns
1731
1727
  -------
1732
- None
1728
+ Optional[bool]
1729
+ False to propagate exceptions, True to suppress them.
1733
1730
  """
1734
- self.close()
1731
+ try:
1732
+ self.close()
1733
+ except Exception as cleanup_err:
1734
+ # optional: log or re-raise, but don’t hide the original exc
1735
+ print(f"Cleanup failed: {cleanup_err!r}")
1736
+ # Return False (or None) => exceptions inside the with‐block are re-raised.
1737
+ return False
1735
1738
 
1736
1739
  def __del__(self):
1737
1740
  """
1738
1741
  Destructor to ensure resources are cleaned up if not explicitly closed.
1739
1742
 
1740
- Returns
1741
- -------
1742
- None
1743
1743
  """
1744
1744
  # Ensure it's called
1745
1745
  self.close()