nosible 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nosible/classes/result.py +65 -106
- nosible/classes/result_set.py +119 -113
- nosible/classes/search.py +68 -89
- nosible/classes/search_set.py +27 -12
- nosible/classes/snippet.py +57 -74
- nosible/classes/snippet_set.py +62 -63
- nosible/classes/web_page.py +39 -103
- nosible/nosible_client.py +224 -224
- nosible/utils/json_tools.py +51 -2
- nosible/utils/question_builder.py +131 -0
- nosible/utils/rate_limiter.py +30 -24
- {nosible-0.1.8.dist-info → nosible-0.1.9.dist-info}/METADATA +9 -45
- nosible-0.1.9.dist-info/RECORD +17 -0
- nosible-0.1.8.dist-info/RECORD +0 -16
- {nosible-0.1.8.dist-info → nosible-0.1.9.dist-info}/WHEEL +0 -0
- {nosible-0.1.8.dist-info → nosible-0.1.9.dist-info}/licenses/LICENSE +0 -0
- {nosible-0.1.8.dist-info → nosible-0.1.9.dist-info}/top_level.txt +0 -0
nosible/nosible_client.py
CHANGED
|
@@ -3,9 +3,10 @@ import json
|
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
5
|
import time
|
|
6
|
-
import
|
|
6
|
+
import types
|
|
7
|
+
import typing
|
|
7
8
|
from collections.abc import Iterator
|
|
8
|
-
from concurrent.futures import ThreadPoolExecutor
|
|
9
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
9
10
|
from typing import Union
|
|
10
11
|
|
|
11
12
|
import polars as pl
|
|
@@ -25,8 +26,10 @@ from tenacity import (
|
|
|
25
26
|
from nosible.classes.result_set import ResultSet
|
|
26
27
|
from nosible.classes.search import Search
|
|
27
28
|
from nosible.classes.search_set import SearchSet
|
|
29
|
+
from nosible.classes.snippet_set import SnippetSet
|
|
28
30
|
from nosible.classes.web_page import WebPageData
|
|
29
31
|
from nosible.utils.json_tools import json_loads
|
|
32
|
+
from nosible.utils.question_builder import _get_question
|
|
30
33
|
from nosible.utils.rate_limiter import PLAN_RATE_LIMITS, RateLimiter, _rate_limited
|
|
31
34
|
|
|
32
35
|
# Set up a module‐level logger.
|
|
@@ -47,37 +50,33 @@ class Nosible:
|
|
|
47
50
|
llm_api_key : str, optional
|
|
48
51
|
API key for LLM-based query expansions.
|
|
49
52
|
openai_base_url : str
|
|
50
|
-
Base URL for the OpenAI-compatible LLM API.
|
|
51
|
-
sentiment_model : str
|
|
52
|
-
Model to use for sentiment analysis
|
|
53
|
+
Base URL for the OpenAI-compatible LLM API. (default is OpenRouter's API endpoint)
|
|
54
|
+
sentiment_model : str, optional
|
|
55
|
+
Model to use for sentiment analysis (default is "openai/gpt-4o").
|
|
53
56
|
timeout : int
|
|
54
57
|
Request timeout for HTTP calls.
|
|
55
|
-
retries : int,
|
|
58
|
+
retries : int,
|
|
56
59
|
Number of retry attempts for transient HTTP errors.
|
|
57
|
-
concurrency : int,
|
|
60
|
+
concurrency : int,
|
|
58
61
|
Maximum concurrent search requests.
|
|
59
62
|
publish_start : str, optional
|
|
60
|
-
|
|
63
|
+
Start date for when the document was published (ISO format).
|
|
61
64
|
publish_end : str, optional
|
|
62
|
-
|
|
63
|
-
include_netlocs : list of str, optional
|
|
64
|
-
Domains to include.
|
|
65
|
-
exclude_netlocs : list of str, optional
|
|
66
|
-
Domains to exclude.
|
|
65
|
+
End date for when the document was published (ISO format).
|
|
67
66
|
visited_start : str, optional
|
|
68
|
-
|
|
67
|
+
Start date for when the document was visited by NOSIBLE (ISO format).
|
|
69
68
|
visited_end : str, optional
|
|
70
|
-
|
|
69
|
+
End date for when the document was visited by NOSIBLE (ISO format).
|
|
71
70
|
certain : bool, optional
|
|
72
|
-
|
|
73
|
-
include_languages : list of str, optional
|
|
74
|
-
Language codes to include (Max: 50).
|
|
75
|
-
exclude_languages : list of str, optional
|
|
76
|
-
Language codes to exclude (Max: 50).
|
|
71
|
+
Only include documents where we are 100% sure of the date.
|
|
77
72
|
include_netlocs : list of str, optional
|
|
78
|
-
|
|
73
|
+
List of netlocs (domains) to include in the search. (Max: 50)
|
|
79
74
|
exclude_netlocs : list of str, optional
|
|
80
|
-
|
|
75
|
+
List of netlocs (domains) to exclude in the search. (Max: 50)
|
|
76
|
+
include_languages : list of str, optional
|
|
77
|
+
Languages to include in the search. (Max: 50, ISO 639-1 language codes).
|
|
78
|
+
exclude_languages : list of str, optional
|
|
79
|
+
Language codes to exclude in the search (Max: 50, ISO 639-1 language codes).
|
|
81
80
|
include_companies : list of str, optional
|
|
82
81
|
Google KG IDs of public companies to require (Max: 50).
|
|
83
82
|
exclude_companies : list of str, optional
|
|
@@ -86,10 +85,6 @@ class Nosible:
|
|
|
86
85
|
URL hashes of docs to include (Max: 50).
|
|
87
86
|
exclude_docs : list of str, optional
|
|
88
87
|
URL hashes of docs to exclude (Max: 50).
|
|
89
|
-
openai_base_url : str, optional
|
|
90
|
-
Base URL for the OpenAI API (default is OpenRouter).
|
|
91
|
-
sentiment_model : str, optional
|
|
92
|
-
Model to use for sentiment analysis (default is "openai/gpt-4o").
|
|
93
88
|
|
|
94
89
|
Notes
|
|
95
90
|
-----
|
|
@@ -243,38 +238,34 @@ class Nosible:
|
|
|
243
238
|
List of LLM‐generated expansions.
|
|
244
239
|
sql_filter : list of str, optional
|
|
245
240
|
SQL‐style filter clauses.
|
|
246
|
-
n_results : int
|
|
241
|
+
n_results : int
|
|
247
242
|
Max number of results (max 100).
|
|
248
|
-
n_probes : int
|
|
243
|
+
n_probes : int
|
|
249
244
|
Number of index shards to probe.
|
|
250
|
-
n_contextify : int
|
|
245
|
+
n_contextify : int
|
|
251
246
|
Context window size per result.
|
|
252
|
-
algorithm : str
|
|
247
|
+
algorithm : str
|
|
253
248
|
Search algorithm type.
|
|
254
|
-
autogenerate_expansions : bool
|
|
249
|
+
autogenerate_expansions : bool
|
|
255
250
|
Do you want to generate expansions automatically using a LLM?
|
|
256
251
|
publish_start : str, optional
|
|
257
|
-
|
|
252
|
+
Start date for when the document was published (ISO format).
|
|
258
253
|
publish_end : str, optional
|
|
259
|
-
|
|
260
|
-
include_netlocs : list of str, optional
|
|
261
|
-
Domains to include.
|
|
262
|
-
exclude_netlocs : list of str, optional
|
|
263
|
-
Domains to exclude.
|
|
254
|
+
End date for when the document was published (ISO format).
|
|
264
255
|
visited_start : str, optional
|
|
265
|
-
|
|
256
|
+
Start date for when the document was visited by NOSIBLE (ISO format).
|
|
266
257
|
visited_end : str, optional
|
|
267
|
-
|
|
258
|
+
End date for when the document was visited by NOSIBLE (ISO format).
|
|
268
259
|
certain : bool, optional
|
|
269
|
-
|
|
270
|
-
include_languages : list of str, optional
|
|
271
|
-
Language codes to include (Max: 50).
|
|
272
|
-
exclude_languages : list of str, optional
|
|
273
|
-
Language codes to exclude (Max: 50).
|
|
260
|
+
Only include documents where we are 100% sure of the date.
|
|
274
261
|
include_netlocs : list of str, optional
|
|
275
|
-
|
|
262
|
+
List of netlocs (domains) to include in the search. (Max: 50)
|
|
276
263
|
exclude_netlocs : list of str, optional
|
|
277
|
-
|
|
264
|
+
List of netlocs (domains) to exclude in the search. (Max: 50)
|
|
265
|
+
include_languages : list of str, optional
|
|
266
|
+
Languages to include in the search. (Max: 50, ISO 639-1 language codes).
|
|
267
|
+
exclude_languages : list of str, optional
|
|
268
|
+
Language codes to exclude in the search (Max: 50, ISO 639-1 language codes).
|
|
278
269
|
include_companies : list of str, optional
|
|
279
270
|
Google KG IDs of public companies to require (Max: 50).
|
|
280
271
|
exclude_companies : list of str, optional
|
|
@@ -297,6 +288,8 @@ class Nosible:
|
|
|
297
288
|
If neither question nor search are specified
|
|
298
289
|
RuntimeError
|
|
299
290
|
If the response fails in any way.
|
|
291
|
+
ValueError
|
|
292
|
+
If `n_results` is greater than 100.
|
|
300
293
|
|
|
301
294
|
Notes
|
|
302
295
|
-----
|
|
@@ -407,48 +400,44 @@ class Nosible:
|
|
|
407
400
|
List of expansion terms to use for each search.
|
|
408
401
|
sql_filter : list of str, optional
|
|
409
402
|
SQL-like filters to apply to the search.
|
|
410
|
-
n_results : int
|
|
403
|
+
n_results : int
|
|
411
404
|
Number of results to return per search.
|
|
412
|
-
n_probes : int
|
|
405
|
+
n_probes : int
|
|
413
406
|
Number of probes to use for the search algorithm.
|
|
414
|
-
n_contextify : int
|
|
407
|
+
n_contextify : int
|
|
415
408
|
Context window size for the search.
|
|
416
|
-
algorithm : str
|
|
409
|
+
algorithm : str
|
|
417
410
|
Search algorithm to use.
|
|
418
|
-
autogenerate_expansions : bool
|
|
411
|
+
autogenerate_expansions : bool
|
|
419
412
|
Do you want to generate expansions automatically using a LLM?
|
|
420
413
|
publish_start : str, optional
|
|
421
|
-
|
|
414
|
+
Start date for when the document was published (ISO format).
|
|
422
415
|
publish_end : str, optional
|
|
423
|
-
|
|
424
|
-
include_netlocs : list of str, optional
|
|
425
|
-
Only include results from these domains.
|
|
426
|
-
exclude_netlocs : list of str, optional
|
|
427
|
-
Exclude results from these domains.
|
|
416
|
+
End date for when the document was published (ISO format).
|
|
428
417
|
visited_start : str, optional
|
|
429
|
-
|
|
418
|
+
Start date for when the document was visited by NOSIBLE (ISO format).
|
|
430
419
|
visited_end : str, optional
|
|
431
|
-
|
|
420
|
+
End date for when the document was visited by NOSIBLE (ISO format).
|
|
432
421
|
certain : bool, optional
|
|
433
|
-
Only include
|
|
422
|
+
Only include documents where we are 100% sure of the date.
|
|
423
|
+
include_netlocs : list of str, optional
|
|
424
|
+
List of netlocs (domains) to include in the search. (Max: 50)
|
|
425
|
+
exclude_netlocs : list of str, optional
|
|
426
|
+
List of netlocs (domains) to exclude in the search. (Max: 50)
|
|
434
427
|
include_languages : list of str, optional
|
|
435
|
-
|
|
428
|
+
Languages to include in the search. (Max: 50, ISO 639-1 language codes).
|
|
436
429
|
exclude_languages : list of str, optional
|
|
437
|
-
|
|
430
|
+
Language codes to exclude in the search (Max: 50, ISO 639-1 language codes).
|
|
438
431
|
include_companies : list of str, optional
|
|
439
|
-
|
|
432
|
+
Google KG IDs of public companies to require (Max: 50).
|
|
440
433
|
exclude_companies : list of str, optional
|
|
441
|
-
|
|
442
|
-
include_netlocs : list of str, optional
|
|
443
|
-
Only include results from these domains (Max: 50).
|
|
444
|
-
exclude_netlocs : list of str, optional
|
|
445
|
-
Exclude results from these domains (Max: 50).
|
|
434
|
+
Google KG IDs of public companies to forbid (Max: 50).
|
|
446
435
|
include_docs : list of str, optional
|
|
447
|
-
URL hashes of
|
|
436
|
+
URL hashes of docs to include (Max: 50).
|
|
448
437
|
exclude_docs : list of str, optional
|
|
449
|
-
URL hashes of
|
|
438
|
+
URL hashes of docs to exclude (Max: 50).
|
|
450
439
|
|
|
451
|
-
|
|
440
|
+
Returns
|
|
452
441
|
------
|
|
453
442
|
ResultSet or None
|
|
454
443
|
Each completed search’s results, or None on failure.
|
|
@@ -461,8 +450,6 @@ class Nosible:
|
|
|
461
450
|
If both queries and searches are specified.
|
|
462
451
|
TypeError
|
|
463
452
|
If neither queries nor searches are specified.
|
|
464
|
-
RuntimeError
|
|
465
|
-
If the response fails in any way.
|
|
466
453
|
|
|
467
454
|
Notes
|
|
468
455
|
-----
|
|
@@ -473,7 +460,10 @@ class Nosible:
|
|
|
473
460
|
--------
|
|
474
461
|
>>> from nosible import Nosible
|
|
475
462
|
>>> queries = SearchSet(
|
|
476
|
-
... [
|
|
463
|
+
... [
|
|
464
|
+
... Search(question="Hedge funds seek to expand into private credit", n_results=5),
|
|
465
|
+
... Search(question="How have the Trump tariffs impacted the US economy?", n_results=5),
|
|
466
|
+
... ]
|
|
477
467
|
... )
|
|
478
468
|
>>> with Nosible() as nos:
|
|
479
469
|
... results_list = list(nos.searches(searches=queries))
|
|
@@ -484,10 +474,14 @@ class Nosible:
|
|
|
484
474
|
True True
|
|
485
475
|
True True
|
|
486
476
|
>>> with Nosible() as nos:
|
|
487
|
-
... results_list_str = list(
|
|
488
|
-
...
|
|
489
|
-
...
|
|
490
|
-
...
|
|
477
|
+
... results_list_str = list(
|
|
478
|
+
... nos.searches(
|
|
479
|
+
... questions=[
|
|
480
|
+
... "What are the terms of the partnership between Microsoft and OpenAI?",
|
|
481
|
+
... "What are the terms of the partnership between Volkswagen and Uber?",
|
|
482
|
+
... ]
|
|
483
|
+
... )
|
|
484
|
+
... )
|
|
491
485
|
>>> nos = Nosible(nosible_api_key="test|xyz") # doctest: +ELLIPSIS
|
|
492
486
|
>>> nos.searches() # doctest: +ELLIPSIS
|
|
493
487
|
Traceback (most recent call last):
|
|
@@ -539,6 +533,7 @@ class Nosible:
|
|
|
539
533
|
except Exception as e:
|
|
540
534
|
self.logger.warning(f"Search failed: {e!r}")
|
|
541
535
|
yield None
|
|
536
|
+
|
|
542
537
|
return _run_generator()
|
|
543
538
|
|
|
544
539
|
@_rate_limited("fast")
|
|
@@ -573,7 +568,7 @@ class Nosible:
|
|
|
573
568
|
ValueError: Search can not have more than 100 results - Use bulk search instead.
|
|
574
569
|
"""
|
|
575
570
|
# --------------------------------------------------------------------------------------------------------------
|
|
576
|
-
# Setting search params. Individual search will
|
|
571
|
+
# Setting search params. Individual search will override Nosible defaults.
|
|
577
572
|
# --------------------------------------------------------------------------------------------------------------
|
|
578
573
|
question = search_obj.question # No default
|
|
579
574
|
expansions = search_obj.expansions if search_obj.expansions is not None else [] # Default to empty list
|
|
@@ -582,7 +577,9 @@ class Nosible:
|
|
|
582
577
|
n_probes = search_obj.n_probes if search_obj.n_probes is not None else 30
|
|
583
578
|
n_contextify = search_obj.n_contextify if search_obj.n_contextify is not None else 128
|
|
584
579
|
algorithm = search_obj.algorithm if search_obj.algorithm is not None else "hybrid-2"
|
|
585
|
-
autogenerate_expansions =
|
|
580
|
+
autogenerate_expansions = (
|
|
581
|
+
search_obj.autogenerate_expansions if search_obj.autogenerate_expansions is not None else False
|
|
582
|
+
)
|
|
586
583
|
publish_start = search_obj.publish_start if search_obj.publish_start is not None else self.publish_start
|
|
587
584
|
publish_end = search_obj.publish_end if search_obj.publish_end is not None else self.publish_end
|
|
588
585
|
include_netlocs = search_obj.include_netlocs if search_obj.include_netlocs is not None else self.include_netlocs
|
|
@@ -728,46 +725,42 @@ class Nosible:
|
|
|
728
725
|
Optional list of expanded query strings.
|
|
729
726
|
sql_filter : list of str, optional
|
|
730
727
|
Optional SQL WHERE clause filters.
|
|
731
|
-
n_results : int
|
|
728
|
+
n_results : int
|
|
732
729
|
Number of results per query (1,000–10,000).
|
|
733
|
-
n_probes : int
|
|
730
|
+
n_probes : int
|
|
734
731
|
Number of shards to probe.
|
|
735
|
-
n_contextify : int
|
|
732
|
+
n_contextify : int
|
|
736
733
|
Context window size per result.
|
|
737
|
-
algorithm : str
|
|
734
|
+
algorithm : str
|
|
738
735
|
Search algorithm identifier.
|
|
739
|
-
autogenerate_expansions : bool
|
|
736
|
+
autogenerate_expansions : bool
|
|
740
737
|
Do you want to generate expansions automatically using a LLM?
|
|
741
738
|
publish_start : str, optional
|
|
742
|
-
|
|
739
|
+
Start date for when the document was published (ISO format).
|
|
743
740
|
publish_end : str, optional
|
|
744
|
-
|
|
745
|
-
include_netlocs : list of str, optional
|
|
746
|
-
Domains to include.
|
|
747
|
-
exclude_netlocs : list of str, optional
|
|
748
|
-
Domains to exclude.
|
|
741
|
+
End date for when the document was published (ISO format).
|
|
749
742
|
visited_start : str, optional
|
|
750
|
-
|
|
743
|
+
Start date for when the document was visited by NOSIBLE (ISO format).
|
|
751
744
|
visited_end : str, optional
|
|
752
|
-
|
|
745
|
+
End date for when the document was visited by NOSIBLE (ISO format).
|
|
753
746
|
certain : bool, optional
|
|
754
|
-
|
|
755
|
-
include_languages : list of str, optional
|
|
756
|
-
Languages to include (Max: 50).
|
|
757
|
-
exclude_languages : list of str, optional
|
|
758
|
-
Languages to exclude (Max: 50).
|
|
747
|
+
Only include documents where we are 100% sure of the date.
|
|
759
748
|
include_netlocs : list of str, optional
|
|
760
|
-
|
|
749
|
+
List of netlocs (domains) to include in the search. (Max: 50)
|
|
761
750
|
exclude_netlocs : list of str, optional
|
|
762
|
-
|
|
751
|
+
List of netlocs (domains) to exclude in the search. (Max: 50)
|
|
752
|
+
include_languages : list of str, optional
|
|
753
|
+
Languages to include in the search. (Max: 50, ISO 639-1 language codes).
|
|
754
|
+
exclude_languages : list of str, optional
|
|
755
|
+
Language codes to exclude in the search (Max: 50, ISO 639-1 language codes).
|
|
763
756
|
include_companies : list of str, optional
|
|
764
|
-
|
|
757
|
+
Google KG IDs of public companies to require (Max: 50).
|
|
765
758
|
exclude_companies : list of str, optional
|
|
766
|
-
|
|
759
|
+
Google KG IDs of public companies to forbid (Max: 50).
|
|
767
760
|
include_docs : list of str, optional
|
|
768
|
-
URL hashes of
|
|
761
|
+
URL hashes of docs to include (Max: 50).
|
|
769
762
|
exclude_docs : list of str, optional
|
|
770
|
-
URL hashes of
|
|
763
|
+
URL hashes of docs to exclude (Max: 50).
|
|
771
764
|
verbose : bool, optional
|
|
772
765
|
Show verbose output, Bulk search will print more information.
|
|
773
766
|
|
|
@@ -794,23 +787,21 @@ class Nosible:
|
|
|
794
787
|
|
|
795
788
|
Examples
|
|
796
789
|
--------
|
|
797
|
-
>>> from nosible.classes.search import Search
|
|
798
|
-
>>> from nosible import Nosible
|
|
799
|
-
>>> with Nosible(
|
|
800
|
-
... results = nos.bulk_search(question=
|
|
790
|
+
>>> from nosible.classes.search import Search # doctest: +SKIP
|
|
791
|
+
>>> from nosible import Nosible # doctest: +SKIP
|
|
792
|
+
>>> with Nosible(exclude_netlocs=["bbc.com"]) as nos: # doctest: +SKIP
|
|
793
|
+
... results = nos.bulk_search(question=_get_question(), n_results=2000) # doctest: +SKIP
|
|
801
794
|
... print(isinstance(results, ResultSet)) # doctest: +SKIP
|
|
802
795
|
... print(len(results)) # doctest: +SKIP
|
|
803
796
|
True
|
|
804
797
|
2000
|
|
805
|
-
|
|
806
|
-
>>> s = Search(question="OpenAI", n_results=1000) # doctest: +SKIP
|
|
798
|
+
>>> s = Search(question=_get_question(), n_results=1000) # doctest: +SKIP
|
|
807
799
|
>>> with Nosible() as nos: # doctest: +SKIP
|
|
808
800
|
... results = nos.bulk_search(search=s) # doctest: +SKIP
|
|
809
801
|
... print(isinstance(results, ResultSet)) # doctest: +SKIP
|
|
810
802
|
... print(len(results)) # doctest: +SKIP
|
|
811
803
|
True
|
|
812
804
|
1000
|
|
813
|
-
|
|
814
805
|
>>> nos = Nosible(nosible_api_key="test|xyz") # doctest: +SKIP
|
|
815
806
|
>>> nos.bulk_search() # doctest: +SKIP
|
|
816
807
|
Traceback (most recent call last):
|
|
@@ -818,20 +809,18 @@ class Nosible:
|
|
|
818
809
|
TypeError: Either question or search must be specified
|
|
819
810
|
|
|
820
811
|
>>> nos = Nosible(nosible_api_key="test|xyz") # doctest: +SKIP
|
|
821
|
-
>>> nos.bulk_search(question=
|
|
812
|
+
>>> nos.bulk_search(question=_get_question(), search=Search(question=_get_question())) # doctest: +SKIP
|
|
822
813
|
Traceback (most recent call last):
|
|
823
814
|
...
|
|
824
815
|
TypeError: Question and search cannot be both specified
|
|
825
|
-
|
|
826
816
|
>>> nos = Nosible(nosible_api_key="test|xyz") # doctest: +SKIP
|
|
827
|
-
>>> nos.bulk_search(question=
|
|
817
|
+
>>> nos.bulk_search(question=_get_question(), n_results=100) # doctest: +SKIP
|
|
828
818
|
Traceback (most recent call last):
|
|
829
819
|
...
|
|
830
|
-
ValueError: Bulk search must have at least
|
|
831
|
-
|
|
820
|
+
ValueError: Bulk search must have at least 1000 results per query; use search() for smaller result sets.
|
|
832
821
|
>>> nos = Nosible(nosible_api_key="test|xyz") # doctest: +SKIP
|
|
833
|
-
>>> nos.bulk_search(question=
|
|
834
|
-
Traceback (most recent call last):
|
|
822
|
+
>>> nos.bulk_search(question=_get_question(), n_results=10001) # doctest: +SKIP
|
|
823
|
+
Traceback (most recent call last): # doctest: +SKIP
|
|
835
824
|
...
|
|
836
825
|
ValueError: Bulk search cannot have more than 10000 results per query.
|
|
837
826
|
"""
|
|
@@ -854,8 +843,11 @@ class Nosible:
|
|
|
854
843
|
n_probes = search.n_probes if search.n_probes is not None else n_probes
|
|
855
844
|
n_contextify = search.n_contextify if search.n_contextify is not None else n_contextify
|
|
856
845
|
algorithm = search.algorithm if search.algorithm is not None else algorithm
|
|
857
|
-
autogenerate_expansions =
|
|
846
|
+
autogenerate_expansions = (
|
|
847
|
+
search.autogenerate_expansions
|
|
848
|
+
if search.autogenerate_expansions is not None
|
|
858
849
|
else autogenerate_expansions
|
|
850
|
+
)
|
|
859
851
|
publish_start = search.publish_start if search.publish_start is not None else publish_start
|
|
860
852
|
publish_end = search.publish_end if search.publish_end is not None else publish_end
|
|
861
853
|
include_netlocs = search.include_netlocs if search.include_netlocs is not None else include_netlocs
|
|
@@ -959,13 +951,13 @@ class Nosible:
|
|
|
959
951
|
|
|
960
952
|
Parameters
|
|
961
953
|
----------
|
|
962
|
-
html : str
|
|
954
|
+
html : str
|
|
963
955
|
Raw HTML to process instead of fetching.
|
|
964
|
-
recrawl : bool
|
|
956
|
+
recrawl : bool
|
|
965
957
|
If True, force a fresh crawl.
|
|
966
|
-
render : bool
|
|
958
|
+
render : bool
|
|
967
959
|
If True, allow JavaScript rendering before extraction.
|
|
968
|
-
url : str
|
|
960
|
+
url : str
|
|
969
961
|
The URL to fetch and parse.
|
|
970
962
|
|
|
971
963
|
Returns
|
|
@@ -986,26 +978,24 @@ class Nosible:
|
|
|
986
978
|
|
|
987
979
|
Examples
|
|
988
980
|
--------
|
|
989
|
-
>>> from nosible import Nosible
|
|
990
|
-
>>> with Nosible() as nos:
|
|
991
|
-
... out = nos.visit(url="https://www.dailynewsegypt.com/2023/09/08/g20-and-its-summits/")
|
|
992
|
-
... print(isinstance(out,
|
|
993
|
-
... print(hasattr(out, "languages"))
|
|
994
|
-
... print(hasattr(out, "page"))
|
|
981
|
+
>>> from nosible import Nosible
|
|
982
|
+
>>> with Nosible() as nos:
|
|
983
|
+
... out = nos.visit(url="https://www.dailynewsegypt.com/2023/09/08/g20-and-its-summits/")
|
|
984
|
+
... print(isinstance(out, WebPageData))
|
|
985
|
+
... print(hasattr(out, "languages"))
|
|
986
|
+
... print(hasattr(out, "page"))
|
|
995
987
|
True
|
|
996
988
|
True
|
|
997
989
|
True
|
|
998
|
-
>>> with Nosible() as nos:
|
|
999
|
-
... out = nos.visit()
|
|
1000
|
-
... print(isinstance(out, type(WebPageData)))
|
|
1001
|
-
... print(hasattr(out, "languages"))
|
|
1002
|
-
... print(hasattr(out, "page")) # doctest: +
|
|
990
|
+
>>> with Nosible() as nos:
|
|
991
|
+
... out = nos.visit()
|
|
992
|
+
... print(isinstance(out, type(WebPageData)))
|
|
993
|
+
... print(hasattr(out, "languages"))
|
|
994
|
+
... print(hasattr(out, "page")) # doctest: +ELLIPSIS
|
|
1003
995
|
Traceback (most recent call last):
|
|
1004
996
|
...
|
|
1005
997
|
TypeError: URL must be provided
|
|
1006
998
|
"""
|
|
1007
|
-
|
|
1008
|
-
# self._enforce("visit")
|
|
1009
999
|
if url is None:
|
|
1010
1000
|
raise TypeError("URL must be provided")
|
|
1011
1001
|
response = self._post(
|
|
@@ -1018,7 +1008,7 @@ class Nosible:
|
|
|
1018
1008
|
self.logger.error(f"Failed to parse JSON from response: {e}")
|
|
1019
1009
|
raise ValueError("Invalid JSON response from server") from e
|
|
1020
1010
|
|
|
1021
|
-
if data == {
|
|
1011
|
+
if data == {"message": "Sorry, the URL could not be fetched."}:
|
|
1022
1012
|
raise ValueError("The URL could not be found.")
|
|
1023
1013
|
|
|
1024
1014
|
if "response" not in data:
|
|
@@ -1033,7 +1023,7 @@ class Nosible:
|
|
|
1033
1023
|
metadata=response_data.get("metadata"),
|
|
1034
1024
|
page=response_data.get("page"),
|
|
1035
1025
|
request=response_data.get("request"),
|
|
1036
|
-
snippets=response_data.get("snippets"),
|
|
1026
|
+
snippets=SnippetSet.from_dict(response_data.get("snippets", {})),
|
|
1037
1027
|
statistics=response_data.get("statistics"),
|
|
1038
1028
|
structured=response_data.get("structured"),
|
|
1039
1029
|
url_tree=response_data.get("url_tree"),
|
|
@@ -1097,10 +1087,6 @@ class Nosible:
|
|
|
1097
1087
|
|
|
1098
1088
|
Raises
|
|
1099
1089
|
------
|
|
1100
|
-
ValueError
|
|
1101
|
-
If the API returns an unexpected message.
|
|
1102
|
-
requests.HTTPError
|
|
1103
|
-
If the HTTP request fails.
|
|
1104
1090
|
|
|
1105
1091
|
Examples
|
|
1106
1092
|
--------
|
|
@@ -1125,6 +1111,7 @@ class Nosible:
|
|
|
1125
1111
|
return False
|
|
1126
1112
|
except:
|
|
1127
1113
|
return False
|
|
1114
|
+
|
|
1128
1115
|
def preflight(self, url: str = None) -> str:
|
|
1129
1116
|
"""
|
|
1130
1117
|
Run a preflight check for crawling/preprocessing on a URL.
|
|
@@ -1180,40 +1167,47 @@ class Nosible:
|
|
|
1180
1167
|
|
|
1181
1168
|
Examples
|
|
1182
1169
|
--------
|
|
1183
|
-
>>> nos = Nosible(nosible_api_key="test|xyz")
|
|
1184
|
-
>>> print(nos.get_rate_limits()) # doctest: +
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
|
1170
|
+
>>> nos = Nosible(nosible_api_key="test|xyz")
|
|
1171
|
+
>>> print(nos.get_rate_limits()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
|
|
1172
|
+
Below are the rate limits for all NOSIBLE plans.
|
|
1173
|
+
To upgrade your package, visit https://www.nosible.ai/products.
|
|
1174
|
+
<BLANKLINE>
|
|
1175
|
+
Unless otherwise indicated, bulk searches are limited to one-at-a-time per API key.
|
|
1176
|
+
<BLANKLINE>
|
|
1177
|
+
Free: (Your current plan)
|
|
1178
|
+
| Endpoint | Per Month | Per Minute | Effective CPM |
|
|
1179
|
+
| ----------- | --------- | ---------- | ------------- |
|
|
1180
|
+
| Search | 3000 | 60 | $4.00 |
|
|
1181
|
+
| URL Visits | 300 | 60 | $4.00 |
|
|
1182
|
+
| Bulk Search | 300 | 60 | $4.00 |
|
|
1183
|
+
<BLANKLINE>
|
|
1184
|
+
Basic ($49p/m):
|
|
1185
|
+
| Endpoint | Per Month | Per Minute | Effective CPM |
|
|
1194
1186
|
...
|
|
1195
1187
|
"""
|
|
1196
1188
|
# Human-friendly plan names
|
|
1197
1189
|
display = {
|
|
1198
1190
|
"test": "Free",
|
|
1199
|
-
"basic": "Basic",
|
|
1200
|
-
"pro": "Pro",
|
|
1201
|
-
"pro+": "Pro+",
|
|
1202
|
-
"bus": "Business",
|
|
1203
|
-
"bus+": "Business+",
|
|
1204
|
-
"ent": "Enterprise",
|
|
1191
|
+
"basic": "Basic ($49p/m)",
|
|
1192
|
+
"pro": "Pro ($199p/m)",
|
|
1193
|
+
"pro+": "Pro+ ($799p/m)",
|
|
1194
|
+
"bus": "Business ($3999p/m)",
|
|
1195
|
+
"bus+": "Business+ ($7499p/m)",
|
|
1196
|
+
"ent": "Enterprise ($14999p/m)",
|
|
1205
1197
|
}
|
|
1206
1198
|
|
|
1207
1199
|
# Human-friendly endpoint names
|
|
1208
|
-
endpoint_name = {"fast": "
|
|
1200
|
+
endpoint_name = {"fast": "Search", "visit": "URL Visits", "slow": "Bulk Search"}
|
|
1209
1201
|
|
|
1210
1202
|
out = [
|
|
1211
1203
|
"Below are the rate limits for all NOSIBLE plans.",
|
|
1212
1204
|
"To upgrade your package, visit https://www.nosible.ai/products.\n",
|
|
1205
|
+
"Unless otherwise indicated, bulk searches are limited to one-at-a-time per API key.\n"
|
|
1213
1206
|
]
|
|
1214
1207
|
|
|
1215
1208
|
user_plan = self._get_user_plan()
|
|
1216
1209
|
current_plan = ""
|
|
1210
|
+
cpm_counter = 4.0
|
|
1217
1211
|
|
|
1218
1212
|
# Preserve the order you care about:
|
|
1219
1213
|
for plan in ["test", "basic", "pro", "pro+", "bus", "bus+", "ent"]:
|
|
@@ -1222,17 +1216,19 @@ class Nosible:
|
|
|
1222
1216
|
current_plan = " (Your current plan)"
|
|
1223
1217
|
|
|
1224
1218
|
out.append(f"{name}:{current_plan}")
|
|
1225
|
-
out.append("| Endpoint | Per Month | Per
|
|
1226
|
-
out.append("| ----------- | --------- |
|
|
1219
|
+
out.append("| Endpoint | Per Month | Per Minute | Effective CPM |")
|
|
1220
|
+
out.append("| ----------- | --------- | ---------- | ------------- |")
|
|
1227
1221
|
|
|
1228
1222
|
for ep in ["fast", "visit", "slow"]:
|
|
1229
1223
|
buckets = PLAN_RATE_LIMITS[plan][ep]
|
|
1230
1224
|
# Find minute & day
|
|
1231
1225
|
minute = next(limit for limit, i in buckets if i == 60)
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1226
|
+
month = next(limit for limit, i in buckets if i == 24 * 3600 * 30)
|
|
1227
|
+
cpm = f"${cpm_counter:.2f}"
|
|
1228
|
+
|
|
1229
|
+
out.append(f"| {endpoint_name[ep]:<11} | {month:>9} | {minute:>10} | {cpm:>13} |")
|
|
1235
1230
|
|
|
1231
|
+
cpm_counter = cpm_counter - 0.5
|
|
1236
1232
|
out.append("") # Blank line
|
|
1237
1233
|
current_plan = ""
|
|
1238
1234
|
|
|
@@ -1243,10 +1239,6 @@ class Nosible:
|
|
|
1243
1239
|
Close the Nosible client, shutting down the HTTP session
|
|
1244
1240
|
and thread pool to release network and threading resources.
|
|
1245
1241
|
|
|
1246
|
-
Returns
|
|
1247
|
-
-------
|
|
1248
|
-
None
|
|
1249
|
-
|
|
1250
1242
|
Examples
|
|
1251
1243
|
--------
|
|
1252
1244
|
>>> from nosible import Nosible
|
|
@@ -1292,6 +1284,8 @@ class Nosible:
|
|
|
1292
1284
|
If the user API key is invalid.
|
|
1293
1285
|
ValueError
|
|
1294
1286
|
If the user hits their rate limit.
|
|
1287
|
+
ValueError
|
|
1288
|
+
If the user is making too many concurrent searches.
|
|
1295
1289
|
ValueError
|
|
1296
1290
|
If an unexpected error occurs.
|
|
1297
1291
|
ValueError
|
|
@@ -1325,6 +1319,8 @@ class Nosible:
|
|
|
1325
1319
|
raise ValueError("You made a bad request.")
|
|
1326
1320
|
if response.status_code == 429:
|
|
1327
1321
|
raise ValueError("You have hit your rate limit.")
|
|
1322
|
+
if response.status_code == 409:
|
|
1323
|
+
raise ValueError("Too many concurrent searches.")
|
|
1328
1324
|
if response.status_code == 500:
|
|
1329
1325
|
raise ValueError("An unexpected error occurred.")
|
|
1330
1326
|
if response.status_code == 502:
|
|
@@ -1354,16 +1350,16 @@ class Nosible:
|
|
|
1354
1350
|
|
|
1355
1351
|
Examples
|
|
1356
1352
|
--------
|
|
1357
|
-
>>> nos = Nosible(nosible_api_key="test+|xyz") # doctest: +
|
|
1353
|
+
>>> nos = Nosible(nosible_api_key="test+|xyz") # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
|
|
1358
1354
|
Traceback (most recent call last):
|
|
1359
1355
|
...
|
|
1360
|
-
ValueError: test+ is not a valid plan prefix
|
|
1356
|
+
ValueError: Your API key is not valid: test+ is not a valid plan prefix.
|
|
1361
1357
|
"""
|
|
1362
1358
|
# Split off anything after the first '|'
|
|
1363
1359
|
prefix = (self.nosible_api_key or "").split("|", 1)[0]
|
|
1364
1360
|
|
|
1365
|
-
# Map prefixes ->
|
|
1366
|
-
plans = {"test", "basic", "pro", "pro+", "bus", "bus+", "ent"}
|
|
1361
|
+
# Map prefixes -> plan names
|
|
1362
|
+
plans = {"test", "basic", "pro", "pro+", "bus", "bus+", "ent", "chat"}
|
|
1367
1363
|
|
|
1368
1364
|
if prefix not in plans:
|
|
1369
1365
|
raise ValueError(f"Your API key is not valid: {prefix} is not a valid plan prefix.")
|
|
@@ -1393,11 +1389,10 @@ class Nosible:
|
|
|
1393
1389
|
|
|
1394
1390
|
Examples
|
|
1395
1391
|
--------
|
|
1396
|
-
|
|
1397
|
-
>>>
|
|
1398
|
-
>>> nos =
|
|
1399
|
-
>>> nos.
|
|
1400
|
-
>>> nos._generate_expansions("anything") # doctest: +SKIP
|
|
1392
|
+
>>> from nosible import Nosible
|
|
1393
|
+
>>> nos = Nosible(llm_api_key=None)
|
|
1394
|
+
>>> nos.llm_api_key = None
|
|
1395
|
+
>>> nos._generate_expansions("anything") # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
|
|
1401
1396
|
Traceback (most recent call last):
|
|
1402
1397
|
...
|
|
1403
1398
|
ValueError: LLM API key is required for generating expansions.
|
|
@@ -1508,35 +1503,31 @@ class Nosible:
|
|
|
1508
1503
|
Parameters
|
|
1509
1504
|
----------
|
|
1510
1505
|
publish_start : str, optional
|
|
1511
|
-
|
|
1506
|
+
Start date for when the document was published (ISO format).
|
|
1512
1507
|
publish_end : str, optional
|
|
1513
|
-
|
|
1514
|
-
include_netlocs : list of str, optional
|
|
1515
|
-
Domains to whitelist.
|
|
1516
|
-
exclude_netlocs : list of str, optional
|
|
1517
|
-
Domains to blacklist.
|
|
1508
|
+
End date for when the document was published (ISO format).
|
|
1518
1509
|
visited_start : str, optional
|
|
1519
|
-
|
|
1510
|
+
Start date for when the document was visited by NOSIBLE (ISO format).
|
|
1520
1511
|
visited_end : str, optional
|
|
1521
|
-
|
|
1512
|
+
End date for when the document was visited by NOSIBLE (ISO format).
|
|
1522
1513
|
certain : bool, optional
|
|
1523
|
-
|
|
1524
|
-
include_languages : list of str, optional
|
|
1525
|
-
Languages to include (Max: 50).
|
|
1526
|
-
exclude_languages : list of str, optional
|
|
1527
|
-
Languages to exclude (Max: 50).
|
|
1514
|
+
Only include documents where we are 100% sure of the date.
|
|
1528
1515
|
include_netlocs : list of str, optional
|
|
1529
|
-
|
|
1516
|
+
List of netlocs (domains) to include in the search. (Max: 50)
|
|
1530
1517
|
exclude_netlocs : list of str, optional
|
|
1531
|
-
|
|
1518
|
+
List of netlocs (domains) to exclude in the search. (Max: 50)
|
|
1519
|
+
include_languages : list of str, optional
|
|
1520
|
+
Languages to include in the search. (Max: 50, ISO 639-1 language codes).
|
|
1521
|
+
exclude_languages : list of str, optional
|
|
1522
|
+
Language codes to exclude in the search (Max: 50, ISO 639-1 language codes).
|
|
1532
1523
|
include_companies : list of str, optional
|
|
1533
|
-
|
|
1524
|
+
Google KG IDs of public companies to require (Max: 50).
|
|
1534
1525
|
exclude_companies : list of str, optional
|
|
1535
|
-
|
|
1526
|
+
Google KG IDs of public companies to forbid (Max: 50).
|
|
1536
1527
|
include_docs : list of str, optional
|
|
1537
|
-
URL hashes of
|
|
1528
|
+
URL hashes of docs to include (Max: 50).
|
|
1538
1529
|
exclude_docs : list of str, optional
|
|
1539
|
-
URL hashes of
|
|
1530
|
+
URL hashes of docs to exclude (Max: 50).
|
|
1540
1531
|
|
|
1541
1532
|
Returns
|
|
1542
1533
|
-------
|
|
@@ -1545,20 +1536,19 @@ class Nosible:
|
|
|
1545
1536
|
|
|
1546
1537
|
Raises
|
|
1547
1538
|
------
|
|
1548
|
-
|
|
1549
1539
|
ValueError
|
|
1550
1540
|
If more than 50 items in a filter are given.
|
|
1551
1541
|
"""
|
|
1552
1542
|
# Validate list lengths
|
|
1553
1543
|
for name, lst in [
|
|
1554
|
-
(
|
|
1555
|
-
(
|
|
1556
|
-
(
|
|
1557
|
-
(
|
|
1558
|
-
(
|
|
1559
|
-
(
|
|
1560
|
-
(
|
|
1561
|
-
(
|
|
1544
|
+
("include_netlocs", include_netlocs),
|
|
1545
|
+
("exclude_netlocs", exclude_netlocs),
|
|
1546
|
+
("include_languages", include_languages),
|
|
1547
|
+
("exclude_languages", exclude_languages),
|
|
1548
|
+
("include_companies", include_companies),
|
|
1549
|
+
("exclude_companies", exclude_companies),
|
|
1550
|
+
("include_docs", include_docs),
|
|
1551
|
+
("exclude_docs", exclude_docs),
|
|
1562
1552
|
]:
|
|
1563
1553
|
if lst is not None and len(lst) > 50:
|
|
1564
1554
|
raise ValueError(f"Too many items for '{name}' filter ({len(lst)}); maximum allowed is 50.")
|
|
@@ -1595,10 +1585,10 @@ class Nosible:
|
|
|
1595
1585
|
variants = set()
|
|
1596
1586
|
for n in include_netlocs:
|
|
1597
1587
|
variants.add(n)
|
|
1598
|
-
if n.startswith(
|
|
1588
|
+
if n.startswith("www."):
|
|
1599
1589
|
variants.add(n[4:])
|
|
1600
1590
|
else:
|
|
1601
|
-
variants.add(
|
|
1591
|
+
variants.add("www." + n)
|
|
1602
1592
|
in_list = ", ".join(f"'{v}'" for v in sorted(variants))
|
|
1603
1593
|
clauses.append(f"netloc IN ({in_list})")
|
|
1604
1594
|
|
|
@@ -1607,10 +1597,10 @@ class Nosible:
|
|
|
1607
1597
|
variants = set()
|
|
1608
1598
|
for n in exclude_netlocs:
|
|
1609
1599
|
variants.add(n)
|
|
1610
|
-
if n.startswith(
|
|
1600
|
+
if n.startswith("www."):
|
|
1611
1601
|
variants.add(n[4:])
|
|
1612
1602
|
else:
|
|
1613
|
-
variants.add(
|
|
1603
|
+
variants.add("www." + n)
|
|
1614
1604
|
ex_list = ", ".join(f"'{v}'" for v in sorted(variants))
|
|
1615
1605
|
clauses.append(f"netloc NOT IN ({ex_list})")
|
|
1616
1606
|
|
|
@@ -1703,7 +1693,7 @@ class Nosible:
|
|
|
1703
1693
|
except Exception:
|
|
1704
1694
|
return False
|
|
1705
1695
|
|
|
1706
|
-
def __enter__(self):
|
|
1696
|
+
def __enter__(self) -> "Nosible":
|
|
1707
1697
|
"""
|
|
1708
1698
|
Enter the context manager, returning this client instance.
|
|
1709
1699
|
|
|
@@ -1714,32 +1704,42 @@ class Nosible:
|
|
|
1714
1704
|
"""
|
|
1715
1705
|
return self
|
|
1716
1706
|
|
|
1717
|
-
def __exit__(
|
|
1707
|
+
def __exit__(
|
|
1708
|
+
self,
|
|
1709
|
+
_exc_type: typing.Optional[type[BaseException]],
|
|
1710
|
+
_exc_val: typing.Optional[BaseException],
|
|
1711
|
+
_exc_tb: typing.Optional[types.TracebackType],
|
|
1712
|
+
) -> typing.Optional[bool]:
|
|
1718
1713
|
"""
|
|
1719
|
-
|
|
1714
|
+
Always clean up (self.close()), but let exceptions propagate.
|
|
1715
|
+
Return True only if you really want to suppress an exception.
|
|
1720
1716
|
|
|
1721
1717
|
Parameters
|
|
1722
1718
|
----------
|
|
1723
|
-
exc_type : type
|
|
1724
|
-
|
|
1725
|
-
|
|
1726
|
-
|
|
1727
|
-
|
|
1728
|
-
|
|
1719
|
+
exc_type : Optional[type[BaseException]]
|
|
1720
|
+
The type of the exception raised, if any.
|
|
1721
|
+
exc_val : Optional[BaseException]
|
|
1722
|
+
The exception instance, if any.
|
|
1723
|
+
exc_tb : Optional[types.TracebackType]
|
|
1724
|
+
The traceback object, if any.
|
|
1729
1725
|
|
|
1730
1726
|
Returns
|
|
1731
1727
|
-------
|
|
1732
|
-
|
|
1728
|
+
Optional[bool]
|
|
1729
|
+
False to propagate exceptions, True to suppress them.
|
|
1733
1730
|
"""
|
|
1734
|
-
|
|
1731
|
+
try:
|
|
1732
|
+
self.close()
|
|
1733
|
+
except Exception as cleanup_err:
|
|
1734
|
+
# optional: log or re-raise, but don’t hide the original exc
|
|
1735
|
+
print(f"Cleanup failed: {cleanup_err!r}")
|
|
1736
|
+
# Return False (or None) => exceptions inside the with‐block are re-raised.
|
|
1737
|
+
return False
|
|
1735
1738
|
|
|
1736
1739
|
def __del__(self):
|
|
1737
1740
|
"""
|
|
1738
1741
|
Destructor to ensure resources are cleaned up if not explicitly closed.
|
|
1739
1742
|
|
|
1740
|
-
Returns
|
|
1741
|
-
-------
|
|
1742
|
-
None
|
|
1743
1743
|
"""
|
|
1744
1744
|
# Ensure it's called
|
|
1745
1745
|
self.close()
|