nosible 0.2.4__py3-none-any.whl → 0.2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nosible/classes/result.py +122 -11
- nosible/classes/result_set.py +42 -29
- nosible/classes/search.py +82 -22
- nosible/classes/search_set.py +26 -26
- nosible/classes/snippet.py +2 -2
- nosible/classes/snippet_set.py +2 -2
- nosible/classes/web_page.py +11 -56
- nosible/nosible_client.py +360 -84
- {nosible-0.2.4.dist-info → nosible-0.2.6.dist-info}/METADATA +40 -41
- nosible-0.2.6.dist-info/RECORD +16 -0
- nosible-0.2.4.dist-info/RECORD +0 -16
- {nosible-0.2.4.dist-info → nosible-0.2.6.dist-info}/WHEEL +0 -0
- {nosible-0.2.4.dist-info → nosible-0.2.6.dist-info}/licenses/LICENSE +0 -0
- {nosible-0.2.4.dist-info → nosible-0.2.6.dist-info}/top_level.txt +0 -0
nosible/classes/result.py
CHANGED
|
@@ -10,6 +10,7 @@ if TYPE_CHECKING:
|
|
|
10
10
|
from nosible.classes.result_set import ResultSet
|
|
11
11
|
else:
|
|
12
12
|
ResultSet = None
|
|
13
|
+
import warnings
|
|
13
14
|
|
|
14
15
|
|
|
15
16
|
@dataclass(init=True, repr=True, eq=True, frozen=False)
|
|
@@ -39,6 +40,32 @@ class Result:
|
|
|
39
40
|
The language code of the content (e.g., 'en' for English).
|
|
40
41
|
similarity : float, optional
|
|
41
42
|
Similarity score with respect to a query or reference.
|
|
43
|
+
brand_safety : str, optional
|
|
44
|
+
Whether it is safe, sensitive, or unsafe to advertise on this content.
|
|
45
|
+
language : str, optional
|
|
46
|
+
Language code to use in search (ISO 639-1 language code).
|
|
47
|
+
continent : str, optional
|
|
48
|
+
Continent the results must come from (e.g., "Europe", "Asia").
|
|
49
|
+
region : str, optional
|
|
50
|
+
Region or subcontinent the results must come from (e.g., "Southern Africa", "Caribbean").
|
|
51
|
+
country : str, optional
|
|
52
|
+
Country the results must come from.
|
|
53
|
+
sector : str, optional
|
|
54
|
+
GICS Sector the results must relate to (e.g., "Energy", "Information Technology").
|
|
55
|
+
industry_group : str, optional
|
|
56
|
+
GICS Industry group the results must relate to (e.g., "Automobiles & Components", "Insurance").
|
|
57
|
+
industry : str, optional
|
|
58
|
+
GICS Industry the results must relate to (e.g., "Consumer Finance", "Passenger Airlines").
|
|
59
|
+
sub_industry : str, optional
|
|
60
|
+
GICS Sub-industry classification of the content's subject.
|
|
61
|
+
iab_tier_1 : str, optional
|
|
62
|
+
IAB Tier 1 category for the content.
|
|
63
|
+
iab_tier_2 : str, optional
|
|
64
|
+
IAB Tier 2 category for the content.
|
|
65
|
+
iab_tier_3 : str, optional
|
|
66
|
+
IAB Tier 3 category for the content.
|
|
67
|
+
iab_tier_4 : str, optional
|
|
68
|
+
IAB Tier 4 category for the content.
|
|
42
69
|
|
|
43
70
|
Examples
|
|
44
71
|
--------
|
|
@@ -84,6 +111,30 @@ class Result:
|
|
|
84
111
|
"""Similarity score with respect to a query or reference."""
|
|
85
112
|
url_hash: str | None = None
|
|
86
113
|
"""A hash of the URL for quick comparisons."""
|
|
114
|
+
brand_safety: str | None = None
|
|
115
|
+
"""Whether it is safe, sensitive, or unsafe to advertise on this content."""
|
|
116
|
+
continent: str | None = None
|
|
117
|
+
"""Continent the results must come from (e.g., "Europe", "Asia")."""
|
|
118
|
+
region: str | None = None
|
|
119
|
+
"""Region or subcontinent the results must come from (e.g., "Southern Africa", "Caribbean")."""
|
|
120
|
+
country: str | None = None
|
|
121
|
+
"""Country the results must come from."""
|
|
122
|
+
sector: str | None = None
|
|
123
|
+
"""GICS Sector the results must relate to (e.g., "Energy", "Information Technology")."""
|
|
124
|
+
industry_group: str | None = None
|
|
125
|
+
"""GICS Industry group the results must relate to (e.g., "Automobiles & Components", "Insurance")."""
|
|
126
|
+
industry: str | None = None
|
|
127
|
+
"""GICS Industry the results must relate to (e.g., "Consumer Finance", "Passenger Airlines")."""
|
|
128
|
+
sub_industry: str | None = None
|
|
129
|
+
"""GICS Sub-industry classification of the content's subject."""
|
|
130
|
+
iab_tier_1: str | None = None
|
|
131
|
+
"""IAB Tier 1 category for the content."""
|
|
132
|
+
iab_tier_2: str | None = None
|
|
133
|
+
"""IAB Tier 2 category for the content."""
|
|
134
|
+
iab_tier_3: str | None = None
|
|
135
|
+
"""IAB Tier 3 category for the content."""
|
|
136
|
+
iab_tier_4: str | None = None
|
|
137
|
+
"""IAB Tier 4 category for the content."""
|
|
87
138
|
|
|
88
139
|
def __str__(self) -> str:
|
|
89
140
|
"""
|
|
@@ -335,17 +386,30 @@ class Result:
|
|
|
335
386
|
algorithm: str = "hybrid-3",
|
|
336
387
|
publish_start: str = None,
|
|
337
388
|
publish_end: str = None,
|
|
338
|
-
include_netlocs: list = None,
|
|
339
|
-
exclude_netlocs: list = None,
|
|
340
389
|
visited_start: str = None,
|
|
341
390
|
visited_end: str = None,
|
|
342
391
|
certain: bool = None,
|
|
343
|
-
|
|
344
|
-
|
|
392
|
+
include_netlocs: list = None,
|
|
393
|
+
exclude_netlocs: list = None,
|
|
345
394
|
include_companies: list = None,
|
|
346
395
|
exclude_companies: list = None,
|
|
347
396
|
include_docs: list = None,
|
|
348
397
|
exclude_docs: list = None,
|
|
398
|
+
brand_safety: str = None,
|
|
399
|
+
language: str = None,
|
|
400
|
+
continent: str = None,
|
|
401
|
+
region: str = None,
|
|
402
|
+
country: str = None,
|
|
403
|
+
sector: str = None,
|
|
404
|
+
industry_group: str = None,
|
|
405
|
+
industry: str = None,
|
|
406
|
+
sub_industry: str = None,
|
|
407
|
+
iab_tier_1: str = None,
|
|
408
|
+
iab_tier_2: str = None,
|
|
409
|
+
iab_tier_3: str = None,
|
|
410
|
+
iab_tier_4: str = None,
|
|
411
|
+
instruction: str = None,
|
|
412
|
+
*args, **kwargs
|
|
349
413
|
) -> ResultSet:
|
|
350
414
|
"""
|
|
351
415
|
Find similar search results based on the content or metadata of this Result.
|
|
@@ -381,10 +445,6 @@ class Result:
|
|
|
381
445
|
List of netlocs (domains) to include in the search. (Max: 50)
|
|
382
446
|
exclude_netlocs : list of str, optional
|
|
383
447
|
List of netlocs (domains) to exclude in the search. (Max: 50)
|
|
384
|
-
include_languages : list of str, optional
|
|
385
|
-
Languages to include in the search. (Max: 50, ISO 639-1 language codes).
|
|
386
|
-
exclude_languages : list of str, optional
|
|
387
|
-
Language codes to exclude in the search (Max: 50, ISO 639-1 language codes).
|
|
388
448
|
include_companies : list of str, optional
|
|
389
449
|
Google KG IDs of public companies to require (Max: 50).
|
|
390
450
|
exclude_companies : list of str, optional
|
|
@@ -393,6 +453,34 @@ class Result:
|
|
|
393
453
|
URL hashes of docs to include (Max: 50).
|
|
394
454
|
exclude_docs : list of str, optional
|
|
395
455
|
URL hashes of docs to exclude (Max: 50).
|
|
456
|
+
brand_safety : str, optional
|
|
457
|
+
Whether it is safe, sensitive, or unsafe to advertise on this content.
|
|
458
|
+
language : str, optional
|
|
459
|
+
Language code to use in search (ISO 639-1 language code).
|
|
460
|
+
continent : str, optional
|
|
461
|
+
Continent the results must come from (e.g., "Europe", "Asia").
|
|
462
|
+
region : str, optional
|
|
463
|
+
Region or subcontinent the results must come from (e.g., "Southern Africa", "Caribbean").
|
|
464
|
+
country : str, optional
|
|
465
|
+
Country the results must come from.
|
|
466
|
+
sector : str, optional
|
|
467
|
+
GICS Sector the results must relate to (e.g., "Energy", "Information Technology").
|
|
468
|
+
industry_group : str, optional
|
|
469
|
+
GICS Industry group the results must relate to (e.g., "Automobiles & Components", "Insurance").
|
|
470
|
+
industry : str, optional
|
|
471
|
+
GICS Industry the results must relate to (e.g., "Consumer Finance", "Passenger Airlines").
|
|
472
|
+
sub_industry : str, optional
|
|
473
|
+
GICS Sub-industry classification of the content's subject.
|
|
474
|
+
iab_tier_1 : str, optional
|
|
475
|
+
IAB Tier 1 category for the content.
|
|
476
|
+
iab_tier_2 : str, optional
|
|
477
|
+
IAB Tier 2 category for the content.
|
|
478
|
+
iab_tier_3 : str, optional
|
|
479
|
+
IAB Tier 3 category for the content.
|
|
480
|
+
iab_tier_4 : str, optional
|
|
481
|
+
IAB Tier 4 category for the content.
|
|
482
|
+
instruction : str, optional
|
|
483
|
+
Instruction to use with the search query.
|
|
396
484
|
|
|
397
485
|
Returns
|
|
398
486
|
-------
|
|
@@ -413,6 +501,17 @@ class Result:
|
|
|
413
501
|
... result = Result(url="https://example.com", title="Example Domain") # doctest: +SKIP
|
|
414
502
|
... similar_results = result.similar(client=nos) # doctest: +SKIP
|
|
415
503
|
"""
|
|
504
|
+
if "include_languages" in kwargs:
|
|
505
|
+
warnings.warn(
|
|
506
|
+
"The 'include_languages' parameter is deprecated and will be removed in a future release. "
|
|
507
|
+
"Please use the parameter 'language' instead.",
|
|
508
|
+
)
|
|
509
|
+
if "exclude_languages" in kwargs:
|
|
510
|
+
warnings.warn(
|
|
511
|
+
"The 'exclude_languages' parameter is deprecated and will be removed in a future release. "
|
|
512
|
+
"Please use the parameter 'language' instead.",
|
|
513
|
+
)
|
|
514
|
+
|
|
416
515
|
if client is None:
|
|
417
516
|
raise ValueError("A Nosible client instance must be provided as 'client'.")
|
|
418
517
|
if not self.url:
|
|
@@ -436,14 +535,26 @@ class Result:
|
|
|
436
535
|
visited_start=visited_start,
|
|
437
536
|
visited_end=visited_end,
|
|
438
537
|
certain=certain,
|
|
439
|
-
include_languages=include_languages,
|
|
440
|
-
exclude_languages=exclude_languages,
|
|
441
538
|
include_companies=include_companies,
|
|
442
539
|
exclude_companies=exclude_companies,
|
|
443
540
|
include_docs=include_docs,
|
|
444
541
|
exclude_docs=exclude_docs,
|
|
542
|
+
brand_safety=brand_safety,
|
|
543
|
+
language=language,
|
|
544
|
+
continent=continent,
|
|
545
|
+
region=region,
|
|
546
|
+
country=country,
|
|
547
|
+
sector=sector,
|
|
548
|
+
industry_group=industry_group,
|
|
549
|
+
industry=industry,
|
|
550
|
+
sub_industry=sub_industry,
|
|
551
|
+
iab_tier_1=iab_tier_1,
|
|
552
|
+
iab_tier_2=iab_tier_2,
|
|
553
|
+
iab_tier_3=iab_tier_3,
|
|
554
|
+
iab_tier_4=iab_tier_4,
|
|
555
|
+
instruction=instruction,
|
|
445
556
|
)
|
|
446
|
-
return client.
|
|
557
|
+
return client.fast_search(search=s)
|
|
447
558
|
except Exception as e:
|
|
448
559
|
raise RuntimeError(f"Failed to find similar results for title '{self.title}': {e}") from e
|
|
449
560
|
|
nosible/classes/result_set.py
CHANGED
|
@@ -57,6 +57,19 @@ class ResultSet(Iterator[Result]):
|
|
|
57
57
|
"language",
|
|
58
58
|
"similarity",
|
|
59
59
|
"url_hash",
|
|
60
|
+
"brand_safety",
|
|
61
|
+
"language",
|
|
62
|
+
"continent",
|
|
63
|
+
"region",
|
|
64
|
+
"country",
|
|
65
|
+
"sector",
|
|
66
|
+
"industry_group",
|
|
67
|
+
"industry",
|
|
68
|
+
"sub_industry",
|
|
69
|
+
"iab_tier_1",
|
|
70
|
+
"iab_tier_2",
|
|
71
|
+
"iab_tier_3",
|
|
72
|
+
"iab_tier_4",
|
|
60
73
|
]
|
|
61
74
|
|
|
62
75
|
results: list[Result] = field(default_factory=list)
|
|
@@ -310,7 +323,7 @@ class ResultSet(Iterator[Result]):
|
|
|
310
323
|
>>> from nosible import Nosible
|
|
311
324
|
>>> from nosible import ResultSet
|
|
312
325
|
>>> with Nosible() as nos:
|
|
313
|
-
... results: ResultSet = nos.
|
|
326
|
+
... results: ResultSet = nos.fast_search(question="Aircraft Manufacturing", n_results=10)
|
|
314
327
|
... inner = results.find_in_search_results("embraer", top_k=5)
|
|
315
328
|
>>> print(f"Top {len(inner)} hits for “embraer” within the initial results:")
|
|
316
329
|
Top 5 hits for “embraer” within the initial results:
|
|
@@ -409,7 +422,7 @@ class ResultSet(Iterator[Result]):
|
|
|
409
422
|
>>> from nosible import Nosible
|
|
410
423
|
>>> from nosible import Result, ResultSet
|
|
411
424
|
>>> with Nosible() as nos:
|
|
412
|
-
... results: ResultSet = nos.
|
|
425
|
+
... results: ResultSet = nos.fast_search(question="Aircraft Manufacturing", n_results=100)
|
|
413
426
|
... summary = results.analyze(by="language")
|
|
414
427
|
... print(summary)
|
|
415
428
|
{'en': 100}
|
|
@@ -507,7 +520,7 @@ class ResultSet(Iterator[Result]):
|
|
|
507
520
|
return {str(row[0]): int(row[1]) for row in sorted_vc.rows()}
|
|
508
521
|
|
|
509
522
|
# Conversion methods
|
|
510
|
-
def
|
|
523
|
+
def write_csv(self, file_path: str | None = None, delimiter: str = ",", encoding: str = "utf-8") -> str:
|
|
511
524
|
"""
|
|
512
525
|
Serialize the search results to a CSV file.
|
|
513
526
|
|
|
@@ -542,7 +555,7 @@ class ResultSet(Iterator[Result]):
|
|
|
542
555
|
... Result(url="https://openai.com", title="OpenAI"),
|
|
543
556
|
... ]
|
|
544
557
|
>>> search_results = ResultSet(results)
|
|
545
|
-
>>> path = search_results.
|
|
558
|
+
>>> path = search_results.write_csv("out.csv")
|
|
546
559
|
>>> path.endswith(".csv")
|
|
547
560
|
True
|
|
548
561
|
"""
|
|
@@ -622,7 +635,7 @@ class ResultSet(Iterator[Result]):
|
|
|
622
635
|
except Exception as e:
|
|
623
636
|
raise RuntimeError(f"Failed to convert search results to Pandas DataFrame: {e}") from e
|
|
624
637
|
|
|
625
|
-
def
|
|
638
|
+
def write_json(self, file_path: str | None = None) -> str | bytes:
|
|
626
639
|
"""
|
|
627
640
|
Serialize the search results to a JSON string and optionally write to disk.
|
|
628
641
|
|
|
@@ -648,11 +661,11 @@ class ResultSet(Iterator[Result]):
|
|
|
648
661
|
... Result(url="https://openai.com", title="OpenAI"),
|
|
649
662
|
... ]
|
|
650
663
|
>>> search_results = ResultSet(results)
|
|
651
|
-
>>> json_str = search_results.
|
|
664
|
+
>>> json_str = search_results.write_json()
|
|
652
665
|
>>> isinstance(json_str, str)
|
|
653
666
|
True
|
|
654
667
|
>>> # Optionally write to file
|
|
655
|
-
>>> path = search_results.
|
|
668
|
+
>>> path = search_results.write_json(file_path="results.json")
|
|
656
669
|
>>> path.endswith(".json")
|
|
657
670
|
True
|
|
658
671
|
"""
|
|
@@ -747,7 +760,7 @@ class ResultSet(Iterator[Result]):
|
|
|
747
760
|
except Exception as e:
|
|
748
761
|
raise RuntimeError(f"Failed to convert results to dict: {e}") from e
|
|
749
762
|
|
|
750
|
-
def
|
|
763
|
+
def write_ndjson(self, file_path: str | None = None) -> str:
|
|
751
764
|
"""
|
|
752
765
|
Serialize search results to newline-delimited JSON (NDJSON) format.
|
|
753
766
|
|
|
@@ -777,11 +790,11 @@ class ResultSet(Iterator[Result]):
|
|
|
777
790
|
... Result(url="https://openai.com", title="OpenAI"),
|
|
778
791
|
... ]
|
|
779
792
|
>>> search_results = ResultSet(results)
|
|
780
|
-
>>> ndjson_str = search_results.
|
|
793
|
+
>>> ndjson_str = search_results.write_ndjson()
|
|
781
794
|
>>> print(ndjson_str.splitlines()[0]) # doctest: +ELLIPSIS
|
|
782
795
|
{"url":"https://example.com","title":"Example Domain","description":null,"netloc":null..."url_hash":null}
|
|
783
796
|
>>> # Optionally write to file
|
|
784
|
-
>>> path = search_results.
|
|
797
|
+
>>> path = search_results.write_ndjson(file_path="results.ndjson")
|
|
785
798
|
>>> path.endswith(".ndjson")
|
|
786
799
|
True
|
|
787
800
|
"""
|
|
@@ -802,7 +815,7 @@ class ResultSet(Iterator[Result]):
|
|
|
802
815
|
raise RuntimeError(f"Failed to write NDJSON to '{file_path}': {e}") from e
|
|
803
816
|
return "\n".join(ndjson_lines) + "\n"
|
|
804
817
|
|
|
805
|
-
def
|
|
818
|
+
def write_parquet(self, file_path: str | None = None) -> str:
|
|
806
819
|
"""
|
|
807
820
|
Serialize the search results to Apache Parquet format using Polars.
|
|
808
821
|
|
|
@@ -832,7 +845,7 @@ class ResultSet(Iterator[Result]):
|
|
|
832
845
|
... Result(url="https://openai.com", title="OpenAI"),
|
|
833
846
|
... ]
|
|
834
847
|
>>> search_results = ResultSet(results)
|
|
835
|
-
>>> parquet_path = search_results.
|
|
848
|
+
>>> parquet_path = search_results.write_parquet("my_results.parquet")
|
|
836
849
|
>>> parquet_path.endswith(".parquet")
|
|
837
850
|
True
|
|
838
851
|
"""
|
|
@@ -843,7 +856,7 @@ class ResultSet(Iterator[Result]):
|
|
|
843
856
|
raise RuntimeError(f"Failed to write Parquet to '{out}': {e}") from e
|
|
844
857
|
return out
|
|
845
858
|
|
|
846
|
-
def
|
|
859
|
+
def write_ipc(self, file_path: str | None = None) -> str:
|
|
847
860
|
"""
|
|
848
861
|
Serialize the search results to Apache Arrow IPC (Feather) format using Polars.
|
|
849
862
|
|
|
@@ -873,7 +886,7 @@ class ResultSet(Iterator[Result]):
|
|
|
873
886
|
... Result(url="https://openai.com", title="OpenAI"),
|
|
874
887
|
... ]
|
|
875
888
|
>>> search_results = ResultSet(results)
|
|
876
|
-
>>> arrow_path = search_results.
|
|
889
|
+
>>> arrow_path = search_results.write_ipc("my_results.arrow")
|
|
877
890
|
>>> arrow_path.endswith(".arrow")
|
|
878
891
|
True
|
|
879
892
|
"""
|
|
@@ -884,7 +897,7 @@ class ResultSet(Iterator[Result]):
|
|
|
884
897
|
raise RuntimeError(f"Failed to write Arrow IPC to '{out}': {e}") from e
|
|
885
898
|
return out
|
|
886
899
|
|
|
887
|
-
def
|
|
900
|
+
def write_duckdb(self, file_path: str | None = None, table_name: str = "results") -> str:
|
|
888
901
|
"""
|
|
889
902
|
Serialize the search results to a DuckDB database file and table.
|
|
890
903
|
|
|
@@ -917,7 +930,7 @@ class ResultSet(Iterator[Result]):
|
|
|
917
930
|
... Result(url="https://openai.com", title="OpenAI"),
|
|
918
931
|
... ]
|
|
919
932
|
>>> search_results = ResultSet(results)
|
|
920
|
-
>>> db_path = search_results.
|
|
933
|
+
>>> db_path = search_results.write_duckdb(file_path="my_results.duckdb", table_name="search_table")
|
|
921
934
|
>>> db_path.endswith(".duckdb")
|
|
922
935
|
True
|
|
923
936
|
"""
|
|
@@ -939,7 +952,7 @@ class ResultSet(Iterator[Result]):
|
|
|
939
952
|
|
|
940
953
|
# Loading from disk
|
|
941
954
|
@classmethod
|
|
942
|
-
def
|
|
955
|
+
def read_csv(cls, file_path: str) -> ResultSet:
|
|
943
956
|
"""
|
|
944
957
|
Load search results from a CSV file using Polars.
|
|
945
958
|
|
|
@@ -971,7 +984,7 @@ class ResultSet(Iterator[Result]):
|
|
|
971
984
|
... {"url": "https://openai.com", "title": "OpenAI", "description": "AI research"},
|
|
972
985
|
... ]
|
|
973
986
|
... ).write_csv("data.csv")
|
|
974
|
-
>>> results = ResultSet.
|
|
987
|
+
>>> results = ResultSet.read_csv("data.csv")
|
|
975
988
|
>>> isinstance(results, ResultSet)
|
|
976
989
|
True
|
|
977
990
|
>>> len(results)
|
|
@@ -1007,7 +1020,7 @@ class ResultSet(Iterator[Result]):
|
|
|
1007
1020
|
return cls(results)
|
|
1008
1021
|
|
|
1009
1022
|
@classmethod
|
|
1010
|
-
def
|
|
1023
|
+
def read_json(cls, file_path: str) -> ResultSet:
|
|
1011
1024
|
"""
|
|
1012
1025
|
Load search results from a JSON file.
|
|
1013
1026
|
|
|
@@ -1039,7 +1052,7 @@ class ResultSet(Iterator[Result]):
|
|
|
1039
1052
|
... ],
|
|
1040
1053
|
... f,
|
|
1041
1054
|
... )
|
|
1042
|
-
>>> results = ResultSet.
|
|
1055
|
+
>>> results = ResultSet.read_json("data.json")
|
|
1043
1056
|
>>> isinstance(results, ResultSet)
|
|
1044
1057
|
True
|
|
1045
1058
|
>>> len(results)
|
|
@@ -1147,7 +1160,7 @@ class ResultSet(Iterator[Result]):
|
|
|
1147
1160
|
return cls.from_polars(pl_df)
|
|
1148
1161
|
|
|
1149
1162
|
@classmethod
|
|
1150
|
-
def
|
|
1163
|
+
def read_ndjson(cls, file_path: str) -> ResultSet:
|
|
1151
1164
|
"""
|
|
1152
1165
|
Load search results from a newline-delimited JSON (NDJSON) file.
|
|
1153
1166
|
|
|
@@ -1181,7 +1194,7 @@ class ResultSet(Iterator[Result]):
|
|
|
1181
1194
|
... f.write('{"url": "https://openai.com", "title": "OpenAI"}\\n')
|
|
1182
1195
|
58
|
|
1183
1196
|
49
|
|
1184
|
-
>>> results = ResultSet.
|
|
1197
|
+
>>> results = ResultSet.read_ndjson("data.ndjson")
|
|
1185
1198
|
>>> isinstance(results, ResultSet)
|
|
1186
1199
|
True
|
|
1187
1200
|
>>> len(results)
|
|
@@ -1219,7 +1232,7 @@ class ResultSet(Iterator[Result]):
|
|
|
1219
1232
|
return cls(results)
|
|
1220
1233
|
|
|
1221
1234
|
@classmethod
|
|
1222
|
-
def
|
|
1235
|
+
def read_parquet(cls, file_path: str) -> ResultSet:
|
|
1223
1236
|
"""
|
|
1224
1237
|
Load search results from a Parquet file using Polars.
|
|
1225
1238
|
|
|
@@ -1250,7 +1263,7 @@ class ResultSet(Iterator[Result]):
|
|
|
1250
1263
|
... ]
|
|
1251
1264
|
... )
|
|
1252
1265
|
>>> df.write_parquet("sample.parquet")
|
|
1253
|
-
>>> results = ResultSet.
|
|
1266
|
+
>>> results = ResultSet.read_parquet("sample.parquet")
|
|
1254
1267
|
>>> isinstance(results, ResultSet)
|
|
1255
1268
|
True
|
|
1256
1269
|
>>> len(results)
|
|
@@ -1270,7 +1283,7 @@ class ResultSet(Iterator[Result]):
|
|
|
1270
1283
|
raise RuntimeError(f"Failed to create ResultSet from Parquet data in '{file_path}': {e}") from e
|
|
1271
1284
|
|
|
1272
1285
|
@classmethod
|
|
1273
|
-
def
|
|
1286
|
+
def read_ipc(cls, file_path: str) -> ResultSet:
|
|
1274
1287
|
"""
|
|
1275
1288
|
Load search results from an Apache Arrow IPC (Feather) file using Polars.
|
|
1276
1289
|
|
|
@@ -1301,7 +1314,7 @@ class ResultSet(Iterator[Result]):
|
|
|
1301
1314
|
... ]
|
|
1302
1315
|
... )
|
|
1303
1316
|
>>> df.write_ipc("sample.arrow")
|
|
1304
|
-
>>> results = ResultSet.
|
|
1317
|
+
>>> results = ResultSet.read_ipc("sample.arrow")
|
|
1305
1318
|
>>> isinstance(results, ResultSet)
|
|
1306
1319
|
True
|
|
1307
1320
|
>>> len(results)
|
|
@@ -1321,7 +1334,7 @@ class ResultSet(Iterator[Result]):
|
|
|
1321
1334
|
raise RuntimeError(f"Failed to create ResultSet from Arrow data in '{file_path}': {e}") from e
|
|
1322
1335
|
|
|
1323
1336
|
@classmethod
|
|
1324
|
-
def
|
|
1337
|
+
def read_duckdb(cls, file_path: str) -> ResultSet:
|
|
1325
1338
|
"""
|
|
1326
1339
|
Load search results from a DuckDB database file.
|
|
1327
1340
|
|
|
@@ -1354,8 +1367,8 @@ class ResultSet(Iterator[Result]):
|
|
|
1354
1367
|
... Result(url="https://openai.com", title="OpenAI", similarity=0.99),
|
|
1355
1368
|
... ]
|
|
1356
1369
|
>>> search_results = ResultSet(results)
|
|
1357
|
-
>>> db_path = search_results.
|
|
1358
|
-
>>> loaded = ResultSet.
|
|
1370
|
+
>>> db_path = search_results.write_duckdb(file_path="results.duckdb", table_name="search_results")
|
|
1371
|
+
>>> loaded = ResultSet.read_duckdb("results.duckdb")
|
|
1359
1372
|
>>> isinstance(loaded, ResultSet)
|
|
1360
1373
|
True
|
|
1361
1374
|
>>> len(loaded)
|
nosible/classes/search.py
CHANGED
|
@@ -55,10 +55,6 @@ class Search:
|
|
|
55
55
|
List of netlocs (domains) to include in the search. (Max 50)
|
|
56
56
|
exclude_netlocs : list of str, optional
|
|
57
57
|
List of netlocs (domains) to exclude in the search. (Max 50)
|
|
58
|
-
include_languages : list of str, optional
|
|
59
|
-
Languages to include in the search. (Max 50, ISO 639-1 language codes).
|
|
60
|
-
exclude_languages : list of str, optional
|
|
61
|
-
Language codes to exclude in the search (Max 50, ISO 639-1 language codes).
|
|
62
58
|
include_companies : list of str, optional
|
|
63
59
|
Google KG IDs of public companies to require (Max 50).
|
|
64
60
|
exclude_companies : list of str, optional
|
|
@@ -67,6 +63,34 @@ class Search:
|
|
|
67
63
|
URL hashes of docs to include (Max 50).
|
|
68
64
|
exclude_docs : list of str, optional
|
|
69
65
|
URL hashes of docs to exclude (Max 50).
|
|
66
|
+
brand_safety : str, optional
|
|
67
|
+
Whether it is safe, sensitive, or unsafe to advertise on this content.
|
|
68
|
+
language : str, optional
|
|
69
|
+
Language code to use in search (ISO 639-1 language code).
|
|
70
|
+
continent : str, optional
|
|
71
|
+
Continent the results must come from (e.g., "Europe", "Asia").
|
|
72
|
+
region : str, optional
|
|
73
|
+
Region or subcontinent the results must come from (e.g., "Southern Africa", "Caribbean").
|
|
74
|
+
country : str, optional
|
|
75
|
+
Country the results must come from.
|
|
76
|
+
sector : str, optional
|
|
77
|
+
GICS Sector the results must relate to (e.g., "Energy", "Information Technology").
|
|
78
|
+
industry_group : str, optional
|
|
79
|
+
GICS Industry group the results must relate to (e.g., "Automobiles & Components", "Insurance").
|
|
80
|
+
industry : str, optional
|
|
81
|
+
GICS Industry the results must relate to (e.g., "Consumer Finance", "Passenger Airlines").
|
|
82
|
+
sub_industry : str, optional
|
|
83
|
+
GICS Sub-industry classification of the content's subject.
|
|
84
|
+
iab_tier_1 : str, optional
|
|
85
|
+
IAB Tier 1 category for the content.
|
|
86
|
+
iab_tier_2 : str, optional
|
|
87
|
+
IAB Tier 2 category for the content.
|
|
88
|
+
iab_tier_3 : str, optional
|
|
89
|
+
IAB Tier 3 category for the content.
|
|
90
|
+
iab_tier_4 : str, optional
|
|
91
|
+
IAB Tier 4 category for the content.
|
|
92
|
+
instruction : str, optional
|
|
93
|
+
Instruction to use with the search query.
|
|
70
94
|
|
|
71
95
|
Examples
|
|
72
96
|
--------
|
|
@@ -75,7 +99,7 @@ class Search:
|
|
|
75
99
|
>>> search = Search(
|
|
76
100
|
... question="What is Python?",
|
|
77
101
|
... n_results=5,
|
|
78
|
-
...
|
|
102
|
+
... language="en",
|
|
79
103
|
... publish_start="2023-01-01",
|
|
80
104
|
... publish_end="2023-12-31",
|
|
81
105
|
... certain=True,
|
|
@@ -120,10 +144,6 @@ class Search:
|
|
|
120
144
|
"""List of netlocs (domains) to include in the search (Max 50)."""
|
|
121
145
|
exclude_netlocs: list[str] | None = None
|
|
122
146
|
"""List of netlocs (domains) to exclude in the search (Max 50)."""
|
|
123
|
-
include_languages: list[str] | None = None
|
|
124
|
-
"""Languages to include in the search. (Max 50)"""
|
|
125
|
-
exclude_languages: list[str] | None = None
|
|
126
|
-
"""Language codes to exclude in the search (Max 50)"""
|
|
127
147
|
include_companies: list[str] | None = None
|
|
128
148
|
"""Google KG IDs of public companies to require (Max 50)."""
|
|
129
149
|
exclude_companies: list[str] | None = None
|
|
@@ -132,6 +152,34 @@ class Search:
|
|
|
132
152
|
"""URL hashes of docs to include (Max 50)."""
|
|
133
153
|
exclude_docs: list[str] | None = None
|
|
134
154
|
"""URL hashes of docs to exclude (Max 50)."""
|
|
155
|
+
brand_safety: str | None = None
|
|
156
|
+
"""Whether it is safe, sensitive, or unsafe to advertise on this content."""
|
|
157
|
+
language: str | None = None
|
|
158
|
+
"""Language code to use in search (ISO 639-1 language code)."""
|
|
159
|
+
continent: str | None = None
|
|
160
|
+
"""Continent the results must come from (e.g., "Europe", "Asia")."""
|
|
161
|
+
region: str | None = None
|
|
162
|
+
"""Region or subcontinent the results must come from (e.g., "Southern Africa", "Caribbean")."""
|
|
163
|
+
country: str | None = None
|
|
164
|
+
"""Country the results must come from."""
|
|
165
|
+
sector: str | None = None
|
|
166
|
+
"""GICS Sector the results must relate to (e.g., "Energy", "Information Technology")."""
|
|
167
|
+
industry_group: str | None = None
|
|
168
|
+
"""GICS Industry group the results must relate to (e.g., "Automobiles & Components", "Insurance")."""
|
|
169
|
+
industry: str | None = None
|
|
170
|
+
"""GICS Industry the results must relate to (e.g., "Consumer Finance", "Passenger Airlines")."""
|
|
171
|
+
sub_industry: str | None = None
|
|
172
|
+
"""GICS Sub-industry classification of the content's subject."""
|
|
173
|
+
iab_tier_1: str | None = None
|
|
174
|
+
"""IAB Tier 1 category for the content."""
|
|
175
|
+
iab_tier_2: str | None = None
|
|
176
|
+
"""IAB Tier 2 category for the content."""
|
|
177
|
+
iab_tier_3: str | None = None
|
|
178
|
+
"""IAB Tier 3 category for the content."""
|
|
179
|
+
iab_tier_4: str | None = None
|
|
180
|
+
"""IAB Tier 4 category for the content."""
|
|
181
|
+
instruction: str | None = None
|
|
182
|
+
"""Instruction to use with the search query."""
|
|
135
183
|
|
|
136
184
|
_FIELDS = [
|
|
137
185
|
"question",
|
|
@@ -147,17 +195,29 @@ class Search:
|
|
|
147
195
|
"autogenerate_expansions",
|
|
148
196
|
"publish_start",
|
|
149
197
|
"publish_end",
|
|
150
|
-
"include_netlocs",
|
|
151
|
-
"exclude_netlocs",
|
|
152
198
|
"visited_start",
|
|
153
199
|
"visited_end",
|
|
154
200
|
"certain",
|
|
155
|
-
"
|
|
156
|
-
"
|
|
201
|
+
"include_netlocs",
|
|
202
|
+
"exclude_netlocs",
|
|
157
203
|
"include_companies",
|
|
158
204
|
"exclude_companies",
|
|
159
205
|
"include_docs",
|
|
160
206
|
"exclude_docs",
|
|
207
|
+
"brand_safety",
|
|
208
|
+
"language",
|
|
209
|
+
"continent",
|
|
210
|
+
"region",
|
|
211
|
+
"country",
|
|
212
|
+
"sector",
|
|
213
|
+
"industry_group",
|
|
214
|
+
"industry",
|
|
215
|
+
"sub_industry",
|
|
216
|
+
"iab_tier_1",
|
|
217
|
+
"iab_tier_2",
|
|
218
|
+
"iab_tier_3",
|
|
219
|
+
"iab_tier_4",
|
|
220
|
+
"instruction",
|
|
161
221
|
]
|
|
162
222
|
|
|
163
223
|
def __str__(self) -> str:
|
|
@@ -220,7 +280,7 @@ class Search:
|
|
|
220
280
|
Examples
|
|
221
281
|
--------
|
|
222
282
|
>>> search = Search(
|
|
223
|
-
... question="What is Python?", n_results=5,
|
|
283
|
+
... question="What is Python?", n_results=5, language="en", publish_start="2023-01-01"
|
|
224
284
|
... )
|
|
225
285
|
>>> search.to_dict()["question"]
|
|
226
286
|
'What is Python?'
|
|
@@ -255,7 +315,7 @@ class Search:
|
|
|
255
315
|
"""
|
|
256
316
|
return cls(**{field: data.get(field) for field in cls._FIELDS})
|
|
257
317
|
|
|
258
|
-
def
|
|
318
|
+
def write_json(self, path: str) -> None:
|
|
259
319
|
"""
|
|
260
320
|
Save the current Search instance to a JSON file.
|
|
261
321
|
|
|
@@ -274,16 +334,16 @@ class Search:
|
|
|
274
334
|
Examples
|
|
275
335
|
--------
|
|
276
336
|
>>> search = Search(
|
|
277
|
-
... question="What is Python?", n_results=5,
|
|
337
|
+
... question="What is Python?", n_results=5, language="en", publish_start="2023-01-01"
|
|
278
338
|
... )
|
|
279
|
-
>>> search.
|
|
339
|
+
>>> search.write_json("search.json")
|
|
280
340
|
"""
|
|
281
341
|
data = json_dumps(self.to_dict())
|
|
282
342
|
with open(path, "w") as f:
|
|
283
343
|
f.write(data)
|
|
284
344
|
|
|
285
345
|
@classmethod
|
|
286
|
-
def
|
|
346
|
+
def read_json(cls, path: str) -> Search:
|
|
287
347
|
"""
|
|
288
348
|
Load a Search instance from a JSON file.
|
|
289
349
|
|
|
@@ -299,7 +359,7 @@ class Search:
|
|
|
299
359
|
Returns
|
|
300
360
|
-------
|
|
301
361
|
Search
|
|
302
|
-
An
|
|
362
|
+
An instancex of the Search class initialized with the loaded parameters.
|
|
303
363
|
|
|
304
364
|
Raises
|
|
305
365
|
------
|
|
@@ -309,10 +369,10 @@ class Search:
|
|
|
309
369
|
Save and load a Search instance:
|
|
310
370
|
|
|
311
371
|
>>> search = Search(
|
|
312
|
-
... question="What is Python?", n_results=3,
|
|
372
|
+
... question="What is Python?", n_results=3, language="en", publish_start="2023-01-01"
|
|
313
373
|
... )
|
|
314
|
-
>>> search.
|
|
315
|
-
>>> loaded_search = Search.
|
|
374
|
+
>>> search.write_json("search.json")
|
|
375
|
+
>>> loaded_search = Search.read_json("search.json")
|
|
316
376
|
>>> print(loaded_search.question)
|
|
317
377
|
What is Python?
|
|
318
378
|
"""
|