nosible 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nosible/classes/result.py +118 -11
- nosible/classes/result_set.py +42 -29
- nosible/classes/search.py +77 -22
- nosible/classes/search_set.py +26 -26
- nosible/classes/snippet.py +2 -2
- nosible/classes/snippet_set.py +2 -2
- nosible/classes/web_page.py +11 -56
- nosible/nosible_client.py +341 -84
- {nosible-0.2.4.dist-info → nosible-0.2.5.dist-info}/METADATA +35 -36
- nosible-0.2.5.dist-info/RECORD +16 -0
- nosible-0.2.4.dist-info/RECORD +0 -16
- {nosible-0.2.4.dist-info → nosible-0.2.5.dist-info}/WHEEL +0 -0
- {nosible-0.2.4.dist-info → nosible-0.2.5.dist-info}/licenses/LICENSE +0 -0
- {nosible-0.2.4.dist-info → nosible-0.2.5.dist-info}/top_level.txt +0 -0
nosible/classes/result.py
CHANGED
|
@@ -10,6 +10,7 @@ if TYPE_CHECKING:
|
|
|
10
10
|
from nosible.classes.result_set import ResultSet
|
|
11
11
|
else:
|
|
12
12
|
ResultSet = None
|
|
13
|
+
import warnings
|
|
13
14
|
|
|
14
15
|
|
|
15
16
|
@dataclass(init=True, repr=True, eq=True, frozen=False)
|
|
@@ -39,6 +40,32 @@ class Result:
|
|
|
39
40
|
The language code of the content (e.g., 'en' for English).
|
|
40
41
|
similarity : float, optional
|
|
41
42
|
Similarity score with respect to a query or reference.
|
|
43
|
+
brand_safety : str, optional
|
|
44
|
+
Whether it is safe, sensitive, or unsafe to advertise on this content.
|
|
45
|
+
language : str, optional
|
|
46
|
+
Language code to use in search (ISO 639-1 language code).
|
|
47
|
+
continent : str, optional
|
|
48
|
+
Continent the results must come from (e.g., "Europe", "Asia").
|
|
49
|
+
region : str, optional
|
|
50
|
+
Region or subcontinent the results must come from (e.g., "Southern Africa", "Caribbean").
|
|
51
|
+
country : str, optional
|
|
52
|
+
Country the results must come from.
|
|
53
|
+
sector : str, optional
|
|
54
|
+
GICS Sector the results must relate to (e.g., "Energy", "Information Technology").
|
|
55
|
+
industry_group : str, optional
|
|
56
|
+
GICS Industry group the results must relate to (e.g., "Automobiles & Components", "Insurance").
|
|
57
|
+
industry : str, optional
|
|
58
|
+
GICS Industry the results must relate to (e.g., "Consumer Finance", "Passenger Airlines").
|
|
59
|
+
sub_industry : str, optional
|
|
60
|
+
GICS Sub-industry classification of the content's subject.
|
|
61
|
+
iab_tier_1 : str, optional
|
|
62
|
+
IAB Tier 1 category for the content.
|
|
63
|
+
iab_tier_2 : str, optional
|
|
64
|
+
IAB Tier 2 category for the content.
|
|
65
|
+
iab_tier_3 : str, optional
|
|
66
|
+
IAB Tier 3 category for the content.
|
|
67
|
+
iab_tier_4 : str, optional
|
|
68
|
+
IAB Tier 4 category for the content.
|
|
42
69
|
|
|
43
70
|
Examples
|
|
44
71
|
--------
|
|
@@ -84,6 +111,30 @@ class Result:
|
|
|
84
111
|
"""Similarity score with respect to a query or reference."""
|
|
85
112
|
url_hash: str | None = None
|
|
86
113
|
"""A hash of the URL for quick comparisons."""
|
|
114
|
+
brand_safety: str | None = None
|
|
115
|
+
"""Whether it is safe, sensitive, or unsafe to advertise on this content."""
|
|
116
|
+
continent: str | None = None
|
|
117
|
+
"""Continent the results must come from (e.g., "Europe", "Asia")."""
|
|
118
|
+
region: str | None = None
|
|
119
|
+
"""Region or subcontinent the results must come from (e.g., "Southern Africa", "Caribbean")."""
|
|
120
|
+
country: str | None = None
|
|
121
|
+
"""Country the results must come from."""
|
|
122
|
+
sector: str | None = None
|
|
123
|
+
"""GICS Sector the results must relate to (e.g., "Energy", "Information Technology")."""
|
|
124
|
+
industry_group: str | None = None
|
|
125
|
+
"""GICS Industry group the results must relate to (e.g., "Automobiles & Components", "Insurance")."""
|
|
126
|
+
industry: str | None = None
|
|
127
|
+
"""GICS Industry the results must relate to (e.g., "Consumer Finance", "Passenger Airlines")."""
|
|
128
|
+
sub_industry: str | None = None
|
|
129
|
+
"""GICS Sub-industry classification of the content's subject."""
|
|
130
|
+
iab_tier_1: str | None = None
|
|
131
|
+
"""IAB Tier 1 category for the content."""
|
|
132
|
+
iab_tier_2: str | None = None
|
|
133
|
+
"""IAB Tier 2 category for the content."""
|
|
134
|
+
iab_tier_3: str | None = None
|
|
135
|
+
"""IAB Tier 3 category for the content."""
|
|
136
|
+
iab_tier_4: str | None = None
|
|
137
|
+
"""IAB Tier 4 category for the content."""
|
|
87
138
|
|
|
88
139
|
def __str__(self) -> str:
|
|
89
140
|
"""
|
|
@@ -335,17 +386,29 @@ class Result:
|
|
|
335
386
|
algorithm: str = "hybrid-3",
|
|
336
387
|
publish_start: str = None,
|
|
337
388
|
publish_end: str = None,
|
|
338
|
-
include_netlocs: list = None,
|
|
339
|
-
exclude_netlocs: list = None,
|
|
340
389
|
visited_start: str = None,
|
|
341
390
|
visited_end: str = None,
|
|
342
391
|
certain: bool = None,
|
|
343
|
-
|
|
344
|
-
|
|
392
|
+
include_netlocs: list = None,
|
|
393
|
+
exclude_netlocs: list = None,
|
|
345
394
|
include_companies: list = None,
|
|
346
395
|
exclude_companies: list = None,
|
|
347
396
|
include_docs: list = None,
|
|
348
397
|
exclude_docs: list = None,
|
|
398
|
+
brand_safety: str = None,
|
|
399
|
+
language: str = None,
|
|
400
|
+
continent: str = None,
|
|
401
|
+
region: str = None,
|
|
402
|
+
country: str = None,
|
|
403
|
+
sector: str = None,
|
|
404
|
+
industry_group: str = None,
|
|
405
|
+
industry: str = None,
|
|
406
|
+
sub_industry: str = None,
|
|
407
|
+
iab_tier_1: str = None,
|
|
408
|
+
iab_tier_2: str = None,
|
|
409
|
+
iab_tier_3: str = None,
|
|
410
|
+
iab_tier_4: str = None,
|
|
411
|
+
*args, **kwargs
|
|
349
412
|
) -> ResultSet:
|
|
350
413
|
"""
|
|
351
414
|
Find similar search results based on the content or metadata of this Result.
|
|
@@ -381,10 +444,6 @@ class Result:
|
|
|
381
444
|
List of netlocs (domains) to include in the search. (Max: 50)
|
|
382
445
|
exclude_netlocs : list of str, optional
|
|
383
446
|
List of netlocs (domains) to exclude in the search. (Max: 50)
|
|
384
|
-
include_languages : list of str, optional
|
|
385
|
-
Languages to include in the search. (Max: 50, ISO 639-1 language codes).
|
|
386
|
-
exclude_languages : list of str, optional
|
|
387
|
-
Language codes to exclude in the search (Max: 50, ISO 639-1 language codes).
|
|
388
447
|
include_companies : list of str, optional
|
|
389
448
|
Google KG IDs of public companies to require (Max: 50).
|
|
390
449
|
exclude_companies : list of str, optional
|
|
@@ -393,6 +452,32 @@ class Result:
|
|
|
393
452
|
URL hashes of docs to include (Max: 50).
|
|
394
453
|
exclude_docs : list of str, optional
|
|
395
454
|
URL hashes of docs to exclude (Max: 50).
|
|
455
|
+
brand_safety : str, optional
|
|
456
|
+
Whether it is safe, sensitive, or unsafe to advertise on this content.
|
|
457
|
+
language : str, optional
|
|
458
|
+
Language code to use in search (ISO 639-1 language code).
|
|
459
|
+
continent : str, optional
|
|
460
|
+
Continent the results must come from (e.g., "Europe", "Asia").
|
|
461
|
+
region : str, optional
|
|
462
|
+
Region or subcontinent the results must come from (e.g., "Southern Africa", "Caribbean").
|
|
463
|
+
country : str, optional
|
|
464
|
+
Country the results must come from.
|
|
465
|
+
sector : str, optional
|
|
466
|
+
GICS Sector the results must relate to (e.g., "Energy", "Information Technology").
|
|
467
|
+
industry_group : str, optional
|
|
468
|
+
GICS Industry group the results must relate to (e.g., "Automobiles & Components", "Insurance").
|
|
469
|
+
industry : str, optional
|
|
470
|
+
GICS Industry the results must relate to (e.g., "Consumer Finance", "Passenger Airlines").
|
|
471
|
+
sub_industry : str, optional
|
|
472
|
+
GICS Sub-industry classification of the content's subject.
|
|
473
|
+
iab_tier_1 : str, optional
|
|
474
|
+
IAB Tier 1 category for the content.
|
|
475
|
+
iab_tier_2 : str, optional
|
|
476
|
+
IAB Tier 2 category for the content.
|
|
477
|
+
iab_tier_3 : str, optional
|
|
478
|
+
IAB Tier 3 category for the content.
|
|
479
|
+
iab_tier_4 : str, optional
|
|
480
|
+
IAB Tier 4 category for the content.
|
|
396
481
|
|
|
397
482
|
Returns
|
|
398
483
|
-------
|
|
@@ -413,6 +498,17 @@ class Result:
|
|
|
413
498
|
... result = Result(url="https://example.com", title="Example Domain") # doctest: +SKIP
|
|
414
499
|
... similar_results = result.similar(client=nos) # doctest: +SKIP
|
|
415
500
|
"""
|
|
501
|
+
if "include_languages" in kwargs:
|
|
502
|
+
warnings.warn(
|
|
503
|
+
"The 'include_languages' parameter is deprecated and will be removed in a future release. "
|
|
504
|
+
"Please use the parameter 'language' instead.",
|
|
505
|
+
)
|
|
506
|
+
if "exclude_languages" in kwargs:
|
|
507
|
+
warnings.warn(
|
|
508
|
+
"The 'exclude_languages' parameter is deprecated and will be removed in a future release. "
|
|
509
|
+
"Please use the parameter 'language' instead.",
|
|
510
|
+
)
|
|
511
|
+
|
|
416
512
|
if client is None:
|
|
417
513
|
raise ValueError("A Nosible client instance must be provided as 'client'.")
|
|
418
514
|
if not self.url:
|
|
@@ -436,14 +532,25 @@ class Result:
|
|
|
436
532
|
visited_start=visited_start,
|
|
437
533
|
visited_end=visited_end,
|
|
438
534
|
certain=certain,
|
|
439
|
-
include_languages=include_languages,
|
|
440
|
-
exclude_languages=exclude_languages,
|
|
441
535
|
include_companies=include_companies,
|
|
442
536
|
exclude_companies=exclude_companies,
|
|
443
537
|
include_docs=include_docs,
|
|
444
538
|
exclude_docs=exclude_docs,
|
|
539
|
+
brand_safety=brand_safety,
|
|
540
|
+
language=language,
|
|
541
|
+
continent=continent,
|
|
542
|
+
region=region,
|
|
543
|
+
country=country,
|
|
544
|
+
sector=sector,
|
|
545
|
+
industry_group=industry_group,
|
|
546
|
+
industry=industry,
|
|
547
|
+
sub_industry=sub_industry,
|
|
548
|
+
iab_tier_1=iab_tier_1,
|
|
549
|
+
iab_tier_2=iab_tier_2,
|
|
550
|
+
iab_tier_3=iab_tier_3,
|
|
551
|
+
iab_tier_4=iab_tier_4,
|
|
445
552
|
)
|
|
446
|
-
return client.
|
|
553
|
+
return client.fast_search(search=s)
|
|
447
554
|
except Exception as e:
|
|
448
555
|
raise RuntimeError(f"Failed to find similar results for title '{self.title}': {e}") from e
|
|
449
556
|
|
nosible/classes/result_set.py
CHANGED
|
@@ -57,6 +57,19 @@ class ResultSet(Iterator[Result]):
|
|
|
57
57
|
"language",
|
|
58
58
|
"similarity",
|
|
59
59
|
"url_hash",
|
|
60
|
+
"brand_safety",
|
|
61
|
+
"language",
|
|
62
|
+
"continent",
|
|
63
|
+
"region",
|
|
64
|
+
"country",
|
|
65
|
+
"sector",
|
|
66
|
+
"industry_group",
|
|
67
|
+
"industry",
|
|
68
|
+
"sub_industry",
|
|
69
|
+
"iab_tier_1",
|
|
70
|
+
"iab_tier_2",
|
|
71
|
+
"iab_tier_3",
|
|
72
|
+
"iab_tier_4",
|
|
60
73
|
]
|
|
61
74
|
|
|
62
75
|
results: list[Result] = field(default_factory=list)
|
|
@@ -310,7 +323,7 @@ class ResultSet(Iterator[Result]):
|
|
|
310
323
|
>>> from nosible import Nosible
|
|
311
324
|
>>> from nosible import ResultSet
|
|
312
325
|
>>> with Nosible() as nos:
|
|
313
|
-
... results: ResultSet = nos.
|
|
326
|
+
... results: ResultSet = nos.fast_search(question="Aircraft Manufacturing", n_results=10)
|
|
314
327
|
... inner = results.find_in_search_results("embraer", top_k=5)
|
|
315
328
|
>>> print(f"Top {len(inner)} hits for “embraer” within the initial results:")
|
|
316
329
|
Top 5 hits for “embraer” within the initial results:
|
|
@@ -409,7 +422,7 @@ class ResultSet(Iterator[Result]):
|
|
|
409
422
|
>>> from nosible import Nosible
|
|
410
423
|
>>> from nosible import Result, ResultSet
|
|
411
424
|
>>> with Nosible() as nos:
|
|
412
|
-
... results: ResultSet = nos.
|
|
425
|
+
... results: ResultSet = nos.fast_search(question="Aircraft Manufacturing", n_results=100)
|
|
413
426
|
... summary = results.analyze(by="language")
|
|
414
427
|
... print(summary)
|
|
415
428
|
{'en': 100}
|
|
@@ -507,7 +520,7 @@ class ResultSet(Iterator[Result]):
|
|
|
507
520
|
return {str(row[0]): int(row[1]) for row in sorted_vc.rows()}
|
|
508
521
|
|
|
509
522
|
# Conversion methods
|
|
510
|
-
def
|
|
523
|
+
def write_csv(self, file_path: str | None = None, delimiter: str = ",", encoding: str = "utf-8") -> str:
|
|
511
524
|
"""
|
|
512
525
|
Serialize the search results to a CSV file.
|
|
513
526
|
|
|
@@ -542,7 +555,7 @@ class ResultSet(Iterator[Result]):
|
|
|
542
555
|
... Result(url="https://openai.com", title="OpenAI"),
|
|
543
556
|
... ]
|
|
544
557
|
>>> search_results = ResultSet(results)
|
|
545
|
-
>>> path = search_results.
|
|
558
|
+
>>> path = search_results.write_csv("out.csv")
|
|
546
559
|
>>> path.endswith(".csv")
|
|
547
560
|
True
|
|
548
561
|
"""
|
|
@@ -622,7 +635,7 @@ class ResultSet(Iterator[Result]):
|
|
|
622
635
|
except Exception as e:
|
|
623
636
|
raise RuntimeError(f"Failed to convert search results to Pandas DataFrame: {e}") from e
|
|
624
637
|
|
|
625
|
-
def
|
|
638
|
+
def write_json(self, file_path: str | None = None) -> str | bytes:
|
|
626
639
|
"""
|
|
627
640
|
Serialize the search results to a JSON string and optionally write to disk.
|
|
628
641
|
|
|
@@ -648,11 +661,11 @@ class ResultSet(Iterator[Result]):
|
|
|
648
661
|
... Result(url="https://openai.com", title="OpenAI"),
|
|
649
662
|
... ]
|
|
650
663
|
>>> search_results = ResultSet(results)
|
|
651
|
-
>>> json_str = search_results.
|
|
664
|
+
>>> json_str = search_results.write_json()
|
|
652
665
|
>>> isinstance(json_str, str)
|
|
653
666
|
True
|
|
654
667
|
>>> # Optionally write to file
|
|
655
|
-
>>> path = search_results.
|
|
668
|
+
>>> path = search_results.write_json(file_path="results.json")
|
|
656
669
|
>>> path.endswith(".json")
|
|
657
670
|
True
|
|
658
671
|
"""
|
|
@@ -747,7 +760,7 @@ class ResultSet(Iterator[Result]):
|
|
|
747
760
|
except Exception as e:
|
|
748
761
|
raise RuntimeError(f"Failed to convert results to dict: {e}") from e
|
|
749
762
|
|
|
750
|
-
def
|
|
763
|
+
def write_ndjson(self, file_path: str | None = None) -> str:
|
|
751
764
|
"""
|
|
752
765
|
Serialize search results to newline-delimited JSON (NDJSON) format.
|
|
753
766
|
|
|
@@ -777,11 +790,11 @@ class ResultSet(Iterator[Result]):
|
|
|
777
790
|
... Result(url="https://openai.com", title="OpenAI"),
|
|
778
791
|
... ]
|
|
779
792
|
>>> search_results = ResultSet(results)
|
|
780
|
-
>>> ndjson_str = search_results.
|
|
793
|
+
>>> ndjson_str = search_results.write_ndjson()
|
|
781
794
|
>>> print(ndjson_str.splitlines()[0]) # doctest: +ELLIPSIS
|
|
782
795
|
{"url":"https://example.com","title":"Example Domain","description":null,"netloc":null..."url_hash":null}
|
|
783
796
|
>>> # Optionally write to file
|
|
784
|
-
>>> path = search_results.
|
|
797
|
+
>>> path = search_results.write_ndjson(file_path="results.ndjson")
|
|
785
798
|
>>> path.endswith(".ndjson")
|
|
786
799
|
True
|
|
787
800
|
"""
|
|
@@ -802,7 +815,7 @@ class ResultSet(Iterator[Result]):
|
|
|
802
815
|
raise RuntimeError(f"Failed to write NDJSON to '{file_path}': {e}") from e
|
|
803
816
|
return "\n".join(ndjson_lines) + "\n"
|
|
804
817
|
|
|
805
|
-
def
|
|
818
|
+
def write_parquet(self, file_path: str | None = None) -> str:
|
|
806
819
|
"""
|
|
807
820
|
Serialize the search results to Apache Parquet format using Polars.
|
|
808
821
|
|
|
@@ -832,7 +845,7 @@ class ResultSet(Iterator[Result]):
|
|
|
832
845
|
... Result(url="https://openai.com", title="OpenAI"),
|
|
833
846
|
... ]
|
|
834
847
|
>>> search_results = ResultSet(results)
|
|
835
|
-
>>> parquet_path = search_results.
|
|
848
|
+
>>> parquet_path = search_results.write_parquet("my_results.parquet")
|
|
836
849
|
>>> parquet_path.endswith(".parquet")
|
|
837
850
|
True
|
|
838
851
|
"""
|
|
@@ -843,7 +856,7 @@ class ResultSet(Iterator[Result]):
|
|
|
843
856
|
raise RuntimeError(f"Failed to write Parquet to '{out}': {e}") from e
|
|
844
857
|
return out
|
|
845
858
|
|
|
846
|
-
def
|
|
859
|
+
def write_ipc(self, file_path: str | None = None) -> str:
|
|
847
860
|
"""
|
|
848
861
|
Serialize the search results to Apache Arrow IPC (Feather) format using Polars.
|
|
849
862
|
|
|
@@ -873,7 +886,7 @@ class ResultSet(Iterator[Result]):
|
|
|
873
886
|
... Result(url="https://openai.com", title="OpenAI"),
|
|
874
887
|
... ]
|
|
875
888
|
>>> search_results = ResultSet(results)
|
|
876
|
-
>>> arrow_path = search_results.
|
|
889
|
+
>>> arrow_path = search_results.write_ipc("my_results.arrow")
|
|
877
890
|
>>> arrow_path.endswith(".arrow")
|
|
878
891
|
True
|
|
879
892
|
"""
|
|
@@ -884,7 +897,7 @@ class ResultSet(Iterator[Result]):
|
|
|
884
897
|
raise RuntimeError(f"Failed to write Arrow IPC to '{out}': {e}") from e
|
|
885
898
|
return out
|
|
886
899
|
|
|
887
|
-
def
|
|
900
|
+
def write_duckdb(self, file_path: str | None = None, table_name: str = "results") -> str:
|
|
888
901
|
"""
|
|
889
902
|
Serialize the search results to a DuckDB database file and table.
|
|
890
903
|
|
|
@@ -917,7 +930,7 @@ class ResultSet(Iterator[Result]):
|
|
|
917
930
|
... Result(url="https://openai.com", title="OpenAI"),
|
|
918
931
|
... ]
|
|
919
932
|
>>> search_results = ResultSet(results)
|
|
920
|
-
>>> db_path = search_results.
|
|
933
|
+
>>> db_path = search_results.write_duckdb(file_path="my_results.duckdb", table_name="search_table")
|
|
921
934
|
>>> db_path.endswith(".duckdb")
|
|
922
935
|
True
|
|
923
936
|
"""
|
|
@@ -939,7 +952,7 @@ class ResultSet(Iterator[Result]):
|
|
|
939
952
|
|
|
940
953
|
# Loading from disk
|
|
941
954
|
@classmethod
|
|
942
|
-
def
|
|
955
|
+
def read_csv(cls, file_path: str) -> ResultSet:
|
|
943
956
|
"""
|
|
944
957
|
Load search results from a CSV file using Polars.
|
|
945
958
|
|
|
@@ -971,7 +984,7 @@ class ResultSet(Iterator[Result]):
|
|
|
971
984
|
... {"url": "https://openai.com", "title": "OpenAI", "description": "AI research"},
|
|
972
985
|
... ]
|
|
973
986
|
... ).write_csv("data.csv")
|
|
974
|
-
>>> results = ResultSet.
|
|
987
|
+
>>> results = ResultSet.read_csv("data.csv")
|
|
975
988
|
>>> isinstance(results, ResultSet)
|
|
976
989
|
True
|
|
977
990
|
>>> len(results)
|
|
@@ -1007,7 +1020,7 @@ class ResultSet(Iterator[Result]):
|
|
|
1007
1020
|
return cls(results)
|
|
1008
1021
|
|
|
1009
1022
|
@classmethod
|
|
1010
|
-
def
|
|
1023
|
+
def read_json(cls, file_path: str) -> ResultSet:
|
|
1011
1024
|
"""
|
|
1012
1025
|
Load search results from a JSON file.
|
|
1013
1026
|
|
|
@@ -1039,7 +1052,7 @@ class ResultSet(Iterator[Result]):
|
|
|
1039
1052
|
... ],
|
|
1040
1053
|
... f,
|
|
1041
1054
|
... )
|
|
1042
|
-
>>> results = ResultSet.
|
|
1055
|
+
>>> results = ResultSet.read_json("data.json")
|
|
1043
1056
|
>>> isinstance(results, ResultSet)
|
|
1044
1057
|
True
|
|
1045
1058
|
>>> len(results)
|
|
@@ -1147,7 +1160,7 @@ class ResultSet(Iterator[Result]):
|
|
|
1147
1160
|
return cls.from_polars(pl_df)
|
|
1148
1161
|
|
|
1149
1162
|
@classmethod
|
|
1150
|
-
def
|
|
1163
|
+
def read_ndjson(cls, file_path: str) -> ResultSet:
|
|
1151
1164
|
"""
|
|
1152
1165
|
Load search results from a newline-delimited JSON (NDJSON) file.
|
|
1153
1166
|
|
|
@@ -1181,7 +1194,7 @@ class ResultSet(Iterator[Result]):
|
|
|
1181
1194
|
... f.write('{"url": "https://openai.com", "title": "OpenAI"}\\n')
|
|
1182
1195
|
58
|
|
1183
1196
|
49
|
|
1184
|
-
>>> results = ResultSet.
|
|
1197
|
+
>>> results = ResultSet.read_ndjson("data.ndjson")
|
|
1185
1198
|
>>> isinstance(results, ResultSet)
|
|
1186
1199
|
True
|
|
1187
1200
|
>>> len(results)
|
|
@@ -1219,7 +1232,7 @@ class ResultSet(Iterator[Result]):
|
|
|
1219
1232
|
return cls(results)
|
|
1220
1233
|
|
|
1221
1234
|
@classmethod
|
|
1222
|
-
def
|
|
1235
|
+
def read_parquet(cls, file_path: str) -> ResultSet:
|
|
1223
1236
|
"""
|
|
1224
1237
|
Load search results from a Parquet file using Polars.
|
|
1225
1238
|
|
|
@@ -1250,7 +1263,7 @@ class ResultSet(Iterator[Result]):
|
|
|
1250
1263
|
... ]
|
|
1251
1264
|
... )
|
|
1252
1265
|
>>> df.write_parquet("sample.parquet")
|
|
1253
|
-
>>> results = ResultSet.
|
|
1266
|
+
>>> results = ResultSet.read_parquet("sample.parquet")
|
|
1254
1267
|
>>> isinstance(results, ResultSet)
|
|
1255
1268
|
True
|
|
1256
1269
|
>>> len(results)
|
|
@@ -1270,7 +1283,7 @@ class ResultSet(Iterator[Result]):
|
|
|
1270
1283
|
raise RuntimeError(f"Failed to create ResultSet from Parquet data in '{file_path}': {e}") from e
|
|
1271
1284
|
|
|
1272
1285
|
@classmethod
|
|
1273
|
-
def
|
|
1286
|
+
def read_ipc(cls, file_path: str) -> ResultSet:
|
|
1274
1287
|
"""
|
|
1275
1288
|
Load search results from an Apache Arrow IPC (Feather) file using Polars.
|
|
1276
1289
|
|
|
@@ -1301,7 +1314,7 @@ class ResultSet(Iterator[Result]):
|
|
|
1301
1314
|
... ]
|
|
1302
1315
|
... )
|
|
1303
1316
|
>>> df.write_ipc("sample.arrow")
|
|
1304
|
-
>>> results = ResultSet.
|
|
1317
|
+
>>> results = ResultSet.read_ipc("sample.arrow")
|
|
1305
1318
|
>>> isinstance(results, ResultSet)
|
|
1306
1319
|
True
|
|
1307
1320
|
>>> len(results)
|
|
@@ -1321,7 +1334,7 @@ class ResultSet(Iterator[Result]):
|
|
|
1321
1334
|
raise RuntimeError(f"Failed to create ResultSet from Arrow data in '{file_path}': {e}") from e
|
|
1322
1335
|
|
|
1323
1336
|
@classmethod
|
|
1324
|
-
def
|
|
1337
|
+
def read_duckdb(cls, file_path: str) -> ResultSet:
|
|
1325
1338
|
"""
|
|
1326
1339
|
Load search results from a DuckDB database file.
|
|
1327
1340
|
|
|
@@ -1354,8 +1367,8 @@ class ResultSet(Iterator[Result]):
|
|
|
1354
1367
|
... Result(url="https://openai.com", title="OpenAI", similarity=0.99),
|
|
1355
1368
|
... ]
|
|
1356
1369
|
>>> search_results = ResultSet(results)
|
|
1357
|
-
>>> db_path = search_results.
|
|
1358
|
-
>>> loaded = ResultSet.
|
|
1370
|
+
>>> db_path = search_results.write_duckdb(file_path="results.duckdb", table_name="search_results")
|
|
1371
|
+
>>> loaded = ResultSet.read_duckdb("results.duckdb")
|
|
1359
1372
|
>>> isinstance(loaded, ResultSet)
|
|
1360
1373
|
True
|
|
1361
1374
|
>>> len(loaded)
|
nosible/classes/search.py
CHANGED
|
@@ -55,10 +55,6 @@ class Search:
|
|
|
55
55
|
List of netlocs (domains) to include in the search. (Max 50)
|
|
56
56
|
exclude_netlocs : list of str, optional
|
|
57
57
|
List of netlocs (domains) to exclude in the search. (Max 50)
|
|
58
|
-
include_languages : list of str, optional
|
|
59
|
-
Languages to include in the search. (Max 50, ISO 639-1 language codes).
|
|
60
|
-
exclude_languages : list of str, optional
|
|
61
|
-
Language codes to exclude in the search (Max 50, ISO 639-1 language codes).
|
|
62
58
|
include_companies : list of str, optional
|
|
63
59
|
Google KG IDs of public companies to require (Max 50).
|
|
64
60
|
exclude_companies : list of str, optional
|
|
@@ -67,6 +63,32 @@ class Search:
|
|
|
67
63
|
URL hashes of docs to include (Max 50).
|
|
68
64
|
exclude_docs : list of str, optional
|
|
69
65
|
URL hashes of docs to exclude (Max 50).
|
|
66
|
+
brand_safety : str, optional
|
|
67
|
+
Whether it is safe, sensitive, or unsafe to advertise on this content.
|
|
68
|
+
language : str, optional
|
|
69
|
+
Language code to use in search (ISO 639-1 language code).
|
|
70
|
+
continent : str, optional
|
|
71
|
+
Continent the results must come from (e.g., "Europe", "Asia").
|
|
72
|
+
region : str, optional
|
|
73
|
+
Region or subcontinent the results must come from (e.g., "Southern Africa", "Caribbean").
|
|
74
|
+
country : str, optional
|
|
75
|
+
Country the results must come from.
|
|
76
|
+
sector : str, optional
|
|
77
|
+
GICS Sector the results must relate to (e.g., "Energy", "Information Technology").
|
|
78
|
+
industry_group : str, optional
|
|
79
|
+
GICS Industry group the results must relate to (e.g., "Automobiles & Components", "Insurance").
|
|
80
|
+
industry : str, optional
|
|
81
|
+
GICS Industry the results must relate to (e.g., "Consumer Finance", "Passenger Airlines").
|
|
82
|
+
sub_industry : str, optional
|
|
83
|
+
GICS Sub-industry classification of the content's subject.
|
|
84
|
+
iab_tier_1 : str, optional
|
|
85
|
+
IAB Tier 1 category for the content.
|
|
86
|
+
iab_tier_2 : str, optional
|
|
87
|
+
IAB Tier 2 category for the content.
|
|
88
|
+
iab_tier_3 : str, optional
|
|
89
|
+
IAB Tier 3 category for the content.
|
|
90
|
+
iab_tier_4 : str, optional
|
|
91
|
+
IAB Tier 4 category for the content.
|
|
70
92
|
|
|
71
93
|
Examples
|
|
72
94
|
--------
|
|
@@ -75,7 +97,7 @@ class Search:
|
|
|
75
97
|
>>> search = Search(
|
|
76
98
|
... question="What is Python?",
|
|
77
99
|
... n_results=5,
|
|
78
|
-
...
|
|
100
|
+
... language="en",
|
|
79
101
|
... publish_start="2023-01-01",
|
|
80
102
|
... publish_end="2023-12-31",
|
|
81
103
|
... certain=True,
|
|
@@ -120,10 +142,6 @@ class Search:
|
|
|
120
142
|
"""List of netlocs (domains) to include in the search (Max 50)."""
|
|
121
143
|
exclude_netlocs: list[str] | None = None
|
|
122
144
|
"""List of netlocs (domains) to exclude in the search (Max 50)."""
|
|
123
|
-
include_languages: list[str] | None = None
|
|
124
|
-
"""Languages to include in the search. (Max 50)"""
|
|
125
|
-
exclude_languages: list[str] | None = None
|
|
126
|
-
"""Language codes to exclude in the search (Max 50)"""
|
|
127
145
|
include_companies: list[str] | None = None
|
|
128
146
|
"""Google KG IDs of public companies to require (Max 50)."""
|
|
129
147
|
exclude_companies: list[str] | None = None
|
|
@@ -132,6 +150,32 @@ class Search:
|
|
|
132
150
|
"""URL hashes of docs to include (Max 50)."""
|
|
133
151
|
exclude_docs: list[str] | None = None
|
|
134
152
|
"""URL hashes of docs to exclude (Max 50)."""
|
|
153
|
+
brand_safety: str | None = None
|
|
154
|
+
"""Whether it is safe, sensitive, or unsafe to advertise on this content."""
|
|
155
|
+
language: str | None = None
|
|
156
|
+
"""Language code to use in search (ISO 639-1 language code)."""
|
|
157
|
+
continent: str | None = None
|
|
158
|
+
"""Continent the results must come from (e.g., "Europe", "Asia")."""
|
|
159
|
+
region: str | None = None
|
|
160
|
+
"""Region or subcontinent the results must come from (e.g., "Southern Africa", "Caribbean")."""
|
|
161
|
+
country: str | None = None
|
|
162
|
+
"""Country the results must come from."""
|
|
163
|
+
sector: str | None = None
|
|
164
|
+
"""GICS Sector the results must relate to (e.g., "Energy", "Information Technology")."""
|
|
165
|
+
industry_group: str | None = None
|
|
166
|
+
"""GICS Industry group the results must relate to (e.g., "Automobiles & Components", "Insurance")."""
|
|
167
|
+
industry: str | None = None
|
|
168
|
+
"""GICS Industry the results must relate to (e.g., "Consumer Finance", "Passenger Airlines")."""
|
|
169
|
+
sub_industry: str | None = None
|
|
170
|
+
"""GICS Sub-industry classification of the content's subject."""
|
|
171
|
+
iab_tier_1: str | None = None
|
|
172
|
+
"""IAB Tier 1 category for the content."""
|
|
173
|
+
iab_tier_2: str | None = None
|
|
174
|
+
"""IAB Tier 2 category for the content."""
|
|
175
|
+
iab_tier_3: str | None = None
|
|
176
|
+
"""IAB Tier 3 category for the content."""
|
|
177
|
+
iab_tier_4: str | None = None
|
|
178
|
+
"""IAB Tier 4 category for the content."""
|
|
135
179
|
|
|
136
180
|
_FIELDS = [
|
|
137
181
|
"question",
|
|
@@ -147,17 +191,28 @@ class Search:
|
|
|
147
191
|
"autogenerate_expansions",
|
|
148
192
|
"publish_start",
|
|
149
193
|
"publish_end",
|
|
150
|
-
"include_netlocs",
|
|
151
|
-
"exclude_netlocs",
|
|
152
194
|
"visited_start",
|
|
153
195
|
"visited_end",
|
|
154
196
|
"certain",
|
|
155
|
-
"
|
|
156
|
-
"
|
|
197
|
+
"include_netlocs",
|
|
198
|
+
"exclude_netlocs",
|
|
157
199
|
"include_companies",
|
|
158
200
|
"exclude_companies",
|
|
159
201
|
"include_docs",
|
|
160
202
|
"exclude_docs",
|
|
203
|
+
"brand_safety",
|
|
204
|
+
"language",
|
|
205
|
+
"continent",
|
|
206
|
+
"region",
|
|
207
|
+
"country",
|
|
208
|
+
"sector",
|
|
209
|
+
"industry_group",
|
|
210
|
+
"industry",
|
|
211
|
+
"sub_industry",
|
|
212
|
+
"iab_tier_1",
|
|
213
|
+
"iab_tier_2",
|
|
214
|
+
"iab_tier_3",
|
|
215
|
+
"iab_tier_4",
|
|
161
216
|
]
|
|
162
217
|
|
|
163
218
|
def __str__(self) -> str:
|
|
@@ -220,7 +275,7 @@ class Search:
|
|
|
220
275
|
Examples
|
|
221
276
|
--------
|
|
222
277
|
>>> search = Search(
|
|
223
|
-
... question="What is Python?", n_results=5,
|
|
278
|
+
... question="What is Python?", n_results=5, language="en", publish_start="2023-01-01"
|
|
224
279
|
... )
|
|
225
280
|
>>> search.to_dict()["question"]
|
|
226
281
|
'What is Python?'
|
|
@@ -255,7 +310,7 @@ class Search:
|
|
|
255
310
|
"""
|
|
256
311
|
return cls(**{field: data.get(field) for field in cls._FIELDS})
|
|
257
312
|
|
|
258
|
-
def
|
|
313
|
+
def write_json(self, path: str) -> None:
|
|
259
314
|
"""
|
|
260
315
|
Save the current Search instance to a JSON file.
|
|
261
316
|
|
|
@@ -274,16 +329,16 @@ class Search:
|
|
|
274
329
|
Examples
|
|
275
330
|
--------
|
|
276
331
|
>>> search = Search(
|
|
277
|
-
... question="What is Python?", n_results=5,
|
|
332
|
+
... question="What is Python?", n_results=5, language="en", publish_start="2023-01-01"
|
|
278
333
|
... )
|
|
279
|
-
>>> search.
|
|
334
|
+
>>> search.write_json("search.json")
|
|
280
335
|
"""
|
|
281
336
|
data = json_dumps(self.to_dict())
|
|
282
337
|
with open(path, "w") as f:
|
|
283
338
|
f.write(data)
|
|
284
339
|
|
|
285
340
|
@classmethod
|
|
286
|
-
def
|
|
341
|
+
def read_json(cls, path: str) -> Search:
|
|
287
342
|
"""
|
|
288
343
|
Load a Search instance from a JSON file.
|
|
289
344
|
|
|
@@ -299,7 +354,7 @@ class Search:
|
|
|
299
354
|
Returns
|
|
300
355
|
-------
|
|
301
356
|
Search
|
|
302
|
-
An
|
|
357
|
+
An instancex of the Search class initialized with the loaded parameters.
|
|
303
358
|
|
|
304
359
|
Raises
|
|
305
360
|
------
|
|
@@ -309,10 +364,10 @@ class Search:
|
|
|
309
364
|
Save and load a Search instance:
|
|
310
365
|
|
|
311
366
|
>>> search = Search(
|
|
312
|
-
... question="What is Python?", n_results=3,
|
|
367
|
+
... question="What is Python?", n_results=3, language="en", publish_start="2023-01-01"
|
|
313
368
|
... )
|
|
314
|
-
>>> search.
|
|
315
|
-
>>> loaded_search = Search.
|
|
369
|
+
>>> search.write_json("search.json")
|
|
370
|
+
>>> loaded_search = Search.read_json("search.json")
|
|
316
371
|
>>> print(loaded_search.question)
|
|
317
372
|
What is Python?
|
|
318
373
|
"""
|