nosible 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nosible/classes/result.py +65 -106
- nosible/classes/result_set.py +119 -113
- nosible/classes/search.py +68 -89
- nosible/classes/search_set.py +27 -12
- nosible/classes/snippet.py +57 -74
- nosible/classes/snippet_set.py +62 -63
- nosible/classes/web_page.py +39 -103
- nosible/nosible_client.py +224 -224
- nosible/utils/json_tools.py +51 -2
- nosible/utils/question_builder.py +131 -0
- nosible/utils/rate_limiter.py +30 -24
- {nosible-0.1.8.dist-info → nosible-0.1.9.dist-info}/METADATA +9 -45
- nosible-0.1.9.dist-info/RECORD +17 -0
- nosible-0.1.8.dist-info/RECORD +0 -16
- {nosible-0.1.8.dist-info → nosible-0.1.9.dist-info}/WHEEL +0 -0
- {nosible-0.1.8.dist-info → nosible-0.1.9.dist-info}/licenses/LICENSE +0 -0
- {nosible-0.1.8.dist-info → nosible-0.1.9.dist-info}/top_level.txt +0 -0
nosible/classes/result_set.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from collections.abc import Iterator
|
|
4
|
+
from dataclasses import dataclass, field
|
|
4
5
|
|
|
5
6
|
import duckdb
|
|
6
7
|
import pandas as pd
|
|
@@ -11,6 +12,7 @@ from nosible.classes.result import Result
|
|
|
11
12
|
from nosible.utils.json_tools import json_dumps, json_loads
|
|
12
13
|
|
|
13
14
|
|
|
15
|
+
@dataclass(frozen=True)
|
|
14
16
|
class ResultSet(Iterator[Result]):
|
|
15
17
|
"""
|
|
16
18
|
Container class for managing and processing a sequence of Result objects.
|
|
@@ -57,33 +59,10 @@ class ResultSet(Iterator[Result]):
|
|
|
57
59
|
"url_hash",
|
|
58
60
|
]
|
|
59
61
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
def _as_dicts(self):
|
|
65
|
-
"""
|
|
66
|
-
Convert the ResultSet to a list of dictionaries.
|
|
67
|
-
|
|
68
|
-
Returns
|
|
69
|
-
-------
|
|
70
|
-
list of dict
|
|
71
|
-
List of dictionaries representing each Result.
|
|
72
|
-
|
|
73
|
-
"""
|
|
74
|
-
# dataclass.asdict handles nested structures too
|
|
75
|
-
return [r.to_dict() for r in self.results]
|
|
76
|
-
|
|
77
|
-
def _as_columns(self):
|
|
78
|
-
"""
|
|
79
|
-
Convert the ResultSet to a dictionary of lists, suitable for DataFrame creation.
|
|
80
|
-
|
|
81
|
-
Returns
|
|
82
|
-
-------
|
|
83
|
-
dict
|
|
84
|
-
Dictionary where keys are field names and values are lists of field values.
|
|
85
|
-
"""
|
|
86
|
-
return {f: [getattr(r, f) for r in self.results] for f in self._FIELDS}
|
|
62
|
+
results: list[Result] = field(default_factory=list)
|
|
63
|
+
""" List of Result objects contained in this ResultSet."""
|
|
64
|
+
_index: int = field(default=0, init=False, repr=False, compare=False)
|
|
65
|
+
""" Internal index for iteration over results."""
|
|
87
66
|
|
|
88
67
|
def __len__(self) -> int:
|
|
89
68
|
"""
|
|
@@ -116,8 +95,8 @@ class ResultSet(Iterator[Result]):
|
|
|
116
95
|
>>> print(search_results) # doctest: +NORMALIZE_WHITESPACE
|
|
117
96
|
Idx | Similarity | Title
|
|
118
97
|
------------------------
|
|
119
|
-
0 | 0.95
|
|
120
|
-
1 | 0.99
|
|
98
|
+
0 | 0.95 | Example Domain
|
|
99
|
+
1 | 0.99 | OpenAI
|
|
121
100
|
|
|
122
101
|
>>> empty = ResultSet([])
|
|
123
102
|
>>> print(empty)
|
|
@@ -129,29 +108,18 @@ class ResultSet(Iterator[Result]):
|
|
|
129
108
|
# Create a formatted string for each result
|
|
130
109
|
lines = []
|
|
131
110
|
for idx, result in enumerate(self.results):
|
|
132
|
-
similarity = f"{result.similarity:.2f}" if result.similarity is not None else "N/A"
|
|
111
|
+
similarity = f"{result.similarity:.2f}" if result.similarity is not None else " N/A"
|
|
133
112
|
title = result.title or "No Title"
|
|
134
|
-
lines.append(f"{idx:>3} | {similarity:>
|
|
113
|
+
lines.append(f"{idx:>3} | {similarity:>10} | {title}")
|
|
135
114
|
|
|
136
|
-
# Add a header
|
|
137
|
-
header = "Idx | Similarity | Title"
|
|
115
|
+
# Add a header with matching column widths
|
|
116
|
+
header = f"{'Idx':>3} | {'Similarity':>10} | Title"
|
|
138
117
|
separator = "-" * len(header)
|
|
139
118
|
lines.insert(0, header)
|
|
140
119
|
lines.insert(1, separator)
|
|
141
120
|
# Join all lines into a single string
|
|
142
121
|
return "\n".join(lines)
|
|
143
122
|
|
|
144
|
-
def __repr__(self) -> str:
|
|
145
|
-
"""
|
|
146
|
-
Returns a string representation of the object for interactive sessions.
|
|
147
|
-
|
|
148
|
-
Returns
|
|
149
|
-
-------
|
|
150
|
-
str
|
|
151
|
-
The string representation of the object, as returned by `__str__()`.
|
|
152
|
-
"""
|
|
153
|
-
return self.__str__()
|
|
154
|
-
|
|
155
123
|
def __iter__(self) -> ResultSet:
|
|
156
124
|
"""
|
|
157
125
|
Reset iteration and return self.
|
|
@@ -161,7 +129,7 @@ class ResultSet(Iterator[Result]):
|
|
|
161
129
|
ResultSet
|
|
162
130
|
Iterator over the ResultSet instance.
|
|
163
131
|
"""
|
|
164
|
-
self
|
|
132
|
+
object.__setattr__(self, "_index", 0)
|
|
165
133
|
return self
|
|
166
134
|
|
|
167
135
|
def __next__(self) -> Result:
|
|
@@ -179,10 +147,29 @@ class ResultSet(Iterator[Result]):
|
|
|
179
147
|
"""
|
|
180
148
|
if self._index < len(self.results):
|
|
181
149
|
item = self.results[self._index]
|
|
182
|
-
self._index
|
|
150
|
+
object.__setattr__(self, "_index", self._index + 1)
|
|
183
151
|
return item
|
|
184
152
|
raise StopIteration
|
|
185
153
|
|
|
154
|
+
def __eq__(self, value):
|
|
155
|
+
"""
|
|
156
|
+
Comapre set of url_hashes to determine equality.
|
|
157
|
+
Two ResultSet instances are considered equal if they contain the same set of url_hashes.
|
|
158
|
+
|
|
159
|
+
Parameters
|
|
160
|
+
----------
|
|
161
|
+
value : ResultSet
|
|
162
|
+
The ResultSet instance to compare against.
|
|
163
|
+
Returns
|
|
164
|
+
-------
|
|
165
|
+
bool
|
|
166
|
+
True if both ResultSet instances contain the same set of url_hashes, False otherwise.
|
|
167
|
+
"""
|
|
168
|
+
if not isinstance(value, ResultSet):
|
|
169
|
+
return False
|
|
170
|
+
# Compare the sets of url_hashes
|
|
171
|
+
return {r.url_hash for r in self.results} == {r.url_hash for r in value.results}
|
|
172
|
+
|
|
186
173
|
def __enter__(self) -> ResultSet:
|
|
187
174
|
"""
|
|
188
175
|
Enters the runtime context related to this object.
|
|
@@ -217,29 +204,9 @@ class ResultSet(Iterator[Result]):
|
|
|
217
204
|
if 0 <= key < len(self.results):
|
|
218
205
|
return self.results[key]
|
|
219
206
|
raise IndexError(f"Index {key} out of range for ResultSet with length {len(self.results)}.")
|
|
207
|
+
raise IndexError(f"Index {key} out of range for ResultSet with length {len(self.results)}.")
|
|
220
208
|
|
|
221
|
-
def
|
|
222
|
-
"""
|
|
223
|
-
Set a Result at a specific index.
|
|
224
|
-
|
|
225
|
-
Parameters
|
|
226
|
-
----------
|
|
227
|
-
key : int
|
|
228
|
-
Index to set the result at.
|
|
229
|
-
value : Result
|
|
230
|
-
Result to set at the specified index.
|
|
231
|
-
|
|
232
|
-
Raises
|
|
233
|
-
------
|
|
234
|
-
IndexError
|
|
235
|
-
If index is out of range.
|
|
236
|
-
"""
|
|
237
|
-
if 0 <= key < len(self.results):
|
|
238
|
-
self.results[key] = value
|
|
239
|
-
else:
|
|
240
|
-
raise IndexError(f"Index {key} out of range for ResultSet with length {len(self.results)}.")
|
|
241
|
-
|
|
242
|
-
def __add__(self, other: ResultSet) -> ResultSet:
|
|
209
|
+
def __add__(self, other: ResultSet | Result) -> ResultSet:
|
|
243
210
|
"""
|
|
244
211
|
Concatenate two ResultSet instances.
|
|
245
212
|
|
|
@@ -265,9 +232,12 @@ class ResultSet(Iterator[Result]):
|
|
|
265
232
|
>>> len(combined)
|
|
266
233
|
2
|
|
267
234
|
"""
|
|
268
|
-
if
|
|
269
|
-
|
|
270
|
-
|
|
235
|
+
if isinstance(other, ResultSet):
|
|
236
|
+
return ResultSet(self.results + other.results)
|
|
237
|
+
if isinstance(other, Result):
|
|
238
|
+
# If other is a single Result, create a new ResultSet with it
|
|
239
|
+
return ResultSet(self.results.append(other))
|
|
240
|
+
raise TypeError("Can only concatenate ResultSet with another ResultSet.")
|
|
271
241
|
|
|
272
242
|
def __sub__(self, other: ResultSet) -> ResultSet:
|
|
273
243
|
"""
|
|
@@ -321,7 +291,7 @@ class ResultSet(Iterator[Result]):
|
|
|
321
291
|
----------
|
|
322
292
|
query : str
|
|
323
293
|
The search string to rank within these results.
|
|
324
|
-
top_k : int
|
|
294
|
+
top_k : int
|
|
325
295
|
Number of top results to return.
|
|
326
296
|
|
|
327
297
|
Returns
|
|
@@ -435,6 +405,39 @@ class ResultSet(Iterator[Result]):
|
|
|
435
405
|
... summary = results.analyze(by="language")
|
|
436
406
|
... print(summary)
|
|
437
407
|
{'en': 100}
|
|
408
|
+
>>> import polars as pl
|
|
409
|
+
>>> from nosible.classes.result_set import Result, ResultSet
|
|
410
|
+
|
|
411
|
+
# -- date grouping (published) --------------------------------------------
|
|
412
|
+
>>> data = [
|
|
413
|
+
... {"published": "2021-01-15", "netloc": "a.com", "author": "", "language": "en", "similarity": 0.5},
|
|
414
|
+
... {"published": "2021-02-20", "netloc": "a.com", "author": "", "language": "en", "similarity": 0.8},
|
|
415
|
+
... {"published": "2021-02-25", "netloc": "b.org", "author": "", "language": "fr", "similarity": 0.2},
|
|
416
|
+
... ]
|
|
417
|
+
>>> results = ResultSet([Result(**d) for d in data])
|
|
418
|
+
>>> results.analyze(by="published") # doctest: +NORMALIZE_WHITESPACE
|
|
419
|
+
{'2021-01': 1, '2021-02': 2}
|
|
420
|
+
|
|
421
|
+
# -- numeric stats (similarity) ------------------------------------------
|
|
422
|
+
>>> stats = results.analyze(by="similarity")
|
|
423
|
+
>>> set(stats) == {"count", "null_count", "mean", "std", "min", "25%", "50%", "75%", "max"}
|
|
424
|
+
True
|
|
425
|
+
>>> round(stats["mean"], 2)
|
|
426
|
+
0.5
|
|
427
|
+
|
|
428
|
+
# -- categorical counts (language) --------------------------------------
|
|
429
|
+
>>> results.analyze(by="language")
|
|
430
|
+
{'en': 2, 'fr': 1}
|
|
431
|
+
|
|
432
|
+
# -- author special case ------------------------------------------------
|
|
433
|
+
# empty author strings get mapped to "Author Unknown"
|
|
434
|
+
>>> results.analyze(by="author")
|
|
435
|
+
{'Author Unknown': 3}
|
|
436
|
+
|
|
437
|
+
# -- invalid field -------------------------------------------------------
|
|
438
|
+
>>> results.analyze(by="foobar") # doctest: +IGNORE_EXCEPTION_DETAIL
|
|
439
|
+
Traceback (most recent call last):
|
|
440
|
+
ValueError: Cannot analyze by 'foobar' - not a valid field.
|
|
438
441
|
"""
|
|
439
442
|
# Convert to Polars DataFrame
|
|
440
443
|
df: pl.DataFrame = self.to_polars()
|
|
@@ -451,7 +454,7 @@ class ResultSet(Iterator[Result]):
|
|
|
451
454
|
# Handle author unknown
|
|
452
455
|
if by == "author":
|
|
453
456
|
df = df.with_columns(
|
|
454
|
-
pl.when(pl.col("author")
|
|
457
|
+
pl.when(pl.col("author") == "")
|
|
455
458
|
.then(pl.lit("Author Unknown"))
|
|
456
459
|
.otherwise(pl.col("author"))
|
|
457
460
|
.alias("author")
|
|
@@ -464,7 +467,7 @@ class ResultSet(Iterator[Result]):
|
|
|
464
467
|
# Extract year-month
|
|
465
468
|
df = df.with_columns(pl.col(by).dt.strftime("%Y-%m").alias("year_month"))
|
|
466
469
|
# Count per month
|
|
467
|
-
vc = df.
|
|
470
|
+
vc = df.group_by("year_month").agg(pl.count().alias("count")).sort("year_month")
|
|
468
471
|
rows = vc.rows()
|
|
469
472
|
if not rows:
|
|
470
473
|
return {}
|
|
@@ -477,13 +480,15 @@ class ResultSet(Iterator[Result]):
|
|
|
477
480
|
result[month] = cnt
|
|
478
481
|
return result
|
|
479
482
|
|
|
483
|
+
# Numeric stats for similarity
|
|
484
|
+
if by == "similarity":
|
|
485
|
+
desc_df = df["similarity"].describe()
|
|
486
|
+
# print({row[0]: float(row[1]) for row in desc_df.rows()})
|
|
487
|
+
return {row[0]: float(row[1]) for row in desc_df.rows()}
|
|
488
|
+
|
|
480
489
|
# Non-date: analyze numeric vs. categorical Non-date: analyze numeric vs. categorical
|
|
481
490
|
series = df[by]
|
|
482
|
-
|
|
483
|
-
# Numeric analysis: descriptive stats
|
|
484
|
-
if dtype in (pl.Float64, pl.Float32, pl.Int64, pl.Int32):
|
|
485
|
-
desc_df = series.describe()
|
|
486
|
-
return {row[0]: float(row[1]) for row in desc_df.rows()}
|
|
491
|
+
|
|
487
492
|
# Categorical/value counts
|
|
488
493
|
vc = series.value_counts()
|
|
489
494
|
_, count_col = vc.columns
|
|
@@ -502,11 +507,11 @@ class ResultSet(Iterator[Result]):
|
|
|
502
507
|
Parameters
|
|
503
508
|
----------
|
|
504
509
|
file_path : str or None, optional
|
|
505
|
-
Path to save the CSV file.
|
|
510
|
+
Path to save the CSV file.
|
|
506
511
|
delimiter : str, optional
|
|
507
|
-
Delimiter to use in the CSV file.
|
|
512
|
+
Delimiter to use in the CSV file.
|
|
508
513
|
encoding : str, optional
|
|
509
|
-
Encoding for the CSV file.
|
|
514
|
+
Encoding for the CSV file.
|
|
510
515
|
|
|
511
516
|
Returns
|
|
512
517
|
-------
|
|
@@ -566,7 +571,7 @@ class ResultSet(Iterator[Result]):
|
|
|
566
571
|
>>> "url" in df.columns
|
|
567
572
|
True
|
|
568
573
|
"""
|
|
569
|
-
return pl.DataFrame(self.
|
|
574
|
+
return pl.DataFrame(self.to_dicts())
|
|
570
575
|
|
|
571
576
|
def to_pandas(self) -> pd.DataFrame:
|
|
572
577
|
"""
|
|
@@ -602,7 +607,7 @@ class ResultSet(Iterator[Result]):
|
|
|
602
607
|
except Exception as e:
|
|
603
608
|
raise RuntimeError(f"Failed to convert search results to Pandas DataFrame: {e}") from e
|
|
604
609
|
|
|
605
|
-
def to_json(self, file_path: str | None = None) -> str:
|
|
610
|
+
def to_json(self, file_path: str | None = None) -> str | bytes:
|
|
606
611
|
"""
|
|
607
612
|
Serialize the search results to a JSON string and optionally write to disk.
|
|
608
613
|
|
|
@@ -637,7 +642,7 @@ class ResultSet(Iterator[Result]):
|
|
|
637
642
|
True
|
|
638
643
|
"""
|
|
639
644
|
try:
|
|
640
|
-
json_bytes = json_dumps(self.
|
|
645
|
+
json_bytes = json_dumps(self.to_dicts())
|
|
641
646
|
if file_path:
|
|
642
647
|
try:
|
|
643
648
|
with open(file_path, "w") as f:
|
|
@@ -684,9 +689,9 @@ class ResultSet(Iterator[Result]):
|
|
|
684
689
|
True
|
|
685
690
|
"""
|
|
686
691
|
try:
|
|
687
|
-
return
|
|
692
|
+
return [result.to_dict() for result in self.results]
|
|
688
693
|
except Exception as e:
|
|
689
|
-
raise RuntimeError(f"Failed to convert results to list of
|
|
694
|
+
raise RuntimeError(f"Failed to convert results to list of dictionaries: {e}") from e
|
|
690
695
|
|
|
691
696
|
def to_dict(self) -> dict:
|
|
692
697
|
"""
|
|
@@ -738,7 +743,6 @@ class ResultSet(Iterator[Result]):
|
|
|
738
743
|
----------
|
|
739
744
|
file_path : str or None, optional
|
|
740
745
|
Path to save the NDJSON file. If None, returns the NDJSON string.
|
|
741
|
-
Default is None.
|
|
742
746
|
|
|
743
747
|
Returns
|
|
744
748
|
-------
|
|
@@ -766,18 +770,22 @@ class ResultSet(Iterator[Result]):
|
|
|
766
770
|
>>> path.endswith(".ndjson")
|
|
767
771
|
True
|
|
768
772
|
"""
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
773
|
+
|
|
774
|
+
ndjson_lines = []
|
|
775
|
+
for result in self.results:
|
|
776
|
+
try:
|
|
777
|
+
ndjson_lines.append(json_dumps(result.to_dict()))
|
|
778
|
+
except Exception as e:
|
|
779
|
+
raise RuntimeError(f"Failed to serialize Result to NDJSON: {e}") from e
|
|
780
|
+
|
|
781
|
+
if file_path:
|
|
782
|
+
try:
|
|
783
|
+
with open(file_path, "w", encoding="utf-8") as f:
|
|
784
|
+
f.write("\n".join(ndjson_lines) + "\n")
|
|
785
|
+
return file_path
|
|
786
|
+
except Exception as e:
|
|
787
|
+
raise RuntimeError(f"Failed to write NDJSON to '{file_path}': {e}") from e
|
|
788
|
+
return "\n".join(ndjson_lines) + "\n"
|
|
781
789
|
|
|
782
790
|
def to_parquet(self, file_path: str | None = None) -> str:
|
|
783
791
|
"""
|
|
@@ -789,7 +797,7 @@ class ResultSet(Iterator[Result]):
|
|
|
789
797
|
Parameters
|
|
790
798
|
----------
|
|
791
799
|
file_path : str or None, optional
|
|
792
|
-
Path to save the Parquet file.
|
|
800
|
+
Path to save the Parquet file.
|
|
793
801
|
|
|
794
802
|
Returns
|
|
795
803
|
-------
|
|
@@ -830,7 +838,7 @@ class ResultSet(Iterator[Result]):
|
|
|
830
838
|
Parameters
|
|
831
839
|
----------
|
|
832
840
|
file_path : str or None, optional
|
|
833
|
-
Path to save the Arrow IPC file.
|
|
841
|
+
Path to save the Arrow IPC file.
|
|
834
842
|
|
|
835
843
|
Returns
|
|
836
844
|
-------
|
|
@@ -872,9 +880,9 @@ class ResultSet(Iterator[Result]):
|
|
|
872
880
|
Parameters
|
|
873
881
|
----------
|
|
874
882
|
file_path : str or None, optional
|
|
875
|
-
Path to save the DuckDB file.
|
|
883
|
+
Path to save the DuckDB file.
|
|
876
884
|
table_name : str, optional
|
|
877
|
-
Name of the table to write the results to.
|
|
885
|
+
Name of the table to write the results to.
|
|
878
886
|
|
|
879
887
|
Returns
|
|
880
888
|
-------
|
|
@@ -1006,11 +1014,6 @@ class ResultSet(Iterator[Result]):
|
|
|
1006
1014
|
--------
|
|
1007
1015
|
>>> import json
|
|
1008
1016
|
>>> from nosible import ResultSet
|
|
1009
|
-
>>> # Suppose 'data.json' contains:
|
|
1010
|
-
>>> # [
|
|
1011
|
-
>>> # {"url": "https://example.com", "title": "Example Domain"},
|
|
1012
|
-
>>> # {"url": "https://openai.com", "title": "OpenAI"}
|
|
1013
|
-
>>> # ]
|
|
1014
1017
|
>>> with open("data.json", "w") as f:
|
|
1015
1018
|
... json.dump(
|
|
1016
1019
|
... [
|
|
@@ -1097,20 +1100,23 @@ class ResultSet(Iterator[Result]):
|
|
|
1097
1100
|
|
|
1098
1101
|
@classmethod
|
|
1099
1102
|
def from_pandas(cls, df: pd.DataFrame) -> ResultSet:
|
|
1100
|
-
"""
|
|
1103
|
+
"""
|
|
1104
|
+
Create a ResultSet instance from a pandas DataFrame.
|
|
1101
1105
|
This class method converts a given pandas DataFrame to a Polars DataFrame
|
|
1102
1106
|
and then constructs a ResultSet object from it. This is useful for
|
|
1103
1107
|
integrating with workflows that use pandas for data manipulation.
|
|
1108
|
+
|
|
1104
1109
|
Parameters
|
|
1105
1110
|
----------
|
|
1106
1111
|
df : pandas.DataFrame
|
|
1107
|
-
DataFrame containing the search result fields. Each row should represent
|
|
1108
|
-
|
|
1109
|
-
|
|
1112
|
+
DataFrame containing the search result fields. Each row should represent a single search result, with
|
|
1113
|
+
columns corresponding to the expected fields of ResultSet.
|
|
1114
|
+
|
|
1110
1115
|
Returns
|
|
1111
1116
|
-------
|
|
1112
1117
|
ResultSet
|
|
1113
1118
|
An instance of ResultSet containing the data from the input DataFrame.
|
|
1119
|
+
|
|
1114
1120
|
Examples
|
|
1115
1121
|
--------
|
|
1116
1122
|
>>> data = [{"url": "https://example.com", "title": "Example"}]
|
nosible/classes/search.py
CHANGED
|
@@ -1,13 +1,15 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from dataclasses import asdict, dataclass
|
|
3
4
|
from typing import TYPE_CHECKING
|
|
4
5
|
|
|
5
|
-
from nosible.utils.json_tools import json_dumps, json_loads
|
|
6
|
+
from nosible.utils.json_tools import json_dumps, json_loads, print_dict
|
|
6
7
|
|
|
7
8
|
if TYPE_CHECKING:
|
|
8
9
|
from nosible.classes.search_set import SearchSet
|
|
9
10
|
|
|
10
11
|
|
|
12
|
+
@dataclass(init=True, repr=True, eq=True)
|
|
11
13
|
class Search:
|
|
12
14
|
"""
|
|
13
15
|
Represents the parameters for a search operation.
|
|
@@ -31,45 +33,38 @@ class Search:
|
|
|
31
33
|
Number of context documents to retrieve.
|
|
32
34
|
algorithm : str, optional
|
|
33
35
|
Search algorithm to use.
|
|
34
|
-
output_type : str, optional
|
|
35
|
-
Type of output to produce.
|
|
36
36
|
autogenerate_expansions : bool, default=False
|
|
37
37
|
Do you want to generate expansions automatically using a LLM?
|
|
38
38
|
publish_start : str, optional
|
|
39
|
-
Start date for published
|
|
39
|
+
Start date for when the document was published (ISO format).
|
|
40
40
|
publish_end : str, optional
|
|
41
|
-
End date for published
|
|
42
|
-
include_netlocs : list of str, optional
|
|
43
|
-
List of netlocs (domains) to include in the search.
|
|
44
|
-
exclude_netlocs : list of str, optional
|
|
45
|
-
List of netlocs (domains) to exclude from the search.
|
|
41
|
+
End date for when the document was published (ISO format).
|
|
46
42
|
visited_start : str, optional
|
|
47
|
-
Start date for visited
|
|
43
|
+
Start date for when the document was visited by NOSIBLE (ISO format).
|
|
48
44
|
visited_end : str, optional
|
|
49
|
-
End date for visited
|
|
45
|
+
End date for when the document was visited by NOSIBLE (ISO format).
|
|
50
46
|
certain : bool, optional
|
|
51
|
-
|
|
52
|
-
include_languages : list of str, optional
|
|
53
|
-
Languages to include in the search (Max: 50).
|
|
54
|
-
exclude_languages : list of str, optional
|
|
55
|
-
Languages to exclude from the search (Max: 50).
|
|
47
|
+
Only include documents where we are 100% sure of the date.
|
|
56
48
|
include_netlocs : list of str, optional
|
|
57
|
-
|
|
49
|
+
List of netlocs (domains) to include in the search. (Max 50)
|
|
58
50
|
exclude_netlocs : list of str, optional
|
|
59
|
-
|
|
51
|
+
List of netlocs (domains) to exclude in the search. (Max 50)
|
|
52
|
+
include_languages : list of str, optional
|
|
53
|
+
Languages to include in the search. (Max 50, ISO 639-1 language codes).
|
|
54
|
+
exclude_languages : list of str, optional
|
|
55
|
+
Language codes to exclude in the search (Max 50, ISO 639-1 language codes).
|
|
60
56
|
include_companies : list of str, optional
|
|
61
|
-
|
|
57
|
+
Google KG IDs of public companies to require (Max 50).
|
|
62
58
|
exclude_companies : list of str, optional
|
|
63
|
-
|
|
59
|
+
Google KG IDs of public companies to forbid (Max 50).
|
|
64
60
|
include_docs : list of str, optional
|
|
65
|
-
|
|
61
|
+
URL hashes of docs to include (Max 50).
|
|
66
62
|
exclude_docs : list of str, optional
|
|
67
|
-
|
|
63
|
+
URL hashes of docs to exclude (Max 50).
|
|
68
64
|
|
|
69
65
|
Examples
|
|
70
66
|
--------
|
|
71
67
|
Create a search with specific parameters:
|
|
72
|
-
|
|
73
68
|
>>> search = Search(
|
|
74
69
|
... question="What is Python?",
|
|
75
70
|
... n_results=5,
|
|
@@ -82,6 +77,49 @@ class Search:
|
|
|
82
77
|
What is Python?
|
|
83
78
|
"""
|
|
84
79
|
|
|
80
|
+
question: str | None = None
|
|
81
|
+
"""The main search question or query."""
|
|
82
|
+
expansions: list[str] | None = None
|
|
83
|
+
"""List of query expansions or related terms."""
|
|
84
|
+
sql_filter: str | None = None
|
|
85
|
+
"""Additional SQL filter to apply to the search."""
|
|
86
|
+
n_results: int | None = None
|
|
87
|
+
"""Number of results to return."""
|
|
88
|
+
n_probes: int | None = None
|
|
89
|
+
"""Number of probe queries to use."""
|
|
90
|
+
n_contextify: int | None = None
|
|
91
|
+
"""Number of context documents to retrieve."""
|
|
92
|
+
algorithm: str | None = None
|
|
93
|
+
"""Search algorithm to use."""
|
|
94
|
+
autogenerate_expansions: bool = False
|
|
95
|
+
"""Do you want to generate expansions automatically using a LLM?"""
|
|
96
|
+
publish_start: str | None = None
|
|
97
|
+
"""Start date for when the document was published."""
|
|
98
|
+
publish_end: str | None = None
|
|
99
|
+
"""End date for when the document was published."""
|
|
100
|
+
visited_start: str | None = None
|
|
101
|
+
"""Start date for when the document was visited by NOSIBLE."""
|
|
102
|
+
visited_end: str | None = None
|
|
103
|
+
"""End date for when the document was visited by NOSIBLE."""
|
|
104
|
+
certain: bool | None = None
|
|
105
|
+
"""Only include documents where we are 100% sure of the date."""
|
|
106
|
+
include_netlocs: list[str] | None = None
|
|
107
|
+
"""List of netlocs (domains) to include in the search (Max 50)."""
|
|
108
|
+
exclude_netlocs: list[str] | None = None
|
|
109
|
+
"""List of netlocs (domains) to exclude in the search (Max 50)."""
|
|
110
|
+
include_languages: list[str] | None = None
|
|
111
|
+
"""Languages to include in the search. (Max 50)"""
|
|
112
|
+
exclude_languages: list[str] | None = None
|
|
113
|
+
"""Language codes to exclude in the search (Max 50)"""
|
|
114
|
+
include_companies: list[str] | None = None
|
|
115
|
+
"""Google KG IDs of public companies to require (Max 50)."""
|
|
116
|
+
exclude_companies: list[str] | None = None
|
|
117
|
+
"""Google KG IDs of public companies to forbid (Max 50)."""
|
|
118
|
+
include_docs: list[str] | None = None
|
|
119
|
+
"""URL hashes of docs to include (Max 50)."""
|
|
120
|
+
exclude_docs: list[str] | None = None
|
|
121
|
+
"""URL hashes of docs to exclude (Max 50)."""
|
|
122
|
+
|
|
85
123
|
_FIELDS = [
|
|
86
124
|
"question",
|
|
87
125
|
"expansions",
|
|
@@ -90,7 +128,6 @@ class Search:
|
|
|
90
128
|
"n_probes",
|
|
91
129
|
"n_contextify",
|
|
92
130
|
"algorithm",
|
|
93
|
-
"output_type",
|
|
94
131
|
"autogenerate_expansions",
|
|
95
132
|
"publish_start",
|
|
96
133
|
"publish_end",
|
|
@@ -107,67 +144,17 @@ class Search:
|
|
|
107
144
|
"exclude_docs",
|
|
108
145
|
]
|
|
109
146
|
|
|
110
|
-
def __init__(
|
|
111
|
-
self,
|
|
112
|
-
question: str = None,
|
|
113
|
-
expansions: list[str] = None,
|
|
114
|
-
sql_filter: str = None,
|
|
115
|
-
n_results: int = None,
|
|
116
|
-
n_probes: int = None,
|
|
117
|
-
n_contextify: int = None,
|
|
118
|
-
algorithm: str = None,
|
|
119
|
-
output_type: str = None,
|
|
120
|
-
autogenerate_expansions: bool = False,
|
|
121
|
-
publish_start: str = None,
|
|
122
|
-
publish_end: str = None,
|
|
123
|
-
include_netlocs: list[str] = None,
|
|
124
|
-
exclude_netlocs: list[str] = None,
|
|
125
|
-
visited_start: str = None,
|
|
126
|
-
visited_end: str = None,
|
|
127
|
-
certain: bool = None,
|
|
128
|
-
include_languages: list[str] = None,
|
|
129
|
-
exclude_languages: list[str] = None,
|
|
130
|
-
include_companies: list[str] = None,
|
|
131
|
-
exclude_companies: list[str] = None,
|
|
132
|
-
include_docs: list[str] = None,
|
|
133
|
-
exclude_docs: list[str] = None,
|
|
134
|
-
) -> None:
|
|
135
|
-
self.question = question
|
|
136
|
-
self.expansions = expansions
|
|
137
|
-
self.sql_filter = sql_filter
|
|
138
|
-
self.n_results = n_results
|
|
139
|
-
self.n_probes = n_probes
|
|
140
|
-
self.n_contextify = n_contextify
|
|
141
|
-
self.algorithm = algorithm
|
|
142
|
-
self.output_type = output_type
|
|
143
|
-
self.autogenerate_expansions = autogenerate_expansions
|
|
144
|
-
self.publish_start = publish_start
|
|
145
|
-
self.publish_end = publish_end
|
|
146
|
-
self.include_netlocs = include_netlocs
|
|
147
|
-
self.exclude_netlocs = exclude_netlocs
|
|
148
|
-
self.visited_start = visited_start
|
|
149
|
-
self.visited_end = visited_end
|
|
150
|
-
self.certain = certain
|
|
151
|
-
self.include_languages = include_languages
|
|
152
|
-
self.exclude_languages = exclude_languages
|
|
153
|
-
self.include_companies = include_companies
|
|
154
|
-
self.exclude_companies = exclude_companies
|
|
155
|
-
self.include_docs = include_docs
|
|
156
|
-
self.exclude_docs = exclude_docs
|
|
157
|
-
|
|
158
147
|
def __str__(self) -> str:
|
|
159
148
|
"""
|
|
160
149
|
Return a readable string representation of the search parameters.
|
|
161
150
|
Only non-None fields are shown, each on its own line for clarity.
|
|
151
|
+
|
|
152
|
+
Returns
|
|
153
|
+
-------
|
|
154
|
+
str
|
|
155
|
+
A string representation of the Search instance, showing only the
|
|
162
156
|
"""
|
|
163
|
-
|
|
164
|
-
for attr in self._FIELDS:
|
|
165
|
-
value = getattr(self, attr)
|
|
166
|
-
if value is not None:
|
|
167
|
-
attrs.append(f" {attr} = {value!r}")
|
|
168
|
-
if not attrs:
|
|
169
|
-
return "Search()"
|
|
170
|
-
return "Search(\n" + ",\n".join(attrs) + "\n)"
|
|
157
|
+
return print_dict(self.to_dict())
|
|
171
158
|
|
|
172
159
|
def __add__(self, other: Search) -> SearchSet:
|
|
173
160
|
"""
|
|
@@ -222,7 +209,7 @@ class Search:
|
|
|
222
209
|
>>> search.to_dict()["question"]
|
|
223
210
|
'What is Python?'
|
|
224
211
|
"""
|
|
225
|
-
return
|
|
212
|
+
return asdict(self, dict_factory=dict)
|
|
226
213
|
|
|
227
214
|
@classmethod
|
|
228
215
|
def from_dict(cls, data: dict) -> Search:
|
|
@@ -267,10 +254,6 @@ class Search:
|
|
|
267
254
|
|
|
268
255
|
Raises
|
|
269
256
|
------
|
|
270
|
-
IOError
|
|
271
|
-
If the file cannot be written.
|
|
272
|
-
TypeError
|
|
273
|
-
If serialization of the search parameters fails.
|
|
274
257
|
|
|
275
258
|
Examples
|
|
276
259
|
--------
|
|
@@ -304,10 +287,6 @@ class Search:
|
|
|
304
287
|
|
|
305
288
|
Raises
|
|
306
289
|
------
|
|
307
|
-
IOError
|
|
308
|
-
If the file cannot be read.
|
|
309
|
-
json.JSONDecodeError
|
|
310
|
-
If the file content is not valid JSON.
|
|
311
290
|
|
|
312
291
|
Examples
|
|
313
292
|
--------
|