nosible 0.1.9__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nosible/classes/result.py +21 -8
- nosible/classes/result_set.py +46 -26
- nosible/classes/search.py +16 -0
- nosible/nosible_client.py +346 -38
- nosible/utils/json_tools.py +8 -7
- {nosible-0.1.9.dist-info → nosible-0.2.2.dist-info}/METADATA +97 -17
- nosible-0.2.2.dist-info/RECORD +16 -0
- nosible/utils/question_builder.py +0 -131
- nosible-0.1.9.dist-info/RECORD +0 -17
- {nosible-0.1.9.dist-info → nosible-0.2.2.dist-info}/WHEEL +0 -0
- {nosible-0.1.9.dist-info → nosible-0.2.2.dist-info}/licenses/LICENSE +0 -0
- {nosible-0.1.9.dist-info → nosible-0.2.2.dist-info}/top_level.txt +0 -0
nosible/classes/result.py
CHANGED
|
@@ -3,9 +3,8 @@ from __future__ import annotations
|
|
|
3
3
|
from dataclasses import asdict, dataclass
|
|
4
4
|
from typing import TYPE_CHECKING
|
|
5
5
|
|
|
6
|
-
from openai import OpenAI
|
|
7
|
-
|
|
8
6
|
from nosible.classes.web_page import WebPageData
|
|
7
|
+
from nosible.utils.json_tools import print_dict
|
|
9
8
|
|
|
10
9
|
if TYPE_CHECKING:
|
|
11
10
|
from nosible.classes.result_set import ResultSet
|
|
@@ -102,11 +101,21 @@ class Result:
|
|
|
102
101
|
0.99 | Example Domain
|
|
103
102
|
>>> result = Result(title=None, similarity=None)
|
|
104
103
|
>>> print(str(result))
|
|
105
|
-
|
|
104
|
+
{
|
|
105
|
+
"url": null,
|
|
106
|
+
"title": null,
|
|
107
|
+
"description": null,
|
|
108
|
+
"netloc": null,
|
|
109
|
+
"published": null,
|
|
110
|
+
"visited": null,
|
|
111
|
+
"author": null,
|
|
112
|
+
"content": null,
|
|
113
|
+
"language": null,
|
|
114
|
+
"similarity": null,
|
|
115
|
+
"url_hash": null
|
|
116
|
+
}
|
|
106
117
|
"""
|
|
107
|
-
|
|
108
|
-
title = self.title or "No Title"
|
|
109
|
-
return f"{similarity:>6} | {title}"
|
|
118
|
+
return print_dict(self.to_dict())
|
|
110
119
|
|
|
111
120
|
def __getitem__(self, key: str) -> str | float | bool | None:
|
|
112
121
|
"""
|
|
@@ -295,12 +304,12 @@ class Result:
|
|
|
295
304
|
|
|
296
305
|
The response must be a float in [-1.0, 1.0]. No other text must be returned.
|
|
297
306
|
"""
|
|
298
|
-
|
|
307
|
+
from openai import OpenAI
|
|
299
308
|
llm_client = OpenAI(base_url="https://openrouter.ai/api/v1", api_key=client.llm_api_key)
|
|
300
309
|
|
|
301
310
|
# Call the chat completions endpoint.
|
|
302
311
|
resp = llm_client.chat.completions.create(
|
|
303
|
-
model=
|
|
312
|
+
model=client.sentiment_model, messages=[{"role": "user", "content": prompt.strip()}], temperature=0.7
|
|
304
313
|
)
|
|
305
314
|
|
|
306
315
|
raw = resp.choices[0].message.content
|
|
@@ -335,6 +344,8 @@ class Result:
|
|
|
335
344
|
exclude_languages: list = None,
|
|
336
345
|
include_companies: list = None,
|
|
337
346
|
exclude_companies: list = None,
|
|
347
|
+
include_docs: list = None,
|
|
348
|
+
exclude_docs: list = None,
|
|
338
349
|
) -> ResultSet:
|
|
339
350
|
"""
|
|
340
351
|
Find similar search results based on the content or metadata of this Result.
|
|
@@ -429,6 +440,8 @@ class Result:
|
|
|
429
440
|
exclude_languages=exclude_languages,
|
|
430
441
|
include_companies=include_companies,
|
|
431
442
|
exclude_companies=exclude_companies,
|
|
443
|
+
include_docs=include_docs,
|
|
444
|
+
exclude_docs=exclude_docs,
|
|
432
445
|
)
|
|
433
446
|
return client.search(search=s)
|
|
434
447
|
except Exception as e:
|
nosible/classes/result_set.py
CHANGED
|
@@ -2,15 +2,15 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
from collections.abc import Iterator
|
|
4
4
|
from dataclasses import dataclass, field
|
|
5
|
-
|
|
6
|
-
import duckdb
|
|
7
|
-
import pandas as pd
|
|
8
|
-
import polars as pl
|
|
9
|
-
from tantivy import Document, Index, SchemaBuilder
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
10
6
|
|
|
11
7
|
from nosible.classes.result import Result
|
|
12
8
|
from nosible.utils.json_tools import json_dumps, json_loads
|
|
13
9
|
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
import pandas as pd
|
|
12
|
+
import polars as pl
|
|
13
|
+
|
|
14
14
|
|
|
15
15
|
@dataclass(frozen=True)
|
|
16
16
|
class ResultSet(Iterator[Result]):
|
|
@@ -182,29 +182,34 @@ class ResultSet(Iterator[Result]):
|
|
|
182
182
|
# Setup if required
|
|
183
183
|
return self
|
|
184
184
|
|
|
185
|
-
def __getitem__(self, key: int) -> Result:
|
|
185
|
+
def __getitem__(self, key: int | slice) -> Result | ResultSet:
|
|
186
186
|
"""
|
|
187
|
-
Get a Result by index.
|
|
187
|
+
Get a Result by index or a list of Results by slice.
|
|
188
188
|
|
|
189
189
|
Parameters
|
|
190
190
|
----------
|
|
191
|
-
key : int
|
|
192
|
-
Index of the result to retrieve.
|
|
191
|
+
key : int or slice
|
|
192
|
+
Index or slice of the result(s) to retrieve.
|
|
193
193
|
|
|
194
194
|
Returns
|
|
195
195
|
-------
|
|
196
|
-
Result
|
|
197
|
-
|
|
196
|
+
Result or ResultSet
|
|
197
|
+
A single Result if `key` is an integer, or a ResultSet containing the sliced results if `key` is a slice.
|
|
198
198
|
|
|
199
199
|
Raises
|
|
200
200
|
------
|
|
201
201
|
IndexError
|
|
202
202
|
If index is out of range.
|
|
203
|
+
TypeError
|
|
204
|
+
If key is not an integer or slice.
|
|
203
205
|
"""
|
|
204
|
-
if
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
206
|
+
if isinstance(key, int):
|
|
207
|
+
if 0 <= key < len(self.results):
|
|
208
|
+
return self.results[key]
|
|
209
|
+
raise IndexError(f"Index {key} out of range for ResultSet with length {len(self.results)}.")
|
|
210
|
+
if isinstance(key, slice):
|
|
211
|
+
return ResultSet(self.results[key])
|
|
212
|
+
raise TypeError("ResultSet indices must be integers or slices.")
|
|
208
213
|
|
|
209
214
|
def __add__(self, other: ResultSet | Result) -> ResultSet:
|
|
210
215
|
"""
|
|
@@ -285,12 +290,13 @@ class ResultSet(Iterator[Result]):
|
|
|
285
290
|
|
|
286
291
|
def find_in_search_results(self, query: str, top_k: int = 10) -> ResultSet:
|
|
287
292
|
"""
|
|
288
|
-
|
|
293
|
+
This allows you to search within the results of a search using BM25 scoring by
|
|
294
|
+
performing an in-memory search over a ResultSet collection using Tantivy.
|
|
289
295
|
|
|
290
296
|
Parameters
|
|
291
297
|
----------
|
|
292
298
|
query : str
|
|
293
|
-
The search string to
|
|
299
|
+
The search string you want to find within these results.
|
|
294
300
|
top_k : int
|
|
295
301
|
Number of top results to return.
|
|
296
302
|
|
|
@@ -316,6 +322,8 @@ class ResultSet(Iterator[Result]):
|
|
|
316
322
|
Document returned
|
|
317
323
|
Document returned
|
|
318
324
|
"""
|
|
325
|
+
from tantivy import Document, Index, SchemaBuilder
|
|
326
|
+
|
|
319
327
|
# Build the Tantivy schema
|
|
320
328
|
schema_builder = SchemaBuilder()
|
|
321
329
|
# Int for doc retrieval.
|
|
@@ -439,6 +447,9 @@ class ResultSet(Iterator[Result]):
|
|
|
439
447
|
Traceback (most recent call last):
|
|
440
448
|
ValueError: Cannot analyze by 'foobar' - not a valid field.
|
|
441
449
|
"""
|
|
450
|
+
import pandas as pd
|
|
451
|
+
import polars as pl
|
|
452
|
+
|
|
442
453
|
# Convert to Polars DataFrame
|
|
443
454
|
df: pl.DataFrame = self.to_polars()
|
|
444
455
|
|
|
@@ -467,7 +478,7 @@ class ResultSet(Iterator[Result]):
|
|
|
467
478
|
# Extract year-month
|
|
468
479
|
df = df.with_columns(pl.col(by).dt.strftime("%Y-%m").alias("year_month"))
|
|
469
480
|
# Count per month
|
|
470
|
-
vc = df.group_by("year_month").agg(pl.
|
|
481
|
+
vc = df.group_by("year_month").agg(pl.len().alias("count")).sort("year_month")
|
|
471
482
|
rows = vc.rows()
|
|
472
483
|
if not rows:
|
|
473
484
|
return {}
|
|
@@ -571,6 +582,10 @@ class ResultSet(Iterator[Result]):
|
|
|
571
582
|
>>> "url" in df.columns
|
|
572
583
|
True
|
|
573
584
|
"""
|
|
585
|
+
# Lazy import for runtime, but allow static type checking
|
|
586
|
+
|
|
587
|
+
import polars as pl
|
|
588
|
+
|
|
574
589
|
return pl.DataFrame(self.to_dicts())
|
|
575
590
|
|
|
576
591
|
def to_pandas(self) -> pd.DataFrame:
|
|
@@ -911,7 +926,7 @@ class ResultSet(Iterator[Result]):
|
|
|
911
926
|
import duckdb
|
|
912
927
|
|
|
913
928
|
# Convert to Polars DataFrame and then to Arrow Table
|
|
914
|
-
df = self.to_polars()
|
|
929
|
+
df = self.to_polars() # noqa: F841
|
|
915
930
|
# Connect to DuckDB and write the Arrow Table to a table
|
|
916
931
|
con = duckdb.connect(out)
|
|
917
932
|
# Write the DataFrame to the specified table name, replacing if exists
|
|
@@ -964,6 +979,8 @@ class ResultSet(Iterator[Result]):
|
|
|
964
979
|
>>> results[0].title
|
|
965
980
|
'Example Domain'
|
|
966
981
|
"""
|
|
982
|
+
import polars as pl
|
|
983
|
+
|
|
967
984
|
try:
|
|
968
985
|
df = pl.read_csv(file_path)
|
|
969
986
|
except Exception as e:
|
|
@@ -1124,6 +1141,8 @@ class ResultSet(Iterator[Result]):
|
|
|
1124
1141
|
>>> print(len(df))
|
|
1125
1142
|
1
|
|
1126
1143
|
"""
|
|
1144
|
+
import polars as pl
|
|
1145
|
+
|
|
1127
1146
|
pl_df = pl.from_pandas(df)
|
|
1128
1147
|
return cls.from_polars(pl_df)
|
|
1129
1148
|
|
|
@@ -1239,6 +1258,8 @@ class ResultSet(Iterator[Result]):
|
|
|
1239
1258
|
>>> results[0].title
|
|
1240
1259
|
'Example Domain'
|
|
1241
1260
|
"""
|
|
1261
|
+
import polars as pl
|
|
1262
|
+
|
|
1242
1263
|
try:
|
|
1243
1264
|
df = pl.read_parquet(file_path)
|
|
1244
1265
|
except Exception as e:
|
|
@@ -1288,6 +1309,8 @@ class ResultSet(Iterator[Result]):
|
|
|
1288
1309
|
>>> results[0].title
|
|
1289
1310
|
'Example Domain'
|
|
1290
1311
|
"""
|
|
1312
|
+
import polars as pl
|
|
1313
|
+
|
|
1291
1314
|
try:
|
|
1292
1315
|
df = pl.read_ipc(file_path)
|
|
1293
1316
|
except Exception as e:
|
|
@@ -1340,7 +1363,11 @@ class ResultSet(Iterator[Result]):
|
|
|
1340
1363
|
>>> loaded[0].title
|
|
1341
1364
|
'Example Domain'
|
|
1342
1365
|
"""
|
|
1366
|
+
import polars as pl
|
|
1367
|
+
|
|
1343
1368
|
try:
|
|
1369
|
+
import duckdb
|
|
1370
|
+
|
|
1344
1371
|
con = duckdb.connect(file_path, read_only=True)
|
|
1345
1372
|
except Exception as e:
|
|
1346
1373
|
raise RuntimeError(f"Failed to connect to DuckDB file '{file_path}': {e}") from e
|
|
@@ -1492,10 +1519,3 @@ class ResultSet(Iterator[Result]):
|
|
|
1492
1519
|
"""
|
|
1493
1520
|
# TODO: cleanup handles, sessions, etc.
|
|
1494
1521
|
pass
|
|
1495
|
-
|
|
1496
|
-
|
|
1497
|
-
if __name__ == "__main__":
|
|
1498
|
-
import doctest
|
|
1499
|
-
|
|
1500
|
-
doctest.testmod(optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
|
|
1501
|
-
print("All tests passed!")
|
nosible/classes/search.py
CHANGED
|
@@ -33,6 +33,12 @@ class Search:
|
|
|
33
33
|
Number of context documents to retrieve.
|
|
34
34
|
algorithm : str, optional
|
|
35
35
|
Search algorithm to use.
|
|
36
|
+
min_similarity : float
|
|
37
|
+
Results must have at least this similarity score.
|
|
38
|
+
must_include: list of str
|
|
39
|
+
Only results mentioning these strings will be included.
|
|
40
|
+
must_exclude : list of str
|
|
41
|
+
Any result mentioning these strings will be excluded.
|
|
36
42
|
autogenerate_expansions : bool, default=False
|
|
37
43
|
Do you want to generate expansions automatically using a LLM?
|
|
38
44
|
publish_start : str, optional
|
|
@@ -65,6 +71,7 @@ class Search:
|
|
|
65
71
|
Examples
|
|
66
72
|
--------
|
|
67
73
|
Create a search with specific parameters:
|
|
74
|
+
|
|
68
75
|
>>> search = Search(
|
|
69
76
|
... question="What is Python?",
|
|
70
77
|
... n_results=5,
|
|
@@ -91,6 +98,12 @@ class Search:
|
|
|
91
98
|
"""Number of context documents to retrieve."""
|
|
92
99
|
algorithm: str | None = None
|
|
93
100
|
"""Search algorithm to use."""
|
|
101
|
+
min_similarity: float | None = None
|
|
102
|
+
"""Results must have at least this similarity score."""
|
|
103
|
+
must_include: list[str] | None = None
|
|
104
|
+
"""Only results mentioning these strings will be included."""
|
|
105
|
+
must_exclude: list[str] | None = None
|
|
106
|
+
"""Any result mentioning these strings will be excluded."""
|
|
94
107
|
autogenerate_expansions: bool = False
|
|
95
108
|
"""Do you want to generate expansions automatically using a LLM?"""
|
|
96
109
|
publish_start: str | None = None
|
|
@@ -128,6 +141,9 @@ class Search:
|
|
|
128
141
|
"n_probes",
|
|
129
142
|
"n_contextify",
|
|
130
143
|
"algorithm",
|
|
144
|
+
"min_similarity",
|
|
145
|
+
"must_include",
|
|
146
|
+
"must_exclude",
|
|
131
147
|
"autogenerate_expansions",
|
|
132
148
|
"publish_start",
|
|
133
149
|
"publish_end",
|