nosible 0.1.9__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
nosible/classes/result.py CHANGED
@@ -3,9 +3,8 @@ from __future__ import annotations
3
3
  from dataclasses import asdict, dataclass
4
4
  from typing import TYPE_CHECKING
5
5
 
6
- from openai import OpenAI
7
-
8
6
  from nosible.classes.web_page import WebPageData
7
+ from nosible.utils.json_tools import print_dict
9
8
 
10
9
  if TYPE_CHECKING:
11
10
  from nosible.classes.result_set import ResultSet
@@ -102,11 +101,21 @@ class Result:
102
101
  0.99 | Example Domain
103
102
  >>> result = Result(title=None, similarity=None)
104
103
  >>> print(str(result))
105
- N/A | No Title
104
+ {
105
+ "url": null,
106
+ "title": null,
107
+ "description": null,
108
+ "netloc": null,
109
+ "published": null,
110
+ "visited": null,
111
+ "author": null,
112
+ "content": null,
113
+ "language": null,
114
+ "similarity": null,
115
+ "url_hash": null
116
+ }
106
117
  """
107
- similarity = f"{self.similarity:.2f}" if self.similarity is not None else "N/A"
108
- title = self.title or "No Title"
109
- return f"{similarity:>6} | {title}"
118
+ return print_dict(self.to_dict())
110
119
 
111
120
  def __getitem__(self, key: str) -> str | float | bool | None:
112
121
  """
@@ -295,12 +304,12 @@ class Result:
295
304
 
296
305
  The response must be a float in [-1.0, 1.0]. No other text must be returned.
297
306
  """
298
-
307
+ from openai import OpenAI
299
308
  llm_client = OpenAI(base_url="https://openrouter.ai/api/v1", api_key=client.llm_api_key)
300
309
 
301
310
  # Call the chat completions endpoint.
302
311
  resp = llm_client.chat.completions.create(
303
- model="openai/gpt-4o", messages=[{"role": "user", "content": prompt.strip()}], temperature=0.7
312
+ model=client.sentiment_model, messages=[{"role": "user", "content": prompt.strip()}], temperature=0.7
304
313
  )
305
314
 
306
315
  raw = resp.choices[0].message.content
@@ -335,6 +344,8 @@ class Result:
335
344
  exclude_languages: list = None,
336
345
  include_companies: list = None,
337
346
  exclude_companies: list = None,
347
+ include_docs: list = None,
348
+ exclude_docs: list = None,
338
349
  ) -> ResultSet:
339
350
  """
340
351
  Find similar search results based on the content or metadata of this Result.
@@ -429,6 +440,8 @@ class Result:
429
440
  exclude_languages=exclude_languages,
430
441
  include_companies=include_companies,
431
442
  exclude_companies=exclude_companies,
443
+ include_docs=include_docs,
444
+ exclude_docs=exclude_docs,
432
445
  )
433
446
  return client.search(search=s)
434
447
  except Exception as e:
@@ -2,15 +2,15 @@ from __future__ import annotations
2
2
 
3
3
  from collections.abc import Iterator
4
4
  from dataclasses import dataclass, field
5
-
6
- import duckdb
7
- import pandas as pd
8
- import polars as pl
9
- from tantivy import Document, Index, SchemaBuilder
5
+ from typing import TYPE_CHECKING
10
6
 
11
7
  from nosible.classes.result import Result
12
8
  from nosible.utils.json_tools import json_dumps, json_loads
13
9
 
10
+ if TYPE_CHECKING:
11
+ import pandas as pd
12
+ import polars as pl
13
+
14
14
 
15
15
  @dataclass(frozen=True)
16
16
  class ResultSet(Iterator[Result]):
@@ -182,29 +182,34 @@ class ResultSet(Iterator[Result]):
182
182
  # Setup if required
183
183
  return self
184
184
 
185
- def __getitem__(self, key: int) -> Result:
185
+ def __getitem__(self, key: int | slice) -> Result | ResultSet:
186
186
  """
187
- Get a Result by index.
187
+ Get a Result by index or a list of Results by slice.
188
188
 
189
189
  Parameters
190
190
  ----------
191
- key : int
192
- Index of the result to retrieve.
191
+ key : int or slice
192
+ Index or slice of the result(s) to retrieve.
193
193
 
194
194
  Returns
195
195
  -------
196
- Result
197
- The Result at the specified index.
196
+ Result or ResultSet
197
+ A single Result if `key` is an integer, or a ResultSet containing the sliced results if `key` is a slice.
198
198
 
199
199
  Raises
200
200
  ------
201
201
  IndexError
202
202
  If index is out of range.
203
+ TypeError
204
+ If key is not an integer or slice.
203
205
  """
204
- if 0 <= key < len(self.results):
205
- return self.results[key]
206
- raise IndexError(f"Index {key} out of range for ResultSet with length {len(self.results)}.")
207
- raise IndexError(f"Index {key} out of range for ResultSet with length {len(self.results)}.")
206
+ if isinstance(key, int):
207
+ if 0 <= key < len(self.results):
208
+ return self.results[key]
209
+ raise IndexError(f"Index {key} out of range for ResultSet with length {len(self.results)}.")
210
+ if isinstance(key, slice):
211
+ return ResultSet(self.results[key])
212
+ raise TypeError("ResultSet indices must be integers or slices.")
208
213
 
209
214
  def __add__(self, other: ResultSet | Result) -> ResultSet:
210
215
  """
@@ -285,12 +290,13 @@ class ResultSet(Iterator[Result]):
285
290
 
286
291
  def find_in_search_results(self, query: str, top_k: int = 10) -> ResultSet:
287
292
  """
288
- Perform an in-memory search over a ResultSet collection using Tantivy.
293
+ This allows you to search within the results of a search using BM25 scoring by
294
+ performing an in-memory search over a ResultSet collection using Tantivy.
289
295
 
290
296
  Parameters
291
297
  ----------
292
298
  query : str
293
- The search string to rank within these results.
299
+ The search string you want to find within these results.
294
300
  top_k : int
295
301
  Number of top results to return.
296
302
 
@@ -316,6 +322,8 @@ class ResultSet(Iterator[Result]):
316
322
  Document returned
317
323
  Document returned
318
324
  """
325
+ from tantivy import Document, Index, SchemaBuilder
326
+
319
327
  # Build the Tantivy schema
320
328
  schema_builder = SchemaBuilder()
321
329
  # Int for doc retrieval.
@@ -439,6 +447,9 @@ class ResultSet(Iterator[Result]):
439
447
  Traceback (most recent call last):
440
448
  ValueError: Cannot analyze by 'foobar' - not a valid field.
441
449
  """
450
+ import pandas as pd
451
+ import polars as pl
452
+
442
453
  # Convert to Polars DataFrame
443
454
  df: pl.DataFrame = self.to_polars()
444
455
 
@@ -467,7 +478,7 @@ class ResultSet(Iterator[Result]):
467
478
  # Extract year-month
468
479
  df = df.with_columns(pl.col(by).dt.strftime("%Y-%m").alias("year_month"))
469
480
  # Count per month
470
- vc = df.group_by("year_month").agg(pl.count().alias("count")).sort("year_month")
481
+ vc = df.group_by("year_month").agg(pl.len().alias("count")).sort("year_month")
471
482
  rows = vc.rows()
472
483
  if not rows:
473
484
  return {}
@@ -571,6 +582,10 @@ class ResultSet(Iterator[Result]):
571
582
  >>> "url" in df.columns
572
583
  True
573
584
  """
585
+ # Lazy import for runtime, but allow static type checking
586
+
587
+ import polars as pl
588
+
574
589
  return pl.DataFrame(self.to_dicts())
575
590
 
576
591
  def to_pandas(self) -> pd.DataFrame:
@@ -911,7 +926,7 @@ class ResultSet(Iterator[Result]):
911
926
  import duckdb
912
927
 
913
928
  # Convert to Polars DataFrame and then to Arrow Table
914
- df = self.to_polars()
929
+ df = self.to_polars() # noqa: F841
915
930
  # Connect to DuckDB and write the Arrow Table to a table
916
931
  con = duckdb.connect(out)
917
932
  # Write the DataFrame to the specified table name, replacing if exists
@@ -964,6 +979,8 @@ class ResultSet(Iterator[Result]):
964
979
  >>> results[0].title
965
980
  'Example Domain'
966
981
  """
982
+ import polars as pl
983
+
967
984
  try:
968
985
  df = pl.read_csv(file_path)
969
986
  except Exception as e:
@@ -1124,6 +1141,8 @@ class ResultSet(Iterator[Result]):
1124
1141
  >>> print(len(df))
1125
1142
  1
1126
1143
  """
1144
+ import polars as pl
1145
+
1127
1146
  pl_df = pl.from_pandas(df)
1128
1147
  return cls.from_polars(pl_df)
1129
1148
 
@@ -1239,6 +1258,8 @@ class ResultSet(Iterator[Result]):
1239
1258
  >>> results[0].title
1240
1259
  'Example Domain'
1241
1260
  """
1261
+ import polars as pl
1262
+
1242
1263
  try:
1243
1264
  df = pl.read_parquet(file_path)
1244
1265
  except Exception as e:
@@ -1288,6 +1309,8 @@ class ResultSet(Iterator[Result]):
1288
1309
  >>> results[0].title
1289
1310
  'Example Domain'
1290
1311
  """
1312
+ import polars as pl
1313
+
1291
1314
  try:
1292
1315
  df = pl.read_ipc(file_path)
1293
1316
  except Exception as e:
@@ -1340,7 +1363,11 @@ class ResultSet(Iterator[Result]):
1340
1363
  >>> loaded[0].title
1341
1364
  'Example Domain'
1342
1365
  """
1366
+ import polars as pl
1367
+
1343
1368
  try:
1369
+ import duckdb
1370
+
1344
1371
  con = duckdb.connect(file_path, read_only=True)
1345
1372
  except Exception as e:
1346
1373
  raise RuntimeError(f"Failed to connect to DuckDB file '{file_path}': {e}") from e
@@ -1492,10 +1519,3 @@ class ResultSet(Iterator[Result]):
1492
1519
  """
1493
1520
  # TODO: cleanup handles, sessions, etc.
1494
1521
  pass
1495
-
1496
-
1497
- if __name__ == "__main__":
1498
- import doctest
1499
-
1500
- doctest.testmod(optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
1501
- print("All tests passed!")
nosible/classes/search.py CHANGED
@@ -33,6 +33,12 @@ class Search:
33
33
  Number of context documents to retrieve.
34
34
  algorithm : str, optional
35
35
  Search algorithm to use.
36
+ min_similarity : float
37
+ Results must have at least this similarity score.
38
+ must_include: list of str
39
+ Only results mentioning these strings will be included.
40
+ must_exclude : list of str
41
+ Any result mentioning these strings will be excluded.
36
42
  autogenerate_expansions : bool, default=False
37
43
  Do you want to generate expansions automatically using a LLM?
38
44
  publish_start : str, optional
@@ -65,6 +71,7 @@ class Search:
65
71
  Examples
66
72
  --------
67
73
  Create a search with specific parameters:
74
+
68
75
  >>> search = Search(
69
76
  ... question="What is Python?",
70
77
  ... n_results=5,
@@ -91,6 +98,12 @@ class Search:
91
98
  """Number of context documents to retrieve."""
92
99
  algorithm: str | None = None
93
100
  """Search algorithm to use."""
101
+ min_similarity: float | None = None
102
+ """Results must have at least this similarity score."""
103
+ must_include: list[str] | None = None
104
+ """Only results mentioning these strings will be included."""
105
+ must_exclude: list[str] | None = None
106
+ """Any result mentioning these strings will be excluded."""
94
107
  autogenerate_expansions: bool = False
95
108
  """Do you want to generate expansions automatically using a LLM?"""
96
109
  publish_start: str | None = None
@@ -128,6 +141,9 @@ class Search:
128
141
  "n_probes",
129
142
  "n_contextify",
130
143
  "algorithm",
144
+ "min_similarity",
145
+ "must_include",
146
+ "must_exclude",
131
147
  "autogenerate_expansions",
132
148
  "publish_start",
133
149
  "publish_end",