nosible 0.3.6__tar.gz → 0.3.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {nosible-0.3.6/src/nosible.egg-info → nosible-0.3.10}/PKG-INFO +2 -2
  2. {nosible-0.3.6 → nosible-0.3.10}/pyproject.toml +3 -3
  3. {nosible-0.3.6 → nosible-0.3.10}/src/nosible/classes/result.py +17 -17
  4. {nosible-0.3.6 → nosible-0.3.10}/src/nosible/classes/result_set.py +5 -0
  5. {nosible-0.3.6 → nosible-0.3.10}/src/nosible/classes/snippet.py +0 -7
  6. {nosible-0.3.6 → nosible-0.3.10}/src/nosible/classes/web_page.py +0 -2
  7. {nosible-0.3.6 → nosible-0.3.10}/src/nosible/nosible_client.py +72 -41
  8. {nosible-0.3.6 → nosible-0.3.10}/src/nosible/utils/rate_limiter.py +9 -77
  9. {nosible-0.3.6 → nosible-0.3.10/src/nosible.egg-info}/PKG-INFO +2 -2
  10. {nosible-0.3.6 → nosible-0.3.10}/src/nosible.egg-info/requires.txt +1 -1
  11. {nosible-0.3.6 → nosible-0.3.10}/tests/test_01_nosible.py +2 -13
  12. {nosible-0.3.6 → nosible-0.3.10}/tests/test_02_results.py +30 -3
  13. {nosible-0.3.6 → nosible-0.3.10}/tests/test_04_snippets.py +1 -2
  14. {nosible-0.3.6 → nosible-0.3.10}/LICENSE +0 -0
  15. {nosible-0.3.6 → nosible-0.3.10}/README.md +0 -0
  16. {nosible-0.3.6 → nosible-0.3.10}/setup.cfg +0 -0
  17. {nosible-0.3.6 → nosible-0.3.10}/setup.py +0 -0
  18. {nosible-0.3.6 → nosible-0.3.10}/src/nosible/__init__.py +0 -0
  19. {nosible-0.3.6 → nosible-0.3.10}/src/nosible/classes/search.py +0 -0
  20. {nosible-0.3.6 → nosible-0.3.10}/src/nosible/classes/search_set.py +0 -0
  21. {nosible-0.3.6 → nosible-0.3.10}/src/nosible/classes/snippet_set.py +0 -0
  22. {nosible-0.3.6 → nosible-0.3.10}/src/nosible/utils/json_tools.py +0 -0
  23. {nosible-0.3.6 → nosible-0.3.10}/src/nosible.egg-info/SOURCES.txt +0 -0
  24. {nosible-0.3.6 → nosible-0.3.10}/src/nosible.egg-info/dependency_links.txt +0 -0
  25. {nosible-0.3.6 → nosible-0.3.10}/src/nosible.egg-info/top_level.txt +0 -0
  26. {nosible-0.3.6 → nosible-0.3.10}/tests/test_03_search_searchset.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nosible
3
- Version: 0.3.6
3
+ Version: 0.3.10
4
4
  Summary: Python client for the NOSIBLE Search API
5
5
  Home-page: https://github.com/NosibleAI/nosible-py
6
6
  Author: Stuart Reid, Matthew Dicks, Richard Taylor, Gareth Warburton
@@ -31,7 +31,7 @@ Requires-Dist: polars
31
31
  Requires-Dist: duckdb
32
32
  Requires-Dist: openai
33
33
  Requires-Dist: tantivy
34
- Requires-Dist: pyrate-limiter
34
+ Requires-Dist: pyrate-limiter<4
35
35
  Requires-Dist: tenacity
36
36
  Requires-Dist: cryptography
37
37
  Requires-Dist: pyarrow
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "nosible"
3
- version = "0.3.6"
3
+ version = "0.3.10"
4
4
  description = "Python client for the NOSIBLE Search API"
5
5
  readme = { file = "README.md", content-type = "text/markdown" }
6
6
  requires-python = ">=3.9"
@@ -16,7 +16,7 @@ dependencies = [
16
16
  "duckdb",
17
17
  "openai",
18
18
  "tantivy",
19
- "pyrate-limiter",
19
+ "pyrate-limiter<4",
20
20
  "tenacity",
21
21
  "cryptography",
22
22
  "pyarrow",
@@ -61,5 +61,5 @@ dev-dependencies = [
61
61
  "pytest-doctestplus",
62
62
  "pytest-xdist",
63
63
  "urllib3==1.26.15",
64
- "hishel",
64
+ "hishel[async]",
65
65
  ]
@@ -36,6 +36,8 @@ class Result:
36
36
  The author of the content.
37
37
  content : str, optional
38
38
  The main content or body of the search result.
39
+ best_chunk : str, optional
40
+ The best snippet of text that matches your question from the search result.
39
41
  language : str, optional
40
42
  The language code of the content (e.g., 'en' for English).
41
43
  similarity : float, optional
@@ -105,6 +107,8 @@ class Result:
105
107
  """The author of the content."""
106
108
  content: str | None = None
107
109
  """The main content or body of the search result."""
110
+ best_chunk: str | None = None
111
+ """The best snippet of text that matches your question from the search result."""
108
112
  language: str | None = None
109
113
  """The language code of the content (e.g., 'en' for English)."""
110
114
  similarity: float | None = None
@@ -150,23 +154,14 @@ class Result:
150
154
  >>> result = Result(title="Example Domain", similarity=0.9876)
151
155
  >>> print(str(result))
152
156
  0.99 | Example Domain
153
- >>> result = Result(title=None, similarity=None)
154
- >>> print(str(result))
155
- {
156
- "url": null,
157
- "title": null,
158
- "description": null,
159
- "netloc": null,
160
- "published": null,
161
- "visited": null,
162
- "author": null,
163
- "content": null,
164
- "language": null,
165
- "similarity": null,
166
- "url_hash": null
167
- }
168
157
  """
169
- return print_dict(self.to_dict())
158
+ # Get the full dictionary
159
+ data = self.to_dict()
160
+
161
+ # Create a new dictionary excluding keys where the value is None
162
+ clean_data = {k: v for k, v in data.items() if v is not None}
163
+
164
+ return print_dict(clean_data)
170
165
 
171
166
  def __getitem__(self, key: str) -> str | float | bool | None:
172
167
  """
@@ -519,6 +514,11 @@ class Result:
519
514
  try:
520
515
  from nosible import Search
521
516
 
517
+ # Exclude the original doc from the new search.
518
+ exclude_docs_list = list(exclude_docs) if exclude_docs else []
519
+ if self.url_hash and self.url_hash not in exclude_docs_list:
520
+ exclude_docs_list.append(self.url_hash)
521
+
522
522
  s = Search(
523
523
  question=self.title,
524
524
  expansions=[],
@@ -537,7 +537,7 @@ class Result:
537
537
  include_companies=include_companies,
538
538
  exclude_companies=exclude_companies,
539
539
  include_docs=include_docs,
540
- exclude_docs=exclude_docs,
540
+ exclude_docs=exclude_docs_list,
541
541
  brand_safety=brand_safety,
542
542
  language=language,
543
543
  continent=continent,
@@ -54,6 +54,7 @@ class ResultSet(Iterator[Result]):
54
54
  "visited",
55
55
  "author",
56
56
  "content",
57
+ "best_chunk",
57
58
  "language",
58
59
  "similarity",
59
60
  "url_hash",
@@ -1004,6 +1005,7 @@ class ResultSet(Iterator[Result]):
1004
1005
  visited=row.get("visited"),
1005
1006
  author=row.get("author"),
1006
1007
  content=row.get("content"),
1008
+ best_chunk=row.get("best_chunk"),
1007
1009
  language=row.get("language"),
1008
1010
  similarity=row.get("similarity"),
1009
1011
  url_hash=row.get("url_hash"),
@@ -1113,6 +1115,7 @@ class ResultSet(Iterator[Result]):
1113
1115
  visited=row.get("visited"),
1114
1116
  author=row.get("author"),
1115
1117
  content=row.get("content"),
1118
+ best_chunk=row.get("best_chunk"),
1116
1119
  language=row.get("language"),
1117
1120
  similarity=row.get("semantics", {}).get("similarity", row.get("similarity")),
1118
1121
  url_hash=row.get("url_hash"),
@@ -1212,6 +1215,7 @@ class ResultSet(Iterator[Result]):
1212
1215
  visited=data.get("visited"),
1213
1216
  author=data.get("author"),
1214
1217
  content=data.get("content"),
1218
+ best_chunk=data.get("best_chunk"),
1215
1219
  language=data.get("language"),
1216
1220
  similarity=data.get("similarity"),
1217
1221
  url_hash=data.get("url_hash"),
@@ -1449,6 +1453,7 @@ class ResultSet(Iterator[Result]):
1449
1453
  visited=d.get("visited"),
1450
1454
  author=d.get("author"),
1451
1455
  content=d.get("content"),
1456
+ best_chunk=d.get("best_chunk"),
1452
1457
  language=d.get("language"),
1453
1458
  similarity=d.get("similarity", d.get("semantics", {}).get("similarity")),
1454
1459
  url_hash=d.get("url_hash"),
@@ -30,18 +30,13 @@ class Snippet:
30
30
  The words in the snippet.
31
31
  links : list or None
32
32
  List of links associated with the snippet.
33
- companies : list or None
34
- List of companies mentioned in the snippet.
35
-
36
33
 
37
34
  Examples
38
35
  --------
39
36
  >>> snippet = Snippet(content="Example snippet", language="en")
40
37
  >>> print(snippet.content)
41
38
  Example snippet
42
-
43
39
  """
44
-
45
40
  content: str = field(default=None, repr=True, compare=True)
46
41
  """The text content of the snippet."""
47
42
  images: list = field(default=None, repr=True, compare=False)
@@ -62,8 +57,6 @@ class Snippet:
62
57
  """The words in the snippet."""
63
58
  links: list = field(default=None, repr=False, compare=False)
64
59
  """List of links associated with the snippet."""
65
- companies: list = field(default=None, repr=False, compare=False)
66
- """List of companies mentioned in the snippet."""
67
60
 
68
61
  def __str__(self):
69
62
  """
@@ -40,8 +40,6 @@ class WebPageData:
40
40
  {'description': 'Example'}
41
41
  """
42
42
 
43
- companies: list = None
44
- """A list of companies mentioned in the webpage, if applicable. (GKIDS)"""
45
43
  full_text: str = None
46
44
  """The full text content of the webpage."""
47
45
  languages: dict = None
@@ -29,7 +29,7 @@ from nosible.classes.search_set import SearchSet
29
29
  from nosible.classes.snippet_set import SnippetSet
30
30
  from nosible.classes.web_page import WebPageData
31
31
  from nosible.utils.json_tools import json_loads
32
- from nosible.utils.rate_limiter import PLAN_RATE_LIMITS, RateLimiter, _rate_limited
32
+ from nosible.utils.rate_limiter import RateLimiter, _rate_limited
33
33
 
34
34
  # Set up a module‐level logger.
35
35
  logger = logging.getLogger(__name__)
@@ -202,11 +202,6 @@ class Nosible:
202
202
  logging.getLogger("httpx").setLevel(logging.WARNING)
203
203
  logging.getLogger("httpcore").setLevel(logging.WARNING)
204
204
 
205
- self._limiters = {
206
- endpoint: [RateLimiter(calls, period) for calls, period in buckets]
207
- for endpoint, buckets in PLAN_RATE_LIMITS[self._get_user_plan()].items()
208
- }
209
-
210
205
  # Define retry decorator
211
206
  self._post = retry(
212
207
  reraise=True,
@@ -230,7 +225,34 @@ class Nosible:
230
225
  self._executor = ThreadPoolExecutor(max_workers=self.concurrency)
231
226
 
232
227
  # Headers
233
- self.headers = {"Accept-Encoding": "gzip", "Content-Type": "application/json", "api-key": self.nosible_api_key}
228
+ self.headers = {
229
+ "Accept-Encoding": "gzip",
230
+ "Content-Type": "application/json",
231
+ "api-key": self.nosible_api_key
232
+ }
233
+
234
+ # Wrap _get_limits with retry.
235
+ self._get_limits = retry(
236
+ reraise=True,
237
+ stop=stop_after_attempt(self.retries) | stop_after_delay(self.timeout),
238
+ wait=wait_exponential(multiplier=1, min=1, max=20),
239
+ retry=retry_if_exception_type(httpx.RequestError),
240
+ before_sleep=before_sleep_log(self.logger, logging.WARNING),
241
+ )(self._get_limits)
242
+
243
+ raw_limits = self._get_limits()
244
+
245
+ # Map API query_type -> your decorator endpoint keys
246
+ mapped_limits = {
247
+ "fast": raw_limits.get("fast", []),
248
+ "bulk": raw_limits.get("slow", []),
249
+ "scrape-url": raw_limits.get("visit", []),
250
+ }
251
+
252
+ self._limiters = {
253
+ endpoint: [RateLimiter(calls, period) for calls, period in buckets]
254
+ for endpoint, buckets in mapped_limits.items()
255
+ }
234
256
 
235
257
  # Filters
236
258
  self.publish_start = publish_start
@@ -1522,7 +1544,6 @@ class Nosible:
1522
1544
 
1523
1545
  response_data = data["response"]
1524
1546
  return WebPageData(
1525
- companies=response_data.get("companies"),
1526
1547
  full_text=response_data.get("full_text"),
1527
1548
  languages=response_data.get("languages"),
1528
1549
  metadata=response_data.get("metadata"),
@@ -1603,7 +1624,6 @@ class Nosible:
1603
1624
 
1604
1625
  return filtered
1605
1626
 
1606
-
1607
1627
  def close(self):
1608
1628
  """
1609
1629
  Close the Nosible client, shutting down the HTTP session
@@ -1703,41 +1723,52 @@ class Nosible:
1703
1723
 
1704
1724
  return response
1705
1725
 
1706
- def _get_user_plan(self) -> str:
1726
+ def _get_limits(self) -> dict[str, list[tuple[int, float]]]:
1727
+ """
1728
+ TODO
1707
1729
  """
1708
- Determine the user's subscription plan from the API key.
1730
+ url = "https://www.nosible.ai/search/v2/limits"
1731
+ resp = self._session.get(
1732
+ url=url,
1733
+ headers=self.headers,
1734
+ timeout=self.timeout,
1735
+ follow_redirects=True,
1736
+ )
1709
1737
 
1710
- The `nosible_api_key` is expected to start with a plan prefix followed by
1711
- a pipe (`|`) and any additional data. This method splits on the first
1712
- pipe character, validates the prefix against supported plans, and returns it.
1738
+ if resp.status_code == 401:
1739
+ raise ValueError("Your API key is not valid.")
1740
+ if resp.status_code == 429:
1741
+ raise ValueError("You have hit your rate limit.")
1742
+ if resp.status_code == 409:
1743
+ raise ValueError("Too many concurrent searches.")
1744
+ if resp.status_code == 502:
1745
+ raise ValueError("NOSIBLE is currently restarting.")
1746
+ if resp.status_code == 504:
1747
+ raise ValueError("NOSIBLE is currently overloaded.")
1713
1748
 
1714
- Returns
1715
- -------
1716
- str
1717
- The plan you are currently on.
1749
+ resp.raise_for_status()
1718
1750
 
1719
- Raises
1720
- ------
1721
- ValueError
1722
- If the extracted prefix is not one of the recognized plan names.
1751
+ try:
1752
+ data = resp.json()
1753
+ except Exception as e:
1754
+ raise ValueError("Invalid JSON response from /limits") from e
1723
1755
 
1724
- Examples
1725
- --------
1726
- >>> nos = Nosible(nosible_api_key="test+|xyz") # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
1727
- Traceback (most recent call last):
1728
- ...
1729
- ValueError: Your API key is not valid: test+ is not a valid plan prefix.
1730
- """
1731
- # Split off anything after the first '|'
1732
- prefix = (self.nosible_api_key or "").split("|", 1)[0]
1756
+ limits_list = data.get("limits")
1757
+ if not isinstance(limits_list, list):
1758
+ raise ValueError(f"Invalid /limits response shape: {data!r}")
1759
+
1760
+ grouped: dict[str, list[tuple[int, float]]] = {}
1761
+ for item in limits_list:
1762
+ query_type = item.get("query_type")
1763
+ duration = item.get("duration_seconds")
1764
+ limit = item.get("limit")
1733
1765
 
1734
- # Map prefixes -> plan names
1735
- plans = {"test", "self", "basic", "pro", "pro+", "bus", "bus+", "ent", "chat", "cons", "stup", "busn", "prod"}
1766
+ if query_type is None or duration is None or limit is None:
1767
+ raise ValueError(f"Invalid limit entry: {item!r}")
1736
1768
 
1737
- if prefix not in plans:
1738
- raise ValueError(f"Your API key is not valid: {prefix} is not a valid plan prefix.")
1769
+ grouped.setdefault(str(query_type), []).append((int(limit), float(duration)))
1739
1770
 
1740
- return prefix
1771
+ return grouped
1741
1772
 
1742
1773
  def _generate_expansions(self, question: Union[str, Search]) -> list:
1743
1774
  """
@@ -2039,13 +2070,13 @@ class Nosible:
2039
2070
 
2040
2071
  if include_docs:
2041
2072
  # Assume these are URL hashes, e.g. "ENNmqkF1mGNhVhvhmbUEs4U2"
2042
- doc_hashes = ", ".join(f"'{doc}'" for doc in include_docs)
2043
- clauses.append(f"doc_hash IN ({doc_hashes})")
2073
+ docs = ", ".join(f"'{doc}'" for doc in include_docs)
2074
+ clauses.append(f"doc IN ({docs})")
2044
2075
 
2045
2076
  if exclude_docs:
2046
2077
  # Assume these are URL hashes, e.g. "ENNmqkF1mGNhVhvhmbUEs4U2"
2047
- doc_hashes = ", ".join(f"'{doc}'" for doc in exclude_docs)
2048
- clauses.append(f"doc_hash NOT IN ({doc_hashes})")
2078
+ docs = ", ".join(f"'{doc}'" for doc in exclude_docs)
2079
+ clauses.append(f"doc NOT IN ({docs})")
2049
2080
 
2050
2081
  # Join everything
2051
2082
  if clauses:
@@ -2092,7 +2123,7 @@ class Nosible:
2092
2123
  "netloc",
2093
2124
  "language",
2094
2125
  "companies"
2095
- "doc_hash",
2126
+ "doc",
2096
2127
  ]
2097
2128
  import polars as pl # Lazy import
2098
2129
 
@@ -4,81 +4,10 @@ import time
4
4
 
5
5
  from pyrate_limiter import Limiter, Rate
6
6
  from pyrate_limiter.buckets.in_memory_bucket import InMemoryBucket
7
- from pyrate_limiter.exceptions import BucketFullException
7
+ from pyrate_limiter.exceptions import BucketFullException, LimiterDelayException
8
8
 
9
9
  log = logging.getLogger(__name__)
10
10
 
11
- PLAN_RATE_LIMITS = {
12
- "test": {
13
- # Per minute limit, then per month.
14
- "scrape-url": [(60, 60), (300, 24 * 3600 * 30)],
15
- "bulk": [(60, 60), (300, 24 * 3600 * 30)],
16
- "fast": [(60, 60), (3000, 24 * 3600 * 30)],
17
- },
18
- "basic": {
19
- "scrape-url": [(60, 60), (1400, 24 * 3600 * 30)],
20
- "bulk": [(60, 60), (1400, 24 * 3600 * 30)],
21
- "fast": [(60, 60), (14_000, 24 * 3600 * 30)],
22
- },
23
- "pro": {
24
- "scrape-url": [(60, 60), (6700, 24 * 3600 * 30)],
25
- "bulk": [(60, 60), (6700, 24 * 3600 * 30)],
26
- "fast": [(60, 60), (67_000, 24 * 3600 * 30)],
27
- },
28
- "pro+": {
29
- "scrape-url": [(60, 60), (32_000, 24 * 3600 * 30)],
30
- "bulk": [(60, 60), (32_000, 24 * 3600 * 30)],
31
- "fast": [(60, 60), (320_000, 24 * 3600 * 30)],
32
- },
33
- "bus": {
34
- "scrape-url": [(60, 60), (200_000, 24 * 3600 * 30)],
35
- "bulk": [(60, 60), (200_000, 24 * 3600 * 30)],
36
- "fast": [(60, 60), (2_000_000, 24 * 3600 * 30)],
37
- },
38
- "bus+": {
39
- "scrape-url": [(60, 60), (500_000, 24 * 3600 * 30)],
40
- "bulk": [(60, 60), (500_000, 24 * 3600 * 30)],
41
- "fast": [(120, 60), (5_000_000, 24 * 3600 * 30)],
42
- },
43
- "ent": {
44
- "scrape-url": [(60, 60), (1_500_000, 24 * 3600 * 30)],
45
- "bulk": [(60, 60), (1_500_000, 24 * 3600 * 30)],
46
- "fast": [(360, 60), (15_000_000, 24 * 3600 * 30)],
47
- },
48
- "prod": {
49
- "scrape-url": [(60, 60), (1_500_000, 24 * 3600 * 30)],
50
- "bulk": [(60, 60), (1_500_000, 24 * 3600 * 30)],
51
- "fast": [(360, 60), (15_000_000, 24 * 3600 * 30)],
52
- },
53
- # This plan is used for testing in the package
54
- "chat": {
55
- "scrape-url": [(60, 60), (1_500_000, 24 * 3600 * 30)],
56
- "bulk": [(60, 60), (1_500_000, 24 * 3600 * 30)],
57
- "fast": [(360, 60), (15_000_000, 24 * 3600 * 30)],
58
- },
59
- "self": {
60
- "scrape-url": [(6000, 60), (1_500_000, 24 * 3600 * 30)],
61
- "bulk": [(6000, 60), (1_500_000, 24 * 3600 * 30)],
62
- "fast": [(36_000, 60), (15_000_000, 24 * 3600 * 30)],
63
- },
64
- "cons": {
65
- "scrape-url": [(60, 60), (3000, 24 * 3600 * 30)],
66
- "bulk": [(60, 60), (3000, 24 * 3600 * 30)],
67
- "fast": [(120, 60), (30_000, 24 * 3600 * 30)],
68
- },
69
- "stup": {
70
- "scrape-url": [(60, 60), (30_000, 24 * 3600 * 30)],
71
- "bulk": [(60, 60), (30_000, 24 * 3600 * 30)],
72
- "fast": [(360, 60), (300_000, 24 * 3600 * 30)],
73
- },
74
- # This plan is used for testing in the package
75
- "busn": {
76
- "scrape-url": [(60, 60), (300_000, 24 * 3600 * 30)],
77
- "bulk": [(60, 60), (300_000, 24 * 3600 * 30)],
78
- "fast": [(360, 60), (3_000_000, 24 * 3600 * 30)],
79
- },
80
- }
81
-
82
11
 
83
12
  def _rate_limited(endpoint):
84
13
  """
@@ -133,7 +62,7 @@ class RateLimiter:
133
62
 
134
63
  # Build our bucket
135
64
  bucket = InMemoryBucket([Rate(max_calls, period_ms)])
136
- self._limiter = Limiter(bucket)
65
+ self._limiter = Limiter(bucket, max_delay=1000)
137
66
 
138
67
  def acquire(self) -> None:
139
68
  """
@@ -177,7 +106,7 @@ class RateLimiter:
177
106
  # Ensure at least a small sleep if rounding to zero
178
107
  time.sleep(wait_s)
179
108
 
180
- def try_acquire(self) -> bool:
109
+ def try_acquire(self, name: str = None) -> bool:
181
110
  """
182
111
  Attempt to acquire a slot without blocking.
183
112
 
@@ -196,8 +125,11 @@ class RateLimiter:
196
125
  >>> rl.try_acquire()
197
126
  False
198
127
  """
128
+ key = name if name else self._GLOBAL_KEY
129
+
199
130
  try:
200
- self._limiter.try_acquire(self._GLOBAL_KEY)
131
+ self._limiter.try_acquire(key)
201
132
  return True
202
- except BucketFullException:
203
- return False
133
+ except (BucketFullException, LimiterDelayException):
134
+ # Return False instead of crashing when the limit is hit
135
+ return False
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nosible
3
- Version: 0.3.6
3
+ Version: 0.3.10
4
4
  Summary: Python client for the NOSIBLE Search API
5
5
  Home-page: https://github.com/NosibleAI/nosible-py
6
6
  Author: Stuart Reid, Matthew Dicks, Richard Taylor, Gareth Warburton
@@ -31,7 +31,7 @@ Requires-Dist: polars
31
31
  Requires-Dist: duckdb
32
32
  Requires-Dist: openai
33
33
  Requires-Dist: tantivy
34
- Requires-Dist: pyrate-limiter
34
+ Requires-Dist: pyrate-limiter<4
35
35
  Requires-Dist: tenacity
36
36
  Requires-Dist: cryptography
37
37
  Requires-Dist: pyarrow
@@ -2,7 +2,7 @@ polars
2
2
  duckdb
3
3
  openai
4
4
  tantivy
5
- pyrate-limiter
5
+ pyrate-limiter<4
6
6
  tenacity
7
7
  cryptography
8
8
  pyarrow
@@ -1,12 +1,9 @@
1
- import json
2
1
  import pytest
3
- import os
4
- import time
5
2
  import re
6
3
 
7
- import polars as pl
4
+ import pytest
8
5
 
9
- from nosible import Nosible, Result, ResultSet, Search, Snippet, SnippetSet
6
+ from nosible import Nosible, ResultSet, Search, SnippetSet
10
7
  from nosible.classes.search_set import SearchSet
11
8
  from nosible.classes.web_page import WebPageData
12
9
 
@@ -75,11 +72,6 @@ def test_close_idempotent():
75
72
  nos.close()
76
73
 
77
74
 
78
- def test_invalid_api_key():
79
- with pytest.raises(ValueError):
80
- Nosible(nosible_api_key="test+|xyz")
81
-
82
-
83
75
  def test_llm_key_required_for_expansions():
84
76
  nos = Nosible(llm_api_key=None)
85
77
  nos.llm_api_key = None
@@ -92,9 +84,6 @@ def test_validate_sql():
92
84
  assert not Nosible()._validate_sql(sql="SELECT * FROM missing_table")
93
85
 
94
86
 
95
- # —— Your additional tests —— #
96
-
97
-
98
87
  def test_search_minimal(search_data):
99
88
  # from your snippet: isinstance(search_data, ResultSet)
100
89
  assert isinstance(search_data, ResultSet)
@@ -1,5 +1,5 @@
1
1
  import pytest
2
- from polars.dependencies import pandas as pd
2
+ import pandas as pd
3
3
  from nosible import Result, ResultSet
4
4
 
5
5
 
@@ -84,8 +84,6 @@ def test_resultset_to_dict(search_data):
84
84
  assert "published" in res
85
85
  assert "similarity" in res
86
86
  assert res["url_hash"] == key
87
- # results_copy_from_dict = ResultSet.from_dict(results_dict)
88
- # assert results == results_copy_from_dict
89
87
 
90
88
 
91
89
  # to_dicts
@@ -149,3 +147,32 @@ def test_resultset_getitem(search_data):
149
147
  _ = search_data[len(search_data)] # Out of range index
150
148
  with pytest.raises(TypeError):
151
149
  _ = search_data["invalid"] # Invalid type for index
150
+
151
+
152
+ def test_similar_excludes_current_document():
153
+ """
154
+ Test that the similar method properly excludes the current document from search results.
155
+
156
+ This test creates a Nosible client, performs a fast search, takes the first result,
157
+ and verifies that calling similar() on that result excludes it from the returned results.
158
+ """
159
+ from nosible import Nosible
160
+
161
+ # Create a Nosible client (similar to test_01_nosible.py)
162
+ with Nosible(concurrency=1) as nos:
163
+ # Perform a search to get some results
164
+ search_results = nos.fast_search(question="Hedge funds seek to expand into private credit", n_results=10)
165
+
166
+ # Get the first result
167
+ first_result = search_results[0]
168
+
169
+ # Call similar() on the first result
170
+ similar_results = first_result.similar(client=nos, n_results=10)
171
+
172
+ # Verify that the first result is NOT in the similar results
173
+ # We check by comparing URL hashes
174
+ similar_hashes = [r.url_hash for r in similar_results if r.url_hash]
175
+ assert first_result.url_hash not in similar_hashes, f"Original result URL hash {first_result.url_hash} should not be in similar results"
176
+
177
+ # Also verify that similar results were actually returned (should be non-empty)
178
+ assert len(similar_results) >= 0, "Similar results should be returned (may be empty if no similar docs found)"
@@ -1,5 +1,4 @@
1
- from nosible import Snippet, SnippetSet, WebPageData
2
- import pytest
1
+ from nosible import Snippet, SnippetSet
3
2
 
4
3
 
5
4
  def test_snippet_initialization(snippets_data):
File without changes
File without changes
File without changes
File without changes