nosible 0.1.8__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
nosible/classes/search.py CHANGED
@@ -1,13 +1,15 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from dataclasses import asdict, dataclass
3
4
  from typing import TYPE_CHECKING
4
5
 
5
- from nosible.utils.json_tools import json_dumps, json_loads
6
+ from nosible.utils.json_tools import json_dumps, json_loads, print_dict
6
7
 
7
8
  if TYPE_CHECKING:
8
9
  from nosible.classes.search_set import SearchSet
9
10
 
10
11
 
12
+ @dataclass(init=True, repr=True, eq=True)
11
13
  class Search:
12
14
  """
13
15
  Represents the parameters for a search operation.
@@ -31,40 +33,40 @@ class Search:
31
33
  Number of context documents to retrieve.
32
34
  algorithm : str, optional
33
35
  Search algorithm to use.
34
- output_type : str, optional
35
- Type of output to produce.
36
+ min_similarity : float
37
+ Results must have at least this similarity score.
38
+ must_include: list of str
39
+ Only results mentioning these strings will be included.
40
+ must_exclude : list of str
41
+ Any result mentioning these strings will be excluded.
36
42
  autogenerate_expansions : bool, default=False
37
43
  Do you want to generate expansions automatically using a LLM?
38
44
  publish_start : str, optional
39
- Start date for published documents (ISO format).
45
+ Start date for when the document was published (ISO format).
40
46
  publish_end : str, optional
41
- End date for published documents (ISO format).
42
- include_netlocs : list of str, optional
43
- List of netlocs (domains) to include in the search.
44
- exclude_netlocs : list of str, optional
45
- List of netlocs (domains) to exclude from the search.
47
+ End date for when the document was published (ISO format).
46
48
  visited_start : str, optional
47
- Start date for visited documents (ISO format).
49
+ Start date for when the document was visited by NOSIBLE (ISO format).
48
50
  visited_end : str, optional
49
- End date for visited documents (ISO format).
51
+ End date for when the document was visited by NOSIBLE (ISO format).
50
52
  certain : bool, optional
51
- Whether to only include certain results.
52
- include_languages : list of str, optional
53
- Languages to include in the search (Max: 50).
54
- exclude_languages : list of str, optional
55
- Languages to exclude from the search (Max: 50).
53
+ Only include documents where we are 100% sure of the date.
56
54
  include_netlocs : list of str, optional
57
- Only include results from these domains (Max: 50).
55
+ List of netlocs (domains) to include in the search. (Max 50)
58
56
  exclude_netlocs : list of str, optional
59
- Exclude results from these domains (Max: 50).
57
+ List of netlocs (domains) to exclude in the search. (Max 50)
58
+ include_languages : list of str, optional
59
+ Languages to include in the search. (Max 50, ISO 639-1 language codes).
60
+ exclude_languages : list of str, optional
61
+ Language codes to exclude in the search (Max 50, ISO 639-1 language codes).
60
62
  include_companies : list of str, optional
61
- Companies to include in the search (Max: 50).
63
+ Google KG IDs of public companies to require (Max 50).
62
64
  exclude_companies : list of str, optional
63
- Companies to exclude from the search (Max: 50).
65
+ Google KG IDs of public companies to forbid (Max 50).
64
66
  include_docs : list of str, optional
65
- Document IDs to include in the search (Max: 50).
67
+ URL hashes of docs to include (Max 50).
66
68
  exclude_docs : list of str, optional
67
- Document IDs to exclude from the search (Max: 50).
69
+ URL hashes of docs to exclude (Max 50).
68
70
 
69
71
  Examples
70
72
  --------
@@ -82,6 +84,55 @@ class Search:
82
84
  What is Python?
83
85
  """
84
86
 
87
+ question: str | None = None
88
+ """The main search question or query."""
89
+ expansions: list[str] | None = None
90
+ """List of query expansions or related terms."""
91
+ sql_filter: str | None = None
92
+ """Additional SQL filter to apply to the search."""
93
+ n_results: int | None = None
94
+ """Number of results to return."""
95
+ n_probes: int | None = None
96
+ """Number of probe queries to use."""
97
+ n_contextify: int | None = None
98
+ """Number of context documents to retrieve."""
99
+ algorithm: str | None = None
100
+ """Search algorithm to use."""
101
+ min_similarity: float | None = None
102
+ """Results must have at least this similarity score."""
103
+ must_include: list[str] | None = None
104
+ """Only results mentioning these strings will be included."""
105
+ must_exclude: list[str] | None = None
106
+ """Any result mentioning these strings will be excluded."""
107
+ autogenerate_expansions: bool = False
108
+ """Do you want to generate expansions automatically using a LLM?"""
109
+ publish_start: str | None = None
110
+ """Start date for when the document was published."""
111
+ publish_end: str | None = None
112
+ """End date for when the document was published."""
113
+ visited_start: str | None = None
114
+ """Start date for when the document was visited by NOSIBLE."""
115
+ visited_end: str | None = None
116
+ """End date for when the document was visited by NOSIBLE."""
117
+ certain: bool | None = None
118
+ """Only include documents where we are 100% sure of the date."""
119
+ include_netlocs: list[str] | None = None
120
+ """List of netlocs (domains) to include in the search (Max 50)."""
121
+ exclude_netlocs: list[str] | None = None
122
+ """List of netlocs (domains) to exclude in the search (Max 50)."""
123
+ include_languages: list[str] | None = None
124
+ """Languages to include in the search. (Max 50)"""
125
+ exclude_languages: list[str] | None = None
126
+ """Language codes to exclude in the search (Max 50)"""
127
+ include_companies: list[str] | None = None
128
+ """Google KG IDs of public companies to require (Max 50)."""
129
+ exclude_companies: list[str] | None = None
130
+ """Google KG IDs of public companies to forbid (Max 50)."""
131
+ include_docs: list[str] | None = None
132
+ """URL hashes of docs to include (Max 50)."""
133
+ exclude_docs: list[str] | None = None
134
+ """URL hashes of docs to exclude (Max 50)."""
135
+
85
136
  _FIELDS = [
86
137
  "question",
87
138
  "expansions",
@@ -90,7 +141,9 @@ class Search:
90
141
  "n_probes",
91
142
  "n_contextify",
92
143
  "algorithm",
93
- "output_type",
144
+ "min_similarity",
145
+ "must_include",
146
+ "must_exclude",
94
147
  "autogenerate_expansions",
95
148
  "publish_start",
96
149
  "publish_end",
@@ -107,67 +160,17 @@ class Search:
107
160
  "exclude_docs",
108
161
  ]
109
162
 
110
- def __init__(
111
- self,
112
- question: str = None,
113
- expansions: list[str] = None,
114
- sql_filter: str = None,
115
- n_results: int = None,
116
- n_probes: int = None,
117
- n_contextify: int = None,
118
- algorithm: str = None,
119
- output_type: str = None,
120
- autogenerate_expansions: bool = False,
121
- publish_start: str = None,
122
- publish_end: str = None,
123
- include_netlocs: list[str] = None,
124
- exclude_netlocs: list[str] = None,
125
- visited_start: str = None,
126
- visited_end: str = None,
127
- certain: bool = None,
128
- include_languages: list[str] = None,
129
- exclude_languages: list[str] = None,
130
- include_companies: list[str] = None,
131
- exclude_companies: list[str] = None,
132
- include_docs: list[str] = None,
133
- exclude_docs: list[str] = None,
134
- ) -> None:
135
- self.question = question
136
- self.expansions = expansions
137
- self.sql_filter = sql_filter
138
- self.n_results = n_results
139
- self.n_probes = n_probes
140
- self.n_contextify = n_contextify
141
- self.algorithm = algorithm
142
- self.output_type = output_type
143
- self.autogenerate_expansions = autogenerate_expansions
144
- self.publish_start = publish_start
145
- self.publish_end = publish_end
146
- self.include_netlocs = include_netlocs
147
- self.exclude_netlocs = exclude_netlocs
148
- self.visited_start = visited_start
149
- self.visited_end = visited_end
150
- self.certain = certain
151
- self.include_languages = include_languages
152
- self.exclude_languages = exclude_languages
153
- self.include_companies = include_companies
154
- self.exclude_companies = exclude_companies
155
- self.include_docs = include_docs
156
- self.exclude_docs = exclude_docs
157
-
158
163
  def __str__(self) -> str:
159
164
  """
160
165
  Return a readable string representation of the search parameters.
161
166
  Only non-None fields are shown, each on its own line for clarity.
167
+
168
+ Returns
169
+ -------
170
+ str
171
+ A string representation of the Search instance, showing only the
162
172
  """
163
- attrs = []
164
- for attr in self._FIELDS:
165
- value = getattr(self, attr)
166
- if value is not None:
167
- attrs.append(f" {attr} = {value!r}")
168
- if not attrs:
169
- return "Search()"
170
- return "Search(\n" + ",\n".join(attrs) + "\n)"
173
+ return print_dict(self.to_dict())
171
174
 
172
175
  def __add__(self, other: Search) -> SearchSet:
173
176
  """
@@ -222,7 +225,7 @@ class Search:
222
225
  >>> search.to_dict()["question"]
223
226
  'What is Python?'
224
227
  """
225
- return {field: getattr(self, field) for field in self._FIELDS}
228
+ return asdict(self, dict_factory=dict)
226
229
 
227
230
  @classmethod
228
231
  def from_dict(cls, data: dict) -> Search:
@@ -267,10 +270,6 @@ class Search:
267
270
 
268
271
  Raises
269
272
  ------
270
- IOError
271
- If the file cannot be written.
272
- TypeError
273
- If serialization of the search parameters fails.
274
273
 
275
274
  Examples
276
275
  --------
@@ -304,10 +303,6 @@ class Search:
304
303
 
305
304
  Raises
306
305
  ------
307
- IOError
308
- If the file cannot be read.
309
- json.JSONDecodeError
310
- If the file content is not valid JSON.
311
306
 
312
307
  Examples
313
308
  --------
@@ -1,9 +1,11 @@
1
1
  from collections.abc import Iterator
2
+ from dataclasses import dataclass, field
2
3
 
3
4
  from nosible.classes.search import Search
4
5
  from nosible.utils.json_tools import json_dumps, json_loads
5
6
 
6
7
 
8
+ @dataclass()
7
9
  class SearchSet(Iterator[Search]):
8
10
  """
9
11
  Manages an iterable collection of Search objects.
@@ -32,9 +34,10 @@ class SearchSet(Iterator[Search]):
32
34
  What is AI?
33
35
  """
34
36
 
35
- def __init__(self, searches: list[Search] = None) -> None:
36
- self.searches = searches or []
37
- self._index = 0
37
+ searches: list[Search] = field(default_factory=list)
38
+ """ A list of Search objects in the collection."""
39
+ _index: int = field(default=0, init=False, repr=False, compare=False)
40
+ """ Internal index for iteration over searches."""
38
41
 
39
42
  def __iter__(self) -> "SearchSet":
40
43
  """
@@ -199,7 +202,7 @@ class SearchSet(Iterator[Search]):
199
202
  """
200
203
  del self.searches[index]
201
204
 
202
- def to_list(self) -> list[dict]:
205
+ def to_dicts(self) -> list[dict]:
203
206
  """
204
207
  Convert all Search objects in the collection to a list of dictionaries.
205
208
 
@@ -219,7 +222,7 @@ class SearchSet(Iterator[Search]):
219
222
  >>> s1 = Search(question="What is Python?", n_results=3)
220
223
  >>> s2 = Search(question="What is PEP8?", n_results=2)
221
224
  >>> searches = SearchSet([s1, s2])
222
- >>> searches.to_list()[1]["question"]
225
+ >>> searches.to_dicts()[1]["question"]
223
226
  'What is PEP8?'
224
227
  """
225
228
  return [s.to_dict() for s in self.searches]
@@ -242,6 +245,10 @@ class SearchSet(Iterator[Search]):
242
245
  str
243
246
  A JSON string representation of the SearchSet collection if no path is provided.
244
247
 
248
+ Raises
249
+ -------
250
+ RuntimeError
251
+ If there is an error during serialization or file writing.
245
252
 
246
253
  Examples
247
254
  --------
@@ -251,14 +258,22 @@ class SearchSet(Iterator[Search]):
251
258
  >>> json_str = searches.to_json()
252
259
  >>> isinstance(json_str, str)
253
260
  True
254
- >>> searches.to_json("searches.json") # The file 'searches.json' will contain both search queries in JSON format.
261
+ >>> searches.to_json(
262
+ ... "searches.json"
263
+ ... ) # The file 'searches.json' will contain both search queries in JSON format.
255
264
  """
256
- data = json_dumps(self.to_list())
257
- if path:
258
- with open(path, "w") as f:
259
- f.write(data)
260
- return None
261
- return data
265
+ try:
266
+ json_bytes = json_dumps(self.to_dicts())
267
+ if path:
268
+ try:
269
+ with open(path, "w") as f:
270
+ f.write(json_bytes)
271
+ return None
272
+ except Exception as e:
273
+ raise RuntimeError(f"Failed to write JSON to '{path}': {e}") from e
274
+ return json_bytes
275
+ except Exception as e:
276
+ raise RuntimeError(f"Failed to serialize results to JSON: {e}") from e
262
277
 
263
278
  @classmethod
264
279
  def from_json(cls, path: str) -> "SearchSet":
@@ -1,6 +1,9 @@
1
- from nosible.utils.json_tools import json_dumps
1
+ from dataclasses import asdict, dataclass, field
2
2
 
3
+ from nosible.utils.json_tools import json_dumps, print_dict
3
4
 
5
+
6
+ @dataclass(init=True, repr=True, eq=True, frozen=True)
4
7
  class Snippet:
5
8
  """
6
9
  A class representing a snippet of text, typically extracted from a web page.
@@ -25,6 +28,11 @@ class Snippet:
25
28
  Hash of the URL from which the snippet was extracted.
26
29
  words : str or None
27
30
  The words in the snippet.
31
+ links : list or None
32
+ List of links associated with the snippet.
33
+ companies : list or None
34
+ List of companies mentioned in the snippet.
35
+
28
36
 
29
37
  Examples
30
38
  --------
@@ -34,67 +42,28 @@ class Snippet:
34
42
 
35
43
  """
36
44
 
37
- def __init__(
38
- self,
39
- *,
40
- companies: list = None,
41
- content: str = None,
42
- images: list = None,
43
- language: str = None,
44
- next_snippet_hash: str = None,
45
- prev_snippet_hash: str = None,
46
- snippet_hash: str = None,
47
- statistics: dict = None,
48
- url_hash: str = None,
49
- words: str = None,
50
- ):
51
- """
52
- Initialize a Snippet instance.
53
-
54
- Parameters
55
- ----------
56
- companies : list, optional
57
- A list of companies mentioned in the snippet, if applicable. (GKIDS)
58
- content : str
59
- The text content of the snippet.
60
- images : list, optional
61
- List of image URLs associated with the snippet.
62
- language : str, optional
63
- The language of the snippet.
64
- snippet_hash : str, optional
65
- A unique hash for the snippet.
66
- statistics : dict, optional
67
- Statistical information about the snippet (e.g., word count).
68
- words : str, optional
69
- The words in the snippet.
70
-
71
- Examples
72
- --------
73
- >>> snippet = Snippet(content="Example snippet", language="en")
74
- >>> print(snippet.content)
75
- Example snippet
76
- """
77
- self.companies = companies or []
78
- self.content = content
79
- self.images = images
80
- self.language = language
81
- self.snippet_hash = snippet_hash
82
- self.statistics = statistics
83
- self.words = words
84
- self.url_hash = url_hash
85
- self.next_snippet_hash = next_snippet_hash
86
- self.prev_snippet_hash = prev_snippet_hash
87
-
88
- def __repr__(self):
89
- """
90
- Returns a string representation of the Snippet object.
91
-
92
- Returns
93
- -------
94
- str
95
- A string representation of the Snippet.
96
- """
97
- return f"Snippet(content={self.content[:30]}, language={self.language}, snippet_hash={self.snippet_hash})"
45
+ content: str = field(default=None, repr=True, compare=True)
46
+ """The text content of the snippet."""
47
+ images: list = field(default=None, repr=True, compare=False)
48
+ """List of image URLs associated with the snippet."""
49
+ language: str = field(default=None, repr=True, compare=False)
50
+ """The language of the snippet."""
51
+ next_snippet_hash: str = field(default=None, repr=True, compare=False)
52
+ """Hash of the next snippet in sequence."""
53
+ prev_snippet_hash: str = field(default=None, repr=True, compare=False)
54
+ """Hash of the previous snippet in sequence."""
55
+ snippet_hash: str = field(default=None, repr=True, compare=True)
56
+ """A unique hash for the snippet."""
57
+ statistics: dict = field(default=None, repr=False, compare=False)
58
+ """Statistical information about the snippet."""
59
+ url_hash: str = field(default=None, repr=True, compare=False)
60
+ """Hash of the URL from which the snippet was extracted."""
61
+ words: str = field(default=None, repr=False, compare=False)
62
+ """The words in the snippet."""
63
+ links: list = field(default=None, repr=False, compare=False)
64
+ """List of links associated with the snippet."""
65
+ companies: list = field(default=None, repr=False, compare=False)
66
+ """List of companies mentioned in the snippet."""
98
67
 
99
68
  def __str__(self):
100
69
  """
@@ -105,7 +74,7 @@ class Snippet:
105
74
  str
106
75
  A string representation of the Snippet.
107
76
  """
108
- return f"Snippet: {self.content}"
77
+ return print_dict(self.to_dict())
109
78
 
110
79
  def __getitem__(self, key: str):
111
80
  """
@@ -146,17 +115,31 @@ class Snippet:
146
115
  >>> isinstance(snippet_dict, dict)
147
116
  True
148
117
  """
149
- return {
150
- "content": self.content,
151
- "images": self.images,
152
- "language": self.language,
153
- "snippet_hash": self.snippet_hash,
154
- "statistics": self.statistics,
155
- "words": self.words,
156
- "url_hash": self.url_hash,
157
- "next_snippet_hash": self.next_snippet_hash,
158
- "prev_snippet_hash": self.prev_snippet_hash,
159
- }
118
+ return asdict(self, dict_factory=dict)
119
+
120
+ @classmethod
121
+ def from_dict(cls, data: dict) -> "Snippet":
122
+ """
123
+ Create a Snippet instance from a dictionary.
124
+
125
+ Parameters
126
+ ----------
127
+ data : dict
128
+ Dictionary containing snippet data.
129
+
130
+ Returns
131
+ -------
132
+ Snippet
133
+ An instance of Snippet populated with the provided data.
134
+
135
+ Examples
136
+ --------
137
+ >>> snippet_data = {"content": "Example snippet", "snippet_hash": "hash1"}
138
+ >>> snippet = Snippet.from_dict(snippet_data)
139
+ >>> isinstance(snippet, Snippet)
140
+ True
141
+ """
142
+ return cls(**data)
160
143
 
161
144
  def to_json(self) -> str:
162
145
  """