nosible 0.1.8__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nosible/classes/result.py +69 -106
- nosible/classes/result_set.py +121 -115
- nosible/classes/search.py +83 -88
- nosible/classes/search_set.py +27 -12
- nosible/classes/snippet.py +57 -74
- nosible/classes/snippet_set.py +62 -63
- nosible/classes/web_page.py +39 -103
- nosible/nosible_client.py +551 -234
- nosible/utils/json_tools.py +58 -8
- nosible/utils/question_builder.py +131 -0
- nosible/utils/rate_limiter.py +30 -24
- {nosible-0.1.8.dist-info → nosible-0.2.1.dist-info}/METADATA +27 -49
- nosible-0.2.1.dist-info/RECORD +17 -0
- nosible-0.1.8.dist-info/RECORD +0 -16
- {nosible-0.1.8.dist-info → nosible-0.2.1.dist-info}/WHEEL +0 -0
- {nosible-0.1.8.dist-info → nosible-0.2.1.dist-info}/licenses/LICENSE +0 -0
- {nosible-0.1.8.dist-info → nosible-0.2.1.dist-info}/top_level.txt +0 -0
nosible/classes/snippet_set.py
CHANGED
|
@@ -1,9 +1,13 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterator
|
|
4
|
+
from dataclasses import dataclass, field
|
|
2
5
|
|
|
3
6
|
from nosible.classes.snippet import Snippet
|
|
4
7
|
from nosible.utils.json_tools import json_dumps
|
|
5
8
|
|
|
6
9
|
|
|
10
|
+
@dataclass()
|
|
7
11
|
class SnippetSet(Iterator[Snippet]):
|
|
8
12
|
"""
|
|
9
13
|
An iterator and container for a collection of Snippet objects.
|
|
@@ -19,64 +23,50 @@ class SnippetSet(Iterator[Snippet]):
|
|
|
19
23
|
Examples
|
|
20
24
|
--------
|
|
21
25
|
>>> snippets_data = {
|
|
22
|
-
... "hash1": {"content": "Example snippet", "snippet_hash": "hash1"}
|
|
26
|
+
... "hash1": {"content": "Example snippet", "snippet_hash": "hash1"},
|
|
27
|
+
... "hash2": {"content": "Another snippet", "snippet_hash": "hash2"},
|
|
23
28
|
... }
|
|
24
|
-
>>> snippets = SnippetSet(snippets_data)
|
|
29
|
+
>>> snippets = SnippetSet().from_dict(snippets_data)
|
|
25
30
|
>>> for snippet in snippets:
|
|
26
31
|
... print(snippet.content)
|
|
27
32
|
Example snippet
|
|
33
|
+
Another snippet
|
|
28
34
|
"""
|
|
29
35
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
language=value.get("language", ""),
|
|
40
|
-
next_snippet_hash=value.get("next_snippet_hash", ""),
|
|
41
|
-
prev_snippet_hash=value.get("prev_snippet_hash", ""),
|
|
42
|
-
snippet_hash=key,
|
|
43
|
-
statistics=value.get("statistics", {}),
|
|
44
|
-
url_hash=value.get("url_hash", ""),
|
|
45
|
-
words=value.get("words", ""),
|
|
46
|
-
)
|
|
47
|
-
)
|
|
48
|
-
|
|
49
|
-
self._index = 0
|
|
50
|
-
|
|
51
|
-
def __iter__(self):
|
|
52
|
-
"""
|
|
53
|
-
Initialize the iterator.
|
|
36
|
+
snippets: list[Snippet] = field(default_factory=list)
|
|
37
|
+
""" List of `Snippet` objects contained in this ResultSet."""
|
|
38
|
+
_index: int = field(default=0, init=False, repr=False, compare=False)
|
|
39
|
+
""" Internal index for iteration over snippets."""
|
|
40
|
+
|
|
41
|
+
def __iter__(self) -> SnippetSet:
|
|
42
|
+
"""
|
|
43
|
+
Reset iteration and return self.
|
|
44
|
+
|
|
54
45
|
Returns
|
|
55
46
|
-------
|
|
56
|
-
|
|
57
|
-
|
|
47
|
+
ResultSet
|
|
48
|
+
Iterator over the ResultSet instance.
|
|
58
49
|
"""
|
|
59
|
-
self
|
|
50
|
+
object.__setattr__(self, "_index", 0)
|
|
60
51
|
return self
|
|
61
52
|
|
|
62
53
|
def __next__(self) -> Snippet:
|
|
63
54
|
"""
|
|
64
|
-
Returns the next
|
|
55
|
+
Returns the next Result in the sequence.
|
|
65
56
|
|
|
66
57
|
Returns
|
|
67
58
|
-------
|
|
68
|
-
|
|
69
|
-
The next
|
|
70
|
-
|
|
59
|
+
Result
|
|
60
|
+
The next Result object in the sequence.
|
|
71
61
|
Raises
|
|
72
62
|
------
|
|
73
63
|
StopIteration
|
|
74
|
-
If
|
|
64
|
+
If the end of the sequence is reached.
|
|
75
65
|
"""
|
|
76
|
-
if self._index < len(self.
|
|
77
|
-
|
|
78
|
-
self._index
|
|
79
|
-
return
|
|
66
|
+
if self._index < len(self.snippets):
|
|
67
|
+
item = self.snippets[self._index]
|
|
68
|
+
object.__setattr__(self, "_index", self._index + 1)
|
|
69
|
+
return item
|
|
80
70
|
raise StopIteration
|
|
81
71
|
|
|
82
72
|
def __len__(self) -> int:
|
|
@@ -88,7 +78,7 @@ class SnippetSet(Iterator[Snippet]):
|
|
|
88
78
|
int
|
|
89
79
|
The number of snippets.
|
|
90
80
|
"""
|
|
91
|
-
return len(self.
|
|
81
|
+
return len(self.snippets)
|
|
92
82
|
|
|
93
83
|
def __getitem__(self, index: int) -> Snippet:
|
|
94
84
|
"""
|
|
@@ -109,9 +99,9 @@ class SnippetSet(Iterator[Snippet]):
|
|
|
109
99
|
IndexError
|
|
110
100
|
If the index is out of range.
|
|
111
101
|
"""
|
|
112
|
-
if
|
|
113
|
-
|
|
114
|
-
|
|
102
|
+
if 0 <= index < len(self.snippets):
|
|
103
|
+
return self.snippets[index]
|
|
104
|
+
raise IndexError(f"Index {index} out of range for SnippetSet of length {len(self.snippets)}.")
|
|
115
105
|
|
|
116
106
|
def __str__(self):
|
|
117
107
|
"""
|
|
@@ -122,17 +112,6 @@ class SnippetSet(Iterator[Snippet]):
|
|
|
122
112
|
"""
|
|
123
113
|
return "\n".join(str(s) for s in self)
|
|
124
114
|
|
|
125
|
-
def __repr__(self):
|
|
126
|
-
"""
|
|
127
|
-
Returns a string representation of the SnippetSet object.
|
|
128
|
-
|
|
129
|
-
Returns
|
|
130
|
-
-------
|
|
131
|
-
str
|
|
132
|
-
A string representation of the SnippetSet.
|
|
133
|
-
"""
|
|
134
|
-
return f"SnippetSet(snippets={len(self._snippets)})"
|
|
135
|
-
|
|
136
115
|
def to_dict(self) -> dict:
|
|
137
116
|
"""
|
|
138
117
|
Convert the SnippetSet to a dictionary representation.
|
|
@@ -144,15 +123,13 @@ class SnippetSet(Iterator[Snippet]):
|
|
|
144
123
|
|
|
145
124
|
Examples
|
|
146
125
|
--------
|
|
147
|
-
>>> snippets_data = {
|
|
148
|
-
|
|
149
|
-
... }
|
|
150
|
-
>>> snippets = SnippetSet(snippets_data)
|
|
126
|
+
>>> snippets_data = {"hash1": {"content": "Example snippet", "snippet_hash": "hash1"}}
|
|
127
|
+
>>> snippets = SnippetSet().from_dict(snippets_data)
|
|
151
128
|
>>> snippets_dict = snippets.to_dict()
|
|
152
129
|
>>> isinstance(snippets_dict, dict)
|
|
153
130
|
True
|
|
154
131
|
"""
|
|
155
|
-
return {s.snippet_hash: s.to_dict() for s in self.
|
|
132
|
+
return {s.snippet_hash: s.to_dict() for s in self.snippets} if self.snippets else {}
|
|
156
133
|
|
|
157
134
|
def to_json(self) -> str:
|
|
158
135
|
"""
|
|
@@ -165,12 +142,34 @@ class SnippetSet(Iterator[Snippet]):
|
|
|
165
142
|
|
|
166
143
|
Examples
|
|
167
144
|
--------
|
|
168
|
-
>>> snippets_data = {
|
|
169
|
-
|
|
170
|
-
... }
|
|
171
|
-
>>> snippets = SnippetSet(snippets_data)
|
|
145
|
+
>>> snippets_data = {"hash1": {"content": "Example snippet", "snippet_hash": "hash1"}}
|
|
146
|
+
>>> snippets = SnippetSet().from_dict(snippets_data)
|
|
172
147
|
>>> json_str = snippets.to_json()
|
|
173
148
|
>>> isinstance(json_str, str)
|
|
174
149
|
True
|
|
175
150
|
"""
|
|
176
151
|
return json_dumps(self.to_dict())
|
|
152
|
+
|
|
153
|
+
@classmethod
|
|
154
|
+
def from_dict(cls, data: dict) -> SnippetSet:
|
|
155
|
+
"""
|
|
156
|
+
Create a SnippetSet instance from a dictionary.
|
|
157
|
+
|
|
158
|
+
Parameters
|
|
159
|
+
----------
|
|
160
|
+
data : dict
|
|
161
|
+
Dictionary containing snippet data.
|
|
162
|
+
|
|
163
|
+
Returns
|
|
164
|
+
-------
|
|
165
|
+
SnippetSet
|
|
166
|
+
An instance of SnippetSet populated with the provided data.
|
|
167
|
+
|
|
168
|
+
Examples
|
|
169
|
+
--------
|
|
170
|
+
>>> snippets_data = {"hash1": {"content": "Example snippet", "snippet_hash": "hash1"}}
|
|
171
|
+
>>> snippets = SnippetSet.from_dict(snippets_data)
|
|
172
|
+
>>> isinstance(snippets, SnippetSet)
|
|
173
|
+
True
|
|
174
|
+
"""
|
|
175
|
+
return cls([Snippet.from_dict(s) for s in data.values()])
|
nosible/classes/web_page.py
CHANGED
|
@@ -1,7 +1,10 @@
|
|
|
1
|
+
from dataclasses import asdict, dataclass, field
|
|
2
|
+
|
|
1
3
|
from nosible.classes.snippet_set import SnippetSet
|
|
2
4
|
from nosible.utils.json_tools import json_dumps, json_loads
|
|
3
5
|
|
|
4
6
|
|
|
7
|
+
@dataclass(init=True, repr=True, eq=True, frozen=True)
|
|
5
8
|
class WebPageData:
|
|
6
9
|
"""
|
|
7
10
|
A data container for all extracted and processed information about a web page.
|
|
@@ -36,64 +39,26 @@ class WebPageData:
|
|
|
36
39
|
{'description': 'Example'}
|
|
37
40
|
"""
|
|
38
41
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
A list of companies mentioned in the webpage, if applicable. (GKIDS)
|
|
60
|
-
full_text : str, optional
|
|
61
|
-
The full text content of the webpage.
|
|
62
|
-
languages : dict, optional
|
|
63
|
-
Detected languages and their probabilities or counts.
|
|
64
|
-
metadata : dict, optional
|
|
65
|
-
Metadata extracted from the webpage (e.g., description, keywords).
|
|
66
|
-
page : dict, optional
|
|
67
|
-
Page-specific details such as title, canonical URL, etc.
|
|
68
|
-
request : dict, optional
|
|
69
|
-
Information about the HTTP request/response.
|
|
70
|
-
snippets : list, optional
|
|
71
|
-
Extracted text snippets or highlights from the page.
|
|
72
|
-
statistics : dict, optional
|
|
73
|
-
Statistical information about the page (e.g., word count).
|
|
74
|
-
structured : list, optional
|
|
75
|
-
Structured data (e.g., schema.org, OpenGraph).
|
|
76
|
-
url_tree : dict, optional
|
|
77
|
-
Hierarchical representation of the URL structure.
|
|
78
|
-
|
|
79
|
-
Examples
|
|
80
|
-
--------
|
|
81
|
-
>>> data = WebPageData(full_text="Example Domain", languages={"en": 1})
|
|
82
|
-
>>> data.languages
|
|
83
|
-
{'en': 1}
|
|
84
|
-
"""
|
|
85
|
-
self.companies = companies or []
|
|
86
|
-
if snippets is None:
|
|
87
|
-
snippets = {}
|
|
88
|
-
self.full_text = full_text
|
|
89
|
-
self.languages = languages or {}
|
|
90
|
-
self.metadata = metadata or {}
|
|
91
|
-
self.page = page or {}
|
|
92
|
-
self.request = request or {}
|
|
93
|
-
self.snippets = SnippetSet(snippets)
|
|
94
|
-
self.statistics = statistics or {}
|
|
95
|
-
self.structured = structured or []
|
|
96
|
-
self.url_tree = url_tree or {}
|
|
42
|
+
companies: list = None
|
|
43
|
+
"""A list of companies mentioned in the webpage, if applicable. (GKIDS)"""
|
|
44
|
+
full_text: str = None
|
|
45
|
+
"""The full text content of the webpage."""
|
|
46
|
+
languages: dict = None
|
|
47
|
+
"""Detected languages and their probabilities or counts."""
|
|
48
|
+
metadata: dict = None
|
|
49
|
+
"""Metadata extracted from the webpage (e.g., description, keywords)."""
|
|
50
|
+
page: dict = None
|
|
51
|
+
"""Page-specific details such as title, canonical URL, etc."""
|
|
52
|
+
request: dict = None
|
|
53
|
+
"""Information about the HTTP request/response."""
|
|
54
|
+
snippets: SnippetSet = field(init=True, default_factory=SnippetSet)
|
|
55
|
+
"""Extracted text snippets or highlights from the page."""
|
|
56
|
+
statistics: dict = None
|
|
57
|
+
"""Statistical information about the page (e.g., word count)."""
|
|
58
|
+
structured: list = None
|
|
59
|
+
"""Structured data (e.g., schema.org, OpenGraph)."""
|
|
60
|
+
url_tree: dict = None
|
|
61
|
+
"""Hierarchical representation of the URL structure."""
|
|
97
62
|
|
|
98
63
|
def __str__(self):
|
|
99
64
|
"""Return a string representation of the WebPageData.
|
|
@@ -109,24 +74,6 @@ class WebPageData:
|
|
|
109
74
|
f"statistics={self.statistics}, structured={self.structured}, url_tree={self.url_tree})"
|
|
110
75
|
)
|
|
111
76
|
|
|
112
|
-
def __repr__(self):
|
|
113
|
-
"""
|
|
114
|
-
Return a JSON-formatted string representation of the WebPageData instance.
|
|
115
|
-
|
|
116
|
-
Returns
|
|
117
|
-
-------
|
|
118
|
-
str
|
|
119
|
-
JSON string representing the WebPageData for easy readability and debugging.
|
|
120
|
-
|
|
121
|
-
Examples
|
|
122
|
-
--------
|
|
123
|
-
>>> data = WebPageData(languages={"en": 1}, metadata={"description": "Example"})
|
|
124
|
-
>>> repr_str = repr(data)
|
|
125
|
-
>>> isinstance(repr_str, str)
|
|
126
|
-
True
|
|
127
|
-
"""
|
|
128
|
-
return json_dumps(self.to_dict())
|
|
129
|
-
|
|
130
77
|
def to_dict(self) -> dict:
|
|
131
78
|
"""
|
|
132
79
|
Convert the WebPageData instance to a dictionary.
|
|
@@ -145,18 +92,10 @@ class WebPageData:
|
|
|
145
92
|
>>> d["languages"] == {"en": 1}
|
|
146
93
|
True
|
|
147
94
|
"""
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
"metadata": self.metadata,
|
|
153
|
-
"page": self.page,
|
|
154
|
-
"request": self.request,
|
|
155
|
-
"snippets": self.snippets.to_dict(),
|
|
156
|
-
"statistics": self.statistics,
|
|
157
|
-
"structured": self.structured,
|
|
158
|
-
"url_tree": self.url_tree,
|
|
159
|
-
}
|
|
95
|
+
data = asdict(self)
|
|
96
|
+
# snippets is still a SnippetSet instance, so convert it:
|
|
97
|
+
data["snippets"] = self.snippets.to_dict()
|
|
98
|
+
return data
|
|
160
99
|
|
|
161
100
|
def to_json(self) -> str:
|
|
162
101
|
"""
|
|
@@ -225,19 +164,12 @@ class WebPageData:
|
|
|
225
164
|
>>> webpage_data.languages
|
|
226
165
|
{'en': 1}
|
|
227
166
|
"""
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
page=parsed_data.get("page"),
|
|
235
|
-
request=parsed_data.get("request"),
|
|
236
|
-
snippets=parsed_data.get("snippets", {}),
|
|
237
|
-
statistics=parsed_data.get("statistics"),
|
|
238
|
-
structured=parsed_data.get("structured"),
|
|
239
|
-
url_tree=parsed_data.get("url_tree"),
|
|
240
|
-
)
|
|
167
|
+
data_dict = json_loads(data)
|
|
168
|
+
# Handle snippets separately to avoid passing it twice
|
|
169
|
+
snippets_data = data_dict.pop("snippets", None)
|
|
170
|
+
if snippets_data is not None:
|
|
171
|
+
data_dict["snippets"] = SnippetSet.from_dict(snippets_data)
|
|
172
|
+
return cls(**data_dict)
|
|
241
173
|
|
|
242
174
|
@classmethod
|
|
243
175
|
def load(cls, path: str) -> "WebPageData":
|
|
@@ -265,5 +197,9 @@ class WebPageData:
|
|
|
265
197
|
{'en': 1}
|
|
266
198
|
"""
|
|
267
199
|
with open(path, encoding="utf-8") as f:
|
|
268
|
-
data = f.read()
|
|
269
|
-
|
|
200
|
+
data = json_loads(f.read())
|
|
201
|
+
# Handle snippets separately to avoid passing it twice
|
|
202
|
+
snippets_data = data.pop("snippets", None)
|
|
203
|
+
if snippets_data is not None:
|
|
204
|
+
data["snippets"] = SnippetSet.from_dict(snippets_data)
|
|
205
|
+
return cls(**data)
|