nosible 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nosible/classes/result.py +65 -106
- nosible/classes/result_set.py +119 -113
- nosible/classes/search.py +68 -85
- nosible/classes/search_set.py +27 -12
- nosible/classes/snippet.py +57 -74
- nosible/classes/snippet_set.py +62 -63
- nosible/classes/web_page.py +39 -103
- nosible/nosible_client.py +232 -227
- nosible/utils/json_tools.py +51 -2
- nosible/utils/question_builder.py +131 -0
- nosible/utils/rate_limiter.py +30 -24
- {nosible-0.1.7.dist-info → nosible-0.1.9.dist-info}/METADATA +9 -45
- nosible-0.1.9.dist-info/RECORD +17 -0
- nosible-0.1.7.dist-info/RECORD +0 -16
- {nosible-0.1.7.dist-info → nosible-0.1.9.dist-info}/WHEEL +0 -0
- {nosible-0.1.7.dist-info → nosible-0.1.9.dist-info}/licenses/LICENSE +0 -0
- {nosible-0.1.7.dist-info → nosible-0.1.9.dist-info}/top_level.txt +0 -0
nosible/classes/search_set.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
from collections.abc import Iterator
|
|
2
|
+
from dataclasses import dataclass, field
|
|
2
3
|
|
|
3
4
|
from nosible.classes.search import Search
|
|
4
5
|
from nosible.utils.json_tools import json_dumps, json_loads
|
|
5
6
|
|
|
6
7
|
|
|
8
|
+
@dataclass()
|
|
7
9
|
class SearchSet(Iterator[Search]):
|
|
8
10
|
"""
|
|
9
11
|
Manages an iterable collection of Search objects.
|
|
@@ -32,9 +34,10 @@ class SearchSet(Iterator[Search]):
|
|
|
32
34
|
What is AI?
|
|
33
35
|
"""
|
|
34
36
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
37
|
+
searches: list[Search] = field(default_factory=list)
|
|
38
|
+
""" A list of Search objects in the collection."""
|
|
39
|
+
_index: int = field(default=0, init=False, repr=False, compare=False)
|
|
40
|
+
""" Internal index for iteration over searches."""
|
|
38
41
|
|
|
39
42
|
def __iter__(self) -> "SearchSet":
|
|
40
43
|
"""
|
|
@@ -199,7 +202,7 @@ class SearchSet(Iterator[Search]):
|
|
|
199
202
|
"""
|
|
200
203
|
del self.searches[index]
|
|
201
204
|
|
|
202
|
-
def
|
|
205
|
+
def to_dicts(self) -> list[dict]:
|
|
203
206
|
"""
|
|
204
207
|
Convert all Search objects in the collection to a list of dictionaries.
|
|
205
208
|
|
|
@@ -219,7 +222,7 @@ class SearchSet(Iterator[Search]):
|
|
|
219
222
|
>>> s1 = Search(question="What is Python?", n_results=3)
|
|
220
223
|
>>> s2 = Search(question="What is PEP8?", n_results=2)
|
|
221
224
|
>>> searches = SearchSet([s1, s2])
|
|
222
|
-
>>> searches.
|
|
225
|
+
>>> searches.to_dicts()[1]["question"]
|
|
223
226
|
'What is PEP8?'
|
|
224
227
|
"""
|
|
225
228
|
return [s.to_dict() for s in self.searches]
|
|
@@ -242,6 +245,10 @@ class SearchSet(Iterator[Search]):
|
|
|
242
245
|
str
|
|
243
246
|
A JSON string representation of the SearchSet collection if no path is provided.
|
|
244
247
|
|
|
248
|
+
Raises
|
|
249
|
+
-------
|
|
250
|
+
RuntimeError
|
|
251
|
+
If there is an error during serialization or file writing.
|
|
245
252
|
|
|
246
253
|
Examples
|
|
247
254
|
--------
|
|
@@ -251,14 +258,22 @@ class SearchSet(Iterator[Search]):
|
|
|
251
258
|
>>> json_str = searches.to_json()
|
|
252
259
|
>>> isinstance(json_str, str)
|
|
253
260
|
True
|
|
254
|
-
>>> searches.to_json(
|
|
261
|
+
>>> searches.to_json(
|
|
262
|
+
... "searches.json"
|
|
263
|
+
... ) # The file 'searches.json' will contain both search queries in JSON format.
|
|
255
264
|
"""
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
265
|
+
try:
|
|
266
|
+
json_bytes = json_dumps(self.to_dicts())
|
|
267
|
+
if path:
|
|
268
|
+
try:
|
|
269
|
+
with open(path, "w") as f:
|
|
270
|
+
f.write(json_bytes)
|
|
271
|
+
return None
|
|
272
|
+
except Exception as e:
|
|
273
|
+
raise RuntimeError(f"Failed to write JSON to '{path}': {e}") from e
|
|
274
|
+
return json_bytes
|
|
275
|
+
except Exception as e:
|
|
276
|
+
raise RuntimeError(f"Failed to serialize results to JSON: {e}") from e
|
|
262
277
|
|
|
263
278
|
@classmethod
|
|
264
279
|
def from_json(cls, path: str) -> "SearchSet":
|
nosible/classes/snippet.py
CHANGED
|
@@ -1,6 +1,9 @@
|
|
|
1
|
-
from
|
|
1
|
+
from dataclasses import asdict, dataclass, field
|
|
2
2
|
|
|
3
|
+
from nosible.utils.json_tools import json_dumps, print_dict
|
|
3
4
|
|
|
5
|
+
|
|
6
|
+
@dataclass(init=True, repr=True, eq=True, frozen=True)
|
|
4
7
|
class Snippet:
|
|
5
8
|
"""
|
|
6
9
|
A class representing a snippet of text, typically extracted from a web page.
|
|
@@ -25,6 +28,11 @@ class Snippet:
|
|
|
25
28
|
Hash of the URL from which the snippet was extracted.
|
|
26
29
|
words : str or None
|
|
27
30
|
The words in the snippet.
|
|
31
|
+
links : list or None
|
|
32
|
+
List of links associated with the snippet.
|
|
33
|
+
companies : list or None
|
|
34
|
+
List of companies mentioned in the snippet.
|
|
35
|
+
|
|
28
36
|
|
|
29
37
|
Examples
|
|
30
38
|
--------
|
|
@@ -34,67 +42,28 @@ class Snippet:
|
|
|
34
42
|
|
|
35
43
|
"""
|
|
36
44
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
The text content of the snippet.
|
|
60
|
-
images : list, optional
|
|
61
|
-
List of image URLs associated with the snippet.
|
|
62
|
-
language : str, optional
|
|
63
|
-
The language of the snippet.
|
|
64
|
-
snippet_hash : str, optional
|
|
65
|
-
A unique hash for the snippet.
|
|
66
|
-
statistics : dict, optional
|
|
67
|
-
Statistical information about the snippet (e.g., word count).
|
|
68
|
-
words : str, optional
|
|
69
|
-
The words in the snippet.
|
|
70
|
-
|
|
71
|
-
Examples
|
|
72
|
-
--------
|
|
73
|
-
>>> snippet = Snippet(content="Example snippet", language="en")
|
|
74
|
-
>>> print(snippet.content)
|
|
75
|
-
Example snippet
|
|
76
|
-
"""
|
|
77
|
-
self.companies = companies or []
|
|
78
|
-
self.content = content
|
|
79
|
-
self.images = images
|
|
80
|
-
self.language = language
|
|
81
|
-
self.snippet_hash = snippet_hash
|
|
82
|
-
self.statistics = statistics
|
|
83
|
-
self.words = words
|
|
84
|
-
self.url_hash = url_hash
|
|
85
|
-
self.next_snippet_hash = next_snippet_hash
|
|
86
|
-
self.prev_snippet_hash = prev_snippet_hash
|
|
87
|
-
|
|
88
|
-
def __repr__(self):
|
|
89
|
-
"""
|
|
90
|
-
Returns a string representation of the Snippet object.
|
|
91
|
-
|
|
92
|
-
Returns
|
|
93
|
-
-------
|
|
94
|
-
str
|
|
95
|
-
A string representation of the Snippet.
|
|
96
|
-
"""
|
|
97
|
-
return f"Snippet(content={self.content[:30]}, language={self.language}, snippet_hash={self.snippet_hash})"
|
|
45
|
+
content: str = field(default=None, repr=True, compare=True)
|
|
46
|
+
"""The text content of the snippet."""
|
|
47
|
+
images: list = field(default=None, repr=True, compare=False)
|
|
48
|
+
"""List of image URLs associated with the snippet."""
|
|
49
|
+
language: str = field(default=None, repr=True, compare=False)
|
|
50
|
+
"""The language of the snippet."""
|
|
51
|
+
next_snippet_hash: str = field(default=None, repr=True, compare=False)
|
|
52
|
+
"""Hash of the next snippet in sequence."""
|
|
53
|
+
prev_snippet_hash: str = field(default=None, repr=True, compare=False)
|
|
54
|
+
"""Hash of the previous snippet in sequence."""
|
|
55
|
+
snippet_hash: str = field(default=None, repr=True, compare=True)
|
|
56
|
+
"""A unique hash for the snippet."""
|
|
57
|
+
statistics: dict = field(default=None, repr=False, compare=False)
|
|
58
|
+
"""Statistical information about the snippet."""
|
|
59
|
+
url_hash: str = field(default=None, repr=True, compare=False)
|
|
60
|
+
"""Hash of the URL from which the snippet was extracted."""
|
|
61
|
+
words: str = field(default=None, repr=False, compare=False)
|
|
62
|
+
"""The words in the snippet."""
|
|
63
|
+
links: list = field(default=None, repr=False, compare=False)
|
|
64
|
+
"""List of links associated with the snippet."""
|
|
65
|
+
companies: list = field(default=None, repr=False, compare=False)
|
|
66
|
+
"""List of companies mentioned in the snippet."""
|
|
98
67
|
|
|
99
68
|
def __str__(self):
|
|
100
69
|
"""
|
|
@@ -105,7 +74,7 @@ class Snippet:
|
|
|
105
74
|
str
|
|
106
75
|
A string representation of the Snippet.
|
|
107
76
|
"""
|
|
108
|
-
return
|
|
77
|
+
return print_dict(self.to_dict())
|
|
109
78
|
|
|
110
79
|
def __getitem__(self, key: str):
|
|
111
80
|
"""
|
|
@@ -146,17 +115,31 @@ class Snippet:
|
|
|
146
115
|
>>> isinstance(snippet_dict, dict)
|
|
147
116
|
True
|
|
148
117
|
"""
|
|
149
|
-
return
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
118
|
+
return asdict(self, dict_factory=dict)
|
|
119
|
+
|
|
120
|
+
@classmethod
|
|
121
|
+
def from_dict(cls, data: dict) -> "Snippet":
|
|
122
|
+
"""
|
|
123
|
+
Create a Snippet instance from a dictionary.
|
|
124
|
+
|
|
125
|
+
Parameters
|
|
126
|
+
----------
|
|
127
|
+
data : dict
|
|
128
|
+
Dictionary containing snippet data.
|
|
129
|
+
|
|
130
|
+
Returns
|
|
131
|
+
-------
|
|
132
|
+
Snippet
|
|
133
|
+
An instance of Snippet populated with the provided data.
|
|
134
|
+
|
|
135
|
+
Examples
|
|
136
|
+
--------
|
|
137
|
+
>>> snippet_data = {"content": "Example snippet", "snippet_hash": "hash1"}
|
|
138
|
+
>>> snippet = Snippet.from_dict(snippet_data)
|
|
139
|
+
>>> isinstance(snippet, Snippet)
|
|
140
|
+
True
|
|
141
|
+
"""
|
|
142
|
+
return cls(**data)
|
|
160
143
|
|
|
161
144
|
def to_json(self) -> str:
|
|
162
145
|
"""
|
nosible/classes/snippet_set.py
CHANGED
|
@@ -1,9 +1,13 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterator
|
|
4
|
+
from dataclasses import dataclass, field
|
|
2
5
|
|
|
3
6
|
from nosible.classes.snippet import Snippet
|
|
4
7
|
from nosible.utils.json_tools import json_dumps
|
|
5
8
|
|
|
6
9
|
|
|
10
|
+
@dataclass()
|
|
7
11
|
class SnippetSet(Iterator[Snippet]):
|
|
8
12
|
"""
|
|
9
13
|
An iterator and container for a collection of Snippet objects.
|
|
@@ -19,64 +23,50 @@ class SnippetSet(Iterator[Snippet]):
|
|
|
19
23
|
Examples
|
|
20
24
|
--------
|
|
21
25
|
>>> snippets_data = {
|
|
22
|
-
... "hash1": {"content": "Example snippet", "snippet_hash": "hash1"}
|
|
26
|
+
... "hash1": {"content": "Example snippet", "snippet_hash": "hash1"},
|
|
27
|
+
... "hash2": {"content": "Another snippet", "snippet_hash": "hash2"},
|
|
23
28
|
... }
|
|
24
|
-
>>> snippets = SnippetSet(snippets_data)
|
|
29
|
+
>>> snippets = SnippetSet().from_dict(snippets_data)
|
|
25
30
|
>>> for snippet in snippets:
|
|
26
31
|
... print(snippet.content)
|
|
27
32
|
Example snippet
|
|
33
|
+
Another snippet
|
|
28
34
|
"""
|
|
29
35
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
language=value.get("language", ""),
|
|
40
|
-
next_snippet_hash=value.get("next_snippet_hash", ""),
|
|
41
|
-
prev_snippet_hash=value.get("prev_snippet_hash", ""),
|
|
42
|
-
snippet_hash=key,
|
|
43
|
-
statistics=value.get("statistics", {}),
|
|
44
|
-
url_hash=value.get("url_hash", ""),
|
|
45
|
-
words=value.get("words", ""),
|
|
46
|
-
)
|
|
47
|
-
)
|
|
48
|
-
|
|
49
|
-
self._index = 0
|
|
50
|
-
|
|
51
|
-
def __iter__(self):
|
|
52
|
-
"""
|
|
53
|
-
Initialize the iterator.
|
|
36
|
+
snippets: list[Snippet] = field(default_factory=list)
|
|
37
|
+
""" List of `Snippet` objects contained in this ResultSet."""
|
|
38
|
+
_index: int = field(default=0, init=False, repr=False, compare=False)
|
|
39
|
+
""" Internal index for iteration over snippets."""
|
|
40
|
+
|
|
41
|
+
def __iter__(self) -> SnippetSet:
|
|
42
|
+
"""
|
|
43
|
+
Reset iteration and return self.
|
|
44
|
+
|
|
54
45
|
Returns
|
|
55
46
|
-------
|
|
56
|
-
|
|
57
|
-
|
|
47
|
+
ResultSet
|
|
48
|
+
Iterator over the ResultSet instance.
|
|
58
49
|
"""
|
|
59
|
-
self
|
|
50
|
+
object.__setattr__(self, "_index", 0)
|
|
60
51
|
return self
|
|
61
52
|
|
|
62
53
|
def __next__(self) -> Snippet:
|
|
63
54
|
"""
|
|
64
|
-
Returns the next
|
|
55
|
+
Returns the next Result in the sequence.
|
|
65
56
|
|
|
66
57
|
Returns
|
|
67
58
|
-------
|
|
68
|
-
|
|
69
|
-
The next
|
|
70
|
-
|
|
59
|
+
Result
|
|
60
|
+
The next Result object in the sequence.
|
|
71
61
|
Raises
|
|
72
62
|
------
|
|
73
63
|
StopIteration
|
|
74
|
-
If
|
|
64
|
+
If the end of the sequence is reached.
|
|
75
65
|
"""
|
|
76
|
-
if self._index < len(self.
|
|
77
|
-
|
|
78
|
-
self._index
|
|
79
|
-
return
|
|
66
|
+
if self._index < len(self.snippets):
|
|
67
|
+
item = self.snippets[self._index]
|
|
68
|
+
object.__setattr__(self, "_index", self._index + 1)
|
|
69
|
+
return item
|
|
80
70
|
raise StopIteration
|
|
81
71
|
|
|
82
72
|
def __len__(self) -> int:
|
|
@@ -88,7 +78,7 @@ class SnippetSet(Iterator[Snippet]):
|
|
|
88
78
|
int
|
|
89
79
|
The number of snippets.
|
|
90
80
|
"""
|
|
91
|
-
return len(self.
|
|
81
|
+
return len(self.snippets)
|
|
92
82
|
|
|
93
83
|
def __getitem__(self, index: int) -> Snippet:
|
|
94
84
|
"""
|
|
@@ -109,9 +99,9 @@ class SnippetSet(Iterator[Snippet]):
|
|
|
109
99
|
IndexError
|
|
110
100
|
If the index is out of range.
|
|
111
101
|
"""
|
|
112
|
-
if
|
|
113
|
-
|
|
114
|
-
|
|
102
|
+
if 0 <= index < len(self.snippets):
|
|
103
|
+
return self.snippets[index]
|
|
104
|
+
raise IndexError(f"Index {index} out of range for SnippetSet of length {len(self.snippets)}.")
|
|
115
105
|
|
|
116
106
|
def __str__(self):
|
|
117
107
|
"""
|
|
@@ -122,17 +112,6 @@ class SnippetSet(Iterator[Snippet]):
|
|
|
122
112
|
"""
|
|
123
113
|
return "\n".join(str(s) for s in self)
|
|
124
114
|
|
|
125
|
-
def __repr__(self):
|
|
126
|
-
"""
|
|
127
|
-
Returns a string representation of the SnippetSet object.
|
|
128
|
-
|
|
129
|
-
Returns
|
|
130
|
-
-------
|
|
131
|
-
str
|
|
132
|
-
A string representation of the SnippetSet.
|
|
133
|
-
"""
|
|
134
|
-
return f"SnippetSet(snippets={len(self._snippets)})"
|
|
135
|
-
|
|
136
115
|
def to_dict(self) -> dict:
|
|
137
116
|
"""
|
|
138
117
|
Convert the SnippetSet to a dictionary representation.
|
|
@@ -144,15 +123,13 @@ class SnippetSet(Iterator[Snippet]):
|
|
|
144
123
|
|
|
145
124
|
Examples
|
|
146
125
|
--------
|
|
147
|
-
>>> snippets_data = {
|
|
148
|
-
|
|
149
|
-
... }
|
|
150
|
-
>>> snippets = SnippetSet(snippets_data)
|
|
126
|
+
>>> snippets_data = {"hash1": {"content": "Example snippet", "snippet_hash": "hash1"}}
|
|
127
|
+
>>> snippets = SnippetSet().from_dict(snippets_data)
|
|
151
128
|
>>> snippets_dict = snippets.to_dict()
|
|
152
129
|
>>> isinstance(snippets_dict, dict)
|
|
153
130
|
True
|
|
154
131
|
"""
|
|
155
|
-
return {s.snippet_hash: s.to_dict() for s in self.
|
|
132
|
+
return {s.snippet_hash: s.to_dict() for s in self.snippets} if self.snippets else {}
|
|
156
133
|
|
|
157
134
|
def to_json(self) -> str:
|
|
158
135
|
"""
|
|
@@ -165,12 +142,34 @@ class SnippetSet(Iterator[Snippet]):
|
|
|
165
142
|
|
|
166
143
|
Examples
|
|
167
144
|
--------
|
|
168
|
-
>>> snippets_data = {
|
|
169
|
-
|
|
170
|
-
... }
|
|
171
|
-
>>> snippets = SnippetSet(snippets_data)
|
|
145
|
+
>>> snippets_data = {"hash1": {"content": "Example snippet", "snippet_hash": "hash1"}}
|
|
146
|
+
>>> snippets = SnippetSet().from_dict(snippets_data)
|
|
172
147
|
>>> json_str = snippets.to_json()
|
|
173
148
|
>>> isinstance(json_str, str)
|
|
174
149
|
True
|
|
175
150
|
"""
|
|
176
151
|
return json_dumps(self.to_dict())
|
|
152
|
+
|
|
153
|
+
@classmethod
|
|
154
|
+
def from_dict(cls, data: dict) -> SnippetSet:
|
|
155
|
+
"""
|
|
156
|
+
Create a SnippetSet instance from a dictionary.
|
|
157
|
+
|
|
158
|
+
Parameters
|
|
159
|
+
----------
|
|
160
|
+
data : dict
|
|
161
|
+
Dictionary containing snippet data.
|
|
162
|
+
|
|
163
|
+
Returns
|
|
164
|
+
-------
|
|
165
|
+
SnippetSet
|
|
166
|
+
An instance of SnippetSet populated with the provided data.
|
|
167
|
+
|
|
168
|
+
Examples
|
|
169
|
+
--------
|
|
170
|
+
>>> snippets_data = {"hash1": {"content": "Example snippet", "snippet_hash": "hash1"}}
|
|
171
|
+
>>> snippets = SnippetSet.from_dict(snippets_data)
|
|
172
|
+
>>> isinstance(snippets, SnippetSet)
|
|
173
|
+
True
|
|
174
|
+
"""
|
|
175
|
+
return cls([Snippet.from_dict(s) for s in data.values()])
|
nosible/classes/web_page.py
CHANGED
|
@@ -1,7 +1,10 @@
|
|
|
1
|
+
from dataclasses import asdict, dataclass, field
|
|
2
|
+
|
|
1
3
|
from nosible.classes.snippet_set import SnippetSet
|
|
2
4
|
from nosible.utils.json_tools import json_dumps, json_loads
|
|
3
5
|
|
|
4
6
|
|
|
7
|
+
@dataclass(init=True, repr=True, eq=True, frozen=True)
|
|
5
8
|
class WebPageData:
|
|
6
9
|
"""
|
|
7
10
|
A data container for all extracted and processed information about a web page.
|
|
@@ -36,64 +39,26 @@ class WebPageData:
|
|
|
36
39
|
{'description': 'Example'}
|
|
37
40
|
"""
|
|
38
41
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
A list of companies mentioned in the webpage, if applicable. (GKIDS)
|
|
60
|
-
full_text : str, optional
|
|
61
|
-
The full text content of the webpage.
|
|
62
|
-
languages : dict, optional
|
|
63
|
-
Detected languages and their probabilities or counts.
|
|
64
|
-
metadata : dict, optional
|
|
65
|
-
Metadata extracted from the webpage (e.g., description, keywords).
|
|
66
|
-
page : dict, optional
|
|
67
|
-
Page-specific details such as title, canonical URL, etc.
|
|
68
|
-
request : dict, optional
|
|
69
|
-
Information about the HTTP request/response.
|
|
70
|
-
snippets : list, optional
|
|
71
|
-
Extracted text snippets or highlights from the page.
|
|
72
|
-
statistics : dict, optional
|
|
73
|
-
Statistical information about the page (e.g., word count).
|
|
74
|
-
structured : list, optional
|
|
75
|
-
Structured data (e.g., schema.org, OpenGraph).
|
|
76
|
-
url_tree : dict, optional
|
|
77
|
-
Hierarchical representation of the URL structure.
|
|
78
|
-
|
|
79
|
-
Examples
|
|
80
|
-
--------
|
|
81
|
-
>>> data = WebPageData(full_text="Example Domain", languages={"en": 1})
|
|
82
|
-
>>> data.languages
|
|
83
|
-
{'en': 1}
|
|
84
|
-
"""
|
|
85
|
-
self.companies = companies or []
|
|
86
|
-
if snippets is None:
|
|
87
|
-
snippets = {}
|
|
88
|
-
self.full_text = full_text
|
|
89
|
-
self.languages = languages or {}
|
|
90
|
-
self.metadata = metadata or {}
|
|
91
|
-
self.page = page or {}
|
|
92
|
-
self.request = request or {}
|
|
93
|
-
self.snippets = SnippetSet(snippets)
|
|
94
|
-
self.statistics = statistics or {}
|
|
95
|
-
self.structured = structured or []
|
|
96
|
-
self.url_tree = url_tree or {}
|
|
42
|
+
companies: list = None
|
|
43
|
+
"""A list of companies mentioned in the webpage, if applicable. (GKIDS)"""
|
|
44
|
+
full_text: str = None
|
|
45
|
+
"""The full text content of the webpage."""
|
|
46
|
+
languages: dict = None
|
|
47
|
+
"""Detected languages and their probabilities or counts."""
|
|
48
|
+
metadata: dict = None
|
|
49
|
+
"""Metadata extracted from the webpage (e.g., description, keywords)."""
|
|
50
|
+
page: dict = None
|
|
51
|
+
"""Page-specific details such as title, canonical URL, etc."""
|
|
52
|
+
request: dict = None
|
|
53
|
+
"""Information about the HTTP request/response."""
|
|
54
|
+
snippets: SnippetSet = field(init=True, default_factory=SnippetSet)
|
|
55
|
+
"""Extracted text snippets or highlights from the page."""
|
|
56
|
+
statistics: dict = None
|
|
57
|
+
"""Statistical information about the page (e.g., word count)."""
|
|
58
|
+
structured: list = None
|
|
59
|
+
"""Structured data (e.g., schema.org, OpenGraph)."""
|
|
60
|
+
url_tree: dict = None
|
|
61
|
+
"""Hierarchical representation of the URL structure."""
|
|
97
62
|
|
|
98
63
|
def __str__(self):
|
|
99
64
|
"""Return a string representation of the WebPageData.
|
|
@@ -109,24 +74,6 @@ class WebPageData:
|
|
|
109
74
|
f"statistics={self.statistics}, structured={self.structured}, url_tree={self.url_tree})"
|
|
110
75
|
)
|
|
111
76
|
|
|
112
|
-
def __repr__(self):
|
|
113
|
-
"""
|
|
114
|
-
Return a JSON-formatted string representation of the WebPageData instance.
|
|
115
|
-
|
|
116
|
-
Returns
|
|
117
|
-
-------
|
|
118
|
-
str
|
|
119
|
-
JSON string representing the WebPageData for easy readability and debugging.
|
|
120
|
-
|
|
121
|
-
Examples
|
|
122
|
-
--------
|
|
123
|
-
>>> data = WebPageData(languages={"en": 1}, metadata={"description": "Example"})
|
|
124
|
-
>>> repr_str = repr(data)
|
|
125
|
-
>>> isinstance(repr_str, str)
|
|
126
|
-
True
|
|
127
|
-
"""
|
|
128
|
-
return json_dumps(self.to_dict())
|
|
129
|
-
|
|
130
77
|
def to_dict(self) -> dict:
|
|
131
78
|
"""
|
|
132
79
|
Convert the WebPageData instance to a dictionary.
|
|
@@ -145,18 +92,10 @@ class WebPageData:
|
|
|
145
92
|
>>> d["languages"] == {"en": 1}
|
|
146
93
|
True
|
|
147
94
|
"""
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
"metadata": self.metadata,
|
|
153
|
-
"page": self.page,
|
|
154
|
-
"request": self.request,
|
|
155
|
-
"snippets": self.snippets.to_dict(),
|
|
156
|
-
"statistics": self.statistics,
|
|
157
|
-
"structured": self.structured,
|
|
158
|
-
"url_tree": self.url_tree,
|
|
159
|
-
}
|
|
95
|
+
data = asdict(self)
|
|
96
|
+
# snippets is still a SnippetSet instance, so convert it:
|
|
97
|
+
data["snippets"] = self.snippets.to_dict()
|
|
98
|
+
return data
|
|
160
99
|
|
|
161
100
|
def to_json(self) -> str:
|
|
162
101
|
"""
|
|
@@ -225,19 +164,12 @@ class WebPageData:
|
|
|
225
164
|
>>> webpage_data.languages
|
|
226
165
|
{'en': 1}
|
|
227
166
|
"""
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
page=parsed_data.get("page"),
|
|
235
|
-
request=parsed_data.get("request"),
|
|
236
|
-
snippets=parsed_data.get("snippets", {}),
|
|
237
|
-
statistics=parsed_data.get("statistics"),
|
|
238
|
-
structured=parsed_data.get("structured"),
|
|
239
|
-
url_tree=parsed_data.get("url_tree"),
|
|
240
|
-
)
|
|
167
|
+
data_dict = json_loads(data)
|
|
168
|
+
# Handle snippets separately to avoid passing it twice
|
|
169
|
+
snippets_data = data_dict.pop("snippets", None)
|
|
170
|
+
if snippets_data is not None:
|
|
171
|
+
data_dict["snippets"] = SnippetSet.from_dict(snippets_data)
|
|
172
|
+
return cls(**data_dict)
|
|
241
173
|
|
|
242
174
|
@classmethod
|
|
243
175
|
def load(cls, path: str) -> "WebPageData":
|
|
@@ -265,5 +197,9 @@ class WebPageData:
|
|
|
265
197
|
{'en': 1}
|
|
266
198
|
"""
|
|
267
199
|
with open(path, encoding="utf-8") as f:
|
|
268
|
-
data = f.read()
|
|
269
|
-
|
|
200
|
+
data = json_loads(f.read())
|
|
201
|
+
# Handle snippets separately to avoid passing it twice
|
|
202
|
+
snippets_data = data.pop("snippets", None)
|
|
203
|
+
if snippets_data is not None:
|
|
204
|
+
data["snippets"] = SnippetSet.from_dict(snippets_data)
|
|
205
|
+
return cls(**data)
|