nosible 0.1.8__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,13 @@
1
- from typing import Iterator
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Iterator
4
+ from dataclasses import dataclass, field
2
5
 
3
6
  from nosible.classes.snippet import Snippet
4
7
  from nosible.utils.json_tools import json_dumps
5
8
 
6
9
 
10
+ @dataclass()
7
11
  class SnippetSet(Iterator[Snippet]):
8
12
  """
9
13
  An iterator and container for a collection of Snippet objects.
@@ -19,64 +23,50 @@ class SnippetSet(Iterator[Snippet]):
19
23
  Examples
20
24
  --------
21
25
  >>> snippets_data = {
22
- ... "hash1": {"content": "Example snippet", "snippet_hash": "hash1"}
26
+ ... "hash1": {"content": "Example snippet", "snippet_hash": "hash1"},
27
+ ... "hash2": {"content": "Another snippet", "snippet_hash": "hash2"},
23
28
  ... }
24
- >>> snippets = SnippetSet(snippets_data)
29
+ >>> snippets = SnippetSet().from_dict(snippets_data)
25
30
  >>> for snippet in snippets:
26
31
  ... print(snippet.content)
27
32
  Example snippet
33
+ Another snippet
28
34
  """
29
35
 
30
- def __init__(self, snippets: dict) -> None:
31
- self._snippets = []
32
-
33
- for key, value in snippets.items():
34
- self._snippets.append(
35
- Snippet(
36
- companies=value.get("companies", []),
37
- content=value.get("content", ""),
38
- images=value.get("images", []),
39
- language=value.get("language", ""),
40
- next_snippet_hash=value.get("next_snippet_hash", ""),
41
- prev_snippet_hash=value.get("prev_snippet_hash", ""),
42
- snippet_hash=key,
43
- statistics=value.get("statistics", {}),
44
- url_hash=value.get("url_hash", ""),
45
- words=value.get("words", ""),
46
- )
47
- )
48
-
49
- self._index = 0
50
-
51
- def __iter__(self):
52
- """
53
- Initialize the iterator.
36
+ snippets: list[Snippet] = field(default_factory=list)
37
+ """ List of `Snippet` objects contained in this ResultSet."""
38
+ _index: int = field(default=0, init=False, repr=False, compare=False)
39
+ """ Internal index for iteration over snippets."""
40
+
41
+ def __iter__(self) -> SnippetSet:
42
+ """
43
+ Reset iteration and return self.
44
+
54
45
  Returns
55
46
  -------
56
- SnippetSet
57
- The iterator itself.
47
+ ResultSet
48
+ Iterator over the ResultSet instance.
58
49
  """
59
- self._index = 0
50
+ object.__setattr__(self, "_index", 0)
60
51
  return self
61
52
 
62
53
  def __next__(self) -> Snippet:
63
54
  """
64
- Returns the next Snippet object from the collection.
55
+ Returns the next Result in the sequence.
65
56
 
66
57
  Returns
67
58
  -------
68
- Snippet
69
- The next snippet in the sequence.
70
-
59
+ Result
60
+ The next Result object in the sequence.
71
61
  Raises
72
62
  ------
73
63
  StopIteration
74
- If there are no more snippets to return.
64
+ If the end of the sequence is reached.
75
65
  """
76
- if self._index < len(self._snippets):
77
- snippet = self._snippets[self._index]
78
- self._index += 1
79
- return snippet
66
+ if self._index < len(self.snippets):
67
+ item = self.snippets[self._index]
68
+ object.__setattr__(self, "_index", self._index + 1)
69
+ return item
80
70
  raise StopIteration
81
71
 
82
72
  def __len__(self) -> int:
@@ -88,7 +78,7 @@ class SnippetSet(Iterator[Snippet]):
88
78
  int
89
79
  The number of snippets.
90
80
  """
91
- return len(self._snippets)
81
+ return len(self.snippets)
92
82
 
93
83
  def __getitem__(self, index: int) -> Snippet:
94
84
  """
@@ -109,9 +99,9 @@ class SnippetSet(Iterator[Snippet]):
109
99
  IndexError
110
100
  If the index is out of range.
111
101
  """
112
- if index < 0 or index >= len(self._snippets):
113
- raise IndexError("Index out of range.")
114
- return self._snippets[index]
102
+ if 0 <= index < len(self.snippets):
103
+ return self.snippets[index]
104
+ raise IndexError(f"Index {index} out of range for SnippetSet of length {len(self.snippets)}.")
115
105
 
116
106
  def __str__(self):
117
107
  """
@@ -122,17 +112,6 @@ class SnippetSet(Iterator[Snippet]):
122
112
  """
123
113
  return "\n".join(str(s) for s in self)
124
114
 
125
- def __repr__(self):
126
- """
127
- Returns a string representation of the SnippetSet object.
128
-
129
- Returns
130
- -------
131
- str
132
- A string representation of the SnippetSet.
133
- """
134
- return f"SnippetSet(snippets={len(self._snippets)})"
135
-
136
115
  def to_dict(self) -> dict:
137
116
  """
138
117
  Convert the SnippetSet to a dictionary representation.
@@ -144,15 +123,13 @@ class SnippetSet(Iterator[Snippet]):
144
123
 
145
124
  Examples
146
125
  --------
147
- >>> snippets_data = {
148
- ... "hash1": {"content": "Example snippet", "snippet_hash": "hash1"}
149
- ... }
150
- >>> snippets = SnippetSet(snippets_data)
126
+ >>> snippets_data = {"hash1": {"content": "Example snippet", "snippet_hash": "hash1"}}
127
+ >>> snippets = SnippetSet().from_dict(snippets_data)
151
128
  >>> snippets_dict = snippets.to_dict()
152
129
  >>> isinstance(snippets_dict, dict)
153
130
  True
154
131
  """
155
- return {s.snippet_hash: s.to_dict() for s in self._snippets} if self._snippets else {}
132
+ return {s.snippet_hash: s.to_dict() for s in self.snippets} if self.snippets else {}
156
133
 
157
134
  def to_json(self) -> str:
158
135
  """
@@ -165,12 +142,34 @@ class SnippetSet(Iterator[Snippet]):
165
142
 
166
143
  Examples
167
144
  --------
168
- >>> snippets_data = {
169
- ... "hash1": {"content": "Example snippet", "snippet_hash": "hash1"}
170
- ... }
171
- >>> snippets = SnippetSet(snippets_data)
145
+ >>> snippets_data = {"hash1": {"content": "Example snippet", "snippet_hash": "hash1"}}
146
+ >>> snippets = SnippetSet().from_dict(snippets_data)
172
147
  >>> json_str = snippets.to_json()
173
148
  >>> isinstance(json_str, str)
174
149
  True
175
150
  """
176
151
  return json_dumps(self.to_dict())
152
+
153
+ @classmethod
154
+ def from_dict(cls, data: dict) -> SnippetSet:
155
+ """
156
+ Create a SnippetSet instance from a dictionary.
157
+
158
+ Parameters
159
+ ----------
160
+ data : dict
161
+ Dictionary containing snippet data.
162
+
163
+ Returns
164
+ -------
165
+ SnippetSet
166
+ An instance of SnippetSet populated with the provided data.
167
+
168
+ Examples
169
+ --------
170
+ >>> snippets_data = {"hash1": {"content": "Example snippet", "snippet_hash": "hash1"}}
171
+ >>> snippets = SnippetSet.from_dict(snippets_data)
172
+ >>> isinstance(snippets, SnippetSet)
173
+ True
174
+ """
175
+ return cls([Snippet.from_dict(s) for s in data.values()])
@@ -1,7 +1,10 @@
1
+ from dataclasses import asdict, dataclass, field
2
+
1
3
  from nosible.classes.snippet_set import SnippetSet
2
4
  from nosible.utils.json_tools import json_dumps, json_loads
3
5
 
4
6
 
7
+ @dataclass(init=True, repr=True, eq=True, frozen=True)
5
8
  class WebPageData:
6
9
  """
7
10
  A data container for all extracted and processed information about a web page.
@@ -36,64 +39,26 @@ class WebPageData:
36
39
  {'description': 'Example'}
37
40
  """
38
41
 
39
- def __init__(
40
- self,
41
- *,
42
- companies: list = None,
43
- full_text: str = None,
44
- languages: dict = None,
45
- metadata: dict = None,
46
- page: dict = None,
47
- request: dict = None,
48
- snippets: dict = None,
49
- statistics: dict = None,
50
- structured: list = None,
51
- url_tree: dict = None,
52
- ):
53
- """
54
- Initialize a WebPageData instance.
55
-
56
- Parameters
57
- ----------
58
- companies : list, optional
59
- A list of companies mentioned in the webpage, if applicable. (GKIDS)
60
- full_text : str, optional
61
- The full text content of the webpage.
62
- languages : dict, optional
63
- Detected languages and their probabilities or counts.
64
- metadata : dict, optional
65
- Metadata extracted from the webpage (e.g., description, keywords).
66
- page : dict, optional
67
- Page-specific details such as title, canonical URL, etc.
68
- request : dict, optional
69
- Information about the HTTP request/response.
70
- snippets : list, optional
71
- Extracted text snippets or highlights from the page.
72
- statistics : dict, optional
73
- Statistical information about the page (e.g., word count).
74
- structured : list, optional
75
- Structured data (e.g., schema.org, OpenGraph).
76
- url_tree : dict, optional
77
- Hierarchical representation of the URL structure.
78
-
79
- Examples
80
- --------
81
- >>> data = WebPageData(full_text="Example Domain", languages={"en": 1})
82
- >>> data.languages
83
- {'en': 1}
84
- """
85
- self.companies = companies or []
86
- if snippets is None:
87
- snippets = {}
88
- self.full_text = full_text
89
- self.languages = languages or {}
90
- self.metadata = metadata or {}
91
- self.page = page or {}
92
- self.request = request or {}
93
- self.snippets = SnippetSet(snippets)
94
- self.statistics = statistics or {}
95
- self.structured = structured or []
96
- self.url_tree = url_tree or {}
42
+ companies: list = None
43
+ """A list of companies mentioned in the webpage, if applicable. (GKIDS)"""
44
+ full_text: str = None
45
+ """The full text content of the webpage."""
46
+ languages: dict = None
47
+ """Detected languages and their probabilities or counts."""
48
+ metadata: dict = None
49
+ """Metadata extracted from the webpage (e.g., description, keywords)."""
50
+ page: dict = None
51
+ """Page-specific details such as title, canonical URL, etc."""
52
+ request: dict = None
53
+ """Information about the HTTP request/response."""
54
+ snippets: SnippetSet = field(init=True, default_factory=SnippetSet)
55
+ """Extracted text snippets or highlights from the page."""
56
+ statistics: dict = None
57
+ """Statistical information about the page (e.g., word count)."""
58
+ structured: list = None
59
+ """Structured data (e.g., schema.org, OpenGraph)."""
60
+ url_tree: dict = None
61
+ """Hierarchical representation of the URL structure."""
97
62
 
98
63
  def __str__(self):
99
64
  """Return a string representation of the WebPageData.
@@ -109,24 +74,6 @@ class WebPageData:
109
74
  f"statistics={self.statistics}, structured={self.structured}, url_tree={self.url_tree})"
110
75
  )
111
76
 
112
- def __repr__(self):
113
- """
114
- Return a JSON-formatted string representation of the WebPageData instance.
115
-
116
- Returns
117
- -------
118
- str
119
- JSON string representing the WebPageData for easy readability and debugging.
120
-
121
- Examples
122
- --------
123
- >>> data = WebPageData(languages={"en": 1}, metadata={"description": "Example"})
124
- >>> repr_str = repr(data)
125
- >>> isinstance(repr_str, str)
126
- True
127
- """
128
- return json_dumps(self.to_dict())
129
-
130
77
  def to_dict(self) -> dict:
131
78
  """
132
79
  Convert the WebPageData instance to a dictionary.
@@ -145,18 +92,10 @@ class WebPageData:
145
92
  >>> d["languages"] == {"en": 1}
146
93
  True
147
94
  """
148
- return {
149
- "companies": self.companies,
150
- "full_text": self.full_text,
151
- "languages": self.languages,
152
- "metadata": self.metadata,
153
- "page": self.page,
154
- "request": self.request,
155
- "snippets": self.snippets.to_dict(),
156
- "statistics": self.statistics,
157
- "structured": self.structured,
158
- "url_tree": self.url_tree,
159
- }
95
+ data = asdict(self)
96
+ # snippets is still a SnippetSet instance, so convert it:
97
+ data["snippets"] = self.snippets.to_dict()
98
+ return data
160
99
 
161
100
  def to_json(self) -> str:
162
101
  """
@@ -225,19 +164,12 @@ class WebPageData:
225
164
  >>> webpage_data.languages
226
165
  {'en': 1}
227
166
  """
228
- parsed_data = json_loads(data)
229
- return cls(
230
- companies=parsed_data.get("companies", []),
231
- full_text=parsed_data.get("full_text"),
232
- languages=parsed_data.get("languages"),
233
- metadata=parsed_data.get("metadata"),
234
- page=parsed_data.get("page"),
235
- request=parsed_data.get("request"),
236
- snippets=parsed_data.get("snippets", {}),
237
- statistics=parsed_data.get("statistics"),
238
- structured=parsed_data.get("structured"),
239
- url_tree=parsed_data.get("url_tree"),
240
- )
167
+ data_dict = json_loads(data)
168
+ # Handle snippets separately to avoid passing it twice
169
+ snippets_data = data_dict.pop("snippets", None)
170
+ if snippets_data is not None:
171
+ data_dict["snippets"] = SnippetSet.from_dict(snippets_data)
172
+ return cls(**data_dict)
241
173
 
242
174
  @classmethod
243
175
  def load(cls, path: str) -> "WebPageData":
@@ -265,5 +197,9 @@ class WebPageData:
265
197
  {'en': 1}
266
198
  """
267
199
  with open(path, encoding="utf-8") as f:
268
- data = f.read()
269
- return cls.from_json(data)
200
+ data = json_loads(f.read())
201
+ # Handle snippets separately to avoid passing it twice
202
+ snippets_data = data.pop("snippets", None)
203
+ if snippets_data is not None:
204
+ data["snippets"] = SnippetSet.from_dict(snippets_data)
205
+ return cls(**data)