scrapegoat-core 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,247 @@
1
+ """
2
+ """
3
+
4
+ import uuid
5
+
6
+
7
+ class HTMLNode:
8
+ """
9
+ """
10
+ VOID_TAGS = {"area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr"}
11
+
12
+ def __init__(self, raw: str, tag_type: str, has_data: bool = False, html_attributes: dict[str, any] = None, body: str = "", parent=None):
13
+ """
14
+ """
15
+ self.id = str(uuid.uuid4())
16
+ self.raw = raw
17
+ self.tag_type = tag_type
18
+ self.has_data = has_data
19
+ self.html_attributes = {"@"+k: v for k, v in (html_attributes or {}).items()}
20
+ self.body = body
21
+ self.children = []
22
+ self.retrieval_instructions = ""
23
+ self.parent = parent
24
+ self.extract_fields = None
25
+ self.extract_flags = {"ignore_children": False, "ignore_grandchildren": False}
26
+
27
+ def to_dict(self, ignore_children=False) -> str:
28
+ """
29
+ """
30
+ ignore_children = self.extract_flags["ignore_children"] or ignore_children
31
+ for child in self.children:
32
+ child.set_extract_instructions(fields=self.extract_fields, ignore_children=self.extract_flags["ignore_grandchildren"])
33
+ dict_representation = {}
34
+ if self.extract_fields:
35
+ for field in self.extract_fields:
36
+ if field[0] == "@":
37
+ dict_representation[field] = self.html_attributes.get(field, None)
38
+ else:
39
+ if field == "id":
40
+ dict_representation["id"] = self.id
41
+ elif field == "tag_type":
42
+ dict_representation["tag_type"] = self.tag_type
43
+ elif field == "has_data":
44
+ dict_representation["has_data"] = self.has_data
45
+ elif field == "html_attributes":
46
+ dict_representation["html_attributes"] = self.html_attributes
47
+ elif field == "body":
48
+ dict_representation["body"] = self.body
49
+ elif field == "children" and not ignore_children:
50
+ dict_representation["children"] = [child.to_dict() for child in self.children]
51
+ elif field == "retrieval_instructions":
52
+ dict_representation["retrieval_instructions"] = self.retrieval_instructions
53
+ elif field == "parent":
54
+ dict_representation["parent"] = self.parent.id if self.parent else None
55
+ elif field == "extract_fields":
56
+ dict_representation["extract_fields"] = self.extract_fields
57
+ elif field == "extract_flags":
58
+ dict_representation["extract_flags"] = self.extract_flags
59
+ return dict_representation
60
+ if ignore_children:
61
+ return {
62
+ "id": self.id,
63
+ "raw": self.raw,
64
+ "tag_type": self.tag_type,
65
+ "has_data": self.has_data,
66
+ "html_attributes": self.html_attributes,
67
+ "body": self.body,
68
+ "retrieval_instructions": self.retrieval_instructions,
69
+ "parent": self.parent.id if self.parent else None,
70
+ "extract_fields": self.extract_fields,
71
+ "extract_flags": self.extract_flags,
72
+ }
73
+ return {
74
+ "id": self.id,
75
+ "raw": self.raw,
76
+ "tag_type": self.tag_type,
77
+ "has_data": self.has_data,
78
+ "html_attributes": self.html_attributes,
79
+ "body": self.body,
80
+ "children": [child.to_dict() for child in self.children],
81
+ "retrieval_instructions": self.retrieval_instructions,
82
+ "parent": self.parent.id if self.parent else None,
83
+ "extract_fields": self.extract_fields,
84
+ "extract_flags": self.extract_flags,
85
+ }
86
+
87
+ def to_string(self) -> str:
88
+ """
89
+ """
90
+ return str(self.to_dict())
91
+
92
+ def __repr__(self):
93
+ """
94
+ """
95
+ return self.to_string()
96
+
97
+ def to_html(self, indent=0) -> str:
98
+ """
99
+ """
100
+ html_attribute_string = " ".join(f'{k}="{v}"' for k, v in self.html_attributes.items())
101
+ if html_attribute_string:
102
+ opening = f"<{self.tag_type} {html_attribute_string}"
103
+ else:
104
+ opening = f"<{self.tag_type}"
105
+
106
+ if self.tag_type in self.VOID_TAGS:
107
+ opening += " />"
108
+ else:
109
+ opening += ">"
110
+
111
+ text = f" {self.body}" if self.has_data else ""
112
+
113
+ pad = " " * indent
114
+ result = f"{pad}{opening}{text}\n"
115
+
116
+ for child in self.children:
117
+ result += child.to_html(indent + 1)
118
+
119
+ if self.tag_type not in self.VOID_TAGS:
120
+ result += f"{pad}</{self.tag_type}>\n"
121
+ return result
122
+
123
+ def __str__(self):
124
+ return self.to_string()
125
+
126
+ def get_parent(self):
127
+ """
128
+ """
129
+ return self.parent
130
+
131
+ def get_children(self):
132
+ """
133
+ """
134
+ return self.children
135
+
136
+ def get_ancestors(self):
137
+ """
138
+ """
139
+ ancestors = []
140
+ current = self.parent
141
+ while current:
142
+ ancestors.append(current)
143
+ current = current.parent
144
+ return ancestors
145
+
146
+ def get_descendants(self, tag_type: str = None, **html_attributes) -> list:
147
+ """
148
+ """
149
+ descendants = []
150
+ for child in self.children:
151
+ if (tag_type is None or child.tag_type == tag_type) and all(child.html_attributes.get(k) == v for k, v in html_attributes.items()):
152
+ descendants.append(child)
153
+ descendants.extend(child.get_descendants(tag_type, **html_attributes))
154
+ return descendants
155
+
156
+ def preorder_traversal(self):
157
+ """
158
+ """
159
+ yield self
160
+ for child in self.children:
161
+ yield from child.preorder_traversal()
162
+
163
+ def has_html_attribute(self, key, value=None) -> bool:
164
+ """
165
+ """
166
+ if value is None:
167
+ return key in self.html_attributes
168
+ if self.html_attributes.get(key) is None:
169
+ return False
170
+ return value in self.html_attributes.get(key)
171
+
172
+ def has_attribute(self, key, value=None) -> bool:
173
+ """
174
+ """
175
+ if key == "tag_type":
176
+ if value is None:
177
+ return self.tag_type is not None
178
+ return self.tag_type == value
179
+ if key == "id":
180
+ if value is None:
181
+ return self.id is not None
182
+ return str(self.id) == value
183
+ if key == "has_data":
184
+ if value is None:
185
+ return self.has_data
186
+ return self.has_data == value
187
+ if key == "body":
188
+ if value is None:
189
+ return self.body is not None
190
+ return self.body == value
191
+ if key == "retrieval_instructions":
192
+ if value is None:
193
+ return self.retrieval_instructions is not None
194
+ return self.retrieval_instructions == value
195
+ if key == "extract_fields":
196
+ if value is None:
197
+ return self.extract_fields is not None
198
+ return self.extract_fields == value
199
+ if key == "extract_flags":
200
+ if value is None:
201
+ return self.extract_flags is not None
202
+ return self.extract_flags == value
203
+ if key == "parent":
204
+ if value is None:
205
+ return self.parent is not None
206
+ return self.parent and str(self.parent.id) == value
207
+ if key == "children":
208
+ if value is None:
209
+ return len(self.children) > 0
210
+ return any(str(child.id) == value for child in self.children)
211
+ if key == "raw":
212
+ if value is None:
213
+ return self.raw is not None
214
+ return self.raw == value
215
+ return False
216
+
217
+ def is_descendant_of(self, tag_type) -> bool:
218
+ """
219
+ """
220
+ return any(ancestor.tag_type == tag_type for ancestor in self.get_ancestors())
221
+
222
+ def set_retrieval_instructions(self, instruction: str):
223
+ """
224
+ """
225
+ self.retrieval_instructions = instruction
226
+
227
+ def set_extract_instructions(self, fields: list=None, ignore_children=False, ignore_grandchildren=False):
228
+ """
229
+ """
230
+ self.extract_fields = fields or None
231
+ self.extract_flags = {"ignore_children": ignore_children, "ignore_grandchildren": ignore_grandchildren}
232
+
233
+ def clear_extract_instructions(self):
234
+ """
235
+ """
236
+ self.extract_fields = None
237
+ self.extract_flags = None
238
+
239
+
240
+ def main():
241
+ """
242
+ """
243
+ pass
244
+
245
+
246
+ if __name__ == "__main__":
247
+ main()
@@ -0,0 +1,81 @@
1
+ """
2
+ """
3
+
4
+ from typing import Union
5
+ import requests
6
+
7
+ from .command import FetchCommand
8
+
9
+
10
+ class Sheepdog:
11
+ """
12
+ """
13
+ DEFAULT_HEADERS = {
14
+ "User-Agent": "Mozilla/5.0 (Scrapegoat)",
15
+ "Accept-Language": "en-US,en;q=0.9",
16
+ "Accept-Encoding": "gzip, deflate, br",
17
+ "Connection": "keep-alive",
18
+ "Accept": "*/*",
19
+ "DNT": "1",
20
+ "Upgrade-Insecure-Requests": "1",
21
+ "Sec-Fetch-Dest": "document",
22
+ "Sec-Fetch-Mode": "navigate",
23
+ }
24
+
25
+ def __init__(self, getter=None):
26
+ """
27
+ """
28
+ self.getter = getter or self.getter
29
+
30
+ def fetch(self, fetch_command: Union[str, FetchCommand]) -> str:
31
+ """
32
+ """
33
+ if not isinstance(fetch_command, FetchCommand):
34
+ fetch_command = FetchCommand(fetch_command)
35
+ fetch_command.set_getter(self.getter)
36
+ return fetch_command.execute()
37
+
38
+ def getter(self, url: str, **kwargs) -> str:
39
+ """
40
+ """
41
+ headers = kwargs.pop('headers', self.DEFAULT_HEADERS)
42
+ response = requests.get(url, headers=headers, **kwargs)
43
+ response.raise_for_status()
44
+ return response.text
45
+
46
+
47
+ class HeadlessSheepdog(Sheepdog):
48
+ """
49
+ """
50
+ def __init__(self, getter=None):
51
+ """
52
+ """
53
+ super().__init__(getter)
54
+
55
+ def getter(self, url: str, **kwargs):
56
+ """
57
+ """
58
+ try:
59
+ from playwright.sync_api import sync_playwright
60
+ except ImportError:
61
+ raise RuntimeError("Playwright is not installed. Please install it with 'pip install playwright'")
62
+
63
+ try:
64
+ with sync_playwright() as p:
65
+ browser = p.chromium.launch(headless=True)
66
+ page = browser.new_page()
67
+ page.goto(url, wait_until="domcontentloaded")
68
+ return page.content()
69
+ except Exception as e:
70
+ if "Executable doesn't exist" in str(e):
71
+ raise RuntimeError("Playwright browser executables are not installed. Please run 'playwright install' to install them.")
72
+
73
+
74
+ def main():
75
+ """
76
+ """
77
+ pass
78
+
79
+
80
+ if __name__ == "__main__":
81
+ main()
@@ -0,0 +1,108 @@
1
+ """
2
+ """
3
+
4
+ import os
5
+
6
+ from .gardener import Gardener
7
+ from .goat import Goat
8
+ from .interpreter import Interpeter
9
+ from .milkmaid import Milkmaid
10
+ from .milkman import Milkman
11
+ from .sheepdog import Sheepdog
12
+
13
+
14
+ class Shepherd:
15
+ """
16
+ """
17
+ def __init__(self, gardener=None, sheepdog=None, goat=None, milkmaid=None, milkman=None):
18
+ """
19
+ """
20
+ self.gardener = gardener if gardener else Gardener()
21
+ self.interpreter = Interpeter()
22
+ self.sheepdog = sheepdog if sheepdog else Sheepdog()
23
+ self.goat = goat if goat else Goat()
24
+ self.milkmaid = milkmaid if milkmaid else Milkmaid()
25
+ self.milkman = milkman if milkman else Milkman()
26
+
27
+ def herd(self, query: str) -> list:
28
+ """
29
+ """
30
+ goatspeak = self._convert_query_to_goatspeak(query)
31
+
32
+ results = []
33
+
34
+ for block in goatspeak:
35
+ html = self.sheepdog.fetch(block.fetch_command)
36
+ root = self.gardener.grow_tree(html)
37
+ self._query_list_handler(block.query_list, root, results)
38
+
39
+ return list(dict.fromkeys(results))
40
+
41
+ def _convert_query_to_goatspeak(self, query: str) -> None:
42
+ """
43
+ """
44
+ if os.path.isfile(query):
45
+ try:
46
+ return self.interpreter.interpret(self.milkman.receive(query))
47
+ except Exception as e:
48
+ raise e
49
+ try:
50
+ return self.interpreter.interpret(query)
51
+ except Exception as e:
52
+ raise e
53
+
54
+
55
+
56
+ def _query_list_handler(self, query_list: str, root, results) -> list:
57
+ """
58
+ """
59
+ for query in query_list:
60
+ query_results = (self.goat.feast(root, query.graze_commands))
61
+ if query.churn_command:
62
+ self.milkmaid.churn(query_results, query.churn_command)
63
+
64
+ results.extend(query_results)
65
+
66
+ if query.deliver_command:
67
+ self.milkman.deliver(results, query.deliver_command)
68
+ results.clear()
69
+ return
70
+
71
+ def _local_herd(self, query: str, root) -> list:
72
+ """
73
+ """
74
+ goatspeak = self._convert_query_to_goatspeak(query)
75
+
76
+ results = []
77
+
78
+ for block in goatspeak:
79
+ self._query_list_handler(block.query_list, root, results)
80
+
81
+ return list(dict.fromkeys(results))
82
+
83
+ def herd_from_node(self, query: str, root) -> list:
84
+ """
85
+ """
86
+ return self._local_herd(query, root=root)
87
+
88
+ def herd_from_html(self, query: str, html: str) -> list:
89
+ """
90
+ """
91
+ root = self.gardener.grow_tree(html)
92
+ return self._local_herd(query, root=root)
93
+
94
+ def herd_from_url(self, query: str, url: str) -> list:
95
+ """
96
+ """
97
+ html = self.sheepdog.fetch(url)
98
+ root = self.gardener.grow_tree(html)
99
+ return self._local_herd(query, root=root)
100
+
101
+ def main():
102
+ """
103
+ """
104
+ pass
105
+
106
+
107
+ if __name__ == "__main__":
108
+ main()
scrapegoat_core/cli.py ADDED
@@ -0,0 +1,38 @@
1
+ import argparse
2
+ from scrapegoat_core import Shepherd, HeadlessSheepdog
3
+
4
+
5
+ def main():
6
+ parser = argparse.ArgumentParser(description="Scrapegoat language executor")
7
+
8
+ # Positional file or query arg
9
+ parser.add_argument(
10
+ "file_or_query",
11
+ nargs="?",
12
+ help="Path to a .goat file or a raw query as a string",
13
+ )
14
+ parser.add_argument(
15
+ "-v",
16
+ "--verbose",
17
+ help="Prints the results of the query to the console",
18
+ action="store_true",
19
+ )
20
+ parser.add_argument(
21
+ "-j",
22
+ "--javascript",
23
+ help="Uses a headless browser to support javascript rendered pages",
24
+ action="store_true",
25
+ )
26
+
27
+ args = parser.parse_args()
28
+
29
+ if args.javascript:
30
+ shepherd = Shepherd(sheepdog=HeadlessSheepdog())
31
+ else:
32
+ shepherd = Shepherd()
33
+
34
+ nodes = shepherd.herd(args.file_or_query)
35
+
36
+ if args.verbose:
37
+ for node in nodes:
38
+ print(node)
File without changes
File without changes
@@ -0,0 +1,22 @@
1
+ Metadata-Version: 2.4
2
+ Name: scrapegoat-core
3
+ Version: 1.2.0
4
+ Summary: A toolkit of functions, classes, and utilities for creating web scrapers.
5
+ Author-email: Arman Chinai <chinaiarman@gmail.com>, Lucas Angelozzi <lucasangelozzi32@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/ChinaiArman/scrapegoat
8
+ Project-URL: Documentation, https://github.com/ChinaiArman/scrapegoat/blob/main/README.md
9
+ Project-URL: Source, https://github.com/ChinaiArman/scrapegoat
10
+ Project-URL: Issues, https://github.com/ChinaiArman/scrapegoat/issues
11
+ Keywords: scraping,webscraping,automation,html,parsing
12
+ Requires-Python: >=3.12
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Requires-Dist: requests>=2.31.0
16
+ Provides-Extra: loom
17
+ Requires-Dist: scrapegoat-loom>=1.2.0; extra == "loom"
18
+ Provides-Extra: js
19
+ Requires-Dist: playwright>=1.56.0; extra == "js"
20
+ Dynamic: license-file
21
+
22
+ # Scrapegoat SDK
@@ -0,0 +1,22 @@
1
+ scrapegoat_core/__init__.py,sha256=rdVu3-8fAdVq_kxRqx-XRU4EQgLljk6zw00YksCSn2s,761
2
+ scrapegoat_core/cli.py,sha256=MQtNOlN5waUgUlGLaDp1gG5lxeCvmIa_v7ydIrLSyjQ,986
3
+ scrapegoat_core/main.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ scrapegoat_core/classes/__init__.py,sha256=r6-e-57CW9DwSxsMzZnE4ujpv9xymQqwad_0_tnTG7A,982
5
+ scrapegoat_core/classes/block.py,sha256=DAtYQSp3_rg6_PVDZHxxsq7dNiadaPZzwS1P2VUnn2U,1131
6
+ scrapegoat_core/classes/command.py,sha256=7EnybkN5SxDV98J9KDZ93DNmC7Kz8PkxjNSqvpXaLwM,5726
7
+ scrapegoat_core/classes/conditions.py,sha256=YxPqHsUhyk9vdM1nv95ErWbq0FhHbfomQC64F4fkVUU,2541
8
+ scrapegoat_core/classes/gardener.py,sha256=jSOI2Mt8s_tVsRG8qE694v2qdwSqqtUQOlcPnd0tXZY,4373
9
+ scrapegoat_core/classes/goat.py,sha256=ERBfSR7mK9IFYR4i0m73gubHU8nlj54wg2FzC8K3HOc,778
10
+ scrapegoat_core/classes/interpreter.py,sha256=jWIL_ZG2ssc6oLfOlYQJO3V6GKHQbdmXfJtY85wp1gU,15161
11
+ scrapegoat_core/classes/milkmaid.py,sha256=idjDLTicQ5XDfXjR-D8qVCEDDdsn-_XWalKsNYgLVA8,376
12
+ scrapegoat_core/classes/milkman.py,sha256=dmC0zBXYr6LKtXZszVjWOGlWFUMqdoZp6Ab_xOSyXdA,510
13
+ scrapegoat_core/classes/node.py,sha256=PzkneKBAuRlQ8Mm7xkLtw7sryhIv-6ng0BnP4BE9NRY,8919
14
+ scrapegoat_core/classes/sheepdog.py,sha256=ni6Tdsf4dQbNWe69r03tlxCmUzLWHnRWoXZ6rcQuoCM,2219
15
+ scrapegoat_core/classes/shepherd.py,sha256=jEEj5kGSAKvwfeU9mmr3LRSiK6GGBSm_RIwsI1AzBXk,3084
16
+ scrapegoat_core/exceptions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
+ scrapegoat_core-1.2.0.dist-info/licenses/LICENSE,sha256=vXZ_divzwGQpeaVHa8qH-DADn9CG-XKN_j5sBUio1B0,1090
18
+ scrapegoat_core-1.2.0.dist-info/METADATA,sha256=zruVc6uG60UsC-ylbKiYaU6SFXyO6S4SmztjJcHcrk8,919
19
+ scrapegoat_core-1.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
20
+ scrapegoat_core-1.2.0.dist-info/entry_points.txt,sha256=2iISE8Mxs-d-RBU6pn8DjQs-vJ_l1OmfkvQM2VMw4fk,56
21
+ scrapegoat_core-1.2.0.dist-info/top_level.txt,sha256=qP4DpWo4p3IWss3TcRlM10DCTMsHtN--W_KZqyg8TEM,16
22
+ scrapegoat_core-1.2.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ scrapegoat = scrapegoat_core.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Arman Chinai
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ scrapegoat_core