scrapegoat-core 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ from .classes import Goat, HTMLNode, Condition, InCondition, IfCondition, Gardener, Interpeter, Command, Shepherd, Sheepdog, TokenType, Token, Tokenizer, Parser, ConditionParser, ScrapeSelectParser, ExtractParser, Milkmaid, Milkman, ChurnCommand, GrazeCommand, DeliverCommand, FetchCommand, VisitParser, FlagParser, GoatspeakBlock, Query, HeadlessSheepdog
2
+
3
+ __all__ = ["Goat", "HTMLNode", "Condition", "InCondition", "IfCondition", "Gardener", "Interpeter", "Command", "Shepherd", "Sheepdog", "TokenType", "Token", "Tokenizer", "Parser", "ConditionParser", "ScrapeSelectParser", "ExtractParser", "Milkmaid", "Milkman", "ChurnCommand", "GrazeCommand", "DeliverCommand", "FetchCommand", "VisitParser", "FlagParser", "GoatspeakBlock", "Query", "HeadlessSheepdog"]
@@ -0,0 +1,13 @@
1
+ from .goat import Goat
2
+ from .milkmaid import Milkmaid
3
+ from .milkman import Milkman
4
+ from .node import HTMLNode
5
+ from .conditions import Condition, InCondition, IfCondition
6
+ from .gardener import Gardener
7
+ from .interpreter import Interpeter, TokenType, Token, Tokenizer, Parser, ConditionParser, ScrapeSelectParser, ExtractParser, VisitParser, FlagParser
8
+ from .command import Command, GrazeCommand, ChurnCommand, DeliverCommand, FetchCommand
9
+ from .shepherd import Shepherd
10
+ from .sheepdog import Sheepdog, HeadlessSheepdog
11
+ from .block import GoatspeakBlock, Query
12
+
13
+ __all__ = ["Goat", "HTMLNode", "Condition", "InCondition", "IfCondition", "Gardener", "Interpeter", "Command", "GrazeCommand", "ChurnCommand", "DeliverCommand", "Shepherd", "Sheepdog", "Loom", "TokenType", "Token", "Tokenizer", "Parser", "ConditionParser", "ScrapeSelectParser", "ExtractParser", "Milkmaid", "Milkman", "FetchCommand", "VisitParser", "FlagParser", "GoatspeakBlock", "Query", "HeadlessSheepdog"]
@@ -0,0 +1,50 @@
1
+ """
2
+ """
3
+
4
+ class GoatspeakBlock:
5
+ """
6
+ """
7
+ def __init__(self, fetch_command, query_list):
8
+ """
9
+ """
10
+ self.fetch_command = fetch_command
11
+ self.query_list = query_list
12
+
13
+ def __repr__(self):
14
+ """
15
+ """
16
+ return f"GoatspeakBlock(fetch_command={self.fetch_command}, query_list={self.query_list})"
17
+
18
+ def to_goat_file(self) -> None:
19
+ """
20
+ """
21
+ pass
22
+
23
+
24
+ class Query:
25
+ """
26
+ """
27
+ def __init__(self, graze_commands, fetch_command=None, churn_command=None, deliver_command=None):
28
+ """
29
+ """
30
+ self.fetch_command = fetch_command
31
+ self.graze_commands = graze_commands
32
+ self.churn_command = churn_command
33
+ self.deliver_command = deliver_command
34
+
35
+ def __repr__(self):
36
+ """
37
+ """
38
+ return f"Query(graze_commands={self.graze_commands}, fetch_command={self.fetch_command}, churn_command={self.churn_command}, deliver_command={self.deliver_command})"
39
+
40
+
41
+ def main():
42
+ """
43
+ """
44
+ pass
45
+
46
+
47
+ if __name__ == "__main__":
48
+ """
49
+ """
50
+ main()
@@ -0,0 +1,205 @@
1
+ """
2
+ """
3
+
4
+ from abc import ABC, abstractmethod
5
+ import os
6
+ import json
7
+ import csv
8
+ import requests
9
+
10
+ from .conditions import InCondition
11
+
12
+
13
+ class Command(ABC):
14
+ """
15
+ """
16
+ @abstractmethod
17
+ def __init__(self, action: str):
18
+ """
19
+ """
20
+ self.action = action
21
+
22
+ @abstractmethod
23
+ def execute(self, root) -> any:
24
+ """
25
+ """
26
+ pass
27
+
28
+
29
+ class GrazeCommand(Command):
30
+ """
31
+ """
32
+ def __init__(self, action: str, count: int, element: str, conditions: list=None, flags: list=None):
33
+ """
34
+ """
35
+ super().__init__(action=action)
36
+ self.count = count
37
+ self.element = element
38
+ self.conditions = conditions or []
39
+ self.flags = flags or []
40
+
41
+ for cond in self.conditions:
42
+ if isinstance(cond, InCondition) and cond.target == "POSITION" and cond.query_tag is None:
43
+ cond.query_tag = self.element
44
+
45
+ def _evaluate(self, node, root) -> bool:
46
+ """
47
+ """
48
+ if node.tag_type != self.element:
49
+ return False
50
+ return all(cond.evaluate(node, root) for cond in self.conditions)
51
+
52
+ def execute(self, root) -> list:
53
+ """
54
+ """
55
+ results = []
56
+ for node in root.preorder_traversal():
57
+ if self._evaluate(node, root):
58
+ results.append(node)
59
+ if self.count > 0 and len(results) >= self.count:
60
+ break
61
+ return results
62
+
63
+
64
+ class ChurnCommand(Command):
65
+ """
66
+ """
67
+ def __init__(self, fields: list = None, ignore_children: bool = False, ignore_grandchildren: bool = False):
68
+ """
69
+ """
70
+ super().__init__(action="extract")
71
+ self.fields = fields
72
+ self.ignore_children = ignore_children
73
+ self.ignore_grandchildren = ignore_grandchildren
74
+
75
+ def execute(self, node) -> None:
76
+ """
77
+ """
78
+ node.set_extract_instructions(self.fields, self.ignore_children, self.ignore_grandchildren)
79
+
80
+
81
+ class DeliverCommand(Command):
82
+ """
83
+ """
84
+ VALID_TYPES = {"csv", "json"}
85
+
86
+ def __init__(self, file_type: str, filepath: str = None, filename: str = None):
87
+ """
88
+ """
89
+ super().__init__(action="output")
90
+ self.file_type = file_type
91
+ self.filepath = filepath or os.getcwd()
92
+ base, ext = os.path.splitext(filename or f"output.{file_type}")
93
+ self.filename = base + (ext if ext else f".{file_type}")
94
+ self.full_path = os.path.join(self.filepath, self.filename)
95
+
96
+ def execute(self, nodes: list) -> str:
97
+ """
98
+ """
99
+ os.makedirs(self.filepath, exist_ok=True)
100
+
101
+ if self.file_type.lower() == "csv":
102
+ self._to_csv(nodes)
103
+ elif self.file_type.lower() == "json":
104
+ self._to_json(nodes)
105
+ return self.full_path
106
+
107
+ def _flatten_dict(self, d: dict, parent_key: str = '', sep: str = '.') -> dict:
108
+ """
109
+ """
110
+ items = {}
111
+ for k, v in d.items():
112
+ new_key = f"{k}" if parent_key else k
113
+ if isinstance(v, dict):
114
+ items.update(self._flatten_dict(v, new_key, sep=sep))
115
+ else:
116
+ items[new_key] = v
117
+ return items
118
+
119
+ def _collect_nodes(self, node_dict: dict, all_nodes: list) -> dict:
120
+ """
121
+ """
122
+ node_copy = node_dict.copy()
123
+
124
+ had_children = "children" in node_copy
125
+ children = node_copy.pop("children", [])
126
+ child_ids = []
127
+
128
+ for child in children:
129
+ child_flat = self._collect_nodes(child, all_nodes)
130
+ child_ids.append(child_flat.get("id"))
131
+
132
+ flattened = self._flatten_dict(node_copy)
133
+ if had_children:
134
+ if child_ids == [] or all(cid is None for cid in child_ids):
135
+ flattened["children"] = None
136
+ else:
137
+ flattened["children"] = child_ids
138
+
139
+ all_nodes.append(flattened)
140
+ return node_copy
141
+
142
+
143
+ def _to_csv(self, nodes: list) -> None:
144
+ """
145
+ """
146
+ all_nodes = []
147
+ for node in nodes:
148
+ node_dict = node.to_dict()
149
+ self._collect_nodes(node_dict, all_nodes)
150
+
151
+ fieldnames = set()
152
+ for nd in all_nodes:
153
+ fieldnames.update(nd.keys())
154
+ fieldnames = list(fieldnames)
155
+
156
+ os.makedirs(self.filepath, exist_ok=True)
157
+ with open(self.full_path, mode='w', newline='', encoding='utf-8') as csvfile:
158
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
159
+ writer.writeheader()
160
+ for nd in all_nodes:
161
+ writer.writerow(nd)
162
+
163
+ def _to_json(self, nodes: list) -> None:
164
+ """
165
+ """
166
+ nodes_as_dicts = [node.to_dict() for node in nodes]
167
+ with open(self.full_path, mode='w', encoding='utf-8') as jsonfile:
168
+ json.dump(nodes_as_dicts, jsonfile, indent=4)
169
+
170
+
171
+ class FetchCommand(Command):
172
+ """
173
+ """
174
+ def __init__(self, url: str, **kwargs):
175
+ """
176
+ """
177
+ super().__init__(action="visit")
178
+ self.getter = requests.get
179
+ self.url = url
180
+ self.kwargs = kwargs
181
+
182
+ def execute(self) -> str:
183
+ """
184
+ """
185
+ return self.getter(self.url, **self.kwargs)
186
+
187
+ def set_getter(self, getter):
188
+ """
189
+ """
190
+ self.getter = getter
191
+
192
+ def __eq__(self, other):
193
+ """
194
+ """
195
+ return isinstance(other, FetchCommand) and self.url == other.url
196
+
197
+
198
+ def main():
199
+ """
200
+ """
201
+ pass
202
+
203
+
204
+ if __name__ == "__main__":
205
+ main()
@@ -0,0 +1,87 @@
1
+ """
2
+ """
3
+
4
+ from abc import ABC, abstractmethod
5
+
6
+
7
+ class Condition(ABC):
8
+ """
9
+ """
10
+ def __init__(self, negated: bool = False):
11
+ """
12
+ """
13
+ self.negated = negated
14
+
15
+ @abstractmethod
16
+ def matches(self, node, root) -> bool:
17
+ """
18
+ """
19
+ pass
20
+
21
+ def evaluate(self, node, root) -> bool:
22
+ """
23
+ """
24
+ result = self.matches(node, root)
25
+ return not result if self.negated else result
26
+
27
+
28
+ class IfCondition(Condition):
29
+ """
30
+ """
31
+ def __init__(self, key: str, value: str, negated: bool = False, query_tag: str = None):
32
+ """
33
+ """
34
+ super().__init__(negated)
35
+ self.key = key
36
+ self.value = value
37
+ self.query_tag = query_tag
38
+
39
+ def matches(self, node, _) -> bool:
40
+ """
41
+ """
42
+ if self.query_tag is None:
43
+ raise ValueError("query_tag is required for IF condition")
44
+ if self.key[0] == "@":
45
+ return node.has_html_attribute(self.key, self.value) and node.tag_type == self.query_tag
46
+ else:
47
+ return node.has_attribute(self.key, self.value) and node.tag_type == self.query_tag
48
+
49
+ def __str__(self):
50
+ """
51
+ """
52
+ return f"IfCondition(key={self.key}, value={self.value}, negated={self.negated}, query_tag={self.query_tag})"
53
+
54
+
55
+ class InCondition(Condition):
56
+ """
57
+ """
58
+ def __init__(self, target: str, value=None, negated: bool = False, query_tag: str = None):
59
+ """
60
+ """
61
+ super().__init__(negated)
62
+ self.target = target
63
+ self.value = value
64
+ self.query_tag = query_tag
65
+
66
+ def matches(self, node, root) -> bool:
67
+ """
68
+ """
69
+ if self.target == "POSITION":
70
+ if not root:
71
+ raise ValueError("Root node is required for POSITION condition")
72
+ if not self.query_tag:
73
+ raise ValueError("query_tag is required for POSITION condition")
74
+ position = 1
75
+ for n in root.preorder_traversal():
76
+ if n.tag_type == self.query_tag:
77
+ if node == n:
78
+ return position == self.value
79
+ position += 1
80
+ return False
81
+ else:
82
+ return node.is_descendant_of(self.target)
83
+
84
+ def __str__(self):
85
+ """
86
+ """
87
+ return f"InCondition(target={self.target}, value={self.value}, negated={self.negated}, query_tag={self.query_tag})"
@@ -0,0 +1,145 @@
1
+ """
2
+ """
3
+
4
+ # IMPORTS
5
+ from html.parser import HTMLParser
6
+ from .node import HTMLNode
7
+
8
+
9
+ class Gardener(HTMLParser):
10
+ """
11
+ """
12
+ VOID_TAGS = {"area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr"}
13
+ AUTO_CLOSE = {
14
+ "li": {"li"},
15
+ "p": {"address", "article", "aside", "blockquote", "div", "dl", "fieldset", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "hr", "main", "nav", "ol", "p", "pre", "section", "table", "ul"},
16
+ "dt": {"dt", "dd"},
17
+ "dd": {"dt", "dd"},
18
+ "tr": {"tr"},
19
+ "td": {"td", "th"},
20
+ "th": {"td", "th"}
21
+ }
22
+ INLINE_TAGS = {"b", "i", "strong", "em", "u", "small", "mark", "sub", "sup", "a", "span", "img", "br", "code", "s", "q", "cite"}
23
+
24
+ def __init__(self):
25
+ """
26
+ """
27
+ super().__init__()
28
+ self.tag_counts = {}
29
+ self.root = None
30
+ self.stack = []
31
+
32
+ def _auto_close_before(self, new_tag: str):
33
+ """
34
+ """
35
+ while self.stack:
36
+ current_node = next((n for n in reversed(self.stack) if n is not None), None)
37
+ if current_node is None:
38
+ break
39
+
40
+ current_tag = current_node.tag_type
41
+ if current_tag in self.AUTO_CLOSE and new_tag in self.AUTO_CLOSE[current_tag]:
42
+ while self.stack:
43
+ popped = self.stack.pop()
44
+ if popped is current_node:
45
+ break
46
+ else:
47
+ break
48
+
49
+ def handle_starttag(self, tag_type, html_attributes):
50
+ """
51
+ """
52
+ self._auto_close_before(tag_type)
53
+
54
+ node = HTMLNode(raw=self.get_starttag_text(), tag_type=tag_type, html_attributes=dict(html_attributes))
55
+
56
+ node.is_inline = tag_type in self.INLINE_TAGS
57
+
58
+ self.tag_counts[tag_type] = self.tag_counts.get(tag_type, 0) + 1
59
+ node.set_retrieval_instructions(f"SCRAPE 1 {tag_type} IN POSITION={self.tag_counts[tag_type]};")
60
+
61
+ if self.root is None:
62
+ self.root = node
63
+ if tag_type not in self.VOID_TAGS:
64
+ self.stack.append(node)
65
+ return
66
+
67
+ parent = next((n for n in reversed(self.stack) if n is not None), self.root)
68
+ parent.children.append(node)
69
+ node.parent = parent
70
+
71
+ if tag_type not in self.VOID_TAGS:
72
+ self.stack.append(node)
73
+
74
+ def handle_endtag(self, tag_type):
75
+ """
76
+ """
77
+ for i in range(len(self.stack)-1, -1, -1):
78
+ if self.stack[i].tag_type == tag_type:
79
+ del self.stack[i:]
80
+ break
81
+ return
82
+
83
+ def handle_data(self, data):
84
+ """
85
+ """
86
+ stripped = data.strip()
87
+ if not stripped:
88
+ return
89
+
90
+ current = next((n for n in reversed(self.stack) if n is not None), self.root)
91
+
92
+ # Add text to current node
93
+ if current.body:
94
+ current.body += " " + stripped
95
+ else:
96
+ current.body = stripped
97
+ current.has_data = True
98
+
99
+ # Bubble text up if inline
100
+ if getattr(current, "is_inline", False) and current.parent is not None:
101
+ if current.parent.body:
102
+ current.parent.body += " " + stripped
103
+ else:
104
+ current.parent.body = stripped
105
+ current.parent.has_data = True
106
+
107
+ def _append_root_tag(self, raw_html: str) -> str:
108
+ """
109
+ """
110
+ html_lower = raw_html.lower()
111
+
112
+ if "<html" not in html_lower:
113
+ raw_html = f"<html>{raw_html}</html>"
114
+
115
+ if "<body" not in html_lower:
116
+ raw_html = raw_html.replace("<html>", "<html><body>", 1)
117
+ raw_html = raw_html.replace("</html>", "</body></html>", 1)
118
+ return raw_html
119
+
120
+ def grow_tree(self, raw_html: str) -> None:
121
+ """
122
+ """
123
+ self.root = None
124
+ self.stack = []
125
+ self.tag_counts = {}
126
+ self.reset()
127
+
128
+ wrapped_html = self._append_root_tag(raw_html)
129
+ self.feed(wrapped_html)
130
+ return self.root
131
+
132
+ def get_root(self) -> HTMLNode:
133
+ """
134
+ """
135
+ return self.root
136
+
137
+
138
+ def main():
139
+ """
140
+ """
141
+ pass
142
+
143
+
144
+ if __name__ == "__main__":
145
+ main()
@@ -0,0 +1,28 @@
1
+ """
2
+ """
3
+
4
+ class Goat:
5
+ """
6
+ """
7
+ def __init__(self):
8
+ """
9
+ """
10
+ pass
11
+
12
+ def feast(self, root, graze_commands) -> list:
13
+ """
14
+ """
15
+ results = []
16
+ i = 0
17
+ while i < len(graze_commands):
18
+ graze_command = graze_commands[i]
19
+ if graze_command.action.lower() == "select":
20
+ rebased_roots = graze_command.execute(root)
21
+ graze_command_subset = graze_commands[i + 1:]
22
+ for new_root in rebased_roots:
23
+ results.extend(self.feast(new_root, graze_command_subset))
24
+ return results
25
+ else:
26
+ results.extend(graze_command.execute(root))
27
+ i += 1
28
+ return results