scrapegoat-core 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,457 @@
1
+ """
2
+ """
3
+
4
+ # IMPORTS
5
+ import re
6
+ from enum import Enum, auto
7
+ from abc import ABC, abstractmethod
8
+
9
+ from .command import GrazeCommand, ChurnCommand, DeliverCommand, FetchCommand
10
+ from .conditions import InCondition, IfCondition
11
+ from .block import GoatspeakBlock, Query
12
+
13
+
14
+ class TokenType(Enum):
15
+ """
16
+ """
17
+ ACTION = auto()
18
+ CONDITIONAL = auto()
19
+ KEYWORD = auto()
20
+ OPERATOR = auto()
21
+ NUMBER = auto()
22
+ IDENTIFIER = auto()
23
+ FILE_TYPE = auto()
24
+ NEGATION = auto()
25
+ FLAG = auto()
26
+ SEMICOLON = auto()
27
+ UNKNOWN = auto()
28
+
29
+
30
+ class Token:
31
+ """
32
+ """
33
+ def __init__(self, type: str, value: str):
34
+ """
35
+ """
36
+ self.type = type
37
+ self.value = value
38
+
39
+ def __repr__(self):
40
+ """
41
+ """
42
+ return f"Token(type={self.type}, value='{self.value}')"
43
+
44
+
45
+ class Tokenizer:
46
+ def __init__(self):
47
+ self.ACTIONS = {"select", "scrape", "extract", "output", "visit"}
48
+ self.CONDITIONALS = {"if", "in"}
49
+ self.KEYWORDS = {"position"}
50
+ self.OPERATORS = {"=", "!="}
51
+ self.NEGATIONS = {"not"}
52
+ self.FILE_TYPES = {"json", "csv"}
53
+ self.FLAGS = {}
54
+
55
+ def _preprocess_query(self, query: str) -> str:
56
+ """
57
+ """
58
+ query = re.sub(r'\[.*?\]', '', query, flags=re.DOTALL)
59
+ query = re.sub(r'^\s*!goatspeak\s*', '', query, flags=re.IGNORECASE)
60
+ pattern = r"""
61
+ (?:'[^'\\]*(?:\\.[^'\\]*)*' | # single-quoted string
62
+ "[^"\\]*(?:\\.[^"\\]*)*" | # double-quoted string
63
+ //.*$ # line comment
64
+ )
65
+ """
66
+ def replacer(m):
67
+ s = m.group(0)
68
+ # Only remove if it’s a comment, not quoted text
69
+ return '' if s.strip().startswith('//') else s
70
+
71
+ query = re.sub(pattern, replacer, query, flags=re.MULTILINE | re.VERBOSE)
72
+ return query
73
+
74
+ def tokenize(self, query: str) -> list[Token]:
75
+ """
76
+ """
77
+ query = self._preprocess_query(query)
78
+
79
+ tokens = []
80
+ pattern = (
81
+ r'(--[A-Za-z0-9_-]+|'
82
+ r'\bSELECT\b|\bSCRAPE\b|\bEXTRACT\b|\bOUTPUT\b|\bVISIT\b|\bIN\b|\bIF\b|'
83
+ r'!=|==|=|;|\n|'
84
+ r'"(?:[^"]*)"|\'(?:[^\']*)\'|'
85
+ r'@?[A-Za-z_][A-Za-z0-9_-]*|'
86
+ r'\d+)'
87
+ )
88
+
89
+ for match in re.finditer(pattern, query.replace("\n", ""), flags=re.IGNORECASE):
90
+ raw_value = match.group(0)
91
+ token = self._classify_token(raw_value)
92
+ tokens.append(token)
93
+ return tokens
94
+
95
+ def _classify_token(self, raw_value: str) -> Token:
96
+ """
97
+ """
98
+ if raw_value[0] in ('"', "'") and raw_value[-1] == raw_value[0]:
99
+ return Token(TokenType.IDENTIFIER, raw_value[1:-1])
100
+ val_lower = raw_value.lower()
101
+ if val_lower.startswith("--"):
102
+ return Token(TokenType.FLAG, val_lower[2:].replace("-", "_"))
103
+ if val_lower in self.ACTIONS:
104
+ return Token(TokenType.ACTION, val_lower)
105
+ if val_lower in self.CONDITIONALS:
106
+ return Token(TokenType.CONDITIONAL, val_lower)
107
+ if val_lower in self.KEYWORDS:
108
+ return Token(TokenType.KEYWORD, val_lower)
109
+ if val_lower in self.OPERATORS:
110
+ return Token(TokenType.OPERATOR, val_lower)
111
+ if val_lower in self.NEGATIONS:
112
+ return Token(TokenType.NEGATION, val_lower)
113
+ if raw_value == ";":
114
+ return Token(TokenType.SEMICOLON, raw_value)
115
+ if val_lower.isdigit():
116
+ return Token(TokenType.NUMBER, val_lower)
117
+ if val_lower in self.FILE_TYPES:
118
+ return Token(TokenType.FILE_TYPE, val_lower)
119
+ return Token(TokenType.IDENTIFIER, raw_value)
120
+
121
+
122
+ class Parser(ABC):
123
+ """
124
+ """
125
+ @abstractmethod
126
+ def parse(self, tokens: list[Token], index) -> tuple:
127
+ """
128
+ """
129
+ pass
130
+
131
+
132
+ class FlagParser(Parser):
133
+ """
134
+ """
135
+ def __init__(self):
136
+ """
137
+ """
138
+ pass
139
+
140
+ def parse(self, tokens, index) -> tuple:
141
+ """
142
+ """
143
+ flags = {}
144
+
145
+ while tokens[index].type != TokenType.SEMICOLON:
146
+ token = tokens[index]
147
+ if token.type != TokenType.FLAG:
148
+ raise SyntaxError(f"Expected flag at token {token}")
149
+ flag_name = token.value
150
+ index += 1
151
+ token = tokens[index]
152
+ if token.type != TokenType.IDENTIFIER:
153
+ flag_value = True
154
+ else:
155
+ flag_value = token.value
156
+ index += 1
157
+ flags[flag_name] = flag_value
158
+ return flags, index
159
+
160
+
161
+ class ConditionParser(Parser):
162
+ """
163
+ """
164
+ def __init__(self):
165
+ """
166
+ """
167
+ pass
168
+
169
+ def parse(self, tokens, index, element) -> tuple:
170
+ negated = False
171
+ if tokens[index].type == TokenType.NEGATION:
172
+ negated = True
173
+ index += 1
174
+ token = tokens[index]
175
+ if token.type != TokenType.CONDITIONAL:
176
+ raise SyntaxError(f"Expected conditional at {token}")
177
+ if token.value == "if":
178
+ return self._parse_if(tokens, index, element, negated)
179
+ elif token.value == "in":
180
+ return self._parse_in(tokens, index, element, negated)
181
+
182
+ def _parse_if(self, tokens, index, element, negated) -> tuple:
183
+ """
184
+ """
185
+ index += 1
186
+ token = tokens[index]
187
+ if token.type != TokenType.IDENTIFIER:
188
+ raise SyntaxError(f"Expected key after IF at {token}")
189
+ key = token.value
190
+ index += 1
191
+ token = tokens[index]
192
+ if token.type != TokenType.OPERATOR:
193
+ condition = IfCondition(key=key, value=None, negated=negated, query_tag=element)
194
+ return condition, index
195
+ if token.value == "!=":
196
+ negated = True
197
+ index += 1
198
+ token = tokens[index]
199
+ if token.type not in {TokenType.IDENTIFIER, TokenType.NUMBER}:
200
+ raise SyntaxError(f"Expected value after IF {key} = at {token}")
201
+ value = token.value
202
+ condition = IfCondition(key=key, value=value, negated=negated, query_tag=element)
203
+ index += 1
204
+ return condition, index
205
+
206
+ def _parse_in(self, tokens, index, element, negated) -> tuple:
207
+ """
208
+ """
209
+ index += 1
210
+ token = tokens[index]
211
+ if token.type == TokenType.KEYWORD:
212
+ index += 1
213
+ token = tokens[index]
214
+ if token.type != TokenType.OPERATOR:
215
+ raise SyntaxError(f"Expected '=' after IN POSITION at {token}")
216
+ if token.value == "!=":
217
+ negated = True
218
+ index += 1
219
+ token = tokens[index]
220
+ if token.type != TokenType.NUMBER:
221
+ raise SyntaxError(f"Expected number after IN POSITION = at {token}")
222
+ position = int(token.value)
223
+ condition = InCondition(target="POSITION", value=position, negated=negated, query_tag=element)
224
+ else:
225
+ if token.type != TokenType.IDENTIFIER:
226
+ raise SyntaxError(f"Expected element after IN at {token}")
227
+ target = token.value
228
+ condition = InCondition(target=target, negated=negated, query_tag=element)
229
+ index += 1
230
+ return condition, index
231
+
232
+
233
+ class ScrapeSelectParser(Parser):
234
+ """
235
+ """
236
+ def __init__(self, condition_parser: ConditionParser, flag_parser: FlagParser):
237
+ """
238
+ """
239
+ self.condition_parser = condition_parser
240
+ self.flag_parser = flag_parser
241
+
242
+ def parse(self, tokens, index) -> tuple:
243
+ """
244
+ """
245
+ action = tokens[index].value
246
+ index += 1
247
+
248
+ # count
249
+ count = 0
250
+ if tokens[index].type == TokenType.NUMBER:
251
+ count = int(tokens[index].value)
252
+ index += 1
253
+
254
+ # element
255
+ if tokens[index].type != TokenType.IDENTIFIER:
256
+ raise SyntaxError(f"Expected element at token {tokens[index]}")
257
+ element = tokens[index].value
258
+ index += 1
259
+
260
+ # conditions
261
+ conditions = []
262
+ while tokens[index].type != TokenType.SEMICOLON and tokens[index].type != TokenType.FLAG:
263
+ condition, index = self.condition_parser.parse(tokens, index, element)
264
+ conditions.append(condition)
265
+
266
+ # flags
267
+ flags = {}
268
+ if tokens[index].type == TokenType.FLAG:
269
+ flags, index = self.flag_parser.parse(tokens, index)
270
+
271
+ instruction = GrazeCommand(action=action, count=count, element=element, conditions=conditions, **flags)
272
+ return instruction, index + 1
273
+
274
+
275
+ class ExtractParser(Parser):
276
+ """
277
+ """
278
+ def __init__(self, flag_parser: FlagParser):
279
+ """
280
+ """
281
+ self.flag_parser = flag_parser
282
+
283
+ def parse(self, tokens, index) -> tuple:
284
+ """
285
+ """
286
+ fields = []
287
+
288
+ index += 1
289
+
290
+ # fields
291
+ while tokens[index].type != TokenType.SEMICOLON and tokens[index].type != TokenType.FLAG:
292
+ if tokens[index].type == TokenType.IDENTIFIER:
293
+ fields.append(tokens[index].value)
294
+ index += 1
295
+
296
+ # flags
297
+ flags = {}
298
+ if tokens[index].type == TokenType.FLAG:
299
+ flags, index = self.flag_parser.parse(tokens, index)
300
+
301
+ instruction = ChurnCommand(fields=fields, **flags)
302
+ return instruction, index + 1
303
+
304
+
305
+ class OutputParser(Parser):
306
+ """
307
+ """
308
+ def __init__(self, flag_parser: FlagParser):
309
+ """
310
+ """
311
+ self.flag_parser = flag_parser
312
+
313
+ def parse(self, tokens, index) -> tuple:
314
+ """
315
+ """
316
+ index += 1
317
+
318
+ # file type
319
+ if tokens[index].type != TokenType.FILE_TYPE:
320
+ raise SyntaxError(f"Expected file type at token {tokens[index]}")
321
+ file_type = tokens[index].value
322
+ index += 1
323
+
324
+ # flags
325
+ flags = {}
326
+ if tokens[index].type == TokenType.FLAG:
327
+ flags, index = self.flag_parser.parse(tokens, index)
328
+
329
+ instruction = DeliverCommand(file_type=file_type, **flags)
330
+ return instruction, index + 1
331
+
332
+
333
+ class VisitParser(Parser):
334
+ """
335
+ """
336
+ def __init__(self, flag_parser: FlagParser):
337
+ """
338
+ """
339
+ self.flag_parser = flag_parser
340
+
341
+ def parse(self, tokens, index) -> tuple:
342
+ """
343
+ """
344
+ index += 1
345
+
346
+ # url
347
+ if tokens[index].type != TokenType.IDENTIFIER:
348
+ raise SyntaxError(f"Expected URL at token {tokens[index]}")
349
+ url = tokens[index].value
350
+ index += 1
351
+
352
+ # flags
353
+ flags = {}
354
+ if tokens[index].type == TokenType.FLAG:
355
+ flags, index = self.flag_parser.parse(tokens, index)
356
+
357
+ instruction = FetchCommand(url=url, **flags)
358
+ return instruction, index + 1
359
+
360
+
361
+ class Interpeter:
362
+ """
363
+ """
364
+ def __init__(self):
365
+ self.tokenizer = Tokenizer()
366
+ self.condition_parser = ConditionParser()
367
+ self.flag_parser = FlagParser()
368
+ self.action_parsers = {
369
+ "visit": VisitParser(self.flag_parser),
370
+ "scrape": ScrapeSelectParser(self.condition_parser, self.flag_parser),
371
+ "select": ScrapeSelectParser(self.condition_parser, self.flag_parser),
372
+ "extract": ExtractParser(self.flag_parser),
373
+ "output": OutputParser(self.flag_parser),
374
+ }
375
+
376
+ def _manage_interpreter_state(self, instructions, goatspeak_blocks) -> tuple:
377
+ """
378
+ """
379
+ if len(instructions) >= 2 and instructions[-2].action in ("scrape", "extract", "output") and instructions[-1].action in ("scrape", "select", "visit"):
380
+ current_instructions = instructions[:-1]
381
+ next_instruction = instructions[-1]
382
+
383
+ fetch_command = next((cmd for cmd in current_instructions if cmd.action == "visit"), None)
384
+ graze_commands = [cmd for cmd in current_instructions if cmd.action in ("scrape", "select")]
385
+ churn_command = next((cmd for cmd in current_instructions if cmd.action == "extract"), None)
386
+ deliver_command = next((cmd for cmd in current_instructions if cmd.action == "output"), None)
387
+
388
+ query = Query(
389
+ graze_commands=graze_commands,
390
+ fetch_command=fetch_command,
391
+ churn_command=churn_command,
392
+ deliver_command=deliver_command,
393
+ )
394
+
395
+ instructions = [next_instruction]
396
+ last_block = goatspeak_blocks[-1]
397
+ last_block.query_list.append(query)
398
+
399
+ if instructions[-1].action == "visit":
400
+ fetch_command = instructions[-1]
401
+ goatspeak_blocks.append(GoatspeakBlock(fetch_command=fetch_command, query_list=[]))
402
+ instructions = [fetch_command]
403
+ return instructions, goatspeak_blocks
404
+
405
+ return instructions, goatspeak_blocks
406
+
407
+ def interpret(self, query: str) -> list[GoatspeakBlock]:
408
+ """
409
+ """
410
+ tokens = self.tokenizer.tokenize(query)
411
+ instructions = []
412
+ goatspeak_blocks = []
413
+ index = 0
414
+
415
+ while index < len(tokens):
416
+ token = tokens[index]
417
+ if token.type != TokenType.ACTION:
418
+ raise SyntaxError(f"Expected action at token {token}")
419
+
420
+ parser = self.action_parsers.get(token.value)
421
+
422
+ if parser is None:
423
+ raise SyntaxError(f"Unknown action '{token.value}' at token {token}")
424
+
425
+ instruction, index = parser.parse(tokens, index)
426
+ instructions.append(instruction)
427
+ instructions, goatspeak_blocks = self._manage_interpreter_state(instructions, goatspeak_blocks)
428
+
429
+ if instructions:
430
+ fetch_command = next((cmd for cmd in instructions if cmd.action == "visit"), None)
431
+ graze_commands = [cmd for cmd in instructions if cmd.action in ("scrape", "select")]
432
+ churn_command = next((cmd for cmd in instructions if cmd.action == "extract"), None)
433
+ deliver_command = next((cmd for cmd in instructions if cmd.action == "output"), None)
434
+
435
+ query = Query(
436
+ graze_commands=graze_commands,
437
+ fetch_command=fetch_command,
438
+ churn_command=churn_command,
439
+ deliver_command=deliver_command,
440
+ )
441
+
442
+ if not goatspeak_blocks:
443
+ goatspeak_blocks.append(GoatspeakBlock(fetch_command=fetch_command, query_list=[query]))
444
+ else:
445
+ last_block = goatspeak_blocks[-1]
446
+ last_block.query_list.append(query)
447
+ return goatspeak_blocks
448
+
449
+
450
+ def main():
451
+ """
452
+ """
453
+ pass
454
+
455
+
456
+ if __name__ == "__main__":
457
+ main()
@@ -0,0 +1,27 @@
1
+ """
2
+ """
3
+
4
+ class Milkmaid:
5
+ """
6
+ """
7
+ def __init__(self):
8
+ """
9
+ """
10
+ pass
11
+
12
+ def churn(self, results: list, churn_command) -> None:
13
+ """
14
+ """
15
+ for node in results:
16
+ churn_command.execute(node)
17
+ return
18
+
19
+
20
+ def main():
21
+ """
22
+ """
23
+ pass
24
+
25
+
26
+ if __name__ == "__main__":
27
+ main()
@@ -0,0 +1,32 @@
1
+ """
2
+ """
3
+
4
+ class Milkman:
5
+ """
6
+ """
7
+ def __init__(self):
8
+ """
9
+ """
10
+ pass
11
+
12
+ def deliver(self, results: list, deliver_command) -> None:
13
+ """
14
+ """
15
+ deliver_command.execute(results)
16
+ return
17
+
18
+ def receive(self, filepath) -> str:
19
+ """
20
+ """
21
+ with open(filepath, "r", encoding="utf-8") as f:
22
+ return f.read()
23
+
24
+
25
+ def main():
26
+ """
27
+ """
28
+ pass
29
+
30
+
31
+ if __name__ == "__main__":
32
+ main()